code review fixes for PR NousResearch#13

code-review-bot · ampagent · code-review-bot · commit 7490cc3abc00 · 2026-04-27T18:41:08.000+09:00
- safe_print: strip rich markup tags when rich is unavailable so plain fallback output doesn't leak literal '[bold red]...[/bold red]' tags - profiling.aggregate_profiling_stats: stop reconstructing per-call timings by repeating the mean (statistically wrong; gave incorrect min/max/median across workers). Combine summary stats directly and flag median as approximate via median_time_approximate - run_agent: hoist 'reset_profiler' import to module level; drop noisy 'dir(tc)' / 'model_dump()' debug logging spam from the verbose path - batch_runner: drop unused 'import re'; replace bare 'except:' with specific (JSONDecodeError, TypeError, AttributeError) and guard isinstance(content, str) before calling .strip() - tools/simple_terminal_tool: replace bare 'except:' with 'except Exception' on the SSH-context cleanup paths Amp-Thread-ID: https://ampcode.com/threads/T-019dce4d-5fc2-703c-b2e4-b8a87ec42105 Co-authored-by: Amp <amp@ampcode.com>
diff --git a/batch_runner.py b/batch_runner.py
@@ -35,7 +35,6 @@
 from datetime import datetime
 from multiprocessing import Pool, Manager, Lock
 import traceback
-import re
 
 import fire
 
@@ -187,9 +186,9 @@ def _extract_tool_errors_from_messages(messages: List[Dict[str, Any]]) -> List[D
                             if not error_msg:
                                 error_msg = str(content_json.get("message", content_json.get("error", "Unknown error")))
 
-            except:
+            except (json.JSONDecodeError, TypeError, AttributeError):
                 # If not JSON, check if content explicitly states an error
-                if content.strip().lower().startswith("error:"):
+                if isinstance(content, str) and content.strip().lower().startswith("error:"):
                     has_error = True
                     error_msg = content.strip()
 
@@ -275,13 +274,13 @@ def _extract_tool_stats(messages: List[Dict[str, Any]]) -> Dict[str, Dict[str, i
                         if content_json.get("success") is False:
                             is_success = False
 
-            except:
+            except (json.JSONDecodeError, TypeError, AttributeError):
                 # If not JSON, check if content is empty or explicitly states an error
                 # Note: We avoid simple substring matching to prevent false positives
                 if not content:
                     is_success = False
                 # Only mark as failure if it explicitly starts with "Error:" or "ERROR:"
-                elif content.strip().lower().startswith("error:"):
+                elif isinstance(content, str) and content.strip().lower().startswith("error:"):
                     is_success = False
             
             # Update success/failure count
diff --git a/profiling.py b/profiling.py
@@ -260,64 +260,68 @@ def aggregate_profiling_stats(stats_list: List[Dict]) -> Dict:
     Returns:
         Dict: Aggregated statistics with combined tool and API call data
     """
-    aggregated = {
-        "tools": defaultdict(lambda: {"times": []}),
-        "api_calls": {"times": []}
-    }
-
-    # Aggregate tool statistics
-    for stats in stats_list:
-        # Aggregate tool timings
-        for tool_name, tool_stats in stats.get("tools", {}).items():
-            # Reconstruct individual timings from aggregated stats
-            # Since we have mean_time and call_count, we approximate
-            aggregated["tools"][tool_name]["times"].extend(
-                [tool_stats.get("mean_time", 0.0)] * tool_stats.get("call_count", 0)
-            )
-
-        # Aggregate API call timings
-        api_stats = stats.get("api_calls", {})
-        if api_stats.get("call_count", 0) > 0:
-            aggregated["api_calls"]["times"].extend(
-                [api_stats.get("mean_time", 0.0)] * api_stats.get("call_count", 0)
-            )
-
-    # Calculate final statistics for tools
-    final_stats = {"tools": {}, "api_calls": {}}
-
-    for tool_name, data in aggregated["tools"].items():
-        times = data["times"]
-        if times:
-            final_stats["tools"][tool_name] = {
-                "call_count": len(times),
-                "total_time": sum(times),
-                "min_time": min(times),
-                "max_time": max(times),
-                "mean_time": statistics.mean(times),
-                "median_time": statistics.median(times)
-            }
+    # Note: per-call timings are not preserved across worker boundaries, so we
+    # combine the per-conversation summary stats directly. This gives correct
+    # call_count/total_time/min/max/mean. ``median`` cannot be reconstructed
+    # exactly from summaries; we surface mean as a best-effort approximation
+    # and flag it via the ``median_time_approximate`` field.
 
-    # Calculate final statistics for API calls
-    api_times = aggregated["api_calls"]["times"]
-    if api_times:
-        final_stats["api_calls"] = {
-            "call_count": len(api_times),
-            "total_time": sum(api_times),
-            "min_time": min(api_times),
-            "max_time": max(api_times),
-            "mean_time": statistics.mean(api_times),
-            "median_time": statistics.median(api_times)
-        }
-    else:
-        final_stats["api_calls"] = {
+    def _empty():
+        return {
             "call_count": 0,
             "total_time": 0.0,
-            "min_time": 0.0,
+            "min_time": float("inf"),
             "max_time": 0.0,
-            "mean_time": 0.0,
-            "median_time": 0.0
         }
 
+    tool_acc: Dict[str, Dict] = defaultdict(_empty)
+    api_acc = _empty()
+
+    def _merge(acc: Dict, summary: Dict) -> None:
+        count = summary.get("call_count", 0)
+        if count <= 0:
+            return
+        acc["call_count"] += count
+        acc["total_time"] += summary.get("total_time", 0.0)
+        # Only consider min/max if the source actually had calls; otherwise
+        # its min_time will be the sentinel 0.0 from to_dict().
+        acc["min_time"] = min(acc["min_time"], summary.get("min_time", float("inf")))
+        acc["max_time"] = max(acc["max_time"], summary.get("max_time", 0.0))
+
+    for stats in stats_list:
+        for tool_name, tool_stats in stats.get("tools", {}).items():
+            _merge(tool_acc[tool_name], tool_stats)
+        _merge(api_acc, stats.get("api_calls", {}))
+
+    def _finalize(acc: Dict) -> Dict:
+        count = acc["call_count"]
+        if count == 0:
+            return {
+                "call_count": 0,
+                "total_time": 0.0,
+                "min_time": 0.0,
+                "max_time": 0.0,
+                "mean_time": 0.0,
+                "median_time": 0.0,
+                "median_time_approximate": True,
+            }
+        mean_time = acc["total_time"] / count
+        return {
+            "call_count": count,
+            "total_time": acc["total_time"],
+            "min_time": acc["min_time"] if acc["min_time"] != float("inf") else 0.0,
+            "max_time": acc["max_time"],
+            "mean_time": mean_time,
+            # Real median requires per-call data we don't carry across workers.
+            "median_time": mean_time,
+            "median_time_approximate": True,
+        }
+
+    final_stats = {
+        "tools": {name: _finalize(acc) for name, acc in tool_acc.items()},
+        "api_calls": _finalize(api_acc),
+    }
+
     return final_stats
 
 
diff --git a/run_agent.py b/run_agent.py
@@ -46,7 +46,7 @@
 from tools.terminal_tool import cleanup_vm
 
 # Import profiling
-from profiling import get_profiler
+from profiling import get_profiler, reset_profiler
 
 
 class AIAgent:
@@ -368,8 +368,7 @@ def run_conversation(
             Dict: Complete conversation result with final response and message history
         """
         # Reset profiler for this conversation to get fresh stats
-        from profiling import reset_profiler as reset_prof
-        reset_prof()
+        reset_profiler()
 
         # Generate unique task_id if not provided to isolate VMs between concurrent tasks
         import uuid
@@ -461,11 +460,6 @@ def run_conversation(
                     if self.verbose_logging:
                         for tc in assistant_message.tool_calls:
                             logging.debug(f"Tool call: {tc.function.name} with args: {tc.function.arguments[:200]}...")
-                            # Debug: Check what attributes are available on tool_call
-                            logging.debug(f"Tool call attributes: {dir(tc)}")
-                            # Try to dump the model to see all fields
-                            if hasattr(tc, 'model_dump'):
-                                logging.debug(f"Tool call data: {tc.model_dump()}")
                     
                     # Add assistant message with tool calls to conversation
                     # Extract thought_signature if present (required for Gemini models)
diff --git a/safe_print.py b/safe_print.py
@@ -1,5 +1,12 @@
 #!/usr/bin/env python3
-"""Simple safe print that tries rich, falls back to regular print."""
+"""Simple safe print that tries rich, falls back to regular print.
+
+When rich is unavailable, any rich-style markup like ``[bold red]...[/bold red]``
+is stripped from string arguments so the plain output stays readable instead of
+leaking literal tags.
+"""
+
+import re
 
 try:
     from rich import print as rich_print
@@ -8,13 +15,25 @@
     RICH_AVAILABLE = False
 
 
+# Matches rich markup tags like ``[bold red]``, ``[/bold red]``, ``[/]``, etc.
+# Conservative: only strips bracketed tokens that look like style directives
+# (letters, digits, slashes, spaces, # for hex colors).
+_RICH_MARKUP_RE = re.compile(r"\[/?[a-zA-Z0-9 #_/-]*\]")
+
+
+def _strip_markup(arg):
+    if isinstance(arg, str):
+        return _RICH_MARKUP_RE.sub("", arg)
+    return arg
+
+
 def safe_print(*args, **kwargs):
-    """Try rich.print, fall back to regular print if it fails."""
+    """Try rich.print, fall back to regular print (with markup stripped)."""
     if RICH_AVAILABLE:
         try:
             rich_print(*args, **kwargs)
             return
         except Exception:
             pass
-    # Fallback to regular print
-    print(*args, **kwargs)
+    # Fallback: strip rich markup so we don't print literal "[bold red]..." tags
+    print(*(_strip_markup(a) for a in args), **kwargs)
diff --git a/tools/simple_terminal_tool.py b/tools/simple_terminal_tool.py
@@ -196,7 +196,7 @@ def _execute_ssh_command(instance, command: str, timeout: Optional[int] = None)
         if ssh_context_manager:
             try:
                 ssh_context_manager.__exit__(None, None, None)
-            except:
+            except Exception:
                 pass
 
         return {
@@ -210,7 +210,7 @@ def _execute_ssh_command(instance, command: str, timeout: Optional[int] = None)
         if ssh_context_manager:
             try:
                 ssh_context_manager.__exit__(None, None, None)
-            except:
+            except Exception:
                 pass
 
         # Check if it's a timeout