Add --parallel N flag to vera-bench run

MarkEdmondson1234 · claude · MarkEdmondson1234 · commit a367de831676 · 2026-05-21T14:25:49.000+02:00
Run N problems concurrently via ThreadPoolExecutor. Each worker
is I/O-bound on its LLM HTTP call + subprocess-based check/run,
so the GIL is not a bottleneck.

Use case: slow models like Kimi K2.5 averaged 49s/problem
sequentially across the 60-problem AILANG sweep (~50 min total).
With --parallel 10 the same sweep should drop to ~5 min, which
makes release-time re-evals practical.

Implementation:
- ThreadPoolExecutor with max_workers=parallel
- Per-problem futures collected via as_completed
- threading.Lock around the JSONL append so concurrent writes
  don't interleave. Lines are still self-contained (carry
  problem_id) so completion-order writes are fine.
- Workers share the same work_dir; per-problem temp files are
  uniquified by problem_id (existing behavior).
- Exception per worker is caught and logged; the sweep continues.

Default parallel=1 preserves the existing sequential path with
no behavior change.

Smoke-tested with claude-haiku-4-5 --tier 1 --parallel 4:
10/10 problems, no duplicates, 100%/100% run_correct.

Co-Authored-By: Claude Opus 4.7 &lt;noreply@anthropic.com&gt;
diff --git a/vera_bench/cli.py b/vera_bench/cli.py
@@ -76,6 +76,17 @@ def validate(problems_dir: Path | None, solutions_dir: Path | None):
     is_flag=True,
     help="Keep temporary generated files",
 )
+@click.option(
+    "--parallel",
+    type=int,
+    default=1,
+    show_default=True,
+    help=(
+        "Run N problems concurrently via ThreadPoolExecutor. "
+        "Use >1 for slow models (e.g. Kimi K2.5). "
+        "Each worker is I/O-bound on its LLM call + subprocess runs."
+    ),
+)
 def run(
     model: str,
     tier: int | None,
@@ -86,6 +97,7 @@ def run(
     output_dir: Path | None,
     max_tokens: int,
     keep_temps: bool,
+    parallel: int,
 ):
     """Run benchmark against an LLM model."""
     from vera_bench.metrics import compute_metrics
@@ -274,6 +286,7 @@ def _ver_slug(v: str) -> str:
         keep_temps=keep_temps,
         bench_version=bench_ver,
         vera_version=vera_ver,
+        parallel=parallel,
     )
 
     # Print summary
diff --git a/vera_bench/runner.py b/vera_bench/runner.py
@@ -1108,20 +1108,62 @@ def run_benchmark(
     keep_temps: bool = False,
     bench_version: str = "",
     vera_version: str = "",
+    parallel: int = 1,
 ) -> list[ProblemResult]:
     """Run the full benchmark across all problems.
 
     Results are written to JSONL incrementally (survives crashes).
+
+    When ``parallel > 1``, problems are dispatched to a ThreadPoolExecutor
+    with ``parallel`` workers. Each problem runs independently (its own
+    LLM call, its own subprocess-based check/run), so threads only block
+    on I/O (HTTP to the LLM provider, subprocess spawns to the toolchain).
+    The GIL is not a bottleneck. Use this when sweeping slow models —
+    e.g. Kimi K2.5 at ~50s/problem sequential becomes ~5s/problem with
+    parallel=10.
+
+    JSONL output ordering is by completion order, not by problem index,
+    when running in parallel. Each line is self-contained (carries
+    ``problem_id``) so downstream consumers can sort if needed.
     """
     work_dir = Path(tempfile.mkdtemp(prefix="verabench_"))
     all_results: list[ProblemResult] = []
 
     try:
-        with Progress(console=console) as progress:
-            task = progress.add_task("Running benchmark...", total=len(problems))
-            for problem in problems:
-                problem_results = run_single_problem(
-                    problem=problem,
+        if parallel <= 1:
+            with Progress(console=console) as progress:
+                task = progress.add_task("Running benchmark...", total=len(problems))
+                for problem in problems:
+                    problem_results = run_single_problem(
+                        problem=problem,
+                        client=client,
+                        skill_md=skill_md,
+                        vera=vera,
+                        work_dir=work_dir,
+                        mode=mode,
+                        language=language,
+                        max_fix_attempts=max_fix_attempts,
+                        max_tokens=max_tokens,
+                        bench_version=bench_version,
+                        vera_version=vera_version,
+                    )
+                    all_results.extend(problem_results)
+
+                    if output_path:
+                        with open(output_path, "a", encoding="utf-8") as f:
+                            for r in problem_results:
+                                f.write(r.to_jsonl() + "\n")
+
+                    progress.advance(task)
+        else:
+            import threading
+            from concurrent.futures import ThreadPoolExecutor, as_completed
+
+            write_lock = threading.Lock()
+
+            def _run_one(p: dict) -> list[ProblemResult]:
+                return run_single_problem(
+                    problem=p,
                     client=client,
                     skill_md=skill_md,
                     vera=vera,
@@ -1133,15 +1175,32 @@ def run_benchmark(
                     bench_version=bench_version,
                     vera_version=vera_version,
                 )
-                all_results.extend(problem_results)
-
-                # Write JSONL incrementally
-                if output_path:
-                    with open(output_path, "a", encoding="utf-8") as f:
-                        for r in problem_results:
-                            f.write(r.to_jsonl() + "\n")
 
-                progress.advance(task)
+            with Progress(console=console) as progress:
+                task = progress.add_task(
+                    f"Running benchmark (parallel={parallel})...",
+                    total=len(problems),
+                )
+                with ThreadPoolExecutor(max_workers=parallel) as executor:
+                    futures = {executor.submit(_run_one, p): p for p in problems}
+                    for fut in as_completed(futures):
+                        try:
+                            problem_results = fut.result()
+                        except Exception as exc:  # noqa: BLE001
+                            pid = futures[fut].get("id", "?")
+                            console.print(
+                                f"[red]Worker failed on {pid}: {exc}[/red]"
+                            )
+                            progress.advance(task)
+                            continue
+                        all_results.extend(problem_results)
+                        if output_path:
+                            with write_lock, open(
+                                output_path, "a", encoding="utf-8"
+                            ) as f:
+                                for r in problem_results:
+                                    f.write(r.to_jsonl() + "\n")
+                        progress.advance(task)
     finally:
         if not keep_temps:
             shutil.rmtree(work_dir, ignore_errors=True)