GH-39303: [Archery][Benchmarking] Allow setting C++ repetition min time (#39324)

pitrou · web-flow · commit 1c1fa746f8d1 · 2024-01-07T06:58:21.000+09:00
### Rationale for this change We want to be able to increase the number of repetitions for each C++ micro-benchmark without increasing the total runtime. ### What changes are included in this PR? * Add a `--repetition-min-time` argument to set the repetition duration in seconds * Add a `--cpp-benchmark-extras` argument to pass arbitrary arguments to Google Benchmark executables * Add a couple tests with multiple benchmark repetitions ### Are these changes tested? Not entirely. Command-line argument passing is not unit-tested. ### Are there any user-facing changes? No. * Closes: #39303 Authored-by: Antoine Pitrou <antoine@python.org> Signed-off-by: Sutou Kouhei <kou@clear-code.com>
diff --git a/.github/workflows/archery.yml b/.github/workflows/archery.yml
@@ -63,7 +63,9 @@ jobs:
       - name: Install pygit2 binary wheel
         run: pip install pygit2 --only-binary pygit2
       - name: Install Archery, Crossbow- and Test Dependencies
-        run: pip install pytest responses -e dev/archery[all]
+        run: |
+          pip install -e dev/archery[all]
+          pip install -r dev/archery/requirements-test.txt
       - name: Archery Unittests
         working-directory: dev/archery
         run: pytest -v archery
diff --git a/dev/archery/archery/benchmark/google.py b/dev/archery/archery/benchmark/google.py
@@ -37,9 +37,10 @@ class GoogleBenchmarkCommand(Command):
     notably `--benchmark_filter`, `--benchmark_format`, etc...
     """
 
-    def __init__(self, benchmark_bin, benchmark_filter=None):
+    def __init__(self, benchmark_bin, benchmark_filter=None, benchmark_extras=None):
         self.bin = benchmark_bin
         self.benchmark_filter = benchmark_filter
+        self.benchmark_extras = benchmark_extras or []
 
     def list_benchmarks(self):
         argv = ["--benchmark_list_tests"]
@@ -49,16 +50,19 @@ def list_benchmarks(self):
                           stderr=subprocess.PIPE)
         return str.splitlines(result.stdout.decode("utf-8"))
 
-    def results(self, repetitions=1):
+    def results(self, repetitions=1, repetition_min_time=None):
         with NamedTemporaryFile() as out:
-            argv = ["--benchmark_repetitions={}".format(repetitions),
-                    "--benchmark_out={}".format(out.name),
+            argv = [f"--benchmark_repetitions={repetitions}",
+                    f"--benchmark_out={out.name}",
                     "--benchmark_out_format=json"]
 
+            if repetition_min_time is not None:
+                argv.append(f"--benchmark_min_time={repetition_min_time:.6f}")
+
             if self.benchmark_filter:
-                argv.append(
-                    "--benchmark_filter={}".format(self.benchmark_filter)
-                )
+                argv.append(f"--benchmark_filter={self.benchmark_filter}")
+
+            argv += self.benchmark_extras
 
             self.run(*argv, check=True)
             return json.load(out)
diff --git a/dev/archery/archery/benchmark/runner.py b/dev/archery/archery/benchmark/runner.py
@@ -42,10 +42,11 @@ def regex_filter(re_expr):
 
 class BenchmarkRunner:
     def __init__(self, suite_filter=None, benchmark_filter=None,
-                 repetitions=DEFAULT_REPETITIONS):
+                 repetitions=DEFAULT_REPETITIONS, repetition_min_time=None):
         self.suite_filter = suite_filter
         self.benchmark_filter = benchmark_filter
         self.repetitions = repetitions
+        self.repetition_min_time = repetition_min_time
 
     @property
     def suites(self):
@@ -107,9 +108,10 @@ def __repr__(self):
 class CppBenchmarkRunner(BenchmarkRunner):
     """ Run suites from a CMakeBuild. """
 
-    def __init__(self, build, **kwargs):
+    def __init__(self, build, benchmark_extras, **kwargs):
         """ Initialize a CppBenchmarkRunner. """
         self.build = build
+        self.benchmark_extras = benchmark_extras
         super().__init__(**kwargs)
 
     @staticmethod
@@ -142,14 +144,17 @@ def suites_binaries(self):
 
     def suite(self, name, suite_bin):
         """ Returns the resulting benchmarks for a given suite. """
-        suite_cmd = GoogleBenchmarkCommand(suite_bin, self.benchmark_filter)
+        suite_cmd = GoogleBenchmarkCommand(suite_bin, self.benchmark_filter,
+                                           self.benchmark_extras)
 
         # Ensure there will be data
         benchmark_names = suite_cmd.list_benchmarks()
         if not benchmark_names:
             return None
 
-        results = suite_cmd.results(repetitions=self.repetitions)
+        results = suite_cmd.results(
+            repetitions=self.repetitions,
+            repetition_min_time=self.repetition_min_time)
         benchmarks = GoogleBenchmark.from_json(results.get("benchmarks"))
         return BenchmarkSuite(name, benchmarks)
 
@@ -252,6 +257,7 @@ def suite(self, name):
         if not benchmark_names:
             return None
 
+        # TODO: support `repetition_min_time`
         results = suite_cmd.results(repetitions=self.repetitions)
         benchmarks = JavaMicrobenchmarkHarness.from_json(results)
         return BenchmarkSuite(name, benchmarks)
diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py
@@ -377,7 +377,10 @@ def check_language(ctx, param, value):
                      "Can be stacked. For language=java"),
         click.option("--cmake-extras", type=str, multiple=True,
                      help="Extra flags/options to pass to cmake invocation. "
-                     "Can be stacked. For language=cpp")
+                     "Can be stacked. For language=cpp"),
+        click.option("--cpp-benchmark-extras", type=str, multiple=True,
+                     help="Extra flags/options to pass to C++ benchmark executables. "
+                     "Can be stacked. For language=cpp"),
     ]
 
     cmd = java_toolchain_options(cmd)
@@ -440,12 +443,16 @@ def benchmark_list(ctx, rev_or_path, src, preserve, output, cmake_extras,
 @click.option("--repetitions", type=int, default=-1,
               help=("Number of repetitions of each benchmark. Increasing "
                     "may improve result precision. "
-                    "[default: 1 for cpp, 5 for java"))
+                    "[default: 1 for cpp, 5 for java]"))
+@click.option("--repetition-min-time", type=float, default=None,
+              help=("Minimum duration of each repetition in seconds. "
+                    "Currently only supported for language=cpp. "
+                    "[default: use runner-specific defaults]"))
 @click.pass_context
 def benchmark_run(ctx, rev_or_path, src, preserve, output, cmake_extras,
                   java_home, java_options, build_extras, benchmark_extras,
                   language, suite_filter, benchmark_filter, repetitions,
-                  **kwargs):
+                  repetition_min_time, cpp_benchmark_extras, **kwargs):
     """ Run benchmark suite.
 
     This command will run the benchmark suite for a single build. This is
@@ -468,13 +475,18 @@ def benchmark_run(ctx, rev_or_path, src, preserve, output, cmake_extras,
     \b
     archery benchmark run
 
+    \b
+    # Run the benchmarks on an existing build directory
+    \b
+    archery benchmark run /build/cpp
+
     \b
     # Run the benchmarks on current previous commit
     \b
     archery benchmark run HEAD~1
 
     \b
-    # Run the benchmarks on current previous commit
+    # Run the benchmarks on current git workspace and output results as a JSON file.
     \b
     archery benchmark run --output=run.json
     """
@@ -488,8 +500,9 @@ def benchmark_run(ctx, rev_or_path, src, preserve, output, cmake_extras,
             repetitions = repetitions if repetitions != -1 else 1
             runner_base = CppBenchmarkRunner.from_rev_or_path(
                 src, root, rev_or_path, conf,
-                repetitions=repetitions,
-                suite_filter=suite_filter, benchmark_filter=benchmark_filter)
+                repetitions=repetitions, repetition_min_time=repetition_min_time,
+                suite_filter=suite_filter, benchmark_filter=benchmark_filter,
+                benchmark_extras=cpp_benchmark_extras)
 
         elif language == "java":
             for key in {'cpp_package_prefix', 'cxx_flags', 'cxx', 'cc'}:
diff --git a/dev/archery/archery/tests/test_benchmarks.py b/dev/archery/archery/tests/test_benchmarks.py
@@ -81,6 +81,53 @@ def test_static_runner_from_json_not_a_regression():
     assert not comparison.regression
 
 
+def test_static_runner_from_json_multiple_values_not_a_regression():
+    # Same as above, but with multiple repetitions
+    archery_result = {
+        "suites": [
+            {
+                "name": "arrow-value-parsing-benchmark",
+                "benchmarks": [
+                    {
+                        "name": "FloatParsing<DoubleType>",
+                        "unit": "items_per_second",
+                        "less_is_better": False,
+                        "values": [
+                            93588476.22327498,
+                            94873831.3818328,
+                            95593675.20810866,
+                            95797325.6543961,
+                            96134728.05794072
+                        ],
+                        "time_unit": "ns",
+                        "times": [
+                            10537.724568456104,
+                            10575.162068480413,
+                            10599.271208720838,
+                            10679.028059166194,
+                            10827.995119861762
+                        ],
+                        "counters": {
+                            "family_index": 0,
+                            "per_family_instance_index": 0,
+                            "run_name": "FloatParsing<DoubleType>",
+                            "repetitions": 5,
+                            "repetition_index": 0,
+                            "threads": 1,
+                            "iterations": 10656
+                        }
+                    }
+                ]
+            }
+        ]
+    }
+
+    contender = StaticBenchmarkRunner.from_json(json.dumps(archery_result))
+    baseline = StaticBenchmarkRunner.from_json(json.dumps(archery_result))
+    [comparison] = RunnerComparator(contender, baseline).comparisons
+    assert not comparison.regression
+
+
 def test_static_runner_from_json_regression():
     archery_result = {
         "suites": [
@@ -114,6 +161,58 @@ def test_static_runner_from_json_regression():
     assert comparison.regression
 
 
+def test_static_runner_from_json_multiple_values_regression():
+    # Same as above, but with multiple repetitions
+    archery_result = {
+        "suites": [
+            {
+                "name": "arrow-value-parsing-benchmark",
+                "benchmarks": [
+                    {
+                        "name": "FloatParsing<DoubleType>",
+                        "unit": "items_per_second",
+                        "less_is_better": False,
+                        "values": [
+                            93588476.22327498,
+                            94873831.3818328,
+                            95593675.20810866,
+                            95797325.6543961,
+                            96134728.05794072
+                        ],
+                        "time_unit": "ns",
+                        "times": [
+                            10537.724568456104,
+                            10575.162068480413,
+                            10599.271208720838,
+                            10679.028059166194,
+                            10827.995119861762
+                        ],
+                        "counters": {
+                            "family_index": 0,
+                            "per_family_instance_index": 0,
+                            "run_name": "FloatParsing<DoubleType>",
+                            "repetitions": 5,
+                            "repetition_index": 0,
+                            "threads": 1,
+                            "iterations": 10656
+                        }
+                    }
+                ]
+            }
+        ]
+    }
+
+    contender = StaticBenchmarkRunner.from_json(json.dumps(archery_result))
+
+    # introduce artificial regression
+    values = archery_result['suites'][0]['benchmarks'][0]['values']
+    values[:] = [v * 2 for v in values]
+    baseline = StaticBenchmarkRunner.from_json(json.dumps(archery_result))
+
+    [comparison] = RunnerComparator(contender, baseline).comparisons
+    assert comparison.regression
+
+
 def test_benchmark_median():
     assert median([10]) == 10
     assert median([1, 2, 3]) == 2
@@ -381,3 +480,77 @@ def test_omits_aggregates():
     benchmark = GoogleBenchmark(name, [observation1, observation2])
     result = json.dumps(benchmark, cls=JsonEncoder)
     assert json.loads(result) == archery_result
+
+
+def test_multiple_observations():
+    name = "FloatParsing<DoubleType>"
+    google_results = [
+        {
+            'cpu_time': 10627.38199641615,
+            'family_index': 0,
+            'items_per_second': 94096551.75067839,
+            'iterations': 9487,
+            'name': 'FloatParsing<DoubleType>',
+            'per_family_instance_index': 0,
+            'real_time': 10628.84905663701,
+            'repetition_index': 0,
+            'repetitions': 3,
+            'run_name': 'FloatParsing<DoubleType>',
+            'run_type': 'iteration',
+            'threads': 1,
+            'time_unit': 'ns'
+        },
+        {
+            'cpu_time': 10633.318014124594,
+            'family_index': 0,
+            'items_per_second': 94044022.63448404,
+            'iterations': 9487,
+            'name': 'FloatParsing<DoubleType>',
+            'per_family_instance_index': 0,
+            'real_time': 10634.858754122948,
+            'repetition_index': 1,
+            'repetitions': 3,
+            'run_name': 'FloatParsing<DoubleType>',
+            'run_type': 'iteration',
+            'threads': 1,
+            'time_unit': 'ns'
+        },
+        {
+            'cpu_time': 10664.315484347,
+            'family_index': 0,
+            'items_per_second': 93770669.24434038,
+            'iterations': 9487,
+            'name': 'FloatParsing<DoubleType>',
+            'per_family_instance_index': 0,
+            'real_time': 10665.584589337563,
+            'repetition_index': 2,
+            'repetitions': 3,
+            'run_name': 'FloatParsing<DoubleType>',
+            'run_type': 'iteration',
+            'threads': 1,
+            'time_unit': 'ns'
+        }
+    ]
+
+    archery_result = {
+        'counters': {
+            'family_index': 0,
+            'iterations': 9487,
+            'per_family_instance_index': 0,
+            'repetition_index': 2,
+            'repetitions': 3,
+            'run_name': 'FloatParsing<DoubleType>',
+            'threads': 1
+        },
+        'less_is_better': False,
+        'name': 'FloatParsing<DoubleType>',
+        'time_unit': 'ns',
+        'times': [10628.84905663701, 10634.858754122948, 10665.584589337563],
+        'unit': 'items_per_second',
+        'values': [93770669.24434038, 94044022.63448404, 94096551.75067839]
+    }
+
+    observations = [GoogleBenchmarkObservation(**g) for g in google_results]
+    benchmark = GoogleBenchmark(name, observations)
+    result = json.dumps(benchmark, cls=JsonEncoder)
+    assert json.loads(result) == archery_result
diff --git a/dev/archery/requirements-test.txt b/dev/archery/requirements-test.txt
@@ -0,0 +1,2 @@
+pytest
+responses