fix(bench): address review feedback and seed ubuntu baselines

clean6378-max-it · clean6378-max-it · commit fcda8edcb005 · 2026-06-26T05:21:11.000+08:00
diff --git a/.gitignore b/.gitignore
@@ -46,3 +46,4 @@ coverage.xml
 .hypothesis/
 benchmark-results.json
 benchmarks/_raw.json
+benchmarks/_ci/
diff --git a/benchmarks/baselines.json b/benchmarks/baselines.json
@@ -1,29 +1,29 @@
 {
-  "_note": "Gated means seeded locally (Windows, 1.5× slack) — refresh from ubuntu-latest CI benchmark-results.json artifact before merge. Excluded from gate: test_list_workspace_projects_nocache[composers-10], test_search_full_corpus.",
-  "updated": "2026-06-25T20:34:07Z",
-  "machine": "Windows",
+  "_note": "Gated means from ubuntu-latest CI benchmark-results.json. Values multiplied by 1.5\u00d7 slack at generation time. Excluded from gate (recorded for reference): test_list_workspace_projects_nocache[composers-10], test_search_full_corpus. Refresh after intentional speedups via reduce_baselines.py.",
+  "updated": "2026-06-25T21:14:16Z",
+  "machine": "Linux",
   "groups": {
     "parse": {
-      "test_list_workspace_projects_nocache[composers-10]": 0.01313006085768828,
-      "test_list_workspace_projects_nocache[composers-50]": 0.04705098008271307,
-      "test_list_workspace_projects_nocache[composers-200]": 0.19944224995560944
+      "test_list_workspace_projects_nocache[composers-10]": 0.01702312019643009,
+      "test_list_workspace_projects_nocache[composers-50]": 0.07538331990000699,
+      "test_list_workspace_projects_nocache[composers-200]": 0.251991555999993
     },
     "export": {
-      "test_post_export_zip[composers-10]": 0.0170322916819714,
-      "test_post_export_zip[composers-50]": 0.040990050032269215
+      "test_post_export_zip[composers-10]": 0.0112034034344294,
+      "test_post_export_zip[composers-50]": 0.04482855966665985
     },
     "search": {
-      "test_search_full_corpus": 0.057670830062124874
+      "test_search_full_corpus": 0.047164217833331655
     },
     "summary-cache": {
-      "test_summary_cache_lookup[hit]": 0.00014543285277406022,
-      "test_summary_cache_lookup[miss]": 0.0001437347241805802,
-      "test_fingerprint_workspace_entries[10]": 0.001866654586096193,
-      "test_fingerprint_workspace_entries[50]": 0.00636450619807407,
-      "test_fingerprint_workspace_entries[200]": 0.020523441289855247,
-      "test_summary_cache_round_trip": 0.0019650292328056915,
-      "test_tab_summary_cache_lookup[hit]": 0.00015344636292124477,
-      "test_tab_summary_cache_lookup[miss]": 0.00012440098537902896
+      "test_summary_cache_lookup[hit]": 9.224067718099102e-05,
+      "test_summary_cache_lookup[miss]": 9.128770315496628e-05,
+      "test_fingerprint_workspace_entries[10]": 0.0024789120309553535,
+      "test_fingerprint_workspace_entries[50]": 0.010901568931818675,
+      "test_fingerprint_workspace_entries[200]": 0.03069810573000666,
+      "test_summary_cache_round_trip": 0.0004966099535917549,
+      "test_tab_summary_cache_lookup[hit]": 0.00010487297799045405,
+      "test_tab_summary_cache_lookup[miss]": 0.00010309520517204601
     }
   }
 }
diff --git a/scripts/check_benchmark_regression.py b/scripts/check_benchmark_regression.py
@@ -179,6 +179,12 @@ def main(argv: list[str] | None = None) -> int:
         help="fail when current mean is below this fraction of baseline (default: 0.50)",
     )
     args = parser.parse_args(argv)
+    if args.threshold <= 1:
+        print("ERROR: --threshold must be greater than 1", file=sys.stderr)
+        return 2
+    if not 0 < args.stale_floor < 1:
+        print("ERROR: --stale-floor must be between 0 and 1 (exclusive)", file=sys.stderr)
+        return 2
     try:
         return check_regression(
             args.results_path,
diff --git a/scripts/reduce_baselines.py b/scripts/reduce_baselines.py
@@ -33,6 +33,7 @@ def reduce_baselines(
     out_path: str | Path,
     *,
     slack: float = 1.0,
+    source: str = "local",
 ) -> dict[str, object]:
     path = Path(raw_path)
     try:
@@ -70,9 +71,14 @@ def reduce_baselines(
     slack_note = f" Values multiplied by {slack}× slack at generation time." if slack != 1.0 else ""
     machine_info = raw.get("machine_info")
     machine = machine_info.get("system") if isinstance(machine_info, dict) else None
+    source_labels = {
+        "ubuntu-latest-ci": "ubuntu-latest CI benchmark-results.json",
+        "local": "local benchmark-results.json",
+    }
+    source_label = source_labels.get(source, source)
     output: dict[str, object] = {
         "_note": (
-            "Gated means from ubuntu-latest CI benchmark-results.json."
+            f"Gated means from {source_label}."
             f"{slack_note} "
             f"Excluded from gate (recorded for reference): {excluded}. "
             "Refresh after intentional speedups via reduce_baselines.py."
@@ -99,9 +105,14 @@ def main(argv: list[str] | None = None) -> int:
         default=1.0,
         help="multiply means by this factor (must be > 0)",
     )
+    parser.add_argument(
+        "--source",
+        default="local",
+        help="provenance label for _note (e.g. ubuntu-latest-ci, local)",
+    )
     args = parser.parse_args(argv)
     try:
-        reduce_baselines(args.raw_path, args.out_path, slack=args.slack)
+        reduce_baselines(args.raw_path, args.out_path, slack=args.slack, source=args.source)
     except BenchmarkDataError as exc:
         print(f"ERROR: {exc}", file=sys.stderr)
         return 2
diff --git a/tests/benchmarks/test_parse_bench.py b/tests/benchmarks/test_parse_bench.py
@@ -23,5 +23,6 @@ def test_list_workspace_projects_nocache(
     def _run() -> object:
         return list_workspace_projects(workspace_path, [], nocache=True)
 
-    projects, _warnings = benchmark(_run)
+    projects, warnings = benchmark(_run)
     assert isinstance(projects, list) and len(projects) > 0
+    assert warnings == []
diff --git a/tests/benchmarks/test_summary_cache_bench.py b/tests/benchmarks/test_summary_cache_bench.py
@@ -29,7 +29,14 @@ def test_summary_cache_lookup(
     """Time ``get_cached_projects`` only; miss = fingerprint mismatch, not rebuild."""
     set_cached_projects(workspace_fingerprint, sample_projects, [])
     lookup_fp = workspace_fingerprint if mode == "hit" else stale_fingerprint
-    benchmark(get_cached_projects, lookup_fp)
+    result = benchmark(get_cached_projects, lookup_fp)
+    if mode == "hit":
+        assert result is not None
+        projects, warnings = result
+        assert projects == sample_projects
+        assert warnings == []
+    else:
+        assert result is None
 
 
 @pytest.mark.benchmark(group="summary-cache")
@@ -82,4 +89,11 @@ def test_tab_summary_cache_lookup(
     payload = {"tabs": [{"id": "cmp_0000", "title": "Bench"}]}
     set_cached_tab_summaries(workspace_fingerprint, workspace_id, payload, 200)
     lookup_fp = workspace_fingerprint if mode == "hit" else stale_fingerprint
-    benchmark(get_cached_tab_summaries, lookup_fp, workspace_id)
+    result = benchmark(get_cached_tab_summaries, lookup_fp, workspace_id)
+    if mode == "hit":
+        assert result is not None
+        cached_payload, status = result
+        assert status == 200
+        assert cached_payload == payload
+    else:
+        assert result is None
diff --git a/tests/test_check_benchmark_regression.py b/tests/test_check_benchmark_regression.py
@@ -251,3 +251,27 @@ def test_excluded_benchmark_not_gated(tmp_path, capsys: pytest.CaptureFixture[st
     out = capsys.readouterr().out
     assert "REGRESSION" not in out
     assert "STALE" not in out
+
+
+def test_main_rejects_invalid_threshold(tmp_path, capsys: pytest.CaptureFixture[str]) -> None:
+    from scripts.check_benchmark_regression import main
+
+    results = tmp_path / "results.json"
+    baselines = tmp_path / "baselines.json"
+    _write_results(results, [{"name": GATED_BENCH, "stats": {"mean": 0.0001}}])
+    _write_baselines(baselines, {"summary-cache": {GATED_BENCH: 0.0002}})
+
+    assert main([str(results), str(baselines), "--threshold", "1.0"]) == 2
+    assert "--threshold must be greater than 1" in capsys.readouterr().err
+
+
+def test_main_rejects_invalid_stale_floor(tmp_path, capsys: pytest.CaptureFixture[str]) -> None:
+    from scripts.check_benchmark_regression import main
+
+    results = tmp_path / "results.json"
+    baselines = tmp_path / "baselines.json"
+    _write_results(results, [{"name": GATED_BENCH, "stats": {"mean": 0.0001}}])
+    _write_baselines(baselines, {"summary-cache": {GATED_BENCH: 0.0002}})
+
+    assert main([str(results), str(baselines), "--stale-floor", "1.5"]) == 2
+    assert "--stale-floor must be between 0 and 1" in capsys.readouterr().err