Skip to content

Commit fcda8ed

Browse files
fix(bench): address review feedback and seed ubuntu baselines
1 parent 5eef897 commit fcda8ed

7 files changed

Lines changed: 79 additions & 22 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,3 +46,4 @@ coverage.xml
4646
.hypothesis/
4747
benchmark-results.json
4848
benchmarks/_raw.json
49+
benchmarks/_ci/

benchmarks/baselines.json

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,29 @@
11
{
2-
"_note": "Gated means seeded locally (Windows, 1.5× slack) — refresh from ubuntu-latest CI benchmark-results.json artifact before merge. Excluded from gate: test_list_workspace_projects_nocache[composers-10], test_search_full_corpus.",
3-
"updated": "2026-06-25T20:34:07Z",
4-
"machine": "Windows",
2+
"_note": "Gated means from ubuntu-latest CI benchmark-results.json. Values multiplied by 1.5\u00d7 slack at generation time. Excluded from gate (recorded for reference): test_list_workspace_projects_nocache[composers-10], test_search_full_corpus. Refresh after intentional speedups via reduce_baselines.py.",
3+
"updated": "2026-06-25T21:14:16Z",
4+
"machine": "Linux",
55
"groups": {
66
"parse": {
7-
"test_list_workspace_projects_nocache[composers-10]": 0.01313006085768828,
8-
"test_list_workspace_projects_nocache[composers-50]": 0.04705098008271307,
9-
"test_list_workspace_projects_nocache[composers-200]": 0.19944224995560944
7+
"test_list_workspace_projects_nocache[composers-10]": 0.01702312019643009,
8+
"test_list_workspace_projects_nocache[composers-50]": 0.07538331990000699,
9+
"test_list_workspace_projects_nocache[composers-200]": 0.251991555999993
1010
},
1111
"export": {
12-
"test_post_export_zip[composers-10]": 0.0170322916819714,
13-
"test_post_export_zip[composers-50]": 0.040990050032269215
12+
"test_post_export_zip[composers-10]": 0.0112034034344294,
13+
"test_post_export_zip[composers-50]": 0.04482855966665985
1414
},
1515
"search": {
16-
"test_search_full_corpus": 0.057670830062124874
16+
"test_search_full_corpus": 0.047164217833331655
1717
},
1818
"summary-cache": {
19-
"test_summary_cache_lookup[hit]": 0.00014543285277406022,
20-
"test_summary_cache_lookup[miss]": 0.0001437347241805802,
21-
"test_fingerprint_workspace_entries[10]": 0.001866654586096193,
22-
"test_fingerprint_workspace_entries[50]": 0.00636450619807407,
23-
"test_fingerprint_workspace_entries[200]": 0.020523441289855247,
24-
"test_summary_cache_round_trip": 0.0019650292328056915,
25-
"test_tab_summary_cache_lookup[hit]": 0.00015344636292124477,
26-
"test_tab_summary_cache_lookup[miss]": 0.00012440098537902896
19+
"test_summary_cache_lookup[hit]": 9.224067718099102e-05,
20+
"test_summary_cache_lookup[miss]": 9.128770315496628e-05,
21+
"test_fingerprint_workspace_entries[10]": 0.0024789120309553535,
22+
"test_fingerprint_workspace_entries[50]": 0.010901568931818675,
23+
"test_fingerprint_workspace_entries[200]": 0.03069810573000666,
24+
"test_summary_cache_round_trip": 0.0004966099535917549,
25+
"test_tab_summary_cache_lookup[hit]": 0.00010487297799045405,
26+
"test_tab_summary_cache_lookup[miss]": 0.00010309520517204601
2727
}
2828
}
2929
}

scripts/check_benchmark_regression.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,12 @@ def main(argv: list[str] | None = None) -> int:
179179
help="fail when current mean is below this fraction of baseline (default: 0.50)",
180180
)
181181
args = parser.parse_args(argv)
182+
if args.threshold <= 1:
183+
print("ERROR: --threshold must be greater than 1", file=sys.stderr)
184+
return 2
185+
if not 0 < args.stale_floor < 1:
186+
print("ERROR: --stale-floor must be between 0 and 1 (exclusive)", file=sys.stderr)
187+
return 2
182188
try:
183189
return check_regression(
184190
args.results_path,

scripts/reduce_baselines.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ def reduce_baselines(
3333
out_path: str | Path,
3434
*,
3535
slack: float = 1.0,
36+
source: str = "local",
3637
) -> dict[str, object]:
3738
path = Path(raw_path)
3839
try:
@@ -70,9 +71,14 @@ def reduce_baselines(
7071
slack_note = f" Values multiplied by {slack}× slack at generation time." if slack != 1.0 else ""
7172
machine_info = raw.get("machine_info")
7273
machine = machine_info.get("system") if isinstance(machine_info, dict) else None
74+
source_labels = {
75+
"ubuntu-latest-ci": "ubuntu-latest CI benchmark-results.json",
76+
"local": "local benchmark-results.json",
77+
}
78+
source_label = source_labels.get(source, source)
7379
output: dict[str, object] = {
7480
"_note": (
75-
"Gated means from ubuntu-latest CI benchmark-results.json."
81+
f"Gated means from {source_label}."
7682
f"{slack_note} "
7783
f"Excluded from gate (recorded for reference): {excluded}. "
7884
"Refresh after intentional speedups via reduce_baselines.py."
@@ -99,9 +105,14 @@ def main(argv: list[str] | None = None) -> int:
99105
default=1.0,
100106
help="multiply means by this factor (must be > 0)",
101107
)
108+
parser.add_argument(
109+
"--source",
110+
default="local",
111+
help="provenance label for _note (e.g. ubuntu-latest-ci, local)",
112+
)
102113
args = parser.parse_args(argv)
103114
try:
104-
reduce_baselines(args.raw_path, args.out_path, slack=args.slack)
115+
reduce_baselines(args.raw_path, args.out_path, slack=args.slack, source=args.source)
105116
except BenchmarkDataError as exc:
106117
print(f"ERROR: {exc}", file=sys.stderr)
107118
return 2

tests/benchmarks/test_parse_bench.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,5 +23,6 @@ def test_list_workspace_projects_nocache(
2323
def _run() -> object:
2424
return list_workspace_projects(workspace_path, [], nocache=True)
2525

26-
projects, _warnings = benchmark(_run)
26+
projects, warnings = benchmark(_run)
2727
assert isinstance(projects, list) and len(projects) > 0
28+
assert warnings == []

tests/benchmarks/test_summary_cache_bench.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,14 @@ def test_summary_cache_lookup(
2929
"""Time ``get_cached_projects`` only; miss = fingerprint mismatch, not rebuild."""
3030
set_cached_projects(workspace_fingerprint, sample_projects, [])
3131
lookup_fp = workspace_fingerprint if mode == "hit" else stale_fingerprint
32-
benchmark(get_cached_projects, lookup_fp)
32+
result = benchmark(get_cached_projects, lookup_fp)
33+
if mode == "hit":
34+
assert result is not None
35+
projects, warnings = result
36+
assert projects == sample_projects
37+
assert warnings == []
38+
else:
39+
assert result is None
3340

3441

3542
@pytest.mark.benchmark(group="summary-cache")
@@ -82,4 +89,11 @@ def test_tab_summary_cache_lookup(
8289
payload = {"tabs": [{"id": "cmp_0000", "title": "Bench"}]}
8390
set_cached_tab_summaries(workspace_fingerprint, workspace_id, payload, 200)
8491
lookup_fp = workspace_fingerprint if mode == "hit" else stale_fingerprint
85-
benchmark(get_cached_tab_summaries, lookup_fp, workspace_id)
92+
result = benchmark(get_cached_tab_summaries, lookup_fp, workspace_id)
93+
if mode == "hit":
94+
assert result is not None
95+
cached_payload, status = result
96+
assert status == 200
97+
assert cached_payload == payload
98+
else:
99+
assert result is None

tests/test_check_benchmark_regression.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -251,3 +251,27 @@ def test_excluded_benchmark_not_gated(tmp_path, capsys: pytest.CaptureFixture[st
251251
out = capsys.readouterr().out
252252
assert "REGRESSION" not in out
253253
assert "STALE" not in out
254+
255+
256+
def test_main_rejects_invalid_threshold(tmp_path, capsys: pytest.CaptureFixture[str]) -> None:
257+
from scripts.check_benchmark_regression import main
258+
259+
results = tmp_path / "results.json"
260+
baselines = tmp_path / "baselines.json"
261+
_write_results(results, [{"name": GATED_BENCH, "stats": {"mean": 0.0001}}])
262+
_write_baselines(baselines, {"summary-cache": {GATED_BENCH: 0.0002}})
263+
264+
assert main([str(results), str(baselines), "--threshold", "1.0"]) == 2
265+
assert "--threshold must be greater than 1" in capsys.readouterr().err
266+
267+
268+
def test_main_rejects_invalid_stale_floor(tmp_path, capsys: pytest.CaptureFixture[str]) -> None:
269+
from scripts.check_benchmark_regression import main
270+
271+
results = tmp_path / "results.json"
272+
baselines = tmp_path / "baselines.json"
273+
_write_results(results, [{"name": GATED_BENCH, "stats": {"mean": 0.0001}}])
274+
_write_baselines(baselines, {"summary-cache": {GATED_BENCH: 0.0002}})
275+
276+
assert main([str(results), str(baselines), "--stale-floor", "1.5"]) == 2
277+
assert "--stale-floor must be between 0 and 1" in capsys.readouterr().err

0 commit comments

Comments
 (0)