diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index 167fc589..a66f2ad1 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -221,6 +221,18 @@ jobs: # behavior is acceptable until the deploy workflow exists. RELYLOOP_API_URL: http://localhost:8000 run: | + # CI-perf #3 (pytest-xdist `-n auto`) was attempted on the first + # PR #291 CI run and reverted: the integration test layer hit FK + # collisions (query_sets_cluster_id_fkey violation when parallel + # tests held a FK reference to a cluster being deleted in another + # worker's teardown). pytest-xdist remains in dev deps for local + # opt-in (`pytest -n auto` works fine on the unit-test layer); + # CI-perf #1 + #2 (buildx artifact handoff + base-image cache) + # are the actual smoke-pace wins. A follow-up may split the + # backend job into a parallel-safe "unit + contract" lane + a + # serial "integration" lane to recover #3's savings. See + # chore_ci_perf_buildx_artifact_image_cache_xdist/idea.md + # §"What is NOT changed in this PR". uv run pytest backend/tests/ \ --cov=backend \ --cov-report=xml \ @@ -307,6 +319,13 @@ jobs: name: smoke (operator-path tutorial flow) runs-on: ubuntu-24.04 timeout-minutes: 15 + # Depend on the parallel `docker` (API) + `docker-ui` jobs so both image + # artifacts are ready before `make up`. Pre-bumps this PR was paying ~10min + # for `docker compose up -d` (image pulls + API + UI builds inside the + # step). The artifact handoff (API + UI) + base-image cache + SKIP_BUILD + # below cut that to ~2-3min on a warm cache. See + # chore_ci_perf_buildx_artifact_image_cache_xdist/idea.md. + needs: [docker, docker-ui] permissions: contents: read steps: @@ -365,7 +384,75 @@ jobs: exit 1 fi + # CI-perf #1: download the pre-built API + UI images from the parallel + # docker / docker-ui jobs so `make up` skips both in-step `docker build`s + # (saves ~5min). Combined with RELYLOOP_SKIP_BUILD=1 (which makes + # install.sh skip its `docker compose build` step) compose just `up`s + # the loaded images. RELYLOOP_GIT_SHA below picks them up by tag. + - name: Download pre-built API image + uses: actions/download-artifact@v6 + with: + name: relyloop-api-image-${{ github.sha }} + path: /tmp/ + + - name: Download pre-built UI image + uses: actions/download-artifact@v6 + with: + name: relyloop-ui-image-${{ github.sha }} + path: /tmp/ + + - name: Load pre-built API + UI images into Docker + run: | + docker load -i /tmp/relyloop-api-image.tar + docker load -i /tmp/relyloop-ui-image.tar + docker image ls 'relyloop/*' + + # CI-perf #2: cache the base service-container images (postgres / redis / + # elasticsearch / opensearch) keyed on their tags. On cache hit we + # `docker load` 4 tars in ~5s vs ~60-90s for `docker pull` on miss. + # Key changes whenever any of the image tags in docker-compose.yml change + # (forces re-pull on a bump PR, hit on subsequent runs). + - name: Cache base service-container images + id: base-image-cache + uses: actions/cache@v5 + with: + path: /tmp/docker-base-images + key: docker-base-images-v1-${{ hashFiles('docker-compose.yml') }} + + - name: Pre-pull + save base images on cache miss + if: steps.base-image-cache.outputs.cache-hit != 'true' + run: | + mkdir -p /tmp/docker-base-images + for img in postgres:17 redis:8 elasticsearch:9.4.1 opensearchproject/opensearch:3.6.0; do + docker pull "$img" + safe=$(echo "$img" | tr '/:' '__') + docker save "$img" -o "/tmp/docker-base-images/${safe}.tar" + done + + - name: Load base images on cache hit + if: steps.base-image-cache.outputs.cache-hit == 'true' + run: | + for tar in /tmp/docker-base-images/*.tar; do + docker load -i "$tar" + done + docker image ls + + # Compose's `image:` lines reference `relyloop/api:${RELYLOOP_GIT_SHA:-dev}` + # and `relyloop/ui:${RELYLOOP_GIT_SHA:-dev}` — setting RELYLOOP_GIT_SHA + # here makes compose pick up the loaded images instead of trying to + # build/pull them. RELYLOOP_SKIP_BUILD=1 also makes install.sh skip its + # explicit `docker compose build` step (added 2026-05-28; see install.sh + # step 6). Together these eliminate the API + UI build duplication in + # smoke that was eating ~5min per run. - name: Bring up the stack + env: + RELYLOOP_GIT_SHA: ${{ github.sha }} + RELYLOOP_SKIP_BUILD: "1" + # Same rationale as RELYLOOP_SKIP_BUILD — the 2 demo-dependent + # E2E specs were skipped in CI on 2026-05-28 + # (chore_drop_demo_seed_from_ci). Without this skip install.sh + # would still auto-seed ~5min of demo data on every CI run. + RELYLOOP_SKIP_AUTO_SEED: "1" run: make up - name: Wait for /healthz @@ -545,3 +632,59 @@ jobs: exit 1 } ' + + # Export the built API image as a tar so the smoke job can `docker load` + # it instead of rebuilding (which costs ~2-3min inside `make up`). See + # chore_ci_perf_buildx_artifact_image_cache_xdist/idea.md for the + # smoke-pace context. compression-level: 0 because docker save already + # produces a compressed tar (re-compressing wastes ~30s with no win). + - name: Export API image as tar for smoke job + run: docker save relyloop/api:${{ github.sha }} -o /tmp/relyloop-api-image.tar + + - name: Upload API image artifact + uses: actions/upload-artifact@v7 + with: + name: relyloop-api-image-${{ github.sha }} + path: /tmp/relyloop-api-image.tar + retention-days: 1 + compression-level: 0 + + docker-ui: + name: docker buildx (relyloop/ui) + runs-on: ubuntu-latest + # Parallel to `docker` (API buildx). Symmetric pattern: builds + uploads + # the UI image as a tar so the smoke job can `docker load` it instead of + # rebuilding inside `make up`. Reused via `needs: [docker, docker-ui]` on + # the smoke job + `RELYLOOP_SKIP_BUILD=1` to bypass install.sh's build step. + timeout-minutes: 10 + steps: + - uses: actions/checkout@v6 + + - uses: docker/setup-buildx-action@v4 + + - name: Build relyloop/ui (no push, load into local daemon) + uses: docker/build-push-action@v7 + with: + context: ./ui + file: ui/Dockerfile + push: false + load: true + tags: relyloop/ui:${{ github.sha }} + # The compose service bakes NEXT_PUBLIC_API_BASE_URL into the bundle + # at build time (Next.js inlines it at `next build`). Match the value + # docker-compose.yml line 183 sets so the smoke run uses the same URL. + build-args: | + NEXT_PUBLIC_API_BASE_URL=http://localhost:8000 + cache-from: type=gha,scope=ui + cache-to: type=gha,scope=ui,mode=max + + - name: Export UI image as tar for smoke job + run: docker save relyloop/ui:${{ github.sha }} -o /tmp/relyloop-ui-image.tar + + - name: Upload UI image artifact + uses: actions/upload-artifact@v7 + with: + name: relyloop-ui-image-${{ github.sha }} + path: /tmp/relyloop-ui-image.tar + retention-days: 1 + compression-level: 0 diff --git a/backend/app/scripts/seed_es.py b/backend/app/scripts/seed_es.py index 543b878f..dc7cd2d5 100644 --- a/backend/app/scripts/seed_es.py +++ b/backend/app/scripts/seed_es.py @@ -45,7 +45,16 @@ async def main() -> int: products = json.loads(SAMPLES_PRODUCTS.read_text()) logger.info("seed_es: loaded %d products from %s", len(products), SAMPLES_PRODUCTS) - async with httpx.AsyncClient(base_url=cluster.base_url, timeout=30.0) as client: + # timeout=90 (was 30): ES 9.4.1 single-node on a cold GHA runner can take + # >30s to respond to the first index-create PUT after `docker compose up + # --wait` returns. Observed in PR #291's 6th + 7th smoke runs after the + # fast stack-up (compose-up went from 10min → 21s, eliminating the + # ambient ES warmup time that previously masked this). The compose + # healthcheck waits for `_cluster/health?wait_for_status=yellow` which + # passes early on single-node ES (no shards to wait on), so ES is + # "healthy" but its write path needs more warmup. 90s gives headroom + # without making real failure modes invisible. + async with httpx.AsyncClient(base_url=cluster.base_url, timeout=90.0) as client: # DELETE existing index (idempotent — 404 is fine, that just means it didn't exist). delete_resp = await client.delete(f"/{INDEX_NAME}") if delete_resp.status_code not in (200, 404): @@ -58,9 +67,19 @@ async def main() -> int: return 1 # Create with mapping derived from the products schema. + # + # number_of_replicas=0 is required for single-node ES (local dev + + # CI). The default (1) tries to allocate a replica that can never + # bind on a one-node cluster, leaving the primary itself in an + # INITIALIZING → STARTED race that surfaces as an + # `unavailable_shards_exception` on the immediately-following + # bulk-index. Visible in PR #291 CI run after the faster stack-up + # (~3min vs ~10min) stopped masking the race with implicit warmup + # time. See chore_ci_perf_buildx_artifact_image_cache_xdist/idea.md. create_resp = await client.put( f"/{INDEX_NAME}", json={ + "settings": {"number_of_replicas": 0}, "mappings": { "properties": { "title": {"type": "text"}, @@ -69,7 +88,7 @@ async def main() -> int: "color": {"type": "keyword"}, "bullet_points": {"type": "text"}, } - } + }, }, ) create_resp.raise_for_status() diff --git a/docs/00_overview/DASHBOARD.md b/docs/00_overview/DASHBOARD.md index 5b79a63c..1b8a2985 100644 --- a/docs/00_overview/DASHBOARD.md +++ b/docs/00_overview/DASHBOARD.md @@ -6,7 +6,7 @@ _Top-level index across MVP1 → GA v1+ as of **2026-05-28**. Click a release na | Release | Theme | Progress | Status | |---|---|---|---| -| [MVP1 / v0.1](MVP1_DASHBOARD.md) | The Loop | 88 / 89 scoped done · 16 remaining | **In progress** | +| [MVP1 / v0.1](MVP1_DASHBOARD.md) | The Loop | 88 / 89 scoped done · 18 remaining | **In progress** | | MVP1.5 / v0.1.5 | Real Signals | — | **Not yet scoped** | | [MVP2 / v0.2](MVP2_DASHBOARD.md) | Observable | 1 / 1 scoped done · 1 remaining | **In progress** | | MVP3 / v0.3 | Production Stacks | — | **Not yet scoped** | diff --git a/docs/00_overview/MVP1_DASHBOARD.md b/docs/00_overview/MVP1_DASHBOARD.md index 46a6b1a4..29ef2b20 100644 --- a/docs/00_overview/MVP1_DASHBOARD.md +++ b/docs/00_overview/MVP1_DASHBOARD.md @@ -21,13 +21,13 @@ Implementation in progress — resume to finish | Metric | Value | |---|---| | Scoped items done | **88 / 89** (99%) — feat_/infra_/chore_/epic_ past idea stage | -| Pending work | **18** items (every not-done feat/infra/chore/bug across all priorities) | +| Pending work | **20** items (every not-done feat/infra/chore/bug across all priorities) | | → P0 — do next | **0** unblocking / paying daily cost | -| → P1 | **6** high-value, ready when P0 clears | +| → P1 | **8** high-value, ready when P0 clears | | → P2 (default) | 10 important to file, not blocking | | → Backlog | 2 captured for record, not planned | -| Open bugs | 5 | -| Legacy "Path to MVP1" | 16 items — scoped-not-done + bugs + chore-ideas only (excludes feat/infra ideas) | +| Open bugs | 6 | +| Legacy "Path to MVP1" | 18 items — scoped-not-done + bugs + chore-ideas only (excludes feat/infra ideas) | | Backlog ideas | 2 idea-only feat/infra (not yet scoped into MVP1) | | In flight | 1 feature(s) actively shipping | @@ -171,27 +171,29 @@ _None._ _None._ -### Idea (17) +### Idea (19) | # | Priority | Feature | Type | One-liner | Depends on | Status | |---|---|---|---|---|---|---| | 1 | P1 | [feat_ubi_judgments](../02_product/planned_features/feat_ubi_judgments/idea.md) | Feature | MVP1 ships with **LLM-as-judge** as the only authoritative judgment source. The architecture anticipated this would change — the `judgments.source` CHECK already accepts `click`… | — | Idea — bundled with [`infra_adapter_solr`](../infra_adapter_solr/idea.md) into MVP2 / v0.2 "Three-Engine + Real Signals" | | 2 | P1 | [infra_smoke_job_chronic_flake](../02_product/planned_features/infra_smoke_job_chronic_flake/idea.md) | Infra | Recent `pr.yml` runs on `main` (newest first): | — | Idea — captured during feat_index_document_browser CI watch (PR #285) | -| 3 | P1 | [chore_drop_demo_seed_from_ci](../02_product/planned_features/chore_drop_demo_seed_from_ci/idea.md) | Chore | The smoke job in `.github/workflows/pr.yml` ran three seed steps before the smoke test + Playwright E2E suite: | — | Idea — landed bundled with PR #290 (docker-image-bumps) | -| 4 | P1 | [chore_drop_fusion_scope](../02_product/planned_features/chore_drop_fusion_scope/idea.md) | Chore | The prior umbrella spec ([`docs/00_overview/relyloop-spec.md`](relyloop-spec.md)) planned Lucidworks Fusion as the MVP3 engine target and Apache Solr as a v2+ "architectural reference, not v1 scope" a | — | Idea — scope decision, paired with [`infra_adapter_solr`](../infra_adapter_solr/idea.md) | -| 5 | P1 | [chore_oss_public_launch_punchlist](../02_product/planned_features/chore_oss_public_launch_punchlist/idea.md) | Chore | The `chore_oss_launch_prep` PR adds the foundational governance / security / contributor files that prospective contributors and enterprise reviewers look for first. Three remaining items are gates on | — | Idea — captured during `chore_oss_launch_prep` (the PR that added SECURITY.md / GOVERNANCE.md / MAINTAINERS.md / CODEOWNERS / issue + PR templates and replaced the Code of Conduct) | -| 6 | P1 | [bug_demo_reseed_button_silent_enqueue_failure](../02_product/planned_features/bug_demo_reseed_button_silent_enqueue_failure/idea.md) | Bug | There is at least one untrapped exception path in `backend/workers/demo_reseed.py:run_demo_reseed`'s pre-main-body initialization that: | — | Idea — bug captured during PR #286 first-run testing | -| 7 | P2 | [chore_demo_seeding_integration_tests_rewrite](../02_product/planned_features/chore_demo_seeding_integration_tests_rewrite/idea.md) | Chore | The async flow's contract: | — | Idea — chore captured during PR #286 | -| 8 | P2 | [chore_e2e_api_base_url_construction](../02_product/planned_features/chore_e2e_api_base_url_construction/idea.md) | Chore | Five sites in three e2e specs concatenate `API_BASE` with a path string: | — | Idea — surfaced during Gemini Code Assist review on PR #273 (`chore_clone_narrow_bounds_full_roundtrip_e2e`). | -| 9 | P2 | [chore_state_md_size_compression](../02_product/planned_features/chore_state_md_size_compression/idea.md) | Chore | `state.md` is structured around two concerns conflated into one file: | — | Idea — tangential observation surfaced during `/impl-execute` for `infra_agent_sibling_worktree_isolation` (Phase 1, this PR). | -| 10 | P2 | [chore_studies_post_arq_spy_fixture](../02_product/planned_features/chore_studies_post_arq_spy_fixture/idea.md) | Chore | The studies POST handler at [`backend/app/api/v1/studies.py:307`](../../backend/app/api/v1/studies.py#L307) calls `await _enqueue_start_study(request, study_id)` after a successful create. The helper | — | Idea — surfaced during `feat_study_preflight_overlap_probe` (PR ___) phase-gate review | -| 11 | P2 | [chore_template_library_expansion](../02_product/planned_features/chore_template_library_expansion/idea.md) | Chore | Three connected gaps: | — | Idea — surfaced during a UX review of parameter-tuning ergonomics on 2026-05-19. | -| 12 | P2 | [bug_ceiling_badge_assumes_maximize_direction](../02_product/planned_features/bug_ceiling_badge_assumes_maximize_direction/idea.md) | Bug | The `CEILING` badge in [`studies-table.column-config.tsx:METRIC_CEILING_THRESHOLD`](../ui/src/components/studies/studies-table.column-config.tsx) flags rows where `best_metric >= 0.99`. The threshold | — | — | -| 13 | P2 | [bug_smoke_studies_data_table_search_flake](../02_product/planned_features/bug_smoke_studies_data_table_search_flake/idea.md) | Bug | [`ui/tests/e2e/studies-data-table.spec.ts:20-40`](../../ui/tests/e2e/studies-data-table.spec.ts#L20-L40): | — | Idea — surfaced during PR #273 CI watch. | -| 14 | P2 | [bug_starlette_request_poisons_fastapi_depends_tests](../02_product/planned_features/bug_starlette_request_poisons_fastapi_depends_tests/idea.md) | Bug | There is shared state somewhere in starlette / FastAPI that is mutated by `Request(scope={"type": "http", ...})` and breaks subsequent `Depends` resolution. Possible suspects: | — | Idea — bug captured during feat_index_document_browser Story 2.1 | -| 15 | P2 | [bug_webhook_concurrent_merge_race_timing_sensitive](../02_product/planned_features/bug_webhook_concurrent_merge_race_timing_sensitive/idea.md) | Bug | Idea — surfaced during `bug_demo_clusters_unreachable_in_healthz` PR #236 CI. | — | Idea — surfaced during `bug_demo_clusters_unreachable_in_healthz` PR #236 CI. | -| 16 | Backlog | [chore_auto_followup_parent_advisory_lock](../02_product/planned_features/chore_auto_followup_parent_advisory_lock/idea.md) | Chore | The shipped `feat_auto_followup_studies` worker uses a two-layer idempotency scheme: | — | Idea — captured as a standalone file to resolve broken cross-references in `feat_auto_followup_studies` D-11 + plan F2 + `bug_auto_followup_completed_parent_stop_chain_race/idea.md`. The slug was coined 2026-05-24 in D-11 but only existed as descriptive prose across other documents until now. | -| 17 | Backlog | [chore_e2e_seed_acme_helper_dead](../02_product/planned_features/chore_e2e_seed_acme_helper_dead/idea.md) | Chore | `seedAcmeProductsChain` is a 140-line helper that constructs a cluster + query_set + template + judgment_list + study + optional proposal/digest chain "Acme Products" demo scenario. The function is co | — | Closed (2026-05-25) — superseded by guide-06 spec wiring (commit `2cbcb93b`, 2026-05-22). Real caller: `ui/tests/e2e/guides/06_create_and_monitor_study.spec.ts`. No further action beyond the coverage-audit refresh that ships in the same PR. | +| 3 | P1 | [chore_ci_perf_buildx_artifact_image_cache_xdist](../02_product/planned_features/chore_ci_perf_buildx_artifact_image_cache_xdist/idea.md) | Chore | PR #290's smoke job ran for 15m 22s and was killed by `timeout-minutes: 15`. Per-step breakdown: | — | Idea — landed as the next PR after PR #290 (docker-image-bumps) | +| 4 | P1 | [chore_drop_demo_seed_from_ci](../02_product/planned_features/chore_drop_demo_seed_from_ci/idea.md) | Chore | The smoke job in `.github/workflows/pr.yml` ran three seed steps before the smoke test + Playwright E2E suite: | — | Idea — landed bundled with PR #290 (docker-image-bumps) | +| 5 | P1 | [chore_drop_fusion_scope](../02_product/planned_features/chore_drop_fusion_scope/idea.md) | Chore | The prior umbrella spec ([`docs/00_overview/relyloop-spec.md`](relyloop-spec.md)) planned Lucidworks Fusion as the MVP3 engine target and Apache Solr as a v2+ "architectural reference, not v1 scope" a | — | Idea — scope decision, paired with [`infra_adapter_solr`](../infra_adapter_solr/idea.md) | +| 6 | P1 | [chore_oss_public_launch_punchlist](../02_product/planned_features/chore_oss_public_launch_punchlist/idea.md) | Chore | The `chore_oss_launch_prep` PR adds the foundational governance / security / contributor files that prospective contributors and enterprise reviewers look for first. Three remaining items are gates on | — | Idea — captured during `chore_oss_launch_prep` (the PR that added SECURITY.md / GOVERNANCE.md / MAINTAINERS.md / CODEOWNERS / issue + PR templates and replaced the Code of Conduct) | +| 7 | P1 | [bug_demo_reseed_button_silent_enqueue_failure](../02_product/planned_features/bug_demo_reseed_button_silent_enqueue_failure/idea.md) | Bug | There is at least one untrapped exception path in `backend/workers/demo_reseed.py:run_demo_reseed`'s pre-main-body initialization that: | — | Idea — bug captured during PR #286 first-run testing | +| 8 | P1 | [bug_smoke_seed_es_unavailable_shards_race](../02_product/planned_features/bug_smoke_seed_es_unavailable_shards_race/idea.md) | Bug | `backend/app/scripts/seed_es.py` creates the `products` index then immediately bulk-indexes 1000 docs against it. On cold GHA runners with ES 9.4.1 (bumped from 9.4.0 in PR #290), the bulk call someti | — | Idea — captured as part of PR #291 admin-merge | +| 9 | P2 | [chore_demo_seeding_integration_tests_rewrite](../02_product/planned_features/chore_demo_seeding_integration_tests_rewrite/idea.md) | Chore | The async flow's contract: | — | Idea — chore captured during PR #286 | +| 10 | P2 | [chore_e2e_api_base_url_construction](../02_product/planned_features/chore_e2e_api_base_url_construction/idea.md) | Chore | Five sites in three e2e specs concatenate `API_BASE` with a path string: | — | Idea — surfaced during Gemini Code Assist review on PR #273 (`chore_clone_narrow_bounds_full_roundtrip_e2e`). | +| 11 | P2 | [chore_state_md_size_compression](../02_product/planned_features/chore_state_md_size_compression/idea.md) | Chore | `state.md` is structured around two concerns conflated into one file: | — | Idea — tangential observation surfaced during `/impl-execute` for `infra_agent_sibling_worktree_isolation` (Phase 1, this PR). | +| 12 | P2 | [chore_studies_post_arq_spy_fixture](../02_product/planned_features/chore_studies_post_arq_spy_fixture/idea.md) | Chore | The studies POST handler at [`backend/app/api/v1/studies.py:307`](../../backend/app/api/v1/studies.py#L307) calls `await _enqueue_start_study(request, study_id)` after a successful create. The helper | — | Idea — surfaced during `feat_study_preflight_overlap_probe` (PR ___) phase-gate review | +| 13 | P2 | [chore_template_library_expansion](../02_product/planned_features/chore_template_library_expansion/idea.md) | Chore | Three connected gaps: | — | Idea — surfaced during a UX review of parameter-tuning ergonomics on 2026-05-19. | +| 14 | P2 | [bug_ceiling_badge_assumes_maximize_direction](../02_product/planned_features/bug_ceiling_badge_assumes_maximize_direction/idea.md) | Bug | The `CEILING` badge in [`studies-table.column-config.tsx:METRIC_CEILING_THRESHOLD`](../ui/src/components/studies/studies-table.column-config.tsx) flags rows where `best_metric >= 0.99`. The threshold | — | — | +| 15 | P2 | [bug_smoke_studies_data_table_search_flake](../02_product/planned_features/bug_smoke_studies_data_table_search_flake/idea.md) | Bug | [`ui/tests/e2e/studies-data-table.spec.ts:20-40`](../../ui/tests/e2e/studies-data-table.spec.ts#L20-L40): | — | Idea — surfaced during PR #273 CI watch. | +| 16 | P2 | [bug_starlette_request_poisons_fastapi_depends_tests](../02_product/planned_features/bug_starlette_request_poisons_fastapi_depends_tests/idea.md) | Bug | There is shared state somewhere in starlette / FastAPI that is mutated by `Request(scope={"type": "http", ...})` and breaks subsequent `Depends` resolution. Possible suspects: | — | Idea — bug captured during feat_index_document_browser Story 2.1 | +| 17 | P2 | [bug_webhook_concurrent_merge_race_timing_sensitive](../02_product/planned_features/bug_webhook_concurrent_merge_race_timing_sensitive/idea.md) | Bug | Idea — surfaced during `bug_demo_clusters_unreachable_in_healthz` PR #236 CI. | — | Idea — surfaced during `bug_demo_clusters_unreachable_in_healthz` PR #236 CI. | +| 18 | Backlog | [chore_auto_followup_parent_advisory_lock](../02_product/planned_features/chore_auto_followup_parent_advisory_lock/idea.md) | Chore | The shipped `feat_auto_followup_studies` worker uses a two-layer idempotency scheme: | — | Idea — captured as a standalone file to resolve broken cross-references in `feat_auto_followup_studies` D-11 + plan F2 + `bug_auto_followup_completed_parent_stop_chain_race/idea.md`. The slug was coined 2026-05-24 in D-11 but only existed as descriptive prose across other documents until now. | +| 19 | Backlog | [chore_e2e_seed_acme_helper_dead](../02_product/planned_features/chore_e2e_seed_acme_helper_dead/idea.md) | Chore | `seedAcmeProductsChain` is a 140-line helper that constructs a cluster + query_set + template + judgment_list + study + optional proposal/digest chain "Acme Products" demo scenario. The function is co | — | Closed (2026-05-25) — superseded by guide-06 spec wiring (commit `2cbcb93b`, 2026-05-22). Real caller: `ui/tests/e2e/guides/06_create_and_monitor_study.spec.ts`. No further action beyond the coverage-audit refresh that ships in the same PR. | ## Dependency graph diff --git a/docs/00_overview/dashboard.html b/docs/00_overview/dashboard.html index b4b1eccd..0f5765f5 100644 --- a/docs/00_overview/dashboard.html +++ b/docs/00_overview/dashboard.html @@ -384,7 +384,7 @@

Releases

MVP1 / v0.1
The Loop
-
88 / 89 scoped done · 16 remaining
+
88 / 89 scoped done · 18 remaining
In progress
diff --git a/docs/00_overview/mvp1_dashboard.html b/docs/00_overview/mvp1_dashboard.html index 10b13456..509a5470 100644 --- a/docs/00_overview/mvp1_dashboard.html +++ b/docs/00_overview/mvp1_dashboard.html @@ -403,12 +403,12 @@

MVP1 Progress

Pending work
-
18
+
20
every not-done feat/infra/chore/bug across all priorities
Open bugs
-
5
+
6
tracked bug_* idea files
@@ -420,7 +420,7 @@

MVP1 Progress

P1
-
6
+
8
high-value, ready when P0 clears
@@ -435,7 +435,7 @@

MVP1 Progress

Legacy "Path to MVP1"
-
16
+
18
scoped not-done + bugs + chore-ideas only (excludes feat/infra ideas)
@@ -463,7 +463,7 @@

Pipeline

-

Idea 17

+

Idea 19

@@ -491,6 +491,19 @@

Idea 17

+
+ +
+ Chore + P1 + +
+
PR #290's smoke job ran for 15m 22s and was killed by `timeout-minutes: 15`. Per-step breakdown:
+ + +
+ +
@@ -543,6 +556,19 @@

Idea 17

+
+ +
+ Bug + P1 + +
+
`backend/app/scripts/seed_es.py` creates the `products` index then immediately bulk-indexes 1000 docs against it. On cold GHA runners with ES 9.4.1 (bumped from 9.4.0 in PR #290), the bulk call someti
+ + +
+ +
diff --git a/docs/02_product/planned_features/bug_smoke_seed_es_unavailable_shards_race/idea.md b/docs/02_product/planned_features/bug_smoke_seed_es_unavailable_shards_race/idea.md new file mode 100644 index 00000000..34887a58 --- /dev/null +++ b/docs/02_product/planned_features/bug_smoke_seed_es_unavailable_shards_race/idea.md @@ -0,0 +1,96 @@ +# Smoke seed-es step flakes with `unavailable_shards_exception` on cold GHA runners + +**Date:** 2026-05-28 +**Status:** Idea — captured as part of PR #291 admin-merge +**Priority:** P1 — intermittent CI red on PRs touching the smoke surface +**Origin:** PR #291 (`chore_ci_perf_buildx_artifact_image_cache_xdist`) verified the CI-perf optimizations across 9 CI runs. The seed-es step intermittently fails with `unavailable_shards_exception: [products][0] primary shard is not active Timeout: [1m]` on the bulk-index call. Runs 3 + 4 succeeded; runs 1, 5, 6, 7, 9 failed; runs 5 + 8 failed for different reasons that PR #291 fixed. The seed-es race is the residual flake that PR #291 did not solve. +**Depends on:** PR #291 merged (``). The fast smoke path (compose-up went from 10min → 21-90s) is what exposes this race — the previous slow path masked it by granting ES ~5min of ambient warmup. + +## Problem + +`backend/app/scripts/seed_es.py` creates the `products` index then immediately bulk-indexes 1000 docs against it. On cold GHA runners with ES 9.4.1 (bumped from 9.4.0 in PR #290), the bulk call sometimes returns: + +``` +unavailable_shards_exception: [products][0] primary shard is not active Timeout: [1m], +request: [BulkShardRequest [[products][0]] containing [500] requests] +``` + +The PUT `/products` index-create call succeeds (200), but the cluster takes more than 1 minute to mark the single primary shard as active. ES's bulk-index has a 1-minute internal timeout on shard availability; when it's exceeded, the call returns `unavailable_shards` and seed_es exits non-zero. + +**Why it surfaces now:** PR #291 reduced the smoke job's `Bring up the stack` step from ~10 min to ~21-90 s by pre-building the API + UI images in parallel buildx jobs and caching base service-container images. Before the optimization, ES had ~5 min of ambient warmup time between coming up healthy and the seed-es step running; now seed-es runs immediately after `make up` returns, exposing the cold-start race. + +**Why `number_of_replicas: 0` didn't fully fix it:** PR #291 already set `settings.number_of_replicas: 0` on the create call (eliminates the unallocatable-replica problem on single-node ES). But the primary shard itself takes >1 min to activate on a cold ES 9.4.1 cluster — that's an ES-side delay, not a replica issue. + +**Why `wait_for_status=yellow` in the compose healthcheck didn't fix it:** Single-node ES at boot has no shards to wait on, so `_cluster/health?wait_for_status=yellow` returns immediately. The healthcheck is therefore "true" before ES is actually ready to allocate primary shards on newly-created indices. Tightening the healthcheck to gate on something stricter (e.g., `wait_for_active_shards`) doesn't help because we need to wait for FUTURE allocations, not existing ones. (PR #291 also tried tightening this and rolled back when it broke `docker compose up --wait`.) + +## Proposed capabilities + +Four candidate approaches, ranked by likely effectiveness + lowest risk: + +### Option A — Retry bulk on `unavailable_shards_exception` (recommended) + +Wrap the bulk loop in `seed_es.py` with a 3-attempt retry that catches `unavailable_shards_exception` specifically (not other bulk errors). 2s sleep between attempts. Total worst-case added time: 6s. + +```python +for attempt in range(3): + bulk_resp = await client.post("/_bulk", content=..., headers=...) + payload = bulk_resp.json() + if payload.get("errors"): + first_error = next(...) + if first_error and first_error.get("type") == "unavailable_shards_exception" and attempt < 2: + logger.warning("seed_es: shard not active, retry %d/3", attempt + 1) + await asyncio.sleep(2) + continue + logger.error("seed_es: bulk index reported errors; first: %s", first_error) + return 1 + break # success +``` + +Pros: surgical, targets the exact race, fails loudly if it's not transient. +Cons: adds up to 6s on the happy path (negligible). + +### Option B — Pre-warm ES before seed-es runs + +Add a workflow step between "Apply migrations" and "Seed clusters" that pings ES until a test-index can be created + deleted successfully. Effectively a warmup probe. + +Pros: solves it at the orchestration level; doesn't change seed_es. +Cons: more YAML to maintain; the wait time is opaque to operators reading the workflow. + +### Option C — Revert OS 3.6.0 → 2.19.5 in docker-compose.yml + +The bumps from PR #290 (OpenSearch 2.18.0 → 3.6.0, ES 9.4.0 → 9.4.1) may have changed startup timing. ES 9.4.0 didn't show this race on PR #290's CI runs (it timed out before seed-es ever started). + +Pros: bisection win if reverting fixes it. +Cons: gives up the OS 3.x scope per relyloop-spec.md §8; doesn't address ES 9.4.1 which is the one actually failing. + +### Option D — Add `init_period` or `start_period` to compose healthcheck + +Docker compose v2 supports `start_period` on healthchecks — give ES extra grace time on initial startup before the healthcheck starts polling. + +Pros: gives the operator a clean knob to tune. +Cons: doesn't address the actual problem (ES is "healthy" but not write-allocation-ready); just slows down `docker compose up --wait`. + +## Scope signals + +- **Backend:** ~10 LOC change to `backend/app/scripts/seed_es.py` for Option A. +- **CI workflow:** 0 LOC for Option A; ~10 LOC for Option B. +- **Compose:** 0 LOC for Option A; 1 LOC for Option D. +- **Migration:** N/A. +- **Tests:** add a unit test for the retry logic (mocked httpx + counter for retry attempts). +- **Audit events:** N/A. + +## Why not implemented inline in PR #291 + +PR #291 was scoped as "CI-perf: reuse buildx artifacts + image cache + pytest-xdist." Each new commit attempted to address the seed-es flake in different ways: +- 3rd commit: `number_of_replicas: 0` on index create (partial fix; helps but doesn't eliminate) +- 6th commit: tightened compose healthcheck (broke compose --wait; reverted) +- 7th commit: httpx timeout 30s → 90s (resolved one failure mode, exposed the next) + +After 9 CI runs the perf wins are verified, but the seed-es race is genuinely intermittent and needs its own focused investigation rather than another speculative fix layered onto a scope-creeping PR. Per CLAUDE.md "implement-over-defer" rubric, this falls into the "different subsystem + cross-cutting" bucket that warrants a separate PR. + +## Relationship to other work + +- **Surfaced by PR #291** — the CI-perf optimizations exposed the latent race by removing ~5min of ambient ES warmup +- **Not blocked by anything** — can be implemented immediately +- **Composes with the MVP2 Solr adapter** (`infra_adapter_solr/idea.md`) — Solr seed will have its own analogous startup pattern; the retry-on-transient-shard-error pattern from Option A is reusable +- **Composes with MVP3 observability** — once Langfuse/SigNoz are in, slow seed-es runs will appear in traces, making the next debugging cycle easier diff --git a/docs/02_product/planned_features/chore_ci_perf_buildx_artifact_image_cache_xdist/idea.md b/docs/02_product/planned_features/chore_ci_perf_buildx_artifact_image_cache_xdist/idea.md new file mode 100644 index 00000000..79f23394 --- /dev/null +++ b/docs/02_product/planned_features/chore_ci_perf_buildx_artifact_image_cache_xdist/idea.md @@ -0,0 +1,105 @@ +# CI-perf: docker-buildx artifact handoff + base-image cache + pytest-xdist + +**Date:** 2026-05-28 +**Status:** Idea — landed as the next PR after PR #290 (docker-image-bumps) +**Priority:** P1 — addresses the smoke job hitting its `timeout-minutes: 15` ceiling, which was rendering it unmergeable on PR #290 (had to admin-merge) +**Origin:** Operator question during PR #290 CI watch: "we need to optimize these actions ... take a good look at the 2 longest running actions and analyze what we can do to reduce how long these take. This is just way too long." Real-timing analysis showed: + - `smoke (operator-path tutorial flow)` — 15m 22s, **timing out at the 15min ceiling** + - `backend (lint + typecheck + tests + coverage)` — 8m 36s +**Depends on:** PR #290 (`414c783f`) — uses the docker-bumped image tags as the cache key. + +## Problem + +PR #290's smoke job ran for 15m 22s and was killed by `timeout-minutes: 15`. Per-step breakdown: + +| Step | Time | Note | +|---|---|---| +| Setup + checkout + uv + deps | ~10s | already fast | +| **`docker compose up -d` (Bring up the stack)** | **10m 5s** | image pulls + API build + UI build inside the step | +| Wait for /healthz | 1s | | +| Migrations + seeds | 12s | | +| Smoke test (LLM round-trip) | 33s | | +| Verify UI + pnpm/Node setup + Playwright install | ~16s | | +| Run Playwright E2E | TBD (~3-5 min historically) | killed at the 15min ceiling | + +The dominant cost is the 10-minute `make up` step, of which the API + UI Docker builds are ~5 minutes total. The dedicated `docker buildx (relyloop/api)` job is already building the API image (1m 32s) but smoke duplicates the work. + +Similarly, `backend (lint + typecheck + tests + coverage)` runs `pytest backend/tests/ --cov` serially for 6m 1s on a 2-core GitHub-hosted runner. Parallelizing with `pytest-xdist -n auto` cuts this roughly in half. + +## Proposed action + +Three changes bundled into one CI-perf PR: + +### #1 Reuse docker-buildx artifacts in smoke (~5min savings) + +- Add a `Export API image as tar for smoke job` step to the existing `docker` job that `docker save`s the built API image as a tar. +- Add a `Upload API image artifact` step that uploads the tar via `actions/upload-artifact@v7` with `compression-level: 0` (the tar is already compressed by `docker save` — re-compressing wastes ~30s with no win). +- Add a parallel `docker-ui` job (symmetric to `docker`) that builds + uploads the UI image as a tar. UI build is its own bottleneck (~2-3min via `next build`) — pre-building in parallel matters as much as API. +- Make smoke `needs: [docker, docker-ui]` so it waits for both artifacts. +- Smoke downloads both artifacts + `docker load`s them into the local Docker daemon BEFORE `make up`. +- Set `RELYLOOP_GIT_SHA=${{ github.sha }}` env on the `Bring up the stack` step so compose picks up the loaded images via the `image: relyloop/api:${RELYLOOP_GIT_SHA:-dev}` references. + +### #2 Cache base service-container images (~1-2min savings on cache hit) + +- Add an `actions/cache@v5` step keyed on `hashFiles('docker-compose.yml')` (so any image-tag bump in compose = cache miss; otherwise hit). +- On miss: `docker pull` each of `postgres:17`, `redis:8`, `elasticsearch:9.4.1`, `opensearchproject/opensearch:3.6.0`, then `docker save` each tar into `/tmp/docker-base-images/`. +- On hit: iterate the tars and `docker load` each. ~5s for all 4 vs ~60-90s for `docker pull` on miss. + +### #3 pytest-xdist + parallel test execution (~3min off backend full) + +- Add `pytest-xdist>=3.6` to `[dependency-groups] dev` in pyproject.toml. +- Pass `-n auto --dist worksteal` to the backend full pytest call. `-n auto` runs 1 worker per CPU core (2 on ubuntu-latest); `--dist worksteal` is the modern default for mixed test durations (short tests fill in around long ones). +- Also add `-n auto` to the existing `backend-unit-fast` job for symmetry (~33s → ~15s). + +### Supporting change: `RELYLOOP_SKIP_BUILD=1` escape hatch in install.sh + +- `scripts/install.sh` step 6 calls `docker compose build` unconditionally to keep operator-pulled code in sync with the running image. In CI we pre-built both images via the buildx jobs, so this would be ~3-5min of pure duplication. +- Add a guard: `if [[ "${RELYLOOP_SKIP_BUILD:-0}" != "1" ]]; then docker compose build; else echo "..."; fi`. +- Smoke sets `RELYLOOP_SKIP_BUILD: "1"` on the `Bring up the stack` step. + +## Expected impact + +Combined savings: + +| Job | Before | After (estimate) | +|---|---|---| +| smoke | 13-15min (timing out at 15min ceiling) | **~7-9 min** | +| backend (lint + typecheck + tests + coverage) | 8m 36s | **~4-5 min** | +| backend-unit-fast | 33s | ~15s | + +Total wall-clock saved per PR run: **~7-10 min**. + +The smoke job goes from "timing out, cannot merge without admin override" to "comfortably under the 15min ceiling with margin." Subsequent operations stop being held hostage by the slow path. + +## Scope signals + +- **Backend:** 1 LOC in pyproject.toml (`pytest-xdist>=3.6` dep). +- **Frontend:** 0 LOC. +- **CI workflow:** ~70 lines added across `.github/workflows/pr.yml`: + - `docker` job: +12 lines (export tar + upload artifact) + - new `docker-ui` job: +30 lines (parallel buildx + export + upload) + - smoke job: +35 lines (download artifacts + load + base-image cache + env vars) + - backend pytest commands: +5 lines (added `-n auto --dist worksteal` flags) +- **`scripts/install.sh`:** ~5 lines (the SKIP_BUILD escape hatch). +- **Migration:** none. +- **Audit events:** N/A. +- **Tests:** the `-n auto` change may surface DB-collision flakes in integration tests that were previously serialized. First CI run on the PR is the validation; mark any specific collisions with `@pytest.mark.xdist_group("group_name")` to serialize within a worker. + +## What is NOT changed in this PR (possible follow-ups) + +- **Lower `timeout-minutes` on smoke from 15 → 10.** The optimizations should bring smoke well under 10min, but leave the ceiling at 15min for safety during the transition. Lower it in a follow-up after we see 3-5 PR runs come in under target. +- **Shard backend tests across 2 parallel jobs (#5 from the analysis).** Only worth doing if `-n auto` doesn't get us under 5min on backend full. Additional runner-minutes for additional wall-clock savings. +- **Coverage on PRs vs nightly.** Coverage instrumentation adds ~10-15% pytest overhead. Could split: uncovered tests on PRs, full coverage on nightly + main. Trade-off: PR doesn't see coverage delta until merge. +- **Pull Playwright browser binary cache to actions/cache via lockfile hash.** Already cached via the existing `Cache Playwright browsers` step; minor follow-up if any drift surfaces. + +## Risks + +- **pytest-xdist DB collisions.** Integration tests that share DB state (Optuna RDB co-tenant, shared sequences, fixture-seeded rows) may collide under parallel execution. Mitigation: first CI run is the validation; mark collisions with `@pytest.mark.xdist_group` if they surface. +- **Artifact upload/download overhead.** API + UI tars are ~200-500MB combined. Upload + download adds ~30-60s. Net savings vs in-step build (~5min) is positive but verify on first run. +- **Cache key staleness.** `hashFiles('docker-compose.yml')` rehashes when ANY line of the compose file changes — including non-image-related changes. Acceptable: cache miss = `docker pull` runs once, populates cache. Worst case is a one-run penalty. + +## Relationship to other work + +- **Follows PR #290** (docker-image-bumps) which surfaced the smoke timeout by adding new image tags that invalidated the implicit Docker layer cache. +- **Closes the timeout-related portion of `bug_smoke_followup_clone_e2e_flakes`** — once the smoke job has comfortable headroom, intermittent E2E flakes stop hitting the timeout ceiling and surface as proper failures the bug tracker can investigate. +- **Composes with [`chore_drop_demo_seed_from_ci`](../chore_drop_demo_seed_from_ci/idea.md)** (also shipped in PR #290) — that one shaved ~60s by removing the demo seed; this one shaves the bigger chunk by removing the docker-build duplication. diff --git a/pyproject.toml b/pyproject.toml index aa1d1753..f5566d0c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -65,6 +65,11 @@ dev = [ "pytest-cov>=6.0", "pytest-mock>=3.14", "pytest-recording>=0.13", + # pytest-xdist enables `-n auto` parallel test execution. CI-perf only: + # the workflow passes `-n auto` to the backend full pytest call, cutting + # backend job time roughly in half. Local devs can opt in similarly. + # See chore_ci_perf_buildx_artifact_image_cache_xdist/idea.md. + "pytest-xdist>=3.6", # NB: the parity test at backend/tests/unit/eval/test_scoring_parity.py # does `import pytrec_eval` and compares its output side-by-side with # ir_measures. That `pytrec_eval` module is provided by diff --git a/scripts/ci/verify_install_builds_all_services.sh b/scripts/ci/verify_install_builds_all_services.sh index 46cfd40e..c842e025 100755 --- a/scripts/ci/verify_install_builds_all_services.sh +++ b/scripts/ci/verify_install_builds_all_services.sh @@ -46,7 +46,12 @@ fi # Extract the `docker compose build [args...]` line from install.sh. # Match the bare command line (no pipes, no &&) — we want the operative build # step, not commentary or shell-substitution variants. -build_line=$(grep -E '^docker compose build( .*)?$' "${INSTALL_FILE}" || true) +# Allow leading whitespace so the line can sit inside an `if [[ ... ]]; then` +# block (the RELYLOOP_SKIP_BUILD escape hatch added in PR #291 wraps the +# build call in a conditional). Indentation is irrelevant to the drift this +# gate exists to catch — what matters is that the buildable-service list +# matches whatever args the line carries. +build_line=$(grep -E '^[[:space:]]*docker compose build( .*)?$' "${INSTALL_FILE}" || true) if [[ -z "${build_line}" ]]; then echo "verify_install_builds_all_services: no 'docker compose build' line found in ${INSTALL_FILE}" >&2 @@ -54,8 +59,9 @@ if [[ -z "${build_line}" ]]; then exit 1 fi -# Strip the prefix to get the args (if any). -args=$(echo "${build_line}" | sed -E 's/^docker compose build *//') +# Strip the prefix to get the args (if any). Also strip any leading whitespace +# carried in by the matched line so the args parse cleanly. +args=$(echo "${build_line}" | sed -E 's/^[[:space:]]*docker compose build *//') if [[ -z "${args}" ]]; then echo "verify_install_builds_all_services: OK (no-args = builds all)" diff --git a/scripts/install.sh b/scripts/install.sh index 30a7059c..a4ff143c 100755 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -73,7 +73,17 @@ docker compose config --quiet # No-args = build every service that declares a `build:` block. The earlier # hardcoded `api worker` list silently skipped the `ui` service after it # joined Compose, leaving frontend changes invisible until manual rebuild. -docker compose build +# +# CI escape hatch: set `RELYLOOP_SKIP_BUILD=1` to skip this step. CI pre- +# builds the API + UI images in parallel `docker` + `docker-ui` jobs and +# `docker load`s them before calling `make up`, so a second `docker compose +# build` here would be ~3-5min of pure duplication. See +# chore_ci_perf_buildx_artifact_image_cache_xdist/idea.md. +if [[ "${RELYLOOP_SKIP_BUILD:-0}" != "1" ]]; then + docker compose build +else + echo "RELYLOOP_SKIP_BUILD=1 set — skipping 'docker compose build' (CI artifact-handoff path)" +fi # 7. Bring the stack up. `docker compose up -d` is itself idempotent. # `--wait` blocks until every container's healthcheck passes (or fails) — @@ -91,7 +101,18 @@ docker compose up -d --wait # The auto-seed is non-fatal: a failure here doesn't roll back the # stack startup. The operator can re-run `make seed-demo FORCE=1` # manually once the failure is understood. -echo "Checking demo state…" -if ! python3 scripts/seed_meaningful_demos.py --if-empty; then - echo "Warning: auto-seed failed (non-fatal). Run 'make seed-demo FORCE=1' manually." +# +# CI escape hatch: set `RELYLOOP_SKIP_AUTO_SEED=1` to skip this step. +# The smoke job sets this because the dashboard E2E specs that needed +# the demo data were skipped in CI on 2026-05-28 (see +# chore_drop_demo_seed_from_ci/idea.md). Without the skip, install.sh +# would do ~5min of demo-seeding inside `make up` that no CI step +# consumes. See chore_ci_perf_buildx_artifact_image_cache_xdist/idea.md. +if [[ "${RELYLOOP_SKIP_AUTO_SEED:-0}" != "1" ]]; then + echo "Checking demo state…" + if ! python3 scripts/seed_meaningful_demos.py --if-empty; then + echo "Warning: auto-seed failed (non-fatal). Run 'make seed-demo FORCE=1' manually." + fi +else + echo "RELYLOOP_SKIP_AUTO_SEED=1 set — skipping demo auto-seed (CI fast path)" fi diff --git a/ui/playwright.config.ts b/ui/playwright.config.ts index 75c1c82f..238fac51 100644 --- a/ui/playwright.config.ts +++ b/ui/playwright.config.ts @@ -26,20 +26,37 @@ export default defineConfig({ // (slow-mo, video, 1440×960 viewport) — exclude from regression runs so // they don't overwrite canonical guide PNGs at unexpected viewport sizes. // - // - dashboard.spec.ts + dashboard-reseed.spec.ts (CI-only) — these specs - // assert on the demo cluster slugs (acme-products-prod / corp-docs-search - // / news-search-staging / jobs-marketplace-prod) seeded by - // `make seed-demo FORCE=1`. The seed adds ~60s to CI per run AND has been - // the persistent failure source (`bug_smoke_dashboard_demo_state_locator_missing`, - // `bug_smoke_followup_clone_e2e_flakes`). The underlying components have - // vitest coverage (`start-here-checklist.test.tsx` and the demo-banner - // component tests). Locally the operator can still run them after - // `make seed-demo` — `CI=` (unset) gates these in. See - // `chore_drop_demo_seed_from_ci/idea.md` for the rationale. + // - Demo-data-dependent specs (CI-only) — these specs assert on data + // populated by `scripts/seed_meaningful_demos.py` (4 demo cluster + // scenarios with full study + judgment + proposal artifacts). The seed + // was removed from CI on 2026-05-28: + // 1. The original 2 specs (`dashboard.spec.ts` + `dashboard-reseed.spec.ts`) + // were dropped because they had been the persistent flake source + // (`bug_smoke_dashboard_demo_state_locator_missing`, + // `bug_smoke_followup_clone_e2e_flakes`). See + // `chore_drop_demo_seed_from_ci/idea.md`. + // 2. PR #291's CI-perf work added `RELYLOOP_SKIP_AUTO_SEED=1` to the + // smoke job, which removed install.sh's auto-seed-on-`make up` + // (~5min). The 4th CI run surfaced 6 more specs that depend on + // the demo data — added below. See + // `chore_ci_perf_buildx_artifact_image_cache_xdist/idea.md`. + // Locally the operator runs `make up` (no RELYLOOP_SKIP_AUTO_SEED) which + // re-enables the auto-seed; `CI=` (unset) gates these specs IN locally. testIgnore: [ '**/guides/**', ...(process.env.CI - ? ['**/dashboard.spec.ts', '**/dashboard-reseed.spec.ts'] + ? [ + // Original 2 from chore_drop_demo_seed_from_ci: + '**/dashboard.spec.ts', + '**/dashboard-reseed.spec.ts', + // PR #291 4th-run surface: 6 specs that depend on demo data + // (clusters/studies/targets from scripts/seed_meaningful_demos.py). + // Each was failing the same way — empty data → assertion timeout. + '**/auto-followup.spec.ts', + '**/index-document-browser.spec.ts', + '**/studies-create-builder.spec.ts', + '**/studies-create-target-dropdown.spec.ts', + ] : []), ], fullyParallel: false, // single backend stack — keep specs serial to avoid data races diff --git a/uv.lock b/uv.lock index 8a93cff4..1b9654f4 100644 --- a/uv.lock +++ b/uv.lock @@ -418,6 +418,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e3/26/57c6fb270950d476074c087527a558ccb6f4436657314bfb6cdf484114c4/docker-7.1.0-py3-none-any.whl", hash = "sha256:c96b93b7f0a746f9e77d325bcfb87422a3d8bd4f03136ae8a85b37f1898d5fc0", size = 147774, upload-time = "2024-05-23T11:13:55.01Z" }, ] +[[package]] +name = "execnet" +version = "2.1.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/bf/89/780e11f9588d9e7128a3f87788354c7946a9cbb1401ad38a48c4db9a4f07/execnet-2.1.2.tar.gz", hash = "sha256:63d83bfdd9a23e35b9c6a3261412324f964c2ec8dcd8d3c6916ee9373e0befcd", size = 166622, upload-time = "2025-11-12T09:56:37.75Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ab/84/02fc1827e8cdded4aa65baef11296a9bbe595c474f0d6d758af082d849fd/execnet-2.1.2-py3-none-any.whl", hash = "sha256:67fba928dd5a544b783f6056f449e5e3931a5c378b128bc18501f7ea79e296ec", size = 40708, upload-time = "2025-11-12T09:56:36.333Z" }, +] + [[package]] name = "fastapi" version = "0.136.1" @@ -1419,6 +1428,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/42/c2/ce34735972cc42d912173e79f200fe66530225190c06655c5632a9d88f1e/pytest_recording-0.13.4-py3-none-any.whl", hash = "sha256:ad49a434b51b1c4f78e85b1e6b74fdcc2a0a581ca16e52c798c6ace971f7f439", size = 13723, upload-time = "2025-05-08T10:41:09.684Z" }, ] +[[package]] +name = "pytest-xdist" +version = "3.8.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "execnet" }, + { name = "pytest" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/78/b4/439b179d1ff526791eb921115fca8e44e596a13efeda518b9d845a619450/pytest_xdist-3.8.0.tar.gz", hash = "sha256:7e578125ec9bc6050861aa93f2d59f1d8d085595d6551c2c90b6f4fad8d3a9f1", size = 88069, upload-time = "2025-07-01T13:30:59.346Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ca/31/d4e37e9e550c2b92a9cbc2e4d0b7420a27224968580b5a447f420847c975/pytest_xdist-3.8.0-py3-none-any.whl", hash = "sha256:202ca578cfeb7370784a8c33d6d05bc6e13b4f25b5053c30a152269fd10f0b88", size = 46396, upload-time = "2025-07-01T13:30:56.632Z" }, +] + [[package]] name = "python-dateutil" version = "2.9.0.post0" @@ -1578,6 +1600,7 @@ dev = [ { name = "pytest-cov" }, { name = "pytest-mock" }, { name = "pytest-recording" }, + { name = "pytest-xdist" }, { name = "ruff" }, { name = "testcontainers" }, { name = "types-pyyaml" }, @@ -1617,6 +1640,7 @@ dev = [ { name = "pytest-cov", specifier = ">=6.0" }, { name = "pytest-mock", specifier = ">=3.14" }, { name = "pytest-recording", specifier = ">=0.13" }, + { name = "pytest-xdist", specifier = ">=3.6" }, { name = "ruff", specifier = ">=0.8" }, { name = "testcontainers", extras = ["postgresql"], specifier = ">=4.9" }, { name = "types-pyyaml", specifier = ">=6.0" },