Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
eb7b270
docs: capture 4 Karpathy-loop audit follow-ups as idea files
SoundMindsAI May 21, 2026
599f436
docs: finalize bug_e2e_target_dropdown_flake + advance feat_pr_metric…
SoundMindsAI May 21, 2026
20b9b43
docs: revise chore_study_default_stop_conditions with measured per-tr…
SoundMindsAI May 21, 2026
90e1297
docs(plan): approve feat_pr_metric_confidence implementation_plan + a…
SoundMindsAI May 21, 2026
9c95021
feat(trials): add per_query_metrics JSONB column with CHECK constrain…
SoundMindsAI May 21, 2026
032f342
feat(worker): persist per_query_metrics on successful run_trial compl…
SoundMindsAI May 21, 2026
3dc0349
feat(domain): add confidence helpers + ConfidenceShape Pydantic model…
SoundMindsAI May 21, 2026
5936c09
feat(studies): wire ConfidenceShape into StudyDetail.confidence (Stor…
SoundMindsAI May 21, 2026
d92fd5f
feat(worker): emit ## Confidence section in study-backed PR body (Sto…
SoundMindsAI May 21, 2026
cc14164
fix(domain): align confidence per-query lookup with @k-suffixed score…
SoundMindsAI May 21, 2026
29132dd
docs(planned): capture Guides Glossary + FAQ follow-ups (2 idea files)
SoundMindsAI May 21, 2026
3924bb2
feat(worker): wire ConfidenceShape into digest narrative prompt (Stor…
SoundMindsAI May 21, 2026
0814700
chore(domain): Epic 1 gate — GPT-5.5 review fixes + docs + live curl …
SoundMindsAI May 21, 2026
8405270
feat(guides): integrate FAQ and glossary processes for improved docum…
SoundMindsAI May 21, 2026
614496a
feat(ui): ConfidencePanel + glossary entries + real-backend E2E (Epic 2)
SoundMindsAI May 21, 2026
1b6b16a
docs: update state.md for Epic 2 + capture guide-06 screenshot follow-up
SoundMindsAI May 21, 2026
6c34a5b
chore(review): apply final GPT-5.5 cross-model findings (3 Low, all a…
SoundMindsAI May 21, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 15 additions & 2 deletions architecture.md
Original file line number Diff line number Diff line change
Expand Up @@ -109,10 +109,20 @@ backend/
judgment.py from feat_llm_judgments)
services/ use-case orchestrators — cluster.py from infra_adapter_elastic;
study_state.py (state machine + FR-7 protection listener,
feat_study_lifecycle Phase 2)
feat_study_lifecycle Phase 2);
study_confidence.py (async glue that runs the 4-query
read pattern from feat_pr_metric_confidence FR-2 and
hands pre-fetched data to the pure orchestrator —
consumed by studies._detail, the open_pr worker, and
the digest worker)
domain/ pure business logic — query/render.py from
infra_adapter_elastic; study/{search_space,template_validator,
csv_parser}.py from feat_study_lifecycle Phase 2;
study/confidence.py (feat_pr_metric_confidence —
ConfidenceShape Pydantic model + 7 sub-shapes + bootstrap
CI / runner-up gap / late-trial 1σ / convergence regime /
per-query outcome helpers; pure-Python orchestrator
returning None on every FR-7 degraded path);
git/{redaction,validation}.py from feat_github_pr_worker
(GitHub PAT redaction + repo_url + config_path validators)
adapters/ engine adapters — protocol.py (SearchAdapter Protocol +
Expand Down Expand Up @@ -180,7 +190,10 @@ migrations/ Alembic config + versions/ (0001 baseline + 0002 clusters
+ 0003 study_lifecycle_schema + 0004_judgments + 0005_digests
+ 0006 proposals_pr_url_idx + 0007 conversations_messages +
0008–0013 search_vector + GIN indexes from
feat_data_table_primitive)
feat_data_table_primitive + 0014 clusters_target_filter
from feat_cluster_target_filter + 0015 trials_per_query_metrics
from feat_pr_metric_confidence — nullable JSONB column +
CHECK constraint enforcing IS NULL OR jsonb_typeof = 'object')
docs/ 00_overview / 01_architecture / 02_product / 03_runbooks /
04_security / 05_quality / 08_guides
```
Expand Down
22 changes: 21 additions & 1 deletion backend/app/api/v1/_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

from __future__ import annotations

from typing import Annotated
from typing import Annotated, Any

from fastapi import APIRouter, Depends, HTTPException, status
from pydantic import BaseModel, ConfigDict, Field
Expand Down Expand Up @@ -77,6 +77,24 @@ class SeedCompletedStudyRequest(BaseModel):
"aria-disabled-button + tooltip path."
),
)
winner_per_query: dict[str, dict[str, Any]] | None = Field(
default=None,
description=(
"Optional per-query metrics dict to populate on the winner "
"trial. Shape: `{query_id: {metric_token: float}}` where "
"metric_token matches what `scoring.score()` emits (e.g. "
"`ndcg@10`). Set alongside `runner_up_per_query` to drive the "
"ConfidencePanel happy path on `/studies/[id]`. When omitted, "
"the seeded trials have `per_query_metrics IS NULL` (the "
"pre-feat_pr_metric_confidence shape)."
),
)
runner_up_per_query: dict[str, dict[str, Any]] | None = Field(
default=None,
description=(
"Optional per-query metrics for the runner-up trial; pairs with `winner_per_query`."
),
)


class SeedCompletedStudyResponse(BaseModel):
Expand Down Expand Up @@ -125,6 +143,8 @@ async def seed_completed_study( # pragma: no cover - integration only
template_id=body.template_id,
judgment_list_id=body.judgment_list_id,
with_pending_proposal=body.with_pending_proposal,
winner_per_query=body.winner_per_query,
runner_up_per_query=body.runner_up_per_query,
)
await db.commit()
return SeedCompletedStudyResponse(
Expand Down
13 changes: 13 additions & 0 deletions backend/app/api/v1/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,12 @@

from backend.app.adapters.protocol import TargetInfo
from backend.app.core.settings import get_settings
from backend.app.domain.study.confidence import ConfidenceShape as ConfidenceShape

# ``ConfidenceShape`` is defined in :mod:`backend.app.domain.study.confidence`
# (the canonical assembler module per Story 1.3). The explicit ``as`` re-export
# above keeps it importable via ``from backend.app.api.v1.schemas import
# ConfidenceShape`` under mypy strict's ``no_implicit_reexport``.

EngineType = Literal["elasticsearch", "opensearch"]
"""Response-only: values are guaranteed by service-layer validation before the
Expand Down Expand Up @@ -634,6 +640,13 @@ class StudyDetail(BaseModel):
started_at: datetime | None
completed_at: datetime | None
trials_summary: TrialsSummaryShape
confidence: ConfidenceShape | None = None
"""Per-study metric-confidence analytics (feat_pr_metric_confidence FR-5a).

``None`` when the study has no winner trial (still running or
``best_trial_id`` points at a deleted row — AC-3a). Otherwise a partial
or full :class:`ConfidenceShape` per FR-7's graceful-degradation
contract."""


class StudySummary(BaseModel):
Expand Down
3 changes: 3 additions & 0 deletions backend/app/api/v1/studies.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@
validate_against_template,
)
from backend.app.services import study_state
from backend.app.services.study_confidence import fetch_study_confidence

router = APIRouter()

Expand Down Expand Up @@ -117,6 +118,7 @@ def _decode_trial_cursor(raw: str, sort_key: str) -> tuple[Any, str]:

async def _detail(db: AsyncSession, row: Study) -> StudyDetail:
summary = await repo.aggregate_trials_summary(db, row.id)
confidence = await fetch_study_confidence(db, row)
return StudyDetail(
id=row.id,
name=row.name,
Expand Down Expand Up @@ -145,6 +147,7 @@ async def _detail(db: AsyncSession, row: Study) -> StudyDetail:
pruned=summary.pruned,
best_primary_metric=summary.best_primary_metric,
),
confidence=confidence,
)


Expand Down
26 changes: 26 additions & 0 deletions backend/app/db/models/trial.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,19 @@
The ``trials_study_metric`` index on ``(study_id, primary_metric DESC NULLS
LAST)`` is created in the migration (Story 1.2) — not declared at the ORM
level — so the ``DESC NULLS LAST`` ordering survives ``--autogenerate``.

The ``per_query_metrics`` JSONB column (nullable; added by migration
``0015_trials_per_query_metrics`` for feat_pr_metric_confidence) carries the
per-query pytrec_eval scores from ``scoring.py::score()``'s ``per_query``
dict. Shape: ``{query_id: {metric_token: float}}`` where ``metric_token`` is
the user-facing token emitted by :func:`backend.app.eval.scoring.score` —
i.e. ``@<k>``-suffixed for cutoff-aware metrics (``ndcg@10``, ``map@10``,
``precision@10``, ``recall@10``) and bare names for cutoff-free metrics
(``mrr``, plain ``map``). The base name (everything before any ``@``) is
constrained to ``MetricCatalog`` (``ndcg``, ``map``, ``precision``,
``recall``, ``mrr``). The ``trials_per_query_metrics_object_check`` CHECK
constraint enforces NULL-or-object at the DB level (since the write path is
the Arq ``run_trial`` worker, not a Pydantic-validated HTTP request).
"""

from __future__ import annotations
Expand All @@ -36,6 +49,10 @@ class Trial(Base):
"status IN ('complete', 'failed', 'pruned')",
name="trials_status_check",
),
CheckConstraint(
"per_query_metrics IS NULL OR jsonb_typeof(per_query_metrics) = 'object'",
name="trials_per_query_metrics_object_check",
),
)

id: Mapped[str] = mapped_column(String(36), primary_key=True)
Expand All @@ -62,6 +79,15 @@ class Trial(Base):
"""``{ndcg@10: ..., map: ..., p@10: ...}`` — every metric the study's
objective enumerated, scored by ``backend/eval/scoring.py`` (lands in
``infra_optuna_eval``)."""
per_query_metrics: Mapped[dict[str, Any] | None] = mapped_column(JSONB, nullable=True)
"""Per-query pytrec_eval scores from ``scoring.py::score()``'s
``per_query`` dict, persisted on every successful trial (NULL on
failure/pruned and on trials predating migration 0015). Shape:
``{query_id: {metric_name: float}}`` using user-facing metric names
(``ndcg``, ``map``, ``precision``, ``recall``, ``mrr``). Consumed by
``backend.app.domain.study.confidence`` to compute the
``ConfidenceShape`` surfaced on ``StudyDetail`` + PR body + digest
narrative (feat_pr_metric_confidence)."""
duration_ms: Mapped[int | None] = mapped_column(Integer, nullable=True)
"""Wall-clock from ``study.ask()`` to ``study.tell()`` for this trial."""
status: Mapped[str] = mapped_column(Text, nullable=False)
Expand Down
Loading
Loading