Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 2 additions & 6 deletions architecture/dataset-builders.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,11 +71,7 @@ Both execution modes integrate skip at the same points:
- **Sequential**: `_run_full_column_generator` and the fan-out methods (`_fan_out_with_threads`, `_fan_out_with_async`) call `_should_skip_cell` per record. Skipped rows are excluded from the generator input, then merged back with skip metadata preserved. A fast `_column_can_skip` check short-circuits the per-record evaluation when no skip config or propagation applies.
- **Async**: `_run_cell` and `_run_batch` in `AsyncTaskScheduler` call `_should_skip_record` / `_apply_skip_to_record` with the same logic. Skipped cells report as skipped (not success) in progress tracking.

DAG edges are added for `skip.when` column references (both in `dag.py` and `ExecutionGraph.create`) so skip-gate columns are generated before the gated column.

### DAG (Config-Level)

`dataset_builders/utils/dag.py` provides `topologically_sort_column_configs` — builds a NetworkX graph from `required_columns`, side-effect columns, and `skip.when` references, returns a topological ordering. Used by both execution modes for initial column ordering.
DAG edges are added for `skip.when` column references in `ExecutionGraph.create` so skip-gate columns are generated before the gated column.
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
DAG edges are added for `skip.when` column references in `ExecutionGraph.create` so skip-gate columns are generated before the gated column.
DAG edges are added for skip.when column references in both topologically_sort_column_configs (compile-time sort) and ExecutionGraph.create (async runtime) so skip-gate columns are generated before the gated column.


### DatasetBatchManager

Expand Down Expand Up @@ -118,7 +114,7 @@ DatasetBuilder.build()
- **Dual execution engines behind one API.** The sequential engine is simpler and easier to debug; the async engine adds row-group parallelism for throughput. Users switch via an environment variable without changing their code.
- **DAG-driven ordering** ensures columns with dependencies (e.g., a judge column that depends on a text column) are generated in the correct order, regardless of the order they appear in the config.
- **Salvage rounds in async mode** retry failed tasks after all other tasks in a round complete, improving resilience against transient LLM failures without blocking the entire generation.
- **Separate config-level and runtime DAGs.** The config-level DAG (`dag.py`) determines column ordering; the runtime `ExecutionGraph` adds strategy-aware dependency tracking for the async scheduler.
- **Unified DAG construction.** `topologically_sort_column_configs` (in `execution_graph.py`) determines column ordering using Kahn's algorithm; the runtime `ExecutionGraph` adds strategy-aware dependency tracking for the async scheduler.

## Cross-References

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
SamplerMultiColumnConfig,
SeedDatasetMultiColumnConfig,
)
from data_designer.engine.dataset_builders.utils.dag import topologically_sort_column_configs
from data_designer.engine.dataset_builders.utils.errors import ConfigCompilationError
from data_designer.engine.dataset_builders.utils.execution_graph import topologically_sort_column_configs


def compile_dataset_builder_column_configs(config: DataDesignerConfig) -> list[DatasetBuilderColumnConfigT]:
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,15 @@
from typing import TYPE_CHECKING

from data_designer.config.column_configs import GenerationStrategy
from data_designer.config.column_types import ColumnConfigT
from data_designer.engine.column_generators.utils.generator_classification import column_type_used_in_execution_dag
from data_designer.engine.dataset_builders.multi_column_configs import (
DatasetBuilderColumnConfigT,
MultiColumnConfig,
)
from data_designer.engine.dataset_builders.utils.errors import ConfigCompilationError, DAGCircularDependencyError
from data_designer.engine.dataset_builders.utils.task_model import SliceRef
from data_designer.logging import LOG_INDENT

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -330,3 +333,94 @@ def to_mermaid(self) -> str:
for dep in sorted(self._upstream.get(col, set())):
lines.append(f" {dep} --> {col}")
return "\n".join(lines)


def _resolve_dag_column(
col_name: str,
dag_col_dict: dict[str, ColumnConfigT],
side_effect_map: dict[str, str],
) -> str | None:
"""Resolve a column name to its DAG producer.

Returns the column itself if it is a direct DAG column, the producing
column if it is a declared side-effect, or ``None`` if the name is not
known to this DAG (e.g. a seed or sampler column).
"""
if col_name in dag_col_dict:
return col_name
return side_effect_map.get(col_name)
Comment on lines +349 to +351
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This method can be absorbed by _add_dag_edge since no other call sites reference it. WDYT?



def _add_dag_edge(
name: str,
dep: str,
label: str,
dag_col_dict: dict[str, ColumnConfigT],
side_effect_map: dict[str, str],
upstream: dict[str, set[str]],
downstream: dict[str, set[str]],
) -> None:
Comment on lines +338 to +362
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To follow our code style guide (public before private) these two private methods need to be pushed further down.

"""Add a dependency edge from *dep*'s producer to *name* if the dep is a known DAG column.

Self-edges are skipped, consistent with ``ExecutionGraph.create``.
The *label* parameter (``"required"`` or ``"skip.when"``) is included in
the debug log so the source of each edge is visible during tracing.
"""
resolved = _resolve_dag_column(dep, dag_col_dict, side_effect_map)
if resolved is None or resolved == name:
return
logger.debug(f"{LOG_INDENT}🔗 `{name}` depends on `{resolved}` [{label}]")
upstream[name].add(resolved)
downstream[resolved].add(name)


def topologically_sort_column_configs(column_configs: list[ColumnConfigT]) -> list[ColumnConfigT]:
Copy link
Copy Markdown
Contributor

@nabinchha nabinchha Apr 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This public method could use a docstring!

Suggested change
def topologically_sort_column_configs(column_configs: list[ColumnConfigT]) -> list[ColumnConfigT]:
def topologically_sort_column_configs(column_configs: list[ColumnConfigT]) -> list[ColumnConfigT]:
"""Return column configs in dependency order using Kahn's algorithm.
Non-DAG columns (samplers, seeds) are placed first, followed by DAG columns
sorted by ``required_columns`` and ``skip.when`` edges. Side-effect columns
are resolved to their producing column.
Raises:
ConfigCompilationError: If two columns declare the same side-effect name.
DAGCircularDependencyError: If the dependency graph contains a cycle.
"""

non_dag_cols = [col for col in column_configs if not column_type_used_in_execution_dag(col.column_type)]
dag_col_dict = {col.name: col for col in column_configs if column_type_used_in_execution_dag(col.column_type)}

if not dag_col_dict:
return non_dag_cols

# side_effect_col_name -> producing column name
side_effect_map: dict[str, str] = {}
for name, col in dag_col_dict.items():
for se_col in col.side_effect_columns:
existing = side_effect_map.get(se_col)
if existing is not None and existing != name:
raise ConfigCompilationError(
f"Side-effect column {se_col!r} is already produced by {existing!r}; "
f"cannot register a second producer {name!r}. "
f"Use distinct side-effect column names for each pipeline stage."
)
side_effect_map[se_col] = name

upstream: dict[str, set[str]] = {name: set() for name in dag_col_dict}
downstream: dict[str, set[str]] = {name: set() for name in dag_col_dict}

logger.info("⛓️ Sorting column configs into a Directed Acyclic Graph")
for name, col in dag_col_dict.items():
for req in col.required_columns:
_add_dag_edge(name, req, "required", dag_col_dict, side_effect_map, upstream, downstream)
if col.skip is not None:
for skip_col in col.skip.columns:
_add_dag_edge(name, skip_col, "skip.when", dag_col_dict, side_effect_map, upstream, downstream)

in_degree = {name: len(ups) for name, ups in upstream.items()}
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This (Kahn's algorithm) is nearly identical to get_topological_order above (lines 236-258). Could we extract a shared _kahns_topological_sort(nodes, upstream, downstream) -> list[str] helper that both call? They're in the same file and the only difference is the error message, which we could unify. This will also be helpful in a future PR where we'll want to reuse it for the DAG inside samplers.

queue: deque[str] = deque(name for name, deg in in_degree.items() if deg == 0)
order: list[str] = []
while queue:
name = queue.popleft()
order.append(name)
for child in downstream.get(name, set()):
in_degree[child] -= 1
if in_degree[child] == 0:
queue.append(child)

if len(order) != len(dag_col_dict):
raise DAGCircularDependencyError(
"🛑 The Data Designer column configurations contain cyclic dependencies. Please "
"inspect the column configurations and ensure they can be sorted without "
"circular references."
)

return non_dag_cols + [dag_col_dict[n] for n in order]
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

from __future__ import annotations

from typing import Any

import pytest

from data_designer.config.base import SkipConfig
from data_designer.config.column_configs import (
CustomColumnConfig,
ExpressionColumnConfig,
Expand All @@ -21,13 +24,13 @@
from data_designer.config.utils.code_lang import CodeLang
from data_designer.config.validator_params import CodeValidatorParams
from data_designer.engine.dataset_builders.multi_column_configs import SamplerMultiColumnConfig
from data_designer.engine.dataset_builders.utils.dag import topologically_sort_column_configs
from data_designer.engine.dataset_builders.utils.errors import ConfigCompilationError, DAGCircularDependencyError
from data_designer.engine.dataset_builders.utils.execution_graph import topologically_sort_column_configs

MODEL_ALIAS = "stub-model-alias"


def test_dag_construction():
def test_dag_construction() -> None:
column_configs = []
column_configs.append(
SamplerMultiColumnConfig(
Expand Down Expand Up @@ -82,17 +85,17 @@ def test_dag_construction():

assert sorted_column_configs[0].column_type == DataDesignerColumnType.SAMPLER

assert [c.name for c in sorted_column_configs[1:]] == [
"test_code",
"test_validation",
"depends_on_validation",
"test_judge",
"test_code_and_depends_on_validation_reasoning_traces",
"uses_all_the_stuff",
]
names = [c.name for c in sorted_column_configs[1:]]
assert names[0] == "test_code"
assert names[1] == "test_validation"
assert names[2] == "depends_on_validation"
# test_judge and test_code_and_depends_on_validation_reasoning_traces have no mutual
# dependency, so their relative order is not guaranteed by topological sort.
assert set(names[3:5]) == {"test_judge", "test_code_and_depends_on_validation_reasoning_traces"}
assert names[5] == "uses_all_the_stuff"


def test_circular_dependencies():
def test_circular_dependencies() -> None:
column_configs = []
column_configs.append(
SamplerMultiColumnConfig(
Expand Down Expand Up @@ -135,3 +138,37 @@ def gen_b(row: dict[str, Any]) -> dict[str, Any]:
]
with pytest.raises(ConfigCompilationError, match="already produced by"):
topologically_sort_column_configs(column_configs)


def test_side_effect_column_ordering() -> None:
"""A column that depends on a side-effect column is sorted after its producer."""

@custom_column_generator(required_columns=["seed"], side_effect_columns=["seed_trace"])
def gen_with_trace(row: dict[str, Any]) -> dict[str, Any]:
return row

column_configs = [
LLMTextColumnConfig(name="seed", prompt="generate seed", model_alias=MODEL_ALIAS),
ExpressionColumnConfig(name="consumer", expr="{{ seed_trace }}"),
CustomColumnConfig(name="producer", generator_function=gen_with_trace),
]
sorted_configs = topologically_sort_column_configs(column_configs)
names = [c.name for c in sorted_configs]
assert names.index("producer") < names.index("consumer")


def test_skip_when_column_ordering() -> None:
"""A column with skip.when referencing another DAG column is sorted after that column."""
column_configs = [
LLMTextColumnConfig(name="seed", prompt="generate seed", model_alias=MODEL_ALIAS),
LLMTextColumnConfig(
name="gated",
prompt="generate gated",
model_alias=MODEL_ALIAS,
skip=SkipConfig(when="{{ seed == 'bad' }}"),
),
]
# gated has no required_columns referencing seed, only a skip.when dependency
sorted_configs = topologically_sort_column_configs(column_configs)
names = [c.name for c in sorted_configs]
assert names.index("seed") < names.index("gated")
Loading