Skip to content

Commit 6e508b4

Browse files
committed
feat(interface): typed DataDesignerEarlyShutdownError on zero-record runs
When the async scheduler hits early shutdown and produces zero records, the buffer manager skips writing parquet (correctly), so ArtifactStorage.load_dataset_with_dropped_columns() raises FileNotFoundError. Previously this surfaced as a generic DataDesignerGenerationError wrapping the FileNotFoundError, which is ambiguous (could be missing files for any reason). This commit: - Adds DataDesignerEarlyShutdownError as a subclass of DataDesignerGenerationError so existing handlers still match while callers that want to react programmatically (retry on different alias, surface a degraded-provider message, etc.) can catch the specific type. - Plumbs the scheduler's structured signals (early_shutdown, partial_row_groups) up through the builder so they're available at data_designer.create() time without re-introspecting the scheduler. - create() raises the typed error in both failure modes (load fails or empty DataFrame returned) when builder.early_shutdown is True. Refs #575.
1 parent 49cc9bf commit 6e508b4

5 files changed

Lines changed: 114 additions & 2 deletions

File tree

packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,10 @@ def __init__(
113113
self._registry = registry or DataDesignerRegistry()
114114
self._graph: ExecutionGraph | None = None
115115
self._use_async: bool = DATA_DESIGNER_ASYNC_ENGINE
116+
# Structured signal: set by _build_async if the scheduler hit early shutdown.
117+
# Stays at defaults for sync-engine and successful async runs.
118+
self._early_shutdown: bool = False
119+
self._partial_row_groups: tuple[int, ...] = ()
116120

117121
self._data_designer_config = compile_data_designer_config(data_designer_config, resource_provider)
118122
self._column_configs = compile_dataset_builder_column_configs(self._data_designer_config)
@@ -135,6 +139,16 @@ def processors(self) -> tuple[Processor, ...]:
135139
def task_traces(self) -> list[TaskTrace]:
136140
return self._task_traces
137141

142+
@property
143+
def early_shutdown(self) -> bool:
144+
"""True if the most recent async run terminated via the early-shutdown gate."""
145+
return self._early_shutdown
146+
147+
@property
148+
def partial_row_groups(self) -> tuple[int, ...]:
149+
"""Row group ids that were partially salvaged after early shutdown (most recent run)."""
150+
return self._partial_row_groups
151+
138152
def set_processor_runner(self, processors: list[Processor]) -> None:
139153
"""Replace the processor runner with a new one using the given processors."""
140154
self._processor_runner = ProcessorRunner(
@@ -326,6 +340,8 @@ def on_complete(final_path: Path | str | None) -> None:
326340
future.result()
327341

328342
self._task_traces = scheduler.traces
343+
self._early_shutdown = scheduler.early_shutdown
344+
self._partial_row_groups = scheduler.partial_row_groups
329345

330346
# Emit telemetry
331347
try:

packages/data-designer/src/data_designer/interface/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,15 @@
99
if TYPE_CHECKING:
1010
from data_designer.interface.data_designer import DataDesigner # noqa: F401
1111
from data_designer.interface.errors import ( # noqa: F401
12+
DataDesignerEarlyShutdownError,
1213
DataDesignerGenerationError,
1314
DataDesignerProfilingError,
1415
)
1516
from data_designer.interface.results import DatasetCreationResults # noqa: F401
1617

1718
_LAZY_IMPORTS: dict[str, tuple[str, str]] = {
1819
"DataDesigner": ("data_designer.interface.data_designer", "DataDesigner"),
20+
"DataDesignerEarlyShutdownError": ("data_designer.interface.errors", "DataDesignerEarlyShutdownError"),
1921
"DataDesignerGenerationError": ("data_designer.interface.errors", "DataDesignerGenerationError"),
2022
"DataDesignerProfilingError": ("data_designer.interface.errors", "DataDesignerProfilingError"),
2123
"DatasetCreationResults": ("data_designer.interface.results", "DatasetCreationResults"),

packages/data-designer/src/data_designer/interface/data_designer.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@
6161
)
6262
from data_designer.engine.storage.artifact_storage import ArtifactStorage
6363
from data_designer.interface.errors import (
64+
DataDesignerEarlyShutdownError,
6465
DataDesignerGenerationError,
6566
DataDesignerProfilingError,
6667
)
@@ -234,6 +235,17 @@ def create(
234235
try:
235236
dataset_for_profiler = builder.artifact_storage.load_dataset_with_dropped_columns()
236237
except Exception as e:
238+
# Distinguish "early shutdown produced zero records" from generic load failures
239+
# so callers can react programmatically (e.g. retry on a different alias) instead
240+
# of parsing a wrapped FileNotFoundError. The scheduler's structured signal lives
241+
# on the builder for the duration of the run.
242+
if builder.early_shutdown:
243+
raise DataDesignerEarlyShutdownError(
244+
"🛑 Generation produced zero records — early shutdown was triggered. "
245+
"The non-retryable error rate exceeded the configured threshold; check the "
246+
"warnings above (and any 'Provider showing degraded performance' logs) for "
247+
"the contributing failures."
248+
) from e
237249
raise DataDesignerGenerationError(
238250
f"🛑 Failed to load generated dataset — all records may have been dropped "
239251
f"due to generation failures. Check the warnings above for details. Original error: {e}"
@@ -243,6 +255,11 @@ def create(
243255
# practice load_dataset_with_dropped_columns() would raise before returning a
244256
# zero-row DataFrame. This guard protects against future changes to that contract.
245257
if len(dataset_for_profiler) == 0:
258+
if builder.early_shutdown:
259+
raise DataDesignerEarlyShutdownError(
260+
"🛑 Dataset is empty — early shutdown was triggered before any records "
261+
"could complete. Check the warnings above for the contributing failures."
262+
)
246263
raise DataDesignerGenerationError(
247264
"🛑 Dataset is empty — all records were dropped due to generation failures. "
248265
"Check the warnings above for details on which columns failed."

packages/data-designer/src/data_designer/interface/errors.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,5 +14,15 @@ class DataDesignerGenerationError(DataDesignerError):
1414
"""Raised for errors related to a Data Designer dataset generation."""
1515

1616

17+
class DataDesignerEarlyShutdownError(DataDesignerGenerationError):
18+
"""Raised when a run terminated via early shutdown and produced no records.
19+
20+
Subclass of ``DataDesignerGenerationError`` so existing handlers still catch
21+
it; callers that want to distinguish the early-shutdown case (e.g. to retry
22+
with a different model alias or surface a degraded-provider message to the
23+
user) can catch this specific type.
24+
"""
25+
26+
1727
class InvalidBufferValueError(DataDesignerError):
1828
"""Raised for errors related to an invalid buffer value."""

packages/data-designer/tests/interface/test_data_designer.py

Lines changed: 69 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from datetime import datetime
99
from pathlib import Path
1010
from typing import Any
11-
from unittest.mock import MagicMock, patch
11+
from unittest.mock import MagicMock, PropertyMock, patch
1212

1313
import pytest
1414
from pydantic import ValidationError
@@ -39,7 +39,11 @@
3939
from data_designer.engine.testing.seed_readers import LineFanoutDirectorySeedReader
4040
from data_designer.engine.testing.stubs import StubHuggingFaceSeedReader
4141
from data_designer.interface.data_designer import DataDesigner
42-
from data_designer.interface.errors import DataDesignerGenerationError, DataDesignerProfilingError
42+
from data_designer.interface.errors import (
43+
DataDesignerEarlyShutdownError,
44+
DataDesignerGenerationError,
45+
DataDesignerProfilingError,
46+
)
4347

4448

4549
class CustomDirectorySeedReader(FileSystemSeedReader[DirectorySeedSource]):
@@ -682,6 +686,69 @@ def test_create_raises_generation_error_when_load_dataset_fails(
682686
assert isinstance(exc_info.value.__cause__, FileNotFoundError)
683687

684688

689+
def test_create_raises_early_shutdown_error_when_load_fails_after_shutdown(
690+
stub_artifact_path: Path,
691+
stub_model_providers: list[ModelProvider],
692+
stub_sampler_only_config_builder: DataDesignerConfigBuilder,
693+
stub_managed_assets_path: Path,
694+
) -> None:
695+
"""When the scheduler hit early shutdown and zero records were produced, surface the
696+
typed DataDesignerEarlyShutdownError instead of the generic load-failure wrap."""
697+
data_designer = DataDesigner(
698+
artifact_path=stub_artifact_path,
699+
model_providers=stub_model_providers,
700+
secret_resolver=PlaintextResolver(),
701+
managed_assets_path=stub_managed_assets_path,
702+
)
703+
704+
with (
705+
patch(
706+
"data_designer.engine.storage.artifact_storage.ArtifactStorage.load_dataset_with_dropped_columns",
707+
side_effect=FileNotFoundError("No parquet files found"),
708+
),
709+
patch(
710+
"data_designer.engine.dataset_builders.dataset_builder.DatasetBuilder.early_shutdown",
711+
new_callable=PropertyMock,
712+
return_value=True,
713+
),
714+
):
715+
with pytest.raises(DataDesignerEarlyShutdownError, match="early shutdown was triggered") as exc_info:
716+
data_designer.create(stub_sampler_only_config_builder, num_records=1)
717+
# Subclass of DataDesignerGenerationError so existing handlers still match.
718+
assert isinstance(exc_info.value, DataDesignerGenerationError)
719+
assert isinstance(exc_info.value.__cause__, FileNotFoundError)
720+
721+
722+
def test_create_raises_early_shutdown_error_on_empty_dataframe_after_shutdown(
723+
stub_artifact_path: Path,
724+
stub_model_providers: list[ModelProvider],
725+
stub_sampler_only_config_builder: DataDesignerConfigBuilder,
726+
stub_managed_assets_path: Path,
727+
) -> None:
728+
"""Defensive guard path: when load_dataset_with_dropped_columns returns an empty DF
729+
AND the scheduler hit early shutdown, the typed error wins over the generic one."""
730+
data_designer = DataDesigner(
731+
artifact_path=stub_artifact_path,
732+
model_providers=stub_model_providers,
733+
secret_resolver=PlaintextResolver(),
734+
managed_assets_path=stub_managed_assets_path,
735+
)
736+
737+
with (
738+
patch(
739+
"data_designer.engine.storage.artifact_storage.ArtifactStorage.load_dataset_with_dropped_columns",
740+
return_value=lazy.pd.DataFrame(),
741+
),
742+
patch(
743+
"data_designer.engine.dataset_builders.dataset_builder.DatasetBuilder.early_shutdown",
744+
new_callable=PropertyMock,
745+
return_value=True,
746+
),
747+
):
748+
with pytest.raises(DataDesignerEarlyShutdownError, match="early shutdown was triggered"):
749+
data_designer.create(stub_sampler_only_config_builder, num_records=1)
750+
751+
685752
def test_preview_raises_generation_error_when_dataset_is_empty(
686753
stub_artifact_path, stub_model_providers, stub_sampler_only_config_builder, stub_managed_assets_path
687754
):

0 commit comments

Comments
 (0)