Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .claude/sweep-accuracy-state.csv
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,4 @@ terrain,2026-04-10T12:00:00Z,,,,Perlin/Worley/ridged noise correct. Dask chunk b
terrain_metrics,2026-04-30,,LOW,2;5,"LOW: Inf input not rejected, propagates as Inf (consistent across backends but undocumented). LOW: dask+cupy non-nan boundary path double-pads (wasted compute, central output values still correct). No CRIT/HIGH; tests cover NaN propagation, all 4 backends, all 4 boundary modes, dtype acceptance."
visibility,2026-04-13T12:00:00Z,,,,"Bresenham line, LOS kernel, Fresnel zone all correct. All backends converge to numpy."
worley,2026-05-01,,MEDIUM,2;5,"MEDIUM: numpy backend uses np.empty_like(data) so integer input dtype produces integer output (distances truncated to 0); cupy/dask paths always produce float32. LOW: freq=inf produces 100000 sentinel (sqrt of initial min_dist=1e10), no validation of freq/seed for non-finite values."
zonal,2026-03-30T12:00:00Z,1090,,,
zonal,2026-05-27,2528,MEDIUM,5,"Pass 2 (2026-05-27): MEDIUM fixed -- issue #2528. zonal_stats() on dask-backed inputs silently dropped 'majority' from the requested stats list. The mutable default stats_funcs included 'majority' (added in commit 7c8d5759), but the dask path filtered it out at xrspatial/zonal.py:459 (computed_stats = [s for s in stats_funcs.keys() if s in stats_dict]) because 'majority' is not in _DASK_BLOCK_STATS. Symptom: stats(zones=dask, values=dask) returned 7 columns instead of the 8 the docstring promises; stats(..., stats_funcs=['mean','majority']) returned only ['zone','mean'] with no error or warning. Both dask+numpy and dask+cupy were affected (dask+cupy delegates to dask+numpy). Fix: replaced the mutable list literal default with stats_funcs=None and resolved the default per backend inside the function -- numpy/cupy get the full 8-stat list, dask gets the 7-stat subset (no majority). Explicit majority on dask now raises ValueError with a clear supported-stats message instead of silently filtering. 4 regression tests in test_zonal.py: explicit majority raises on dask, bare default omits majority on dask, bare default keeps majority on numpy, default list is not mutated across calls (covers the historical mutable-default pitfall). All 129 test_zonal.py tests pass (125 pre-existing + 4 new); test_dasymetric.py 61 tests still pass (dasymetric uses zonal.stats internally). Categories: Cat 5 (backend inconsistency: numpy/cupy honoured majority; dask paths silently dropped it). | Pass 1 (2026-03-30T12:00:00Z): historical entry #1090."
72 changes: 72 additions & 0 deletions xrspatial/tests/test_zonal.py
Original file line number Diff line number Diff line change
Expand Up @@ -1900,6 +1900,78 @@ def test_crop_missing_zone_ids_raises():
crop(raster, raster)


# ---------------------------------------------------------------------------
# Regression tests for #2528: dask backend must not silently drop 'majority'
# from the requested stats list.
# ---------------------------------------------------------------------------


@pytest.mark.skipif(not has_dask_array(), reason="dask.array not available")
def test_stats_dask_explicit_majority_raises_2528():
"""Explicit 'majority' on dask must raise instead of being silently dropped."""
zones_data = np.array([[1, 1, 2, 2], [1, 1, 2, 2]], dtype=float)
values_data = np.array([[1.0, 1, 2, 2], [3, 3, 5, 5]])

zones = xr.DataArray(da.from_array(zones_data, chunks=(2, 2)), dims=['y', 'x'])
values = xr.DataArray(da.from_array(values_data, chunks=(2, 2)), dims=['y', 'x'])

with pytest.raises(ValueError, match="majority"):
stats(zones=zones, values=values, stats_funcs=['mean', 'majority'])

# Single-stat majority request also raises.
with pytest.raises(ValueError, match="majority"):
stats(zones=zones, values=values, stats_funcs=['majority'])


@pytest.mark.skipif(not has_dask_array(), reason="dask.array not available")
def test_stats_dask_default_omits_majority_2528():
"""Bare stats() on dask should resolve to the dask default (no majority).

Regression for #2528: the previous mutable default included 'majority'
and the dask path silently dropped it. After the fix the dask default
resolves to the supported subset, so the result columns match the
documented contract and no stat is silently filtered.
"""
zones_data = np.array([[1, 1, 2, 2], [1, 1, 2, 2]], dtype=float)
values_data = np.array([[1.0, 1, 2, 2], [3, 3, 5, 5]])

zones = xr.DataArray(da.from_array(zones_data, chunks=(2, 2)), dims=['y', 'x'])
values = xr.DataArray(da.from_array(values_data, chunks=(2, 2)), dims=['y', 'x'])

df = stats(zones=zones, values=values).compute()
expected_cols = ['zone', 'mean', 'max', 'min', 'sum', 'std', 'var', 'count']
assert df.columns.tolist() == expected_cols
assert 'majority' not in df.columns


def test_stats_numpy_default_includes_majority_2528():
"""Bare stats() on numpy keeps 'majority' in the resolved default list."""
zones = xr.DataArray(np.array([[1, 1, 2, 2], [1, 1, 2, 2]], dtype=float),
dims=['y', 'x'])
values = xr.DataArray(np.array([[1.0, 1, 2, 2], [3, 3, 5, 5]]),
dims=['y', 'x'])

df = stats(zones=zones, values=values)
assert 'majority' in df.columns


def test_stats_default_list_is_not_mutated_2528():
"""Repeated calls with the default must not accumulate state.

The previous implementation used a mutable list literal as the default
argument. After the switch to ``stats_funcs=None``, the resolved list
is freshly constructed each call, so a caller that mutates it locally
cannot leak state into the next caller.
"""
zones = xr.DataArray(np.array([[1, 1], [2, 2]], dtype=float), dims=['y', 'x'])
values = xr.DataArray(np.array([[1.0, 2.0], [3.0, 4.0]]), dims=['y', 'x'])

df1 = stats(zones=zones, values=values)
df2 = stats(zones=zones, values=values)
assert df1.columns.tolist() == df2.columns.tolist()
assert 'majority' in df1.columns and 'majority' in df2.columns


# ---------------------------------------------------------------------------
# Regression tests for #881: np.unique / np.isfinite must not materialise
# the full dask array.
Expand Down
63 changes: 43 additions & 20 deletions xrspatial/zonal.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,16 @@ class cupy(object):

TOTAL_COUNT = '_total_count'

_DEFAULT_STATS_NUMPY = [
"mean", "max", "min", "sum", "std", "var", "count", "majority",
]

# 'majority' cannot be computed block-by-block, so it is omitted from the
# dask default list and rejected when explicitly requested on a dask input.
_DEFAULT_STATS_DASK = [
"mean", "max", "min", "sum", "std", "var", "count",
]


def _maybe_rasterize_zones(zones, values, column=None, rasterize_kw=None):
"""If *zones* is vector data, rasterize it using *values* as the template.
Expand Down Expand Up @@ -634,16 +644,7 @@ def stats(
zones,
values: xr.DataArray,
zone_ids: Optional[List[Union[int, float]]] = None,
stats_funcs: Union[Dict, List] = [
"mean",
"max",
"min",
"sum",
"std",
"var",
"count",
"majority",
],
stats_funcs: Optional[Union[Dict, List]] = None,
nodata_values: Union[int, float] = None,
return_type: str = 'pandas.DataFrame',
column: Optional[str] = None,
Expand Down Expand Up @@ -688,16 +689,19 @@ def stats(
List of zones to be included in calculation. If no zone_ids provided,
all zones will be used.

stats_funcs : dict, or list of strings, default=['mean', 'max', 'min',
'sum', 'std', 'var', 'count', 'majority']
The statistics to calculate for each zone. If a list, possible
stats_funcs : dict, or list of strings, optional
The statistics to calculate for each zone. If a list, possible
choices are subsets of the default options.
In the dictionary case, all of its values must be
callable. Function takes only one argument that is the `values` raster.
The key become the column name in the output DataFrame.
Note that if `zones` and `values` are dask backed DataArrays,
`stats_funcs` must be provided as a list that is a subset of
default supported stats.
The key becomes the column name in the output DataFrame.
Defaults: ``['mean', 'max', 'min', 'sum', 'std', 'var', 'count',
'majority']`` for numpy/cupy and ``['mean', 'max', 'min', 'sum',
'std', 'var', 'count']`` for dask-backed inputs. ``'majority'``
cannot be computed block-by-block so requesting it on a dask
input raises ``ValueError`` instead of being silently dropped.
Note that if `zones` and `values` are dask-backed DataArrays,
`stats_funcs` must be provided as a list (or left unset).

nodata_values: int, float, default=None
Nodata value in `values` raster.
Expand Down Expand Up @@ -844,14 +848,33 @@ def stats(

validate_arrays(zones, values)

is_dask_values = has_dask_array() and isinstance(values.data, da.Array)

# Resolve the default stats_funcs based on backend. The dask path cannot
# compute 'majority' block-by-block, so its default list omits it. Using
# None as the sentinel default also avoids the mutable-default pitfall.
if stats_funcs is None:
stats_funcs = (
list(_DEFAULT_STATS_DASK) if is_dask_values
else list(_DEFAULT_STATS_NUMPY)
)

# validate stats_funcs
if has_dask_array() and isinstance(values.data, da.Array) and not isinstance(stats_funcs, list):
if is_dask_values and not isinstance(stats_funcs, list):
raise ValueError(
"Got dask-backed DataArray as `values` aggregate. "
"`stats_funcs` must be a subset of default supported stats "
"`[\'mean\', \'max\', \'min\', \'sum\', \'std\', \'var\', \'count\']`"
"`stats_funcs` must be a list that is a subset of "
f"{_DEFAULT_STATS_DASK!r}."
)

if is_dask_values and isinstance(stats_funcs, list):
unsupported = [s for s in stats_funcs if s not in _DEFAULT_STATS_DASK]
if unsupported:
raise ValueError(
f"stats_funcs={unsupported!r} not supported on dask-backed "
f"input. Supported on dask: {_DEFAULT_STATS_DASK!r}."
)

if isinstance(stats_funcs, list):
# create a dict of stats
stats_funcs_dict = {}
Expand Down
Loading