diff --git a/.claude/sweep-accuracy-state.csv b/.claude/sweep-accuracy-state.csv index 40432e971..400e2e19d 100644 --- a/.claude/sweep-accuracy-state.csv +++ b/.claude/sweep-accuracy-state.csv @@ -33,4 +33,4 @@ terrain,2026-04-10T12:00:00Z,,,,Perlin/Worley/ridged noise correct. Dask chunk b terrain_metrics,2026-04-30,,LOW,2;5,"LOW: Inf input not rejected, propagates as Inf (consistent across backends but undocumented). LOW: dask+cupy non-nan boundary path double-pads (wasted compute, central output values still correct). No CRIT/HIGH; tests cover NaN propagation, all 4 backends, all 4 boundary modes, dtype acceptance." visibility,2026-04-13T12:00:00Z,,,,"Bresenham line, LOS kernel, Fresnel zone all correct. All backends converge to numpy." worley,2026-05-01,,MEDIUM,2;5,"MEDIUM: numpy backend uses np.empty_like(data) so integer input dtype produces integer output (distances truncated to 0); cupy/dask paths always produce float32. LOW: freq=inf produces 100000 sentinel (sqrt of initial min_dist=1e10), no validation of freq/seed for non-finite values." -zonal,2026-03-30T12:00:00Z,1090,,, +zonal,2026-05-27,2528,MEDIUM,5,"Pass 2 (2026-05-27): MEDIUM fixed -- issue #2528. zonal_stats() on dask-backed inputs silently dropped 'majority' from the requested stats list. The mutable default stats_funcs included 'majority' (added in commit 7c8d5759), but the dask path filtered it out at xrspatial/zonal.py:459 (computed_stats = [s for s in stats_funcs.keys() if s in stats_dict]) because 'majority' is not in _DASK_BLOCK_STATS. Symptom: stats(zones=dask, values=dask) returned 7 columns instead of the 8 the docstring promises; stats(..., stats_funcs=['mean','majority']) returned only ['zone','mean'] with no error or warning. Both dask+numpy and dask+cupy were affected (dask+cupy delegates to dask+numpy). Fix: replaced the mutable list literal default with stats_funcs=None and resolved the default per backend inside the function -- numpy/cupy get the full 8-stat list, dask gets the 7-stat subset (no majority). Explicit majority on dask now raises ValueError with a clear supported-stats message instead of silently filtering. 4 regression tests in test_zonal.py: explicit majority raises on dask, bare default omits majority on dask, bare default keeps majority on numpy, default list is not mutated across calls (covers the historical mutable-default pitfall). All 129 test_zonal.py tests pass (125 pre-existing + 4 new); test_dasymetric.py 61 tests still pass (dasymetric uses zonal.stats internally). Categories: Cat 5 (backend inconsistency: numpy/cupy honoured majority; dask paths silently dropped it). | Pass 1 (2026-03-30T12:00:00Z): historical entry #1090." diff --git a/xrspatial/tests/test_zonal.py b/xrspatial/tests/test_zonal.py index d2bbcb6c4..9738a10c6 100644 --- a/xrspatial/tests/test_zonal.py +++ b/xrspatial/tests/test_zonal.py @@ -1900,6 +1900,78 @@ def test_crop_missing_zone_ids_raises(): crop(raster, raster) +# --------------------------------------------------------------------------- +# Regression tests for #2528: dask backend must not silently drop 'majority' +# from the requested stats list. +# --------------------------------------------------------------------------- + + +@pytest.mark.skipif(not has_dask_array(), reason="dask.array not available") +def test_stats_dask_explicit_majority_raises_2528(): + """Explicit 'majority' on dask must raise instead of being silently dropped.""" + zones_data = np.array([[1, 1, 2, 2], [1, 1, 2, 2]], dtype=float) + values_data = np.array([[1.0, 1, 2, 2], [3, 3, 5, 5]]) + + zones = xr.DataArray(da.from_array(zones_data, chunks=(2, 2)), dims=['y', 'x']) + values = xr.DataArray(da.from_array(values_data, chunks=(2, 2)), dims=['y', 'x']) + + with pytest.raises(ValueError, match="majority"): + stats(zones=zones, values=values, stats_funcs=['mean', 'majority']) + + # Single-stat majority request also raises. + with pytest.raises(ValueError, match="majority"): + stats(zones=zones, values=values, stats_funcs=['majority']) + + +@pytest.mark.skipif(not has_dask_array(), reason="dask.array not available") +def test_stats_dask_default_omits_majority_2528(): + """Bare stats() on dask should resolve to the dask default (no majority). + + Regression for #2528: the previous mutable default included 'majority' + and the dask path silently dropped it. After the fix the dask default + resolves to the supported subset, so the result columns match the + documented contract and no stat is silently filtered. + """ + zones_data = np.array([[1, 1, 2, 2], [1, 1, 2, 2]], dtype=float) + values_data = np.array([[1.0, 1, 2, 2], [3, 3, 5, 5]]) + + zones = xr.DataArray(da.from_array(zones_data, chunks=(2, 2)), dims=['y', 'x']) + values = xr.DataArray(da.from_array(values_data, chunks=(2, 2)), dims=['y', 'x']) + + df = stats(zones=zones, values=values).compute() + expected_cols = ['zone', 'mean', 'max', 'min', 'sum', 'std', 'var', 'count'] + assert df.columns.tolist() == expected_cols + assert 'majority' not in df.columns + + +def test_stats_numpy_default_includes_majority_2528(): + """Bare stats() on numpy keeps 'majority' in the resolved default list.""" + zones = xr.DataArray(np.array([[1, 1, 2, 2], [1, 1, 2, 2]], dtype=float), + dims=['y', 'x']) + values = xr.DataArray(np.array([[1.0, 1, 2, 2], [3, 3, 5, 5]]), + dims=['y', 'x']) + + df = stats(zones=zones, values=values) + assert 'majority' in df.columns + + +def test_stats_default_list_is_not_mutated_2528(): + """Repeated calls with the default must not accumulate state. + + The previous implementation used a mutable list literal as the default + argument. After the switch to ``stats_funcs=None``, the resolved list + is freshly constructed each call, so a caller that mutates it locally + cannot leak state into the next caller. + """ + zones = xr.DataArray(np.array([[1, 1], [2, 2]], dtype=float), dims=['y', 'x']) + values = xr.DataArray(np.array([[1.0, 2.0], [3.0, 4.0]]), dims=['y', 'x']) + + df1 = stats(zones=zones, values=values) + df2 = stats(zones=zones, values=values) + assert df1.columns.tolist() == df2.columns.tolist() + assert 'majority' in df1.columns and 'majority' in df2.columns + + # --------------------------------------------------------------------------- # Regression tests for #881: np.unique / np.isfinite must not materialise # the full dask array. diff --git a/xrspatial/zonal.py b/xrspatial/zonal.py index e07b7859e..18eba18ba 100644 --- a/xrspatial/zonal.py +++ b/xrspatial/zonal.py @@ -41,6 +41,16 @@ class cupy(object): TOTAL_COUNT = '_total_count' +_DEFAULT_STATS_NUMPY = [ + "mean", "max", "min", "sum", "std", "var", "count", "majority", +] + +# 'majority' cannot be computed block-by-block, so it is omitted from the +# dask default list and rejected when explicitly requested on a dask input. +_DEFAULT_STATS_DASK = [ + "mean", "max", "min", "sum", "std", "var", "count", +] + def _maybe_rasterize_zones(zones, values, column=None, rasterize_kw=None): """If *zones* is vector data, rasterize it using *values* as the template. @@ -634,16 +644,7 @@ def stats( zones, values: xr.DataArray, zone_ids: Optional[List[Union[int, float]]] = None, - stats_funcs: Union[Dict, List] = [ - "mean", - "max", - "min", - "sum", - "std", - "var", - "count", - "majority", - ], + stats_funcs: Optional[Union[Dict, List]] = None, nodata_values: Union[int, float] = None, return_type: str = 'pandas.DataFrame', column: Optional[str] = None, @@ -688,16 +689,19 @@ def stats( List of zones to be included in calculation. If no zone_ids provided, all zones will be used. - stats_funcs : dict, or list of strings, default=['mean', 'max', 'min', - 'sum', 'std', 'var', 'count', 'majority'] - The statistics to calculate for each zone. If a list, possible + stats_funcs : dict, or list of strings, optional + The statistics to calculate for each zone. If a list, possible choices are subsets of the default options. In the dictionary case, all of its values must be callable. Function takes only one argument that is the `values` raster. - The key become the column name in the output DataFrame. - Note that if `zones` and `values` are dask backed DataArrays, - `stats_funcs` must be provided as a list that is a subset of - default supported stats. + The key becomes the column name in the output DataFrame. + Defaults: ``['mean', 'max', 'min', 'sum', 'std', 'var', 'count', + 'majority']`` for numpy/cupy and ``['mean', 'max', 'min', 'sum', + 'std', 'var', 'count']`` for dask-backed inputs. ``'majority'`` + cannot be computed block-by-block so requesting it on a dask + input raises ``ValueError`` instead of being silently dropped. + Note that if `zones` and `values` are dask-backed DataArrays, + `stats_funcs` must be provided as a list (or left unset). nodata_values: int, float, default=None Nodata value in `values` raster. @@ -844,14 +848,33 @@ def stats( validate_arrays(zones, values) + is_dask_values = has_dask_array() and isinstance(values.data, da.Array) + + # Resolve the default stats_funcs based on backend. The dask path cannot + # compute 'majority' block-by-block, so its default list omits it. Using + # None as the sentinel default also avoids the mutable-default pitfall. + if stats_funcs is None: + stats_funcs = ( + list(_DEFAULT_STATS_DASK) if is_dask_values + else list(_DEFAULT_STATS_NUMPY) + ) + # validate stats_funcs - if has_dask_array() and isinstance(values.data, da.Array) and not isinstance(stats_funcs, list): + if is_dask_values and not isinstance(stats_funcs, list): raise ValueError( "Got dask-backed DataArray as `values` aggregate. " - "`stats_funcs` must be a subset of default supported stats " - "`[\'mean\', \'max\', \'min\', \'sum\', \'std\', \'var\', \'count\']`" + "`stats_funcs` must be a list that is a subset of " + f"{_DEFAULT_STATS_DASK!r}." ) + if is_dask_values and isinstance(stats_funcs, list): + unsupported = [s for s in stats_funcs if s not in _DEFAULT_STATS_DASK] + if unsupported: + raise ValueError( + f"stats_funcs={unsupported!r} not supported on dask-backed " + f"input. Supported on dask: {_DEFAULT_STATS_DASK!r}." + ) + if isinstance(stats_funcs, list): # create a dict of stats stats_funcs_dict = {}