xarray-contrib · brendancol · May 28, 2026 · May 27, 2026 · May 28, 2026
diff --git a/.claude/sweep-accuracy-state.csv b/.claude/sweep-accuracy-state.csv
@@ -33,4 +33,4 @@ terrain,2026-04-10T12:00:00Z,,,,Perlin/Worley/ridged noise correct. Dask chunk b
 terrain_metrics,2026-04-30,,LOW,2;5,"LOW: Inf input not rejected, propagates as Inf (consistent across backends but undocumented). LOW: dask+cupy non-nan boundary path double-pads (wasted compute, central output values still correct). No CRIT/HIGH; tests cover NaN propagation, all 4 backends, all 4 boundary modes, dtype acceptance."
 visibility,2026-04-13T12:00:00Z,,,,"Bresenham line, LOS kernel, Fresnel zone all correct. All backends converge to numpy."
 worley,2026-05-01,,MEDIUM,2;5,"MEDIUM: numpy backend uses np.empty_like(data) so integer input dtype produces integer output (distances truncated to 0); cupy/dask paths always produce float32. LOW: freq=inf produces 100000 sentinel (sqrt of initial min_dist=1e10), no validation of freq/seed for non-finite values."
-zonal,2026-03-30T12:00:00Z,1090,,,
+zonal,2026-05-27,2528,MEDIUM,5,"Pass 2 (2026-05-27): MEDIUM fixed -- issue #2528. zonal_stats() on dask-backed inputs silently dropped 'majority' from the requested stats list. The mutable default stats_funcs included 'majority' (added in commit 7c8d5759), but the dask path filtered it out at xrspatial/zonal.py:459 (computed_stats = [s for s in stats_funcs.keys() if s in stats_dict]) because 'majority' is not in _DASK_BLOCK_STATS. Symptom: stats(zones=dask, values=dask) returned 7 columns instead of the 8 the docstring promises; stats(..., stats_funcs=['mean','majority']) returned only ['zone','mean'] with no error or warning. Both dask+numpy and dask+cupy were affected (dask+cupy delegates to dask+numpy). Fix: replaced the mutable list literal default with stats_funcs=None and resolved the default per backend inside the function -- numpy/cupy get the full 8-stat list, dask gets the 7-stat subset (no majority).  Explicit majority on dask now raises ValueError with a clear supported-stats message instead of silently filtering. 4 regression tests in test_zonal.py: explicit majority raises on dask, bare default omits majority on dask, bare default keeps majority on numpy, default list is not mutated across calls (covers the historical mutable-default pitfall). All 129 test_zonal.py tests pass (125 pre-existing + 4 new); test_dasymetric.py 61 tests still pass (dasymetric uses zonal.stats internally). Categories: Cat 5 (backend inconsistency: numpy/cupy honoured majority; dask paths silently dropped it). | Pass 1 (2026-03-30T12:00:00Z): historical entry #1090."
diff --git a/xrspatial/tests/test_zonal.py b/xrspatial/tests/test_zonal.py
@@ -1900,6 +1900,78 @@ def test_crop_missing_zone_ids_raises():
         crop(raster, raster)
 
 
+# ---------------------------------------------------------------------------
+# Regression tests for #2528: dask backend must not silently drop 'majority'
+# from the requested stats list.
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.skipif(not has_dask_array(), reason="dask.array not available")
+def test_stats_dask_explicit_majority_raises_2528():
+    """Explicit 'majority' on dask must raise instead of being silently dropped."""
+    zones_data = np.array([[1, 1, 2, 2], [1, 1, 2, 2]], dtype=float)
+    values_data = np.array([[1.0, 1, 2, 2], [3, 3, 5, 5]])
+
+    zones = xr.DataArray(da.from_array(zones_data, chunks=(2, 2)), dims=['y', 'x'])
+    values = xr.DataArray(da.from_array(values_data, chunks=(2, 2)), dims=['y', 'x'])
+
+    with pytest.raises(ValueError, match="majority"):
+        stats(zones=zones, values=values, stats_funcs=['mean', 'majority'])
+
+    # Single-stat majority request also raises.
+    with pytest.raises(ValueError, match="majority"):
+        stats(zones=zones, values=values, stats_funcs=['majority'])
+
+
+@pytest.mark.skipif(not has_dask_array(), reason="dask.array not available")
+def test_stats_dask_default_omits_majority_2528():
+    """Bare stats() on dask should resolve to the dask default (no majority).
+
+    Regression for #2528: the previous mutable default included 'majority'
+    and the dask path silently dropped it.  After the fix the dask default
+    resolves to the supported subset, so the result columns match the
+    documented contract and no stat is silently filtered.
+    """
+    zones_data = np.array([[1, 1, 2, 2], [1, 1, 2, 2]], dtype=float)
+    values_data = np.array([[1.0, 1, 2, 2], [3, 3, 5, 5]])
+
+    zones = xr.DataArray(da.from_array(zones_data, chunks=(2, 2)), dims=['y', 'x'])
+    values = xr.DataArray(da.from_array(values_data, chunks=(2, 2)), dims=['y', 'x'])
+
+    df = stats(zones=zones, values=values).compute()
+    expected_cols = ['zone', 'mean', 'max', 'min', 'sum', 'std', 'var', 'count']
+    assert df.columns.tolist() == expected_cols
+    assert 'majority' not in df.columns
+
+
+def test_stats_numpy_default_includes_majority_2528():
+    """Bare stats() on numpy keeps 'majority' in the resolved default list."""
+    zones = xr.DataArray(np.array([[1, 1, 2, 2], [1, 1, 2, 2]], dtype=float),
+                         dims=['y', 'x'])
+    values = xr.DataArray(np.array([[1.0, 1, 2, 2], [3, 3, 5, 5]]),
+                          dims=['y', 'x'])
+
+    df = stats(zones=zones, values=values)
+    assert 'majority' in df.columns
+
+
+def test_stats_default_list_is_not_mutated_2528():
+    """Repeated calls with the default must not accumulate state.
+
+    The previous implementation used a mutable list literal as the default
+    argument.  After the switch to ``stats_funcs=None``, the resolved list
+    is freshly constructed each call, so a caller that mutates it locally
+    cannot leak state into the next caller.
+    """
+    zones = xr.DataArray(np.array([[1, 1], [2, 2]], dtype=float), dims=['y', 'x'])
+    values = xr.DataArray(np.array([[1.0, 2.0], [3.0, 4.0]]), dims=['y', 'x'])
+
+    df1 = stats(zones=zones, values=values)
+    df2 = stats(zones=zones, values=values)
+    assert df1.columns.tolist() == df2.columns.tolist()
+    assert 'majority' in df1.columns and 'majority' in df2.columns
+
+
 # ---------------------------------------------------------------------------
 # Regression tests for #881: np.unique / np.isfinite must not materialise
 # the full dask array.

diff --git a/xrspatial/zonal.py b/xrspatial/zonal.py
@@ -41,6 +41,16 @@ class cupy(object):
 
 TOTAL_COUNT = '_total_count'
 
+_DEFAULT_STATS_NUMPY = [
+    "mean", "max", "min", "sum", "std", "var", "count", "majority",
+]
+
+# 'majority' cannot be computed block-by-block, so it is omitted from the
+# dask default list and rejected when explicitly requested on a dask input.
+_DEFAULT_STATS_DASK = [
+    "mean", "max", "min", "sum", "std", "var", "count",
+]
+
 
 def _maybe_rasterize_zones(zones, values, column=None, rasterize_kw=None):
     """If *zones* is vector data, rasterize it using *values* as the template.
@@ -634,16 +644,7 @@ def stats(
     zones,
     values: xr.DataArray,
     zone_ids: Optional[List[Union[int, float]]] = None,
-    stats_funcs: Union[Dict, List] = [
-        "mean",
-        "max",
-        "min",
-        "sum",
-        "std",
-        "var",
-        "count",
-        "majority",
-    ],
+    stats_funcs: Optional[Union[Dict, List]] = None,
     nodata_values: Union[int, float] = None,
     return_type: str = 'pandas.DataFrame',
     column: Optional[str] = None,
@@ -688,16 +689,19 @@ def stats(
         List of zones to be included in calculation. If no zone_ids provided,
         all zones will be used.
 
-    stats_funcs : dict, or list of strings, default=['mean', 'max', 'min',
-        'sum', 'std', 'var', 'count', 'majority']
-        The statistics to calculate for each zone. If a list, possible
+    stats_funcs : dict, or list of strings, optional
+        The statistics to calculate for each zone.  If a list, possible
         choices are subsets of the default options.
         In the dictionary case, all of its values must be
         callable. Function takes only one argument that is the `values` raster.
-        The key become the column name in the output DataFrame.
-        Note that if `zones` and `values` are dask backed DataArrays,
-        `stats_funcs` must be provided as a list that is a subset of
-        default supported stats.
+        The key becomes the column name in the output DataFrame.
+        Defaults: ``['mean', 'max', 'min', 'sum', 'std', 'var', 'count',
+        'majority']`` for numpy/cupy and ``['mean', 'max', 'min', 'sum',
+        'std', 'var', 'count']`` for dask-backed inputs.  ``'majority'``
+        cannot be computed block-by-block so requesting it on a dask
+        input raises ``ValueError`` instead of being silently dropped.
+        Note that if `zones` and `values` are dask-backed DataArrays,
+        `stats_funcs` must be provided as a list (or left unset).
 
     nodata_values: int, float, default=None
         Nodata value in `values` raster.
@@ -844,14 +848,33 @@ def stats(
 
     validate_arrays(zones, values)
 
+    is_dask_values = has_dask_array() and isinstance(values.data, da.Array)
+
+    # Resolve the default stats_funcs based on backend. The dask path cannot
+    # compute 'majority' block-by-block, so its default list omits it.  Using
+    # None as the sentinel default also avoids the mutable-default pitfall.
+    if stats_funcs is None:
+        stats_funcs = (
+            list(_DEFAULT_STATS_DASK) if is_dask_values
+            else list(_DEFAULT_STATS_NUMPY)
+        )
+
     # validate stats_funcs
-    if has_dask_array() and isinstance(values.data, da.Array) and not isinstance(stats_funcs, list):
+    if is_dask_values and not isinstance(stats_funcs, list):
         raise ValueError(
             "Got dask-backed DataArray as `values` aggregate. "
-            "`stats_funcs` must be a subset of default supported stats "
-            "`[\'mean\', \'max\', \'min\', \'sum\', \'std\', \'var\', \'count\']`"
+            "`stats_funcs` must be a list that is a subset of "
+            f"{_DEFAULT_STATS_DASK!r}."
         )
 
+    if is_dask_values and isinstance(stats_funcs, list):
+        unsupported = [s for s in stats_funcs if s not in _DEFAULT_STATS_DASK]
+        if unsupported:
+            raise ValueError(
+                f"stats_funcs={unsupported!r} not supported on dask-backed "
+                f"input.  Supported on dask: {_DEFAULT_STATS_DASK!r}."
+            )
+
     if isinstance(stats_funcs, list):
         # create a dict of stats
         stats_funcs_dict = {}