diff --git a/xrspatial/tests/test_zonal.py b/xrspatial/tests/test_zonal.py index 764b1d9a4..9bf63380e 100644 --- a/xrspatial/tests/test_zonal.py +++ b/xrspatial/tests/test_zonal.py @@ -626,9 +626,9 @@ def test_stats_all_nan_zone_preserved(backend): 'min': [1.0, 3.0, 9.0, np.nan], 'max': [5.0, 7.0, 10.0, np.nan], 'mean': [(1.0 + 2.0 + 5.0) / 3, (3.0 + 4.0 + 7.0) / 3, 9.5, np.nan], - # numpy's _calc_stats leaves an all-NaN zone at NaN for every stat, - # including count (the func is only called when len(zone_values) > 0). - 'count': [3, 3, 2, np.nan], + # An empty zone (zone 5, all NaN) reports count == 0 because the + # number of valid cells is zero. Other stats stay NaN (issue #2644). + 'count': [3, 3, 2, 0], } check_results(backend, df_result, expected_result) @@ -800,9 +800,10 @@ def test_zonal_stats_against_qgis(elevation_raster_no_nans, raster, qgis_zonal_s def test_stats_all_nan_zone(backend): """Zone where every value is NaN should not crash. - All backends now agree: the empty zone stays in the output with NaN - stats. The cupy path used to drop the zone (issue #2562) — that is - now fixed and the per-backend branching is gone. + All backends agree: the empty zone stays in the output. Its count is + 0 (zero valid cells) while every other stat is NaN (issue #2644). The + cupy path used to drop the zone entirely (issue #2562); that is fixed + and the per-backend branching is gone. """ if 'cupy' in backend and not has_cuda_and_cupy(): pytest.skip("Requires CUDA and CuPy") @@ -826,7 +827,7 @@ def test_stats_all_nan_zone(backend): 'max': [np.nan, 7.0], 'min': [np.nan, 5.0], 'sum': [np.nan, 12.0], - 'count': [np.nan, 2], + 'count': [0, 2], } check_results(backend, df_result, expected) @@ -897,9 +898,11 @@ def test_stats_negative_zone_ids(backend): @pytest.mark.filterwarnings("ignore:Degrees of freedom:RuntimeWarning") @pytest.mark.parametrize("backend", ['numpy', 'dask+numpy', 'cupy', 'dask+cupy']) def test_stats_nodata_wipes_zone(backend): - """When nodata_values filters out every finite value in a zone, stats are NaN. + """When nodata_values filters out every finite value in a zone, the zone + is empty: count is 0 and the other stats are NaN. - All backends now agree after the issue #2562 fix. + All backends agree after the issue #2562 fix. The empty-zone count is 0 + rather than NaN per issue #2644. """ if 'cupy' in backend and not has_cuda_and_cupy(): pytest.skip("Requires CUDA and CuPy") @@ -923,7 +926,7 @@ def test_stats_nodata_wipes_zone(backend): 'max': [np.nan, 7.0], 'min': [np.nan, 3.0], 'sum': [np.nan, 10.0], - 'count': [np.nan, 2], + 'count': [0, 2], } check_results(backend, df_result, expected) diff --git a/xrspatial/zonal.py b/xrspatial/zonal.py index 18f864e88..4e1b7883c 100644 --- a/xrspatial/zonal.py +++ b/xrspatial/zonal.py @@ -223,11 +223,23 @@ def _nanreduce_preserve_allnan(blocks, func): return result +def _count_reduce(blocks): + """Sum per-block counts. An empty zone (NaN in every block) totals 0. + + Unlike the other reducers, count does not preserve all-NaN as NaN: an + empty zone has zero valid cells, so its count is 0, not undefined. + """ + import warnings + with warnings.catch_warnings(): + warnings.simplefilter('ignore', RuntimeWarning) + return np.nansum(blocks, axis=0) + + _DASK_STATS = dict( max=lambda blocks: _nanreduce_preserve_allnan(blocks, np.nanmax), min=lambda blocks: _nanreduce_preserve_allnan(blocks, np.nanmin), sum=lambda blocks: _nanreduce_preserve_allnan(blocks, np.nansum), - count=lambda blocks: _nanreduce_preserve_allnan(blocks, np.nansum), + count=_count_reduce, sum_squares=lambda blocks: _nanreduce_preserve_allnan(blocks, np.nansum), ) @@ -319,6 +331,13 @@ def _sort_and_stride(zones, values, unique_zones): return sorted_indices, values_by_zones, zone_breaks +def _empty_zone_value(stat_name: str) -> float: + # 'count' is a cardinality: an empty zone has zero valid cells, so its + # count is 0. Every other built-in stat (and any custom callable) is + # undefined over no values, so it stays NaN. + return 0.0 if stat_name == 'count' else np.nan + + def _calc_stats( values_by_zones: np.array, zone_breaks: np.array, @@ -326,9 +345,14 @@ def _calc_stats( zone_ids: np.array, func: Callable, nodata_values: Union[int, float] = None, + empty_zone_value: float = np.nan, ): + # An "empty" zone exists in the zones raster but has no valid values + # (all NaN, or all equal to nodata_values). Most stats leave NaN there + # (empty_zone_value defaults to NaN), but count passes 0 because the + # count of no values is a cardinality of zero, not undefined. start = 0 - results = np.full(unique_zones.shape, np.nan) + results = np.full(unique_zones.shape, empty_zone_value, dtype=np.float64) for i in range(len(unique_zones)): end = zone_breaks[i] if unique_zones[i] in zone_ids: @@ -505,7 +529,8 @@ def _stats_numpy( func = stats_funcs.get(stats) stats_dict[stats] = _calc_stats( values_by_zones, zone_breaks, - unique_zones, zone_ids, func, nodata_values + unique_zones, zone_ids, func, nodata_values, + empty_zone_value=_empty_zone_value(stats), ) stats_dict[stats] = stats_dict[stats][selected_indexes] result = pd.DataFrame(stats_dict) @@ -519,7 +544,8 @@ def _stats_numpy( func = stats_funcs.get(stats) stats_results = _calc_stats( values_by_zones, zone_breaks, - unique_zones, zone_ids, func, nodata_values + unique_zones, zone_ids, func, nodata_values, + empty_zone_value=_empty_zone_value(stats), ) for zone in zone_ids: iz = zone_ids_map[zone] # position of zone in unique_zones @@ -604,7 +630,8 @@ def _stats_cupy( # extract zone_values, then filter per-zone for non-finite values # and the nodata sentinel. If the zone has no valid values left, - # emit NaN for every stat instead of dropping the zone. + # emit the empty-zone value for every stat instead of dropping the + # zone: 0 for count (a cardinality), NaN for everything else. zone_values = values_by_zone[unique_index[i]:unique_index[i]+unique_counts[i]] zone_mask = cupy.isfinite(zone_values) if nodata_values is not None: @@ -613,7 +640,7 @@ def _stats_cupy( if zone_values.size == 0: for stats in stats_funcs: - stats_dict[stats].append(float('nan')) + stats_dict[stats].append(_empty_zone_value(stats)) continue # apply stats on the zone data @@ -741,6 +768,17 @@ def stats( Extra keyword arguments forwarded to ``rasterize()`` when *zones* is vector input (e.g. ``{'all_touched': True}``). + Notes + ----- + Empty zones. A zone that exists in ``zones`` but has no valid values + (every cell is NaN, or every cell equals ``nodata_values``) still + appears as a row in the output. For such a zone, ``count`` is ``0`` + because the number of valid cells is zero. Every other statistic + (``mean``, ``min``, ``max``, ``sum``, ``std``, ``var``, ``majority``, + and any custom callable) is ``NaN``, since those values are undefined + over an empty set. This holds across the numpy, cupy, and dask + backends. + Returns ------- stats_df : Union[pandas.DataFrame, dask.dataframe.DataFrame]