Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 13 additions & 10 deletions xrspatial/tests/test_zonal.py
Original file line number Diff line number Diff line change
Expand Up @@ -626,9 +626,9 @@ def test_stats_all_nan_zone_preserved(backend):
'min': [1.0, 3.0, 9.0, np.nan],
'max': [5.0, 7.0, 10.0, np.nan],
'mean': [(1.0 + 2.0 + 5.0) / 3, (3.0 + 4.0 + 7.0) / 3, 9.5, np.nan],
# numpy's _calc_stats leaves an all-NaN zone at NaN for every stat,
# including count (the func is only called when len(zone_values) > 0).
'count': [3, 3, 2, np.nan],
# An empty zone (zone 5, all NaN) reports count == 0 because the
# number of valid cells is zero. Other stats stay NaN (issue #2644).
'count': [3, 3, 2, 0],
}
check_results(backend, df_result, expected_result)

Expand Down Expand Up @@ -800,9 +800,10 @@ def test_zonal_stats_against_qgis(elevation_raster_no_nans, raster, qgis_zonal_s
def test_stats_all_nan_zone(backend):
"""Zone where every value is NaN should not crash.

All backends now agree: the empty zone stays in the output with NaN
stats. The cupy path used to drop the zone (issue #2562) — that is
now fixed and the per-backend branching is gone.
All backends agree: the empty zone stays in the output. Its count is
0 (zero valid cells) while every other stat is NaN (issue #2644). The
cupy path used to drop the zone entirely (issue #2562); that is fixed
and the per-backend branching is gone.
"""
if 'cupy' in backend and not has_cuda_and_cupy():
pytest.skip("Requires CUDA and CuPy")
Expand All @@ -826,7 +827,7 @@ def test_stats_all_nan_zone(backend):
'max': [np.nan, 7.0],
'min': [np.nan, 5.0],
'sum': [np.nan, 12.0],
'count': [np.nan, 2],
'count': [0, 2],
}
check_results(backend, df_result, expected)

Expand Down Expand Up @@ -897,9 +898,11 @@ def test_stats_negative_zone_ids(backend):
@pytest.mark.filterwarnings("ignore:Degrees of freedom:RuntimeWarning")
@pytest.mark.parametrize("backend", ['numpy', 'dask+numpy', 'cupy', 'dask+cupy'])
def test_stats_nodata_wipes_zone(backend):
"""When nodata_values filters out every finite value in a zone, stats are NaN.
"""When nodata_values filters out every finite value in a zone, the zone
is empty: count is 0 and the other stats are NaN.

All backends now agree after the issue #2562 fix.
All backends agree after the issue #2562 fix. The empty-zone count is 0
rather than NaN per issue #2644.
"""
if 'cupy' in backend and not has_cuda_and_cupy():
pytest.skip("Requires CUDA and CuPy")
Expand All @@ -923,7 +926,7 @@ def test_stats_nodata_wipes_zone(backend):
'max': [np.nan, 7.0],
'min': [np.nan, 3.0],
'sum': [np.nan, 10.0],
'count': [np.nan, 2],
'count': [0, 2],
}
check_results(backend, df_result, expected)

Expand Down
50 changes: 44 additions & 6 deletions xrspatial/zonal.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,11 +223,23 @@ def _nanreduce_preserve_allnan(blocks, func):
return result


def _count_reduce(blocks):
"""Sum per-block counts. An empty zone (NaN in every block) totals 0.

Unlike the other reducers, count does not preserve all-NaN as NaN: an
empty zone has zero valid cells, so its count is 0, not undefined.
"""
import warnings
with warnings.catch_warnings():
warnings.simplefilter('ignore', RuntimeWarning)
return np.nansum(blocks, axis=0)


_DASK_STATS = dict(
max=lambda blocks: _nanreduce_preserve_allnan(blocks, np.nanmax),
min=lambda blocks: _nanreduce_preserve_allnan(blocks, np.nanmin),
sum=lambda blocks: _nanreduce_preserve_allnan(blocks, np.nansum),
count=lambda blocks: _nanreduce_preserve_allnan(blocks, np.nansum),
count=_count_reduce,
sum_squares=lambda blocks: _nanreduce_preserve_allnan(blocks, np.nansum),
)

Expand Down Expand Up @@ -319,16 +331,28 @@ def _sort_and_stride(zones, values, unique_zones):
return sorted_indices, values_by_zones, zone_breaks


def _empty_zone_value(stat_name: str) -> float:
# 'count' is a cardinality: an empty zone has zero valid cells, so its
# count is 0. Every other built-in stat (and any custom callable) is
# undefined over no values, so it stays NaN.
return 0.0 if stat_name == 'count' else np.nan


def _calc_stats(
values_by_zones: np.array,
zone_breaks: np.array,
unique_zones: np.array,
zone_ids: np.array,
func: Callable,
nodata_values: Union[int, float] = None,
empty_zone_value: float = np.nan,
):
# An "empty" zone exists in the zones raster but has no valid values
# (all NaN, or all equal to nodata_values). Most stats leave NaN there
# (empty_zone_value defaults to NaN), but count passes 0 because the
# count of no values is a cardinality of zero, not undefined.
start = 0
results = np.full(unique_zones.shape, np.nan)
results = np.full(unique_zones.shape, empty_zone_value, dtype=np.float64)
for i in range(len(unique_zones)):
end = zone_breaks[i]
if unique_zones[i] in zone_ids:
Expand Down Expand Up @@ -505,7 +529,8 @@ def _stats_numpy(
func = stats_funcs.get(stats)
stats_dict[stats] = _calc_stats(
values_by_zones, zone_breaks,
unique_zones, zone_ids, func, nodata_values
unique_zones, zone_ids, func, nodata_values,
empty_zone_value=_empty_zone_value(stats),
)
stats_dict[stats] = stats_dict[stats][selected_indexes]
result = pd.DataFrame(stats_dict)
Expand All @@ -519,7 +544,8 @@ def _stats_numpy(
func = stats_funcs.get(stats)
stats_results = _calc_stats(
values_by_zones, zone_breaks,
unique_zones, zone_ids, func, nodata_values
unique_zones, zone_ids, func, nodata_values,
empty_zone_value=_empty_zone_value(stats),
)
for zone in zone_ids:
iz = zone_ids_map[zone] # position of zone in unique_zones
Expand Down Expand Up @@ -604,7 +630,8 @@ def _stats_cupy(

# extract zone_values, then filter per-zone for non-finite values
# and the nodata sentinel. If the zone has no valid values left,
# emit NaN for every stat instead of dropping the zone.
# emit the empty-zone value for every stat instead of dropping the
# zone: 0 for count (a cardinality), NaN for everything else.
zone_values = values_by_zone[unique_index[i]:unique_index[i]+unique_counts[i]]
zone_mask = cupy.isfinite(zone_values)
if nodata_values is not None:
Expand All @@ -613,7 +640,7 @@ def _stats_cupy(

if zone_values.size == 0:
for stats in stats_funcs:
stats_dict[stats].append(float('nan'))
stats_dict[stats].append(_empty_zone_value(stats))
continue

# apply stats on the zone data
Expand Down Expand Up @@ -741,6 +768,17 @@ def stats(
Extra keyword arguments forwarded to ``rasterize()`` when
*zones* is vector input (e.g. ``{'all_touched': True}``).

Notes
-----
Empty zones. A zone that exists in ``zones`` but has no valid values
(every cell is NaN, or every cell equals ``nodata_values``) still
appears as a row in the output. For such a zone, ``count`` is ``0``
because the number of valid cells is zero. Every other statistic
(``mean``, ``min``, ``max``, ``sum``, ``std``, ``var``, ``majority``,
and any custom callable) is ``NaN``, since those values are undefined
over an empty set. This holds across the numpy, cupy, and dask
backends.

Returns
-------
stats_df : Union[pandas.DataFrame, dask.dataframe.DataFrame]
Expand Down
Loading