diff --git a/.claude/sweep-security-state.csv b/.claude/sweep-security-state.csv index a7dcefba7..444b469bd 100644 --- a/.claude/sweep-security-state.csv +++ b/.claude/sweep-security-state.csv @@ -45,4 +45,4 @@ terrain,2026-05-03,1443,MEDIUM,1;3,,"Re-audit 2026-05-03. MEDIUM Cat 1 + Cat 3 f viewshed,2026-04-22,1229,HIGH,1,,"HIGH (fixed #1229): _viewshed_cpu allocated ~500 bytes/pixel of working memory (event_list 3*H*W*7*8 bytes + status_values/status_struct/idle + visibility_grid + lexsort temporary) with no guard. A 20000x20000 raster tried to allocate ~200 GB. Fixed by adding peak-memory guard mirroring the _viewshed_dask pattern (_available_memory_bytes() check, raises MemoryError with max_distance= hint). No other HIGH findings: dask path already guarded, _validate_raster is called, distance-sweep uses dtype=float64, _calc_dist_n_grad guards zero distance." visibility,2026-04-28,,,,,"Clean. line_of_sight (line 190) and cumulative_viewshed (line 259) call _validate_raster; visibility_frequency delegates. Cat 1: cumulative_viewshed allocates int32 accumulator (4 B/px) but delegates per-observer to viewshed() which has 500 B/px memory guard at viewshed.py:1523-1531; viewshed will fail first on oversize rasters. _bresenham_line (line 35) and _los_kernel (lines 112-143) bounded by transect length (<=W+H+1). Cat 2: int64 throughout, no int32 overflow path. Cat 3: divisions in _los_kernel guarded (D==0 in _fresnel_radius_1 line 87, distance[i]==0 continue line 133, total_dist>0 check line 123); NaN elevation at observer cell would taint los_height but is a correctness not DoS concern. Cat 4: no CUDA kernels. Cat 5: no file I/O. Cat 6: elevations cast to float64 in _extract_transect line 79." worley,2026-04-28,,,,,"Clean. worley() calls _validate_raster at line 234 (Cat 6 OK). Cat 1: output allocation matches input agg.shape (np.empty_like at line 80, cupy.empty at line 174); not a width/height generator like bump, so unbounded alloc N/A. Cat 2: cell_x/cell_y use & 255 mask before perm-table indexing, no overflow risk; tid/block_size math bounded by hardware limits. Cat 3: no division by data-derived values; out.shape guards prevent zero-div in coordinate computation; no NaN read from input (pure noise generator). Cat 4 (PRIMARY): both @cuda.jit kernels (_worley_gpu line 99, _worley_gpu_xy line 135) have correct bounds guard 'if i < out.shape[0] and j < out.shape[1]'. cuda.shared.array(512, nb.int32) uses HARDCODED constant 512 (matches 256*2 perm table size), NOT derived from caller input — safe. cuda.syncthreads() called at line 110/147 between strided shared-mem write and reads. Each thread writes distinct sp[k] indices via 'range(tid, 512, block_size)', no race. All threads (incl. out-of-bounds) participate in the load loop before the bounds check, so syncthreads divergence is avoided. Cat 5: no file I/O. Minor: freq/seed not range-validated, _worley_numpy uses np.empty_like(data) which preserves int dtype if input is int (truncation). Functional, not security." -zonal,2026-04-22,1227,HIGH,1;2;6,,"HIGH (fixed #1227): _stats_cupy used `if nodata_values:` (truthy) so nodata_values=0 silently skipped the filter on the cupy backend, producing wrong stats vs every other backend. MEDIUM (unfixed): _strides uses np.int32 for stride indices — can wrap for arrays > ~2B elements in the numpy path. MEDIUM (unfixed): hypsometric_integral() skips _validate_raster on zones/values; _regions_numpy has no memory guard (numpy-only path, bounded by caller-allocated input). MEDIUM (unfixed): _stats_numpy return_type='xarray.DataArray' allocates np.full((n_stats, values.size)) with no guard." +zonal,2026-05-27,2523,HIGH,1;2;6,,"Re-audit 2026-05-27. HIGH Cat 1 (fixed #2523): _stats_numpy xarray.DataArray return path allocated np.full((n_stats, H*W), float64) with no memory guard; n_stats user-controlled via stats_funcs dict. Fixed by adding _check_stats_dataarray_memory helper that calls _available_memory_bytes() and raises MemoryError when n_stats*H*W*8 > 0.5*avail. Carry-over MEDIUMs still present (no new commits to zonal.py since 2026-04-22): _strides uses np.int32 stride indices (wraps at H*W > ~2.1B elements); hypsometric_integral() skips _validate_raster on zones/values (only validate_arrays for shape parity); _regions_numpy/_regions_cupy have no memory guard but allocations match input shape (bounded by caller). HIGH #1227 remains fixed. No CUDA bounds issues: _apply CUDA kernel has (y < zones.shape[0] and x < zones.shape[1]) guard. No file I/O beyond hardcoded /proc/meminfo read." diff --git a/xrspatial/tests/test_zonal.py b/xrspatial/tests/test_zonal.py index f790d2b6c..fbf0e65e7 100644 --- a/xrspatial/tests/test_zonal.py +++ b/xrspatial/tests/test_zonal.py @@ -1550,6 +1550,38 @@ def test_regions_dask_memory_guard(): _regions_dask(huge.data if hasattr(huge, 'data') else huge, 4) +def test_stats_dataarray_return_type_memory_guard_2523(): + """stats(return_type='xarray.DataArray') should refuse to allocate + an oversized (n_stats, H*W) float64 working buffer. + + Regression guard for issue #2523: the numpy backend's xarray.DataArray + return path allocated np.full((n_stats, values.size), nan) with no + memory check, scaling linearly with the user-supplied stats_funcs. + """ + from unittest.mock import patch + + zones_arr = np.array([[0, 0, 1, 1], [0, 0, 1, 1]], dtype=np.int32) + values_arr = np.array([[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0]], + dtype=np.float64) + zones = xr.DataArray(zones_arr) + values = xr.DataArray(values_arr) + + # Mock available memory to a tiny budget so the (n_stats, 8) buffer + # exceeds 50% of it. The default 8 stats * 8 cells * 8 bytes = 512 B; + # with avail = 100 B the guard must trip. + with patch('xrspatial.zonal._available_memory_bytes', return_value=100): + with pytest.raises(MemoryError, match="xarray.DataArray"): + stats(zones=zones, values=values, + return_type='xarray.DataArray') + + # Sanity: with the normal memory budget the same call succeeds. + out = stats(zones=zones, values=values, return_type='xarray.DataArray') + assert isinstance(out, xr.DataArray) + # n_stats=8 default, output shape = (8, *values.shape) + assert out.shape[0] == 8 + assert out.shape[1:] == values_arr.shape + + @pytest.mark.skipif(da is None, reason="dask not installed") def test_stats_dask_zone_filter(): """stats() with zone_ids filter should return only requested zones.""" diff --git a/xrspatial/zonal.py b/xrspatial/zonal.py index 9db5e3d6f..81faab5ea 100644 --- a/xrspatial/zonal.py +++ b/xrspatial/zonal.py @@ -502,6 +502,7 @@ def _stats_numpy( result = pd.DataFrame(stats_dict) else: + _check_stats_dataarray_memory(len(stats_funcs), values.shape) result = np.full((len(stats_funcs), values.size), np.nan) zone_ids_map = {z: i for i, z in enumerate(unique_zones) if z in zone_ids} stats_id = 0 @@ -2013,6 +2014,30 @@ def _available_memory_bytes(): return 2 * 1024 ** 3 +def _check_stats_dataarray_memory(n_stats, values_shape): + """Guard the (n_stats, H*W) float64 buffer in ``_stats_numpy``. + + The ``return_type='xarray.DataArray'`` branch allocates a same-shape + output replicated per requested statistic, so peak memory scales + linearly with ``len(stats_funcs)``. Refuse the request when the + buffer would exceed half of available RAM. + """ + n_cells = 1 + for s in values_shape: + n_cells *= int(s) + required = n_stats * n_cells * 8 # float64 + avail = _available_memory_bytes() + if required > 0.5 * avail: + raise MemoryError( + f"stats(return_type='xarray.DataArray') needs " + f"~{required / 1e9:.1f} GB for an " + f"({n_stats}, {n_cells}) float64 result buffer " + f"but only ~{avail / 1e9:.1f} GB is available. " + "Reduce `stats_funcs`, use a smaller raster, or call " + "stats(..., return_type='pandas.DataFrame') instead." + ) + + def _regions_dask(data, neighborhood): """Dask backend: compute to numpy and run scipy label.""" avail = _available_memory_bytes()