diff --git a/xrspatial/geotiff/__init__.py b/xrspatial/geotiff/__init__.py index 31c54f86c..60167b5d3 100644 --- a/xrspatial/geotiff/__init__.py +++ b/xrspatial/geotiff/__init__.py @@ -489,9 +489,13 @@ def open_geotiff(source: str | BinaryIO, *, ``on_gpu_failure='strict'`` is also set. No cross-backend numerical parity claim outside the Tier 1 codec set. max_pixels : int or None - [stable] Maximum allowed pixel count (width * height * - samples). None uses the default (~1 billion). Raise to read - legitimately large files. + [stable] Maximum allowed pixel count per materialised buffer. + Without ``chunks=`` the cap bounds the full windowed region + (width * height * samples); with ``chunks=`` the cap bounds + each chunk's decode buffer instead, so a small ``max_pixels`` + no longer rejects a large lazy raster up front. None uses the + default (~1 billion). Raise it to read legitimately large + files. max_cloud_bytes : int or None, optional [advanced] fsspec cloud reads can run up cost on large objects; the budget defends against accidental large downloads but the diff --git a/xrspatial/geotiff/_backends/dask.py b/xrspatial/geotiff/_backends/dask.py index d21b65ee9..868c90cdc 100644 --- a/xrspatial/geotiff/_backends/dask.py +++ b/xrspatial/geotiff/_backends/dask.py @@ -85,12 +85,12 @@ def read_geotiff_dask(source: str, *, multi-band files, 2D for single-band). Selecting a single band produces a 2D DataArray. max_pixels : int or None - [stable] Maximum allowed pixel count (width * height * - samples) for the windowed region. None uses the reader default - (~1 billion). The cap is checked once up-front against the - lazy region; each chunk task also re-checks against - ``max_pixels`` so windowed reads stay bounded even when - ``read_to_array`` is invoked directly. + [stable] Maximum allowed pixel count for a single materialised + chunk buffer (chunk_h * chunk_w * samples). None uses the + reader default (~1 billion). The full windowed region can + exceed this cap; only per-chunk decodes are bounded. To bound + the whole region instead, call ``read_to_array`` directly (or + ``open_geotiff`` without ``chunks=``). name : str or None [stable] Name for the DataArray. band_nodata : {'first', None}, optional @@ -468,26 +468,13 @@ def read_geotiff_dask(source: str, *, raise IndexError( f"band={band} out of range for {n_bands}-band file.") - # Up-front pixel-count guard against the windowed extent. Chunk - # tasks re-check via read_to_array's own ``max_pixels`` (which we - # forward through ``_delayed_read_window``), but catching an - # oversized request before any task is scheduled saves the caller - # from a misleading "tile size exceeds max_pixels" error in a - # chunk that happens to align with the file's tile grid. - # ``max_pixels=None`` substitutes the module default to match the - # eager (``read_to_array``) and VRT chunked paths. Without the - # substitution the guard would skip entirely on ``None`` and a - # caller could build a lazy graph over a region far larger than the - # documented safety cap. - from .._reader import MAX_PIXELS_DEFAULT as _MAX_PIXELS_DEFAULT - effective_max_pixels = (max_pixels if max_pixels is not None - else _MAX_PIXELS_DEFAULT) - eff_bands = (1 if band is not None - else (n_bands if n_bands > 0 else 1)) - if full_h * full_w * eff_bands > effective_max_pixels: - raise ValueError( - f"Requested region {full_h}x{full_w}x{eff_bands} " - f"exceeds max_pixels={effective_max_pixels:,}.") + # ``max_pixels`` bounds each chunk's materialised buffer, not the + # full windowed region. The per-chunk cap is enforced inside + # ``_delayed_read_window`` -> ``_read_to_array`` -> ``_check_dimensions`` + # against the chunk's output window. The eager (no-``chunks``) path + # still applies the cap to the full image, so callers who want the + # old semantics can drop ``chunks=`` to get a single-shot decode + # under the same limit. if name is None: import os diff --git a/xrspatial/geotiff/_backends/gpu.py b/xrspatial/geotiff/_backends/gpu.py index bd97c9e50..7db2f11bb 100644 --- a/xrspatial/geotiff/_backends/gpu.py +++ b/xrspatial/geotiff/_backends/gpu.py @@ -140,8 +140,12 @@ def read_geotiff_gpu(source: str, *, name : str or None [experimental] Name for the DataArray. max_pixels : int or None - [experimental] Maximum allowed pixel count - (width * height * samples). None uses the default (~1 billion). + [experimental] Maximum allowed pixel count per materialised + buffer. With ``chunks=None`` it bounds the full image + (width * height * samples); with ``chunks=`` it bounds the + per-chunk decode buffer instead so chunked reads of large + rasters do not need to widen the cap to the full file. None + uses the default (~1 billion). on_gpu_failure : {'auto', 'strict'}, default 'auto' [experimental] Behaviour when any GPU decode stage raises an exception. @@ -1485,7 +1489,11 @@ def _read_geotiff_gpu_chunked_gds(source, ifd, geo_info, header, *, offsets = list(ifd.tile_offsets) byte_counts = list(ifd.tile_byte_counts) - _check_dimensions(full_w, full_h, samples, max_pixels) + # Per-TIFF-tile guard: hostile-input defense against a forged + # tile-width / tile-length tag, independent of the chunk-scoped + # ``max_pixels`` contract below. A real chunk task allocates a + # buffer the chunk's shape, not a single tile, so the chunk-extent + # guard further down is what bounds peak GPU memory per task. _check_dimensions(tw, th, samples, max_pixels) validate_tile_layout(ifd) @@ -1532,6 +1540,12 @@ def _read_geotiff_gpu_chunked_gds(source, ifd, geo_info, header, *, if ch_h <= 0 or ch_w <= 0: raise ValueError(f"Invalid chunks: {chunks}") + # Chunk-extent guard: each ``_chunk_task`` allocates a + # ``(ch_h, ch_w, samples)`` buffer on the GPU. Cap that against + # ``max_pixels`` to match the per-chunk semantics the CPU dask + # path enforces inside ``_read_to_array``. + _check_dimensions(ch_w, ch_h, samples, max_pixels) + # Validate band kwarg against the file's band count. n_bands_out = samples if samples > 1 else 0 if band is not None: diff --git a/xrspatial/geotiff/tests/integration/test_dask_pipeline.py b/xrspatial/geotiff/tests/integration/test_dask_pipeline.py index 927756474..acce7e03f 100644 --- a/xrspatial/geotiff/tests/integration/test_dask_pipeline.py +++ b/xrspatial/geotiff/tests/integration/test_dask_pipeline.py @@ -252,22 +252,36 @@ def _write_oversized_dask_max_pixels_default_guard(path, *, h: int, w: int) -> N path.write_bytes(bytes(raw)) -def test_default_max_pixels_guard_fires_for_full_region(tmp_path): - """``max_pixels=None`` must apply the module default cap at the - up-front region guard, matching the eager / VRT paths. +def test_default_max_pixels_guard_does_not_fire_up_front(tmp_path): + """The dask path no longer rejects an oversized full image at graph- + build time (#2501). The cap is now per-chunk, so a forged + multi-billion-pixel header builds a lazy graph without erroring; + each chunk task decodes against ``max_pixels`` separately. """ - path = tmp_path / "tmp_1838_oversized.tif" + path = tmp_path / "tmp_2501_oversized.tif" side = int((MAX_PIXELS_DEFAULT ** 0.5)) + 2 _write_oversized_dask_max_pixels_default_guard(path, h=side, w=side) - with pytest.raises(ValueError, match=r"max_pixels"): - read_geotiff_dask(str(path)) + # The graph builds; the up-front guard is gone. Chunk-level decode + # may still fail when a task actually runs, but that is a separate + # path. ``read_geotiff_dask(...)`` itself returns a DataArray. + da = read_geotiff_dask(str(path)) + assert da.shape == (side, side) -def test_explicit_max_pixels_still_enforced(tmp_path): - path = tmp_path / "tmp_1838_explicit_cap.tif" - _write_oversized_dask_max_pixels_default_guard(path, h=2048, w=2048) - with pytest.raises(ValueError, match=r"max_pixels"): - read_geotiff_dask(str(path), max_pixels=1024) +def test_explicit_max_pixels_enforced_per_chunk(tmp_path): + """Per-chunk decode honours ``max_pixels`` even when the full image + would have fit under it (#2501). Build a real 64x64 file, request + chunks of 32, and set ``max_pixels`` below the per-chunk cost. + """ + arr = np.arange(64 * 64, dtype=np.uint8).reshape(64, 64) + path = tmp_path / "tmp_2501_per_chunk_cap.tif" + tifffile_dask_max_pixels_default_guard.imwrite( + str(path), arr, tile=(16, 16), + photometric="minisblack", compression="none") + # 32x32 chunk = 1024 pixels; cap is 100. Graph builds, compute fails. + da = read_geotiff_dask(str(path), chunks=32, max_pixels=100) + with pytest.raises(ValueError, match="exceed the safety limit"): + da.compute() def test_small_region_unaffected(tmp_path): diff --git a/xrspatial/geotiff/tests/parity/test_pixel_equality.py b/xrspatial/geotiff/tests/parity/test_pixel_equality.py index 0ffcb31f6..4967280b8 100644 --- a/xrspatial/geotiff/tests/parity/test_pixel_equality.py +++ b/xrspatial/geotiff/tests/parity/test_pixel_equality.py @@ -479,11 +479,49 @@ def test_read_geotiff_dask_band_via_dispatcher(small_multiband_tiff_path): np.testing.assert_array_equal(da.values, arr[:, :, 2]) -def test_read_geotiff_dask_max_pixels_rejects_oversized(small_tiff_path): - """``max_pixels=`` rejects the windowed region up front.""" - path, _ = small_tiff_path - with pytest.raises(ValueError, match="exceeds max_pixels"): - read_geotiff_dask(path, chunks=2, max_pixels=10) +def test_read_geotiff_dask_max_pixels_bounds_chunk_not_full_image( + small_tiff_path): + """``max_pixels`` bounds each chunk, not the full lazy region (#2501). + + The 4x6 fixture is 24 pixels in total. With ``chunks=2`` each chunk + is at most 2x2 = 4 pixels, so ``max_pixels=10`` permits the read + even though the full image exceeds the cap. With ``chunks=4`` a + chunk reaches 4x4 = 16 pixels, which trips the per-chunk guard at + ``.compute()`` time. + """ + path, arr = small_tiff_path + + # Per-chunk cap is satisfied; full image is not. Should succeed. + da = read_geotiff_dask(path, chunks=2, max_pixels=10) + np.testing.assert_array_equal(da.values, arr) + + # Per-chunk cap is exceeded. The graph builds, but the chunk task + # raises the safety-limit error on compute. + da = read_geotiff_dask(path, chunks=4, max_pixels=10) + with pytest.raises(ValueError, match="exceed the safety limit"): + da.compute() + + +def test_read_geotiff_dask_max_pixels_chunk_includes_band_count( + small_multiband_tiff_path): + """Multi-band: per-chunk cap is ``chunk_h * chunk_w * samples`` (#2501). + + The 4x6x3 fixture is 72 pixels. With ``chunks=2`` each chunk + materialises 2x2x3 = 12 pixels, fitting under ``max_pixels=20`` + even though the full image (72) does not. With ``chunks=4`` the + largest chunk is 4x4x3 = 48 pixels and the per-chunk decode trips + the cap on compute. + """ + path, arr = small_multiband_tiff_path + + # Per-chunk cap satisfied (12 px <= 20) but full image (72) is not. + da = read_geotiff_dask(path, chunks=2, max_pixels=20) + np.testing.assert_array_equal(da.values, arr) + + # Per-chunk cap exceeded (48 px > 20). Graph builds, compute raises. + da = read_geotiff_dask(path, chunks=4, max_pixels=20) + with pytest.raises(ValueError, match="exceed the safety limit"): + da.compute() def test_read_geotiff_dask_window_band_combined(small_multiband_tiff_path): diff --git a/xrspatial/geotiff/tests/test_security.py b/xrspatial/geotiff/tests/test_security.py index c117347e5..d1b2d253a 100644 --- a/xrspatial/geotiff/tests/test_security.py +++ b/xrspatial/geotiff/tests/test_security.py @@ -131,6 +131,38 @@ def test_open_geotiff_max_pixels(self, tmp_path): with pytest.raises(ValueError, match="exceed the safety limit"): open_geotiff(path, max_pixels=10) + def test_open_geotiff_max_pixels_chunked_bounds_chunk(self, tmp_path): + """``open_geotiff(chunks=...)`` scopes max_pixels to the chunk (#2501). + + A 6x6 image is 36 pixels. ``max_pixels=10`` would reject the eager + read, but ``chunks=2`` keeps each materialised buffer at 2x2=4 + pixels, well under the cap. The cap still fires when a chunk + actually exceeds it. + """ + from xrspatial.geotiff import open_geotiff + + expected = np.arange(36, dtype=np.float32).reshape(6, 6) + data = make_minimal_tiff(6, 6, np.dtype('float32'), + pixel_data=expected) + path = str(tmp_path / "small_2501_chunked.tif") + with open(path, 'wb') as f: + f.write(data) + + # Eager path still rejects: full image > max_pixels. + with pytest.raises(ValueError, match="exceed the safety limit"): + open_geotiff(path, max_pixels=10) + + # Dask path with chunks small enough to fit: succeeds and + # round-trips the values. + da = open_geotiff(path, chunks=2, max_pixels=10) + np.testing.assert_array_equal(da.values, expected) + + # Dask path with chunks too large for the cap: per-chunk guard + # fires at compute time. + da = open_geotiff(path, chunks=4, max_pixels=10) + with pytest.raises(ValueError, match="exceed the safety limit"): + da.compute() + # --------------------------------------------------------------------------- # Cat 1c: Tile dimension guard diff --git a/xrspatial/geotiff/tests/unit/test_signatures.py b/xrspatial/geotiff/tests/unit/test_signatures.py index a599f74bb..cf69efdc1 100644 --- a/xrspatial/geotiff/tests/unit/test_signatures.py +++ b/xrspatial/geotiff/tests/unit/test_signatures.py @@ -2077,10 +2077,20 @@ def test_read_geotiff_gpu_max_pixels_rejects_oversized(small_tiff_path): @requires_gpu def test_read_geotiff_gpu_chunks_max_pixels_rejects_oversized(small_tiff_path): - """Dask+GPU path also enforces ``max_pixels``.""" + """Dask+GPU path enforces ``max_pixels`` per chunk (#2501). + + The cap is now chunk-scoped, so a per-chunk allocation over + ``max_pixels`` raises either at graph-build (the GDS fast path's + per-tile and per-chunk checks against the 16x16 TIFF tile dim and + the 4x4 chunk shape) or on ``.compute()`` (the CPU-fallback path's + per-chunk ``_check_dimensions`` inside ``read_to_array``). Wrap + both calls in ``pytest.raises`` so either branch satisfies the + test depending on whether KvikIO is installed. + """ path, _ = small_tiff_path with pytest.raises(ValueError, match="safety limit|exceeds max_pixels"): - read_geotiff_gpu(path, chunks=4, max_pixels=10) + da = read_geotiff_gpu(path, chunks=4, max_pixels=10) + da.compute() def test_open_geotiff_chunks_name_flows_through(small_tiff_path):