Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions xrspatial/geotiff/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -489,9 +489,13 @@ def open_geotiff(source: str | BinaryIO, *,
``on_gpu_failure='strict'`` is also set. No cross-backend
numerical parity claim outside the Tier 1 codec set.
max_pixels : int or None
[stable] Maximum allowed pixel count (width * height *
samples). None uses the default (~1 billion). Raise to read
legitimately large files.
[stable] Maximum allowed pixel count per materialised buffer.
Without ``chunks=`` the cap bounds the full windowed region
(width * height * samples); with ``chunks=`` the cap bounds
each chunk's decode buffer instead, so a small ``max_pixels``
no longer rejects a large lazy raster up front. None uses the
default (~1 billion). Raise it to read legitimately large
files.
max_cloud_bytes : int or None, optional
[advanced] fsspec cloud reads can run up cost on large objects;
the budget defends against accidental large downloads but the
Expand Down
39 changes: 13 additions & 26 deletions xrspatial/geotiff/_backends/dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,12 +85,12 @@ def read_geotiff_dask(source: str, *,
multi-band files, 2D for single-band). Selecting a single band
produces a 2D DataArray.
max_pixels : int or None
[stable] Maximum allowed pixel count (width * height *
samples) for the windowed region. None uses the reader default
(~1 billion). The cap is checked once up-front against the
lazy region; each chunk task also re-checks against
``max_pixels`` so windowed reads stay bounded even when
``read_to_array`` is invoked directly.
[stable] Maximum allowed pixel count for a single materialised
chunk buffer (chunk_h * chunk_w * samples). None uses the
reader default (~1 billion). The full windowed region can
exceed this cap; only per-chunk decodes are bounded. To bound
the whole region instead, call ``read_to_array`` directly (or
``open_geotiff`` without ``chunks=``).
name : str or None
[stable] Name for the DataArray.
band_nodata : {'first', None}, optional
Expand Down Expand Up @@ -468,26 +468,13 @@ def read_geotiff_dask(source: str, *,
raise IndexError(
f"band={band} out of range for {n_bands}-band file.")

# Up-front pixel-count guard against the windowed extent. Chunk
# tasks re-check via read_to_array's own ``max_pixels`` (which we
# forward through ``_delayed_read_window``), but catching an
# oversized request before any task is scheduled saves the caller
# from a misleading "tile size exceeds max_pixels" error in a
# chunk that happens to align with the file's tile grid.
# ``max_pixels=None`` substitutes the module default to match the
# eager (``read_to_array``) and VRT chunked paths. Without the
# substitution the guard would skip entirely on ``None`` and a
# caller could build a lazy graph over a region far larger than the
# documented safety cap.
from .._reader import MAX_PIXELS_DEFAULT as _MAX_PIXELS_DEFAULT
effective_max_pixels = (max_pixels if max_pixels is not None
else _MAX_PIXELS_DEFAULT)
eff_bands = (1 if band is not None
else (n_bands if n_bands > 0 else 1))
if full_h * full_w * eff_bands > effective_max_pixels:
raise ValueError(
f"Requested region {full_h}x{full_w}x{eff_bands} "
f"exceeds max_pixels={effective_max_pixels:,}.")
# ``max_pixels`` bounds each chunk's materialised buffer, not the
# full windowed region. The per-chunk cap is enforced inside
# ``_delayed_read_window`` -> ``_read_to_array`` -> ``_check_dimensions``
# against the chunk's output window. The eager (no-``chunks``) path
# still applies the cap to the full image, so callers who want the
# old semantics can drop ``chunks=`` to get a single-shot decode
# under the same limit.

if name is None:
import os
Expand Down
20 changes: 17 additions & 3 deletions xrspatial/geotiff/_backends/gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,12 @@ def read_geotiff_gpu(source: str, *,
name : str or None
[experimental] Name for the DataArray.
max_pixels : int or None
[experimental] Maximum allowed pixel count
(width * height * samples). None uses the default (~1 billion).
[experimental] Maximum allowed pixel count per materialised
buffer. With ``chunks=None`` it bounds the full image
(width * height * samples); with ``chunks=`` it bounds the
per-chunk decode buffer instead so chunked reads of large
rasters do not need to widen the cap to the full file. None
uses the default (~1 billion).
on_gpu_failure : {'auto', 'strict'}, default 'auto'
[experimental] Behaviour when any GPU decode stage raises an
exception.
Expand Down Expand Up @@ -1485,7 +1489,11 @@ def _read_geotiff_gpu_chunked_gds(source, ifd, geo_info, header, *,
offsets = list(ifd.tile_offsets)
byte_counts = list(ifd.tile_byte_counts)

_check_dimensions(full_w, full_h, samples, max_pixels)
# Per-TIFF-tile guard: hostile-input defense against a forged
# tile-width / tile-length tag, independent of the chunk-scoped
# ``max_pixels`` contract below. A real chunk task allocates a
# buffer the chunk's shape, not a single tile, so the chunk-extent
# guard further down is what bounds peak GPU memory per task.
_check_dimensions(tw, th, samples, max_pixels)
validate_tile_layout(ifd)

Expand Down Expand Up @@ -1532,6 +1540,12 @@ def _read_geotiff_gpu_chunked_gds(source, ifd, geo_info, header, *,
if ch_h <= 0 or ch_w <= 0:
raise ValueError(f"Invalid chunks: {chunks}")

# Chunk-extent guard: each ``_chunk_task`` allocates a
# ``(ch_h, ch_w, samples)`` buffer on the GPU. Cap that against
# ``max_pixels`` to match the per-chunk semantics the CPU dask
# path enforces inside ``_read_to_array``.
_check_dimensions(ch_w, ch_h, samples, max_pixels)

# Validate band kwarg against the file's band count.
n_bands_out = samples if samples > 1 else 0
if band is not None:
Expand Down
36 changes: 25 additions & 11 deletions xrspatial/geotiff/tests/integration/test_dask_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,22 +252,36 @@ def _write_oversized_dask_max_pixels_default_guard(path, *, h: int, w: int) -> N
path.write_bytes(bytes(raw))


def test_default_max_pixels_guard_fires_for_full_region(tmp_path):
"""``max_pixels=None`` must apply the module default cap at the
up-front region guard, matching the eager / VRT paths.
def test_default_max_pixels_guard_does_not_fire_up_front(tmp_path):
"""The dask path no longer rejects an oversized full image at graph-
build time (#2501). The cap is now per-chunk, so a forged
multi-billion-pixel header builds a lazy graph without erroring;
each chunk task decodes against ``max_pixels`` separately.
"""
path = tmp_path / "tmp_1838_oversized.tif"
path = tmp_path / "tmp_2501_oversized.tif"
side = int((MAX_PIXELS_DEFAULT ** 0.5)) + 2
_write_oversized_dask_max_pixels_default_guard(path, h=side, w=side)
with pytest.raises(ValueError, match=r"max_pixels"):
read_geotiff_dask(str(path))
# The graph builds; the up-front guard is gone. Chunk-level decode
# may still fail when a task actually runs, but that is a separate
# path. ``read_geotiff_dask(...)`` itself returns a DataArray.
da = read_geotiff_dask(str(path))
assert da.shape == (side, side)


def test_explicit_max_pixels_still_enforced(tmp_path):
path = tmp_path / "tmp_1838_explicit_cap.tif"
_write_oversized_dask_max_pixels_default_guard(path, h=2048, w=2048)
with pytest.raises(ValueError, match=r"max_pixels"):
read_geotiff_dask(str(path), max_pixels=1024)
def test_explicit_max_pixels_enforced_per_chunk(tmp_path):
"""Per-chunk decode honours ``max_pixels`` even when the full image
would have fit under it (#2501). Build a real 64x64 file, request
chunks of 32, and set ``max_pixels`` below the per-chunk cost.
"""
arr = np.arange(64 * 64, dtype=np.uint8).reshape(64, 64)
path = tmp_path / "tmp_2501_per_chunk_cap.tif"
tifffile_dask_max_pixels_default_guard.imwrite(
str(path), arr, tile=(16, 16),
photometric="minisblack", compression="none")
# 32x32 chunk = 1024 pixels; cap is 100. Graph builds, compute fails.
da = read_geotiff_dask(str(path), chunks=32, max_pixels=100)
with pytest.raises(ValueError, match="exceed the safety limit"):
da.compute()


def test_small_region_unaffected(tmp_path):
Expand Down
48 changes: 43 additions & 5 deletions xrspatial/geotiff/tests/parity/test_pixel_equality.py
Original file line number Diff line number Diff line change
Expand Up @@ -479,11 +479,49 @@ def test_read_geotiff_dask_band_via_dispatcher(small_multiband_tiff_path):
np.testing.assert_array_equal(da.values, arr[:, :, 2])


def test_read_geotiff_dask_max_pixels_rejects_oversized(small_tiff_path):
"""``max_pixels=`` rejects the windowed region up front."""
path, _ = small_tiff_path
with pytest.raises(ValueError, match="exceeds max_pixels"):
read_geotiff_dask(path, chunks=2, max_pixels=10)
def test_read_geotiff_dask_max_pixels_bounds_chunk_not_full_image(
small_tiff_path):
"""``max_pixels`` bounds each chunk, not the full lazy region (#2501).

The 4x6 fixture is 24 pixels in total. With ``chunks=2`` each chunk
is at most 2x2 = 4 pixels, so ``max_pixels=10`` permits the read
even though the full image exceeds the cap. With ``chunks=4`` a
chunk reaches 4x4 = 16 pixels, which trips the per-chunk guard at
``.compute()`` time.
"""
path, arr = small_tiff_path

# Per-chunk cap is satisfied; full image is not. Should succeed.
da = read_geotiff_dask(path, chunks=2, max_pixels=10)
np.testing.assert_array_equal(da.values, arr)

# Per-chunk cap is exceeded. The graph builds, but the chunk task
# raises the safety-limit error on compute.
da = read_geotiff_dask(path, chunks=4, max_pixels=10)
with pytest.raises(ValueError, match="exceed the safety limit"):
da.compute()


def test_read_geotiff_dask_max_pixels_chunk_includes_band_count(
small_multiband_tiff_path):
"""Multi-band: per-chunk cap is ``chunk_h * chunk_w * samples`` (#2501).

The 4x6x3 fixture is 72 pixels. With ``chunks=2`` each chunk
materialises 2x2x3 = 12 pixels, fitting under ``max_pixels=20``
even though the full image (72) does not. With ``chunks=4`` the
largest chunk is 4x4x3 = 48 pixels and the per-chunk decode trips
the cap on compute.
"""
path, arr = small_multiband_tiff_path

# Per-chunk cap satisfied (12 px <= 20) but full image (72) is not.
da = read_geotiff_dask(path, chunks=2, max_pixels=20)
np.testing.assert_array_equal(da.values, arr)

# Per-chunk cap exceeded (48 px > 20). Graph builds, compute raises.
da = read_geotiff_dask(path, chunks=4, max_pixels=20)
with pytest.raises(ValueError, match="exceed the safety limit"):
da.compute()


def test_read_geotiff_dask_window_band_combined(small_multiband_tiff_path):
Expand Down
32 changes: 32 additions & 0 deletions xrspatial/geotiff/tests/test_security.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,38 @@ def test_open_geotiff_max_pixels(self, tmp_path):
with pytest.raises(ValueError, match="exceed the safety limit"):
open_geotiff(path, max_pixels=10)

def test_open_geotiff_max_pixels_chunked_bounds_chunk(self, tmp_path):
"""``open_geotiff(chunks=...)`` scopes max_pixels to the chunk (#2501).

A 6x6 image is 36 pixels. ``max_pixels=10`` would reject the eager
read, but ``chunks=2`` keeps each materialised buffer at 2x2=4
pixels, well under the cap. The cap still fires when a chunk
actually exceeds it.
"""
from xrspatial.geotiff import open_geotiff

expected = np.arange(36, dtype=np.float32).reshape(6, 6)
data = make_minimal_tiff(6, 6, np.dtype('float32'),
pixel_data=expected)
path = str(tmp_path / "small_2501_chunked.tif")
with open(path, 'wb') as f:
f.write(data)

# Eager path still rejects: full image > max_pixels.
with pytest.raises(ValueError, match="exceed the safety limit"):
open_geotiff(path, max_pixels=10)

# Dask path with chunks small enough to fit: succeeds and
# round-trips the values.
da = open_geotiff(path, chunks=2, max_pixels=10)
np.testing.assert_array_equal(da.values, expected)

# Dask path with chunks too large for the cap: per-chunk guard
# fires at compute time.
da = open_geotiff(path, chunks=4, max_pixels=10)
with pytest.raises(ValueError, match="exceed the safety limit"):
da.compute()


# ---------------------------------------------------------------------------
# Cat 1c: Tile dimension guard
Expand Down
14 changes: 12 additions & 2 deletions xrspatial/geotiff/tests/unit/test_signatures.py
Original file line number Diff line number Diff line change
Expand Up @@ -2077,10 +2077,20 @@ def test_read_geotiff_gpu_max_pixels_rejects_oversized(small_tiff_path):

@requires_gpu
def test_read_geotiff_gpu_chunks_max_pixels_rejects_oversized(small_tiff_path):
"""Dask+GPU path also enforces ``max_pixels``."""
"""Dask+GPU path enforces ``max_pixels`` per chunk (#2501).

The cap is now chunk-scoped, so a per-chunk allocation over
``max_pixels`` raises either at graph-build (the GDS fast path's
per-tile and per-chunk checks against the 16x16 TIFF tile dim and
the 4x4 chunk shape) or on ``.compute()`` (the CPU-fallback path's
per-chunk ``_check_dimensions`` inside ``read_to_array``). Wrap
both calls in ``pytest.raises`` so either branch satisfies the
test depending on whether KvikIO is installed.
"""
path, _ = small_tiff_path
with pytest.raises(ValueError, match="safety limit|exceeds max_pixels"):
read_geotiff_gpu(path, chunks=4, max_pixels=10)
da = read_geotiff_gpu(path, chunks=4, max_pixels=10)
da.compute()


def test_open_geotiff_chunks_name_flows_through(small_tiff_path):
Expand Down
Loading