Skip to content
Merged
11 changes: 6 additions & 5 deletions docs/source/reference/geotiff.rst
Original file line number Diff line number Diff line change
Expand Up @@ -181,11 +181,12 @@ this section is the brief.
(``xrspatial/geotiff/tests/write/test_nodata.py``).
* Float nodata. The on-disk sentinel is recorded on
``attrs['nodata']`` and surfaces as NaN in pixel data only when the
read promotes via ``mask_nodata=True`` (the default for float
outputs). With ``mask_nodata=False`` the raw float sentinel passes
through, so downstream callers can branch on the exact value;
``xrspatial/geotiff/tests/write/test_nodata.py`` pins this
split.
read promotes via ``masked=True``. The default is ``masked=False``
(matching rioxarray's ``open_rasterio``), so by default the raw float
sentinel passes through and downstream callers can branch on the exact
value; pass ``masked=True`` to get NaN-masked output.
``xrspatial/geotiff/tests/write/test_nodata.py`` pins this split.
(``mask_nodata`` is a deprecated alias of ``masked``.)
* NaN nodata. A file that declares ``nodata=NaN`` is read with NaN in
both ``attrs['nodata']`` and pixel data (NaN propagates either way).
* ``attrs['masked_nodata']``. Every read sets a boolean lifecycle
Expand Down
4 changes: 2 additions & 2 deletions docs/source/reference/release_gate_geotiff.rst
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ Local GeoTIFF read and write
keys (``transform``, ``crs``, ``crs_wkt``, ``nodata``,
``masked_nodata``, ``georef_status``, ``raster_type``) across
four scenarios: integer-nodata, float-NaN-nodata, MinIsWhite,
and the ``mask_nodata=False`` raw-sentinel branch of the
and the ``masked=False`` raw-sentinel branch of the
nodata lifecycle.
- ``xrspatial/geotiff/tests/release_gates/test_stable_features.py``
(eager / dask full parity section)
Expand Down Expand Up @@ -384,7 +384,7 @@ Nodata lifecycle
- stable
- The sentinel survives read and write across every backend; integer
sentinels are preserved bit-exact, float sentinels surface as NaN
only when ``mask_nodata=True``.
only when ``masked=True`` (default ``masked=False``).
- ``xrspatial/geotiff/tests/read/test_nodata.py``,
``xrspatial/geotiff/tests/write/test_nodata.py``
- `#2341`_
Expand Down
4 changes: 2 additions & 2 deletions docs/source/user_guide/attrs_contract.rst
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,8 @@ write.
replaced sentinel pixels with NaN (so the buffer is NaN-aware);
``False`` when the array still carries the literal sentinel
values, including the case where the array is float dtype
because the caller passed ``mask_nodata=False`` together with
``dtype=float...``. Only set when ``nodata`` is set; absence
because the caller passed ``masked=False`` (the default) together
with ``dtype=float...``. Only set when ``nodata`` is set; absence
means no declared sentinel. See issue #2092.
* - ``nodata_pixels_present``
- bool
Expand Down
210 changes: 173 additions & 37 deletions xrspatial/geotiff/__init__.py

Large diffs are not rendered by default.

63 changes: 58 additions & 5 deletions xrspatial/geotiff/_attrs.py
Original file line number Diff line number Diff line change
Expand Up @@ -1527,6 +1527,35 @@ def _apply_eager_nodata_mask(arr, *, mask_sentinel, mask_nodata):
return arr, nodata_pixels_present


def _extract_scale_offset(gdal_metadata):
"""Pull SCALE / OFFSET from parsed GDAL_METADATA for ``mask_and_scale``.

Returns ``(scale, offset)`` floats, defaulting to ``(1.0, 0.0)`` when the
source carries no scale / offset. GDAL stores these in the GDAL_METADATA
XML either as dataset-level ``SCALE`` / ``OFFSET`` items or per-band items
keyed ``(name, band_index)`` by :func:`_parse_gdal_metadata`. Dataset-level
values are preferred; band 0's per-band values are the fallback. A single
pair is applied to the whole array, so a source with differing per-band
scale / offset is read with band 0's values (documented limitation).
"""
scale, offset = 1.0, 0.0
if not gdal_metadata:
return scale, offset

def _num(keys, default):
for k in keys:
if k in gdal_metadata:
try:
return float(gdal_metadata[k])
except (TypeError, ValueError):
return default
return default

scale = _num(['SCALE', ('SCALE', 0)], 1.0)
offset = _num(['OFFSET', ('OFFSET', 0)], 0.0)
return scale, offset


def _finalize_eager_read(
arr,
*,
Expand All @@ -1540,6 +1569,8 @@ def _finalize_eager_read(
allow_rotated: bool = False,
allow_unparseable_crs: bool = False,
attrs_in: dict | None = None,
mask_and_scale: bool = False,
parse_coordinates: bool = True,
):
"""Validate, populate attrs, mask, cast, and build an eager DataArray.

Expand Down Expand Up @@ -1593,15 +1624,33 @@ def _finalize_eager_read(
attrs: dict = dict(attrs_in) if attrs_in else {}
_populate_attrs_from_geo_info(attrs, geo_info, window=window)

# ``mask_and_scale`` implies masking (rioxarray applies scale / offset
# AND masks the nodata sentinel to NaN), so fold it into the mask gate.
effective_mask = mask_nodata or mask_and_scale

# Apply the nodata-to-NaN mask (or compute pixels_present
# without rewriting if ``mask_nodata=False``). Skipped entirely when
# without rewriting if masking is off). Skipped entirely when
# the source declared no sentinel.
nodata_pixels_present: bool | None = None
if nodata is not None:
arr, nodata_pixels_present = _apply_eager_nodata_mask(
arr, mask_sentinel=mask_sentinel, mask_nodata=mask_nodata,
arr, mask_sentinel=mask_sentinel, mask_nodata=effective_mask,
)

# ``mask_and_scale``: apply ``data * scale + offset`` from the source's
# GDAL_METADATA. Runs before the caller's ``dtype=`` cast so a
# ``dtype=<integer>`` request raises the same float-to-int ValueError the
# mask path raises (scaling promotes to float).
if mask_and_scale:
scale, offset = _extract_scale_offset(
getattr(geo_info, 'gdal_metadata', None))
if scale != 1.0 or offset != 0.0:
if arr.dtype.kind != 'f':
arr = arr.astype(np.float64)
arr = arr * scale + offset
attrs['scale_factor'] = scale
attrs['add_offset'] = offset

# Caller-requested dtype cast (post-mask so the integer
# promotion above runs first). ``_validate_dtype_cast`` lives in
# ``_validation``; local import keeps ``_attrs`` free of a top-level
Expand All @@ -1622,16 +1671,20 @@ def _finalize_eager_read(
# masked" rather than "masking was disabled").
_set_nodata_attrs(
attrs, nodata,
masked=(mask_nodata and np.dtype(str(arr.dtype)).kind == 'f'),
masked=(effective_mask and np.dtype(str(arr.dtype)).kind == 'f'),
pixels_present=nodata_pixels_present,
dtype_cast=dtype_cast_attr,
)

# Build the DataArray. ``_coords_from_geo_info`` honours the
# windowed-read contract (origin shifted to the window's top-left).
# ``parse_coordinates=False`` skips the x / y coordinate arrays
# (matching rioxarray); the transform / crs attrs still carry the
# georeferencing, and the band coord is kept.
height, width = arr.shape[:2]
coords = _coords_from_geo_info(
geo_info, height, width, window=window,
coords = (
_coords_from_geo_info(geo_info, height, width, window=window)
if parse_coordinates else {}
)
if arr.ndim == 3:
dims = ['y', 'x', 'band']
Expand Down
39 changes: 36 additions & 3 deletions xrspatial/geotiff/_backends/dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,9 @@ def _read_geotiff_dask(source: str, *,
allow_experimental_codecs: bool = False,
allow_internal_only_jpeg: bool = False,
band_nodata: str | None = None,
mask_nodata: bool = True) -> xr.DataArray:
mask_nodata: bool = True,
mask_and_scale: bool = False,
parse_coordinates: bool = True) -> xr.DataArray:
"""Read a GeoTIFF as a dask-backed DataArray for out-of-core processing.

Release-contract tier (see
Expand Down Expand Up @@ -105,6 +107,15 @@ def _read_geotiff_dask(source: str, *,
either way. Pass ``mask_nodata=False`` together with
``dtype=<integer>`` to keep an integer source dtype; the
default promotes to ``float64`` and the cast then raises.
mask_and_scale : bool, default False
[advanced] If True, apply the source's GDAL ``SCALE`` / ``OFFSET``
(``data * scale + offset``) lazily on the assembled dask array and
mask the nodata sentinel. Records ``attrs['scale_factor']`` /
``attrs['add_offset']``. No-op when the source carries no scale /
offset metadata.
parse_coordinates : bool, default True
[stable] If False, skip the ``x`` / ``y`` coordinate arrays; the
``transform`` / ``crs`` attrs still carry the georeferencing.
allow_rotated : bool, default False
[advanced] Read-side opt-in for rotated / sheared
``ModelTransformationTag`` files. Forwarded to every per-chunk
Expand Down Expand Up @@ -418,6 +429,10 @@ def _read_geotiff_dask(source: str, *,
and file_dtype.kind in ('u', 'i')
and lifecycle.sentinel_fits_buffer):
effective_dtype = np.dtype('float64')
# ``mask_and_scale`` applies ``data * scale + offset`` (and masks), which
# promotes any integer source to float regardless of the sentinel.
if mask_and_scale and file_dtype.kind != 'f':
effective_dtype = np.dtype('float64')

if dtype is not None:
target_dtype = np.dtype(dtype)
Expand Down Expand Up @@ -505,7 +520,7 @@ def _read_geotiff_dask(source: str, *,
attrs = _finalize_lazy_read_attrs(
geo_info=geo_info,
nodata=nodata_attr,
mask_nodata=mask_nodata,
mask_nodata=(mask_nodata or mask_and_scale),
graph_dtype=target_dtype,
caller_dtype=dtype,
window=window,
Expand Down Expand Up @@ -580,7 +595,7 @@ def _read_geotiff_dask(source: str, *,
# int-promotion branches in ``_delayed_read_window``. The
# original sentinel is still carried in ``attrs['nodata']`` via
# ``nodata_attr`` so write round-trips preserve the tag.
chunk_nodata = nodata if mask_nodata else None
chunk_nodata = nodata if (mask_nodata or mask_and_scale) else None
# ``effective_source`` swaps in the sidecar URL when the
# requested overview lives in an external ``.tif.ovr``.
# For local files and non-sidecar remote
Expand Down Expand Up @@ -613,6 +628,24 @@ def _read_geotiff_dask(source: str, *,

dask_arr = da.concatenate(dask_rows, axis=0)

# ``mask_and_scale``: apply ``data * scale + offset`` lazily on the
# assembled dask array. The per-chunk mask above already promoted the
# graph to float and replaced sentinels with NaN.
if mask_and_scale:
from .._attrs import _extract_scale_offset
scale, offset = _extract_scale_offset(
getattr(geo_info, 'gdal_metadata', None))
if scale != 1.0 or offset != 0.0:
dask_arr = dask_arr * scale + offset
attrs['scale_factor'] = scale
attrs['add_offset'] = offset

# ``parse_coordinates=False`` drops the x / y coordinate arrays (the
# transform / crs attrs still carry georeferencing); the band coord is
# kept.
if not parse_coordinates:
coords = {}

if out_has_band_axis:
dims = ['y', 'x', 'band']
coords['band'] = np.arange(n_bands)
Expand Down
8 changes: 8 additions & 0 deletions xrspatial/geotiff/_runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,14 @@
# ``path is None`` branch and raised a "missing required argument"
# TypeError for the wrong reason.
_VRT_PATH_MISSING_SENTINEL = object()
# ``open_geotiff`` renamed ``mask_nodata`` -> ``masked`` (and flipped the
# default from True to False) and ``name`` -> ``default_name`` to match
# rioxarray's ``open_rasterio``. Each sentinel distinguishes "caller passed
# the deprecated alias" from "caller passed nothing", so passing both the old
# and new name raises TypeError and the old name alone warns. Same rationale
# as ``_GPU_DEPRECATED_SENTINEL`` above.
_MASK_NODATA_DEPRECATED_SENTINEL = object()
_NAME_DEPRECATED_SENTINEL = object()


# Spatial dim names recognised on 3D writer inputs. ``y``/``x`` are the
Expand Down
24 changes: 12 additions & 12 deletions xrspatial/geotiff/tests/gpu/test_kernels_and_kwargs.py
Original file line number Diff line number Diff line change
Expand Up @@ -2483,7 +2483,7 @@ def test_open_geotiff_gpu_mask_nodata_false_threads_through_2052(
from xrspatial.geotiff import open_geotiff

path, arr = uint16_with_matching_sentinel_2052
da = open_geotiff(path, gpu=True, mask_nodata=False)
da = open_geotiff(path, gpu=True, masked=False)

assert da.dtype == np.uint16
np.testing.assert_array_equal(da.data.get(), arr)
Expand Down Expand Up @@ -2544,7 +2544,7 @@ def test_open_geotiff_dask_gpu_mask_nodata_false_threads_through_2052(
from xrspatial.geotiff import open_geotiff

path, arr = uint16_with_matching_sentinel_2052
da = open_geotiff(path, gpu=True, chunks=2, mask_nodata=False)
da = open_geotiff(path, gpu=True, chunks=2, masked=False)

assert da.dtype == np.uint16
computed = da.compute()
Expand Down Expand Up @@ -2584,7 +2584,7 @@ def test_open_geotiff_vrt_mask_nodata_false_threads_through_2052(
from xrspatial.geotiff import open_geotiff

vrt_path, arr = uint16_vrt_with_matching_sentinel_2052
da = open_geotiff(vrt_path, mask_nodata=False)
da = open_geotiff(vrt_path, masked=False)

assert da.dtype == np.uint16
np.testing.assert_array_equal(np.asarray(da.values), arr)
Expand Down Expand Up @@ -2637,7 +2637,7 @@ def test_open_geotiff_vrt_chunked_mask_nodata_false_threads_through_2052(
from xrspatial.geotiff import open_geotiff

vrt_path, arr = uint16_vrt_with_matching_sentinel_2052
da = open_geotiff(vrt_path, chunks=2, mask_nodata=False)
da = open_geotiff(vrt_path, chunks=2, masked=False)

assert da.dtype == np.uint16
computed = da.compute()
Expand All @@ -2651,8 +2651,8 @@ def test_cross_backend_parity_eager_dask_numpy_2052(

path, arr = uint16_with_matching_sentinel_2052

eager = open_geotiff(path, mask_nodata=False)
dask_ = open_geotiff(path, chunks=2, mask_nodata=False).compute()
eager = open_geotiff(path, masked=False)
dask_ = open_geotiff(path, chunks=2, masked=False).compute()

assert eager.dtype == np.uint16
assert dask_.dtype == np.uint16
Expand All @@ -2668,8 +2668,8 @@ def test_cross_backend_parity_eager_gpu_2052(

path, arr = uint16_with_matching_sentinel_2052

eager = open_geotiff(path, mask_nodata=False)
gpu = open_geotiff(path, gpu=True, mask_nodata=False)
eager = open_geotiff(path, masked=False)
gpu = open_geotiff(path, gpu=True, masked=False)

assert eager.dtype == np.uint16
assert gpu.dtype == np.uint16
Expand All @@ -2685,9 +2685,9 @@ def test_cross_backend_parity_eager_dask_gpu_2052(

path, arr = uint16_with_matching_sentinel_2052

eager = open_geotiff(path, mask_nodata=False)
eager = open_geotiff(path, masked=False)
dgpu = open_geotiff(
path, gpu=True, chunks=2, mask_nodata=False).compute()
path, gpu=True, chunks=2, masked=False).compute()

assert eager.dtype == np.uint16
assert dgpu.dtype == np.uint16
Expand All @@ -2705,8 +2705,8 @@ def test_cross_backend_parity_eager_vrt_2052(
tif_path, arr = uint16_with_matching_sentinel_2052
vrt_path, _ = uint16_vrt_with_matching_sentinel_2052

eager = open_geotiff(tif_path, mask_nodata=False)
vrt = open_geotiff(vrt_path, mask_nodata=False)
eager = open_geotiff(tif_path, masked=False)
vrt = open_geotiff(vrt_path, masked=False)

assert eager.dtype == vrt.dtype == np.uint16
np.testing.assert_array_equal(eager.values, np.asarray(vrt.values))
Expand Down
20 changes: 10 additions & 10 deletions xrspatial/geotiff/tests/gpu/test_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -615,8 +615,8 @@ def test_gpu_uint16_nodata_promoted_and_masked_tiled_1542(tmp_path):
write(arr, path, nodata=65535, compression='deflate',
tiled=True, tile_size=16)

cpu = open_geotiff(path)
gpu = open_geotiff(path, gpu=True)
cpu = open_geotiff(path, masked=True)
gpu = open_geotiff(path, gpu=True, masked=True)

assert cpu.dtype == gpu.dtype == np.float64
assert cpu.attrs.get('nodata') == 65535.0
Expand All @@ -638,8 +638,8 @@ def test_gpu_uint16_nodata_promoted_and_masked_stripped_1542(tmp_path):
path = str(tmp_path / 'gpu_u16_nodata_1542_stripped.tif')
write(arr, path, nodata=65535, compression='deflate', tiled=False)

cpu = open_geotiff(path)
gpu = open_geotiff(path, gpu=True)
cpu = open_geotiff(path, masked=True)
gpu = open_geotiff(path, gpu=True, masked=True)

assert cpu.dtype == gpu.dtype == np.float64
assert gpu.attrs.get('nodata') == 65535.0
Expand Down Expand Up @@ -718,10 +718,10 @@ def test_gpu_all_four_backends_agree_on_nodata_1542(tmp_path):
write(arr, path, nodata=65535, compression='deflate',
tiled=True, tile_size=16)

da_np = open_geotiff(path)
da_dask = open_geotiff(path, chunks=512)
da_gpu = open_geotiff(path, gpu=True)
da_gpu_dask = open_geotiff(path, gpu=True, chunks=512)
da_np = open_geotiff(path, masked=True)
da_dask = open_geotiff(path, chunks=512, masked=True)
da_gpu = open_geotiff(path, gpu=True, masked=True)
da_gpu_dask = open_geotiff(path, gpu=True, chunks=512, masked=True)

for label, da in [('np', da_np), ('dask+np', da_dask),
('gpu', da_gpu), ('gpu+dask', da_gpu_dask)]:
Expand Down Expand Up @@ -750,8 +750,8 @@ def test_gpu_int16_negative_nodata_1542(tmp_path):
write(arr, path, nodata=-9999, compression='deflate',
tiled=True, tile_size=16)

cpu = open_geotiff(path)
gpu = open_geotiff(path, gpu=True)
cpu = open_geotiff(path, masked=True)
gpu = open_geotiff(path, gpu=True, masked=True)
assert cpu.dtype == gpu.dtype == np.float64
assert gpu.attrs.get('nodata') == -9999.0
np.testing.assert_array_equal(np.isnan(cpu.values),
Expand Down
6 changes: 3 additions & 3 deletions xrspatial/geotiff/tests/gpu/test_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1454,9 +1454,9 @@ def test_gpu_writer_cog_overview_sentinel_roundtrip_1948():
)

# Read full-resolution and the two overview levels.
full = open_geotiff(path)
ov1 = open_geotiff(path, overview_level=1)
ov2 = open_geotiff(path, overview_level=2)
full = open_geotiff(path, masked=True)
ov1 = open_geotiff(path, overview_level=1, masked=True)
ov2 = open_geotiff(path, overview_level=2, masked=True)

# Full-resolution: sentinel pixels survive as NaN (the read path
# masks the sentinel back to NaN since attrs['nodata'] is set).
Expand Down
Loading
Loading