Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 12 additions & 7 deletions docs/source/user_guide/attrs_contract.rst
Original file line number Diff line number Diff line change
Expand Up @@ -85,13 +85,18 @@ write.
``_resolve_nodata_attr``.
* - ``masked_nodata``
- bool
- Paired with ``nodata``. ``True`` when the reader actually
replaced sentinel pixels with NaN (so the buffer is NaN-aware);
``False`` when the array still carries the literal sentinel
values, including the case where the array is float dtype
because the caller passed ``masked=False`` (the default) together
with ``dtype=float...``. Only set when ``nodata`` is set; absence
means no declared sentinel. See issue #2092.
- Paired with ``nodata``. ``True`` when the reader ran the
sentinel-to-NaN step so the buffer is NaN-aware; ``False`` when
the array still carries the literal sentinel values, including the
case where the array is float dtype because the caller passed
``masked=False`` (the default) together with ``dtype=float...``.
The flag tracks whether masking ran, not whether any sentinel
pixel matched: a masked read of a maskable integer source promotes
to float and sets ``True`` even when zero pixels match, so the
eager and dask paths agree for the same input (issue #2990). Use
``nodata_pixels_present`` for the did-any-pixel-match question.
Only set when ``nodata`` is set; absence means no declared
sentinel. See issue #2092.
* - ``nodata_pixels_present``
- bool
- Paired with ``nodata``. ``True`` iff the read window contained
Expand Down
16 changes: 10 additions & 6 deletions xrspatial/geotiff/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -688,9 +688,10 @@ def open_geotiff(source: str | BinaryIO, *,
(``masked=False``); note that earlier xrspatial releases masked
by default (``mask_nodata=True``), so a bare ``open_geotiff(path)``
no longer promotes the sentinel to NaN. Pass ``masked=True`` and
``dtype=<integer>`` together and the read raises ``ValueError``
once the sentinel matches a pixel, because the float64 promotion
then makes the integer cast lossy.
``dtype=<integer>`` together on a source with a maskable sentinel
and the read raises ``ValueError``, because the unconditional
float64 promotion (issue #2990) makes the integer cast lossy
whether or not a sentinel pixel is present.
mask_nodata : bool
[deprecated] Deprecated alias of ``masked``; emits a
``DeprecationWarning``. Passing both ``masked`` and ``mask_nodata``
Expand Down Expand Up @@ -841,9 +842,12 @@ def open_geotiff(source: str | BinaryIO, *,
passing ``dtype=<integer>`` as well is not enough to keep an integer
dtype: the sentinel-to-NaN promotion runs first and the subsequent
integer cast then raises ``ValueError`` (float-to-int is lossy in a
way users rarely intend). When the file has no in-range sentinel
match, the promotion is skipped and ``dtype=<integer>`` works either
way.
way users rarely intend). The promotion runs whenever the sentinel is
maskable (finite, integer, in-range), whether or not any pixel matches
it, so the eager and dask paths return the same float64 dtype for the
same input (issue #2990). A sentinel that cannot match (out-of-range,
non-finite, or fractional) leaves the source dtype, so
``dtype=<integer>`` works in that case.

Examples
--------
Expand Down
49 changes: 36 additions & 13 deletions xrspatial/geotiff/_attrs.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,12 @@
the in-memory array is float dtype and the reader's sentinel-to-NaN
step ran; ``False`` iff the array still carries the literal integer
sentinel. Only emitted when ``nodata`` is set; absence is the
"no declared sentinel" signal. See ``_set_nodata_attrs``.
"no declared sentinel" signal. See ``_set_nodata_attrs``. The flag
tracks whether masking ran, not whether any sentinel pixel matched:
a masked read of a maskable integer source promotes to float and sets
``True`` even when zero pixels match, so the eager and dask paths agree
for the same input (issue #2990). Use ``nodata_pixels_present`` for the
did-any-pixel-match question.
- ``nodata_pixels_present`` (#2135): bool, only emitted when
``nodata`` is set and the backend computed the answer cheaply.
True iff the read window contained at least one pixel matching the
Expand Down Expand Up @@ -1452,11 +1457,14 @@ def _apply_eager_nodata_mask(arr, *, mask_sentinel, mask_nodata):

Mirrors the inline block in ``open_geotiff`` so the eager helper can
share one implementation. Returns ``(arr, nodata_pixels_present)``
where ``arr`` may have been promoted from an integer dtype to float64
when the sentinel matched at least one pixel, and
``nodata_pixels_present`` is the bool used to populate
``attrs['nodata_pixels_present']``. ``None`` means "no scan was
appropriate for this dtype / sentinel combination."
where ``arr`` is promoted from an integer dtype to float64 whenever
``mask_nodata`` is set and the sentinel is maskable (finite, integer,
in-range), independent of whether any pixel matches. This matches the
dask path (which declares float64 up front from the same gate) and
rioxarray's ``masked=True`` (issue #2990). ``nodata_pixels_present``
is the bool used to populate ``attrs['nodata_pixels_present']``;
``None`` means "no scan was appropriate for this dtype / sentinel
combination."

The sentinel is taken as the ``mask_sentinel`` parameter rather than
being read from ``geo_info``. Three GPU eager sites derive it three
Expand Down Expand Up @@ -1491,10 +1499,23 @@ def _apply_eager_nodata_mask(arr, *, mask_sentinel, mask_nodata):
nodata_int = int(mask_sentinel)
info = np.iinfo(arr.dtype)
if info.min <= nodata_int <= info.max:
mask = arr == arr.dtype.type(nodata_int)
# Promote to float64 whenever the sentinel is
# maskable, independent of whether any pixel matches.
# The dask path declares float64 up front from the
# same finite/integer/in-range gate (see
# ``_read_geotiff_dask``'s ``sentinel_fits_buffer``
# branch), and rioxarray's ``masked=True`` always
# promotes an integer source to float. Gating the
# promotion on a matching pixel made the eager output
# diverge (uint16 + ``masked_nodata=False``) from the
# lazy output (float64 + ``masked_nodata=True``) for
# the same file when no sentinel pixel was present
# (issue #2990). ``nodata_pixels_present`` still
# records whether a pixel matched.
arr = arr.astype(np.float64)
mask = arr == np.float64(nodata_int)
nodata_pixels_present = bool(mask.any())
if nodata_pixels_present:
arr = arr.astype(np.float64)
arr[mask] = np.nan
else:
nodata_pixels_present = False
Expand Down Expand Up @@ -1664,11 +1685,13 @@ def _finalize_eager_read(
dtype_cast_attr = target.name

# Stamp the nodata lifecycle attrs. ``masked`` is True iff
# the caller opted into masking AND the final buffer dtype is float,
# mirroring the existing call sites (the integer promotion above
# only runs when the sentinel matched at least one pixel, so an
# ``int`` buffer + ``mask_nodata=True`` here means "no pixels were
# masked" rather than "masking was disabled").
# the caller opted into masking AND the final buffer dtype is float.
# ``_apply_eager_nodata_mask`` promotes a maskable integer source to
# float64 whenever masking is on (issue #2990), so an ``int`` buffer +
# ``mask_nodata=True`` here means the sentinel was unmaskable
# (out-of-range / non-finite / fractional) and could never match, not
# that masking was disabled. Either way ``masked`` is correctly False
# because the literal sentinel still occupies its integer slot.
_set_nodata_attrs(
attrs, nodata,
masked=(effective_mask and np.dtype(str(arr.dtype)).kind == 'f'),
Expand Down
76 changes: 66 additions & 10 deletions xrspatial/geotiff/tests/read/test_nodata.py
Original file line number Diff line number Diff line change
Expand Up @@ -2449,21 +2449,23 @@ def test_int_source_with_sentinel_hit(self, tmp_path):
assert da.attrs["masked_nodata"] is True
assert np.isnan(da.values).sum() == 2

def test_int_source_no_hit_keeps_sentinel(self, tmp_path):
"""Int source + sentinel declared but no hit -> nodata set, masked_nodata=False.

The eager numpy path only promotes integer arrays to float on
the first sentinel hit. When the sentinel is in-range but never
matches a pixel, the array stays at the source integer dtype
and ``masked_nodata`` is False so downstream code knows the
literal sentinel is still in-band.
def test_int_source_no_hit_promotes_to_float(self, tmp_path):
"""Int source + in-range sentinel, no hit -> float64, masked_nodata=True.

When ``masked=True`` and the sentinel is in-range, the eager
numpy path promotes the integer array to float64 even when no
pixel matches the sentinel. This matches the dask path (which
declares float64 up front) and rioxarray's ``masked=True``, both
of which promote unconditionally. ``nodata_pixels_present``
records that no pixel matched (issue #2990).
"""
path = str(tmp_path / "tnss1988_int_no_hit.tif")
_write_int_tiff_1988(path, with_sentinel_hit=False)
da = open_geotiff(path, masked=True)
assert da.attrs["nodata"] == 65535
assert da.dtype.kind in ("u", "i")
assert da.attrs["masked_nodata"] is False
assert da.dtype.kind == "f"
assert da.attrs["masked_nodata"] is True
assert da.attrs["nodata_pixels_present"] is False


class TestDaskNumpy:
Expand Down Expand Up @@ -2516,6 +2518,60 @@ def test_int_source_with_out_of_range_sentinel(self, tmp_path):
assert da.attrs["masked_nodata"] is False


class TestEagerLazyMaskParity2990:
"""Eager and lazy paths agree on dtype and masked_nodata (issue #2990).

The lazy dask path declares float64 up front from the in-range
integer-sentinel gate and stamps ``masked_nodata=True`` before any
chunk decodes. The eager path used to gate the float promotion on a
matching sentinel pixel, so a uint16 source with an in-range sentinel
but no matching pixel came back uint16 with ``masked_nodata=False``
from the eager reader and float64 with ``masked_nodata=True`` from the
dask reader. These tests pin the two paths to the same answer.
"""

def test_no_hit_eager_matches_dask(self, tmp_path):
path = str(tmp_path / "t2990_no_hit.tif")
_write_int_tiff_1988(path, with_sentinel_hit=False)
eager = open_geotiff(path, masked=True)
lazy = _read_geotiff_dask(path, chunks=2, mask_nodata=True)
assert eager.dtype == lazy.dtype == np.float64
assert eager.attrs["masked_nodata"] is True
assert lazy.attrs["masked_nodata"] is True
# Eager scans the buffer, so it can report the presence bool; the
# lazy path leaves it absent per the documented dask contract.
assert eager.attrs["nodata_pixels_present"] is False

def test_hit_eager_matches_dask(self, tmp_path):
path = str(tmp_path / "t2990_hit.tif")
_write_int_tiff_1988(path, with_sentinel_hit=True)
eager = open_geotiff(path, masked=True)
lazy = _read_geotiff_dask(path, chunks=2, mask_nodata=True)
assert eager.dtype == lazy.dtype == np.float64
assert eager.attrs["masked_nodata"] is True
assert lazy.attrs["masked_nodata"] is True
assert eager.attrs["nodata_pixels_present"] is True
# The masked pixel materialises to NaN on both paths.
assert bool(np.isnan(eager.values).any())
assert bool(np.isnan(np.asarray(lazy)).any())

def test_out_of_range_keeps_int_on_both(self, tmp_path):
"""Out-of-range sentinel cannot match: both paths stay integer.

The promotion gate (finite + integer + in-range) is unchanged, so
an unmatchable sentinel leaves the source dtype and reports
``masked_nodata=False`` on both paths.
"""
path = str(tmp_path / "t2990_oor.tif")
_build_uint16_with_out_of_range_nodata_1988(path)
eager = open_geotiff(path, masked=True)
lazy = _read_geotiff_dask(path, chunks=2, mask_nodata=True)
assert eager.dtype.kind == "u"
assert lazy.dtype.kind == "u"
assert eager.attrs["masked_nodata"] is False
assert lazy.attrs["masked_nodata"] is False


def _write_uint16_vrt_source_1988(tmp_path, *, sentinel_hit: bool,
filename: str):
"""Write a 2x2 uint16 source raster with declared sentinel 65535."""
Expand Down
Loading