Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Scale offset from item asset #202

Merged
merged 9 commits into from
Jul 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ It's a good idea to use `conda` to handle installing rasterio on Windows. There'
* Transfer the STAC metadata into [xarray coordinates](http://xarray.pydata.org/en/stable/data-structures.html#coordinates) for easy indexing, filtering, and provenance of metadata.
* Efficiently generate a Dask graph for loading the data in parallel.
* Mediate between Dask's parallelism and GDAL's aversion to it, allowing for fast, multi-threaded reads when possible, and at least preventing segfaults when not.
* Mask nodata and rescale by dataset-level scales/offsets.
* Mask nodata and rescale by STAC item asset scales/offsets.
* Display data in interactive maps in a notebook, computed on the fly by Dask.

## Limitations:
Expand Down
21 changes: 19 additions & 2 deletions stackstac/prepare.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,11 @@
import xarray as xr

from .raster_spec import IntFloat, Bbox, Resolutions, RasterSpec

from .stac_types import ItemSequence
from . import accumulate_metadata, geom_utils

ASSET_TABLE_DT = np.dtype([("url", object), ("bounds", "float64", 4)])
ASSET_TABLE_DT = np.dtype([("url", object), ("bounds", "float64", 4), ("scale_offset", "float64", 2)])


class Mimetype(NamedTuple):
Expand Down Expand Up @@ -143,6 +144,22 @@ def prepare_items(
asset_bbox = asset.get("proj:bbox", item_bbox)
asset_shape = asset.get("proj:shape", item_shape)
asset_transform = asset.get("proj:transform", item_transform)
raster_bands = asset.get('raster:bands')
gjoseph92 marked this conversation as resolved.
Show resolved Hide resolved

if raster_bands is not None:
if len(raster_bands) != 1:
raise ValueError(
f"raster:bands has {len(raster_bands)} elements for asset {asset_id!r}. "
"Multi-band rasters are not currently supported.\n"
"If you don't care about this asset, you can skip it by giving a list "
"of asset IDs you *do* want in `assets=`, and leaving this one out."
)
asset_scale = raster_bands[0].get('scale', 1)
asset_offset = raster_bands[0].get('offset', 0)
else:
asset_scale = 1
asset_offset = 0

asset_affine = None

# Auto-compute CRS
Expand Down Expand Up @@ -322,7 +339,7 @@ def prepare_items(
continue

# Phew, we figured out all the spatial stuff! Now actually store the information we care about.
asset_table[item_i, asset_i] = (asset["href"], asset_bbox_proj)
asset_table[item_i, asset_i] = (asset["href"], asset_bbox_proj, (asset_scale, asset_offset))
gjoseph92 marked this conversation as resolved.
Show resolved Hide resolved
# ^ NOTE: If `asset_bbox_proj` is None, NumPy automatically converts it to NaNs

# At this point, everything has been set (or there was as error)
Expand Down
4 changes: 1 addition & 3 deletions stackstac/reader_protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def __init__(
resampling: Resampling,
dtype: np.dtype,
fill_value: Union[int, float],
rescale: bool,
scale_offset: Tuple[Union[int, float], Union[int, float]],
gdal_env: Optional[LayeredEnv],
errors_as_nodata: Tuple[Exception, ...] = (),
) -> None:
Expand All @@ -55,8 +55,6 @@ def __init__(
fill_value:
Fill nodata pixels in the output array with this value.
If None, whatever nodata value is set in the asset will be used.
rescale:
Rescale the output array according to any scales and offsets set in the asset.
gdal_env:
A `~.LayeredEnv` of GDAL configuration options to use while opening
and reading datasets. If None (default), `~.DEFAULT_GDAL_ENV` is used.
Expand Down
24 changes: 12 additions & 12 deletions stackstac/rio_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ def _curthread():

# /TODO


# Default GDAL configuration options
DEFAULT_GDAL_ENV = LayeredEnv(
always=dict(
Expand Down Expand Up @@ -64,11 +63,12 @@ def _curthread():
# See `ThreadLocalRioDataset` for more.
# https://github.com/pangeo-data/pangeo-example-notebooks/issues/21#issuecomment-432457955
# https://gdal.org/drivers/raster/vrt.html#multi-threading-issues

MULTITHREADED_DRIVER_ALLOWLIST = {"GTiff"}


class ThreadsafeRioDataset(Protocol):
scale_offset: Tuple[float, float]
scale_offset: Tuple[Union[int, float], Union[int, float]]

def read(self, window: Window, **kwargs) -> np.ndarray:
...
Expand Down Expand Up @@ -280,7 +280,7 @@ class PickleState(TypedDict):
resampling: Resampling
dtype: np.dtype
fill_value: Union[int, float]
rescale: bool
scale_offset: Tuple[Union[int, float], Union[int, float]]
gdal_env: Optional[LayeredEnv]
errors_as_nodata: Tuple[Exception, ...]

Expand All @@ -303,16 +303,16 @@ def __init__(
resampling: Resampling,
dtype: np.dtype,
fill_value: Union[int, float],
rescale: bool,
scale_offset: Tuple[Union[int, float], Union[int, float]],
gdal_env: Optional[LayeredEnv] = None,
errors_as_nodata: Tuple[Exception, ...] = (),
) -> None:
self.url = url
self.spec = spec
self.resampling = resampling
self.dtype = dtype
self.rescale = rescale
self.fill_value = fill_value
self.scale_offset = scale_offset
self.gdal_env = gdal_env or DEFAULT_GDAL_ENV
self.errors_as_nodata = errors_as_nodata

Expand Down Expand Up @@ -398,12 +398,12 @@ def read(self, window: Window, **kwargs) -> np.ndarray:

raise RuntimeError(msg) from e

if self.rescale:
scale, offset = reader.scale_offset
if scale != 1:
result *= scale
if offset != 0:
result += offset
scale, offset = self.scale_offset

if scale != 1:
result *= scale
if offset != 0:
result += offset

result = np.ma.filled(result, fill_value=self.fill_value)
assert np.issubdtype(result.dtype, self.dtype), (
Expand Down Expand Up @@ -436,7 +436,7 @@ def __getstate__(
"resampling": self.resampling,
"dtype": self.dtype,
"fill_value": self.fill_value,
"rescale": self.rescale,
gjoseph92 marked this conversation as resolved.
Show resolved Hide resolved
"scale_offset": self.scale_offset,
"gdal_env": self.gdal_env,
"errors_as_nodata": self.errors_as_nodata,
}
Expand Down
3 changes: 2 additions & 1 deletion stackstac/stack.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,8 @@ def stack(
be represented in a smaller data type (like ``uint16``), using a different ``fill_value``
(like 0) and managing it yourself could save a lot of memory.
rescale:
Whether to rescale pixel values by the scale and offset set on the dataset.
Whether to rescale pixel values by the scale and offset present in the ``raster:bands`` metadata
for each asset.
Default: True. Note that this could produce floating-point data when the
original values are ints, so set ``dtype`` accordingly. You will NOT be warned
if the cast to ``dtype`` is losing information!
Expand Down
6 changes: 5 additions & 1 deletion stackstac/to_dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,10 @@ def asset_table_to_reader_and_window(
if url:
asset_bounds: Bbox = asset_entry["bounds"]
asset_window = windows.from_bounds(*asset_bounds, spec.transform)
if rescale:
asset_scale_offset = asset_entry["scale_offset"]
else:
asset_scale_offset = (1, 0)

entry: ReaderTableEntry = (
reader(
Expand All @@ -140,7 +144,7 @@ def asset_table_to_reader_and_window(
resampling=resampling,
dtype=dtype,
fill_value=fill_value,
rescale=rescale,
scale_offset=asset_scale_offset,
gdal_env=gdal_env,
errors_as_nodata=errors_as_nodata,
),
Expand Down