Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add utility for opening remote files with fsspec #9797

Merged
merged 8 commits into from
Nov 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,9 @@ New Features
- Optimize :py:meth:`DataArray.polyfit` and :py:meth:`Dataset.polyfit` with dask, when used with
arrays with more than two dimensions.
(:issue:`5629`). By `Deepak Cherian <https://github.com/dcherian>`_.
- Support for directly opening remote files as string paths (for example, ``s3://bucket/data.nc``)
with ``fsspec`` when using the ``h5netcdf`` engine (:issue:`9723`, :pull:`9797`).
By `James Bourbeau <https://github.com/jrbourbeau>`_.
- Re-implement the :py:mod:`ufuncs` module, which now dynamically dispatches to the
underlying array's backend. Provides better support for certain wrapped array types
like ``jax.numpy.ndarray``. (:issue:`7848`, :pull:`9776`).
Expand Down
9 changes: 9 additions & 0 deletions xarray/backends/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,15 @@ def _normalize_path_list(
return _normalize_path_list(paths)


def _open_remote_file(file, mode, storage_options=None):
import fsspec

fs, _, paths = fsspec.get_fs_token_paths(
file, mode=mode, storage_options=storage_options
)
return fs.open(paths[0], mode=mode)


def _encode_variable_name(name):
if name is None:
name = NONE_VAR_NAME
Expand Down
12 changes: 11 additions & 1 deletion xarray/backends/h5netcdf_.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
BackendEntrypoint,
WritableCFDataStore,
_normalize_path,
_open_remote_file,
datatree_from_dict_with_io_cleanup,
find_root_and_group,
)
Expand Down Expand Up @@ -149,9 +150,16 @@ def open(
decode_vlen_strings=True,
driver=None,
driver_kwds=None,
storage_options: dict[str, Any] | None = None,
):
import h5netcdf

if isinstance(filename, str) and is_remote_uri(filename) and driver is None:
mode_ = "rb" if mode == "r" else mode
filename = _open_remote_file(
filename, mode=mode_, storage_options=storage_options
)

if isinstance(filename, bytes):
raise ValueError(
"can't open netCDF4/HDF5 as bytes "
Expand All @@ -161,7 +169,7 @@ def open(
magic_number = read_magic_number_from_file(filename)
if not magic_number.startswith(b"\211HDF\r\n\032\n"):
raise ValueError(
f"{magic_number} is not the signature of a valid netCDF4 file"
f"{magic_number!r} is not the signature of a valid netCDF4 file"
)

if format not in [None, "NETCDF4"]:
Expand Down Expand Up @@ -425,6 +433,7 @@ def open_dataset(
decode_vlen_strings=True,
driver=None,
driver_kwds=None,
storage_options: dict[str, Any] | None = None,
) -> Dataset:
filename_or_obj = _normalize_path(filename_or_obj)
store = H5NetCDFStore.open(
Expand All @@ -437,6 +446,7 @@ def open_dataset(
decode_vlen_strings=decode_vlen_strings,
driver=driver,
driver_kwds=driver_kwds,
storage_options=storage_options,
)

store_entrypoint = StoreBackendEntrypoint()
Expand Down
21 changes: 21 additions & 0 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -6489,3 +6489,24 @@ def test_zarr_safe_chunk_region(tmp_path):
chunk = ds.isel(region)
chunk = chunk.chunk()
chunk.chunk().to_zarr(store, region=region)


@requires_h5netcdf
@requires_fsspec
def test_h5netcdf_storage_options() -> None:
with create_tmp_files(2, allow_cleanup_failure=ON_WINDOWS) as (f1, f2):
ds1 = create_test_data()
ds1.to_netcdf(f1, engine="h5netcdf")

ds2 = create_test_data()
ds2.to_netcdf(f2, engine="h5netcdf")

files = [f"file://{f}" for f in [f1, f2]]
ds = xr.open_mfdataset(
files,
engine="h5netcdf",
concat_dim="time",
combine="nested",
storage_options={"skip_instance_cache": False},
)
assert_identical(xr.concat([ds1, ds2], dim="time"), ds)
Loading