pydata · jrbourbeau · Nov 7, 2023 · Nov 7, 2023 · Nov 16, 2023 · Nov 16, 2023
diff --git a/xarray/backends/file_manager.py b/xarray/backends/file_manager.py
@@ -157,11 +157,17 @@ def __init__(
 
     def _make_key(self):
         """Make a key for caching files in the LRU cache."""
+        kwargs = self._kwargs
+        # storage_options is a non-hashable dict, so we implement special logic for hashing
+        if self._kwargs.get("storage_options", None) is not None:
+            kwargs = self._kwargs.copy()
+            kwargs["storage_options"] = tuple(sorted(kwargs["storage_options"].items()))
+
         value = (
             self._opener,
             self._args,
             "a" if self._mode == "w" else self._mode,
-            tuple(sorted(self._kwargs.items())),
+            tuple(sorted(kwargs.items())),
             self._manager_id,
         )
         return _HashedSequence(value)

diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py
@@ -88,6 +88,20 @@ def _h5netcdf_create_group(dataset, name):
     return dataset.create_group(name)
 
 
+def _h5netcdf_opener(filename, mode, storage_options=None, **kwargs):
+    import h5netcdf
+
+    if isinstance(filename, str) and is_remote_uri(filename):
+        import fsspec
+
+        mode_ = "rb" if mode == "r" else mode
+        fs, _, _ = fsspec.get_fs_token_paths(
+            filename, mode=mode_, storage_options=storage_options
+        )
+        filename = fs.open(filename, mode=mode_)
 def _find_absolute_paths( 
     paths: str | os.PathLike | NestedSequence[str | os.PathLike], **kwargs 
 ) -> list[str]: 
     """ 
     Find absolute paths from the pattern. 
     Parameters 
     ---------- 
     paths : 
         Path(s) to file(s). Can include wildcards like * . 
     **kwargs : 
         Extra kwargs. Mainly for fsspec. 
     Examples 
     -------- 
     >>> from pathlib import Path 
     >>> directory = Path(xr.backends.common.__file__).parent 
     >>> paths = str(Path(directory).joinpath("comm*n.py"))  # Find common with wildcard 
     >>> paths = xr.backends.common._find_absolute_paths(paths) 
     >>> [Path(p).name for p in paths] 
     ['common.py'] 
     """ 
     if isinstance(paths, str): 
         if is_remote_uri(paths) and kwargs.get("engine", None) == "zarr": 
             try: 
                 from fsspec.core import get_fs_token_paths 
             except ImportError as e: 
                 raise ImportError( 
                     "The use of remote URLs for opening zarr requires the package fsspec" 
                 ) from e 
             fs, _, _ = get_fs_token_paths( 
                 paths, 
                 mode="rb", 
                 storage_options=kwargs.get("backend_kwargs", {}).get( 
                     "storage_options", {} 
                 ), 
                 expand=False, 
             ) 
             tmp_paths = fs.glob(fs._strip_protocol(paths))  # finds directories 
             paths = [fs.get_mapper(path) for path in tmp_paths] 
         elif is_remote_uri(paths): 
             raise ValueError( 
                 "cannot do wild-card matching for paths that are remote URLs " 
                 f"unless engine='zarr' is specified. Got paths: {paths}. " 
                 "Instead, supply paths as an explicit list of strings." 
             ) 
         else: 
             paths = sorted(glob(_normalize_path(paths))) 
     elif isinstance(paths, os.PathLike): 
         paths = [os.fspath(paths)] 
     else: 
         paths = [os.fspath(p) if isinstance(p, os.PathLike) else p for p in paths] 
     return paths 
 def _find_absolute_paths( 
     paths: str | os.PathLike | NestedSequence[str | os.PathLike], **kwargs 
 ) -> list[str]: 
     """ 
     Find absolute paths from the pattern. 
  
     Parameters 
     ---------- 
     paths : 
         Path(s) to file(s). Can include wildcards like * . 
     **kwargs : 
         Extra kwargs. Mainly for fsspec. 
  
     Examples 
     -------- 
     >>> from pathlib import Path 
  
     >>> directory = Path(xr.backends.common.__file__).parent 
     >>> paths = str(Path(directory).joinpath("comm*n.py"))  # Find common with wildcard 
     >>> paths = xr.backends.common._find_absolute_paths(paths) 
     >>> [Path(p).name for p in paths] 
     ['common.py'] 
     """ 
     if isinstance(paths, str): 
         if is_remote_uri(paths) and kwargs.get("engine", None) == "zarr": 
             try: 
                 from fsspec.core import get_fs_token_paths 
             except ImportError as e: 
                 raise ImportError( 
                     "The use of remote URLs for opening zarr requires the package fsspec" 
                 ) from e 
  
             fs, _, _ = get_fs_token_paths( 
                 paths, 
                 mode="rb", 
                 storage_options=kwargs.get("backend_kwargs", {}).get( 
                     "storage_options", {} 
                 ), 
                 expand=False, 
             ) 
             tmp_paths = fs.glob(fs._strip_protocol(paths))  # finds directories 
             paths = [fs.get_mapper(path) for path in tmp_paths] 
         elif is_remote_uri(paths): 
             raise ValueError( 
                 "cannot do wild-card matching for paths that are remote URLs " 
                 f"unless engine='zarr' is specified. Got paths: {paths}. " 
                 "Instead, supply paths as an explicit list of strings." 
             ) 
         else: 
             paths = sorted(glob(_normalize_path(paths))) 
     elif isinstance(paths, os.PathLike): 
         paths = [os.fspath(paths)] 
     else: 
         paths = [os.fspath(p) if isinstance(p, os.PathLike) else p for p in paths] 
  
     return paths 
+    return h5netcdf.File(filename, mode=mode, **kwargs)
+
+
 class H5NetCDFStore(WritableCFDataStore):
     """Store for reading and writing data via h5netcdf"""
 
@@ -140,9 +154,8 @@ def open(
         invalid_netcdf=None,
         phony_dims=None,
         decode_vlen_strings=True,
+        storage_options=None,
     ):
-        import h5netcdf
-
         if isinstance(filename, bytes):
             raise ValueError(
                 "can't open netCDF4/HDF5 as bytes "
@@ -161,6 +174,7 @@ def open(
         kwargs = {
             "invalid_netcdf": invalid_netcdf,
             "decode_vlen_strings": decode_vlen_strings,
+            "storage_options": storage_options,
         }
         if phony_dims is not None:
             kwargs["phony_dims"] = phony_dims
@@ -171,7 +185,9 @@ def open(
             else:
                 lock = combine_locks([HDF5_LOCK, get_write_lock(filename)])
 
-        manager = CachingFileManager(h5netcdf.File, filename, mode=mode, kwargs=kwargs)
+        manager = CachingFileManager(
+            _h5netcdf_opener, filename, mode=mode, kwargs=kwargs
+        )
         return cls(manager, group=group, mode=mode, lock=lock, autoclose=autoclose)
 
     def _acquire(self, needs_lock=True):
@@ -397,6 +413,7 @@ def open_dataset(  # type: ignore[override]  # allow LSP violation, not supporti
         invalid_netcdf=None,
         phony_dims=None,
         decode_vlen_strings=True,
+        storage_options=None,
     ) -> Dataset:
         filename_or_obj = _normalize_path(filename_or_obj)
         store = H5NetCDFStore.open(
@@ -407,6 +424,7 @@ def open_dataset(  # type: ignore[override]  # allow LSP violation, not supporti
             invalid_netcdf=invalid_netcdf,
             phony_dims=phony_dims,
             decode_vlen_strings=decode_vlen_strings,
+            storage_options=storage_options,
         )
 
         store_entrypoint = StoreBackendEntrypoint()

diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py
@@ -2915,6 +2915,27 @@
     assert_identical(ds, ds_a)
 
 
+@requires_h5netcdf
+@requires_fsspec
+def test_h5netcdf_storage_options() -> None:
+    with create_tmp_files(2) as (f1, f2):
+        ds1 = create_test_data()
+        ds1.to_netcdf(f1, engine="h5netcdf")
+
+        ds2 = create_test_data()
+        ds2.to_netcdf(f2, engine="h5netcdf")
+
+        files = [f"file://{f}" for f in [f1, f2]]
+        ds = xr.open_mfdataset(
+            files,
+            engine="h5netcdf",
+            concat_dim="time",
+            combine="nested",
+            storage_options={"skip_instance_cache": False},
+        )
+        assert_identical(xr.concat([ds1, ds2], dim="time"), ds)
+
+
 @requires_scipy
 class TestScipyInMemoryData(CFEncodedBase, NetCDF3Only):
     engine: T_NetcdfEngine = "scipy"
@@ -4217,14 +4238,14 @@
                        assert_identical(data, on_disk)

    def test_deterministic_names(self) -> None:
        with create_tmp_file() as tmp:
            data = create_test_data()
            data.to_netcdf(tmp)
            with open_mfdataset(tmp, combine="by_coords") as ds:
                original_names = {k: v.data.name for k, v in ds.data_vars.items()}
            with open_mfdataset(tmp, combine="by_coords") as ds:
                repeat_names = {k: v.data.name for k, v in ds.data_vars.items()}
            for var_name, dask_name in original_names.items():
                assert var_name in dask_name
                assert dask_name[:13] == "open_dataset-"
            assert original_names == repeat_names
@@ -4242,7 +4263,7 @@
    def test_save_mfdataset_compute_false_roundtrip(self) -> None:
        from dask.delayed import Delayed

        original = Dataset({"foo": ("x", np.random.randn(10))}).chunk()
        datasets = [original.isel(x=slice(5)), original.isel(x=slice(5, 10))]
        with create_tmp_file(allow_cleanup_failure=ON_WINDOWS) as tmp1:
            with create_tmp_file(allow_cleanup_failure=ON_WINDOWS) as tmp2:
@@ -4255,7 +4276,7 @@
                    [tmp1, tmp2], combine="nested", concat_dim="x"
                ) as actual:
                    assert_identical(actual, original)

    def test_load_dataset(self) -> None:
        with create_tmp_file() as tmp:
            original = Dataset({"foo": ("x", np.random.randn(10))})
@@ -4264,7 +4285,7 @@
            # this would fail if we used open_dataset instead of load_dataset
            ds.to_netcdf(tmp)

    def test_load_dataarray(self) -> None:
        with create_tmp_file() as tmp:
            original = Dataset({"foo": ("x", np.random.randn(10))})
            original.to_netcdf(tmp)
@@ -4272,7 +4293,7 @@
            # this would fail if we used open_dataarray instead of
            # load_dataarray
            ds.to_netcdf(tmp)

    @pytest.mark.skipif(
        ON_WINDOWS,
        reason="counting number of tasks in graph fails on windows for some reason",
@@ -4285,7 +4306,7 @@

            def num_graph_nodes(obj):
                return len(obj.__dask_graph__())

            not_inlined_ds = open_dataset(tmp, inline_array=False, chunks=chunks)
            inlined_ds = open_dataset(tmp, inline_array=True, chunks=chunks)
            assert num_graph_nodes(inlined_ds) < num_graph_nodes(not_inlined_ds)
@@ -4332,7 +4353,7 @@

            # global attributes should be global attributes on the dataset
            assert "NC_GLOBAL" not in actual.attrs
            assert "history" in actual.attrs

            # we don't check attributes exactly with assertDatasetIdentical()
            # because the test DAP server seems to insert some extra
@@ -4366,7 +4387,7 @@
                actual.to_netcdf(tmp_file)
                with open_dataset(tmp_file) as actual2:
                    actual2["bears"] = actual2["bears"].astype(str)
                    assert_equal(actual2, expected)

    @requires_dask
    def test_dask(self) -> None:
@@ -4375,7 +4396,7 @@


 @network
 @requires_scipy_or_netCDF4
 @requires_pydap
 class TestPydapOnline(TestPydap):
    @contextlib.contextmanager