From 3a6f5438ce06b0d70860a998767143cc3fe56dae Mon Sep 17 00:00:00 2001 From: euronion <42553970+euronion@users.noreply.github.com> Date: Wed, 5 Apr 2023 11:08:53 +0200 Subject: [PATCH] Enable download of large cutouts from ERA5 via cdsapi (#236) and address xarray encoding bug * Update era5.py * Address memory errors for writing large cutouts. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add month being retrieved to informative output during cutout.prepare(). * Add note on how to compress cutouts from terminal * Update doc-string for monthly retrieval * Update RELEASE_NOTES.rst * update release notes * era5: ensure correct info print out format for data retrieval * Update doc to refer to new compression feature. * Fix nan issue with encoding in xarray/netcdf * Use nicer Python syntax * Increase default compression level * Address bug in xarray encoding for ERA5 data --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Fabian --- RELEASE_NOTES.rst | 11 +++++++++- atlite/data.py | 16 ++++++++------ atlite/datasets/era5.py | 39 ++++++++++++++++++++++------------ examples/create_cutout.ipynb | 41 +++++++++++++++++++++++++++++++----- 4 files changed, 82 insertions(+), 25 deletions(-) diff --git a/RELEASE_NOTES.rst b/RELEASE_NOTES.rst index 766a8a33..5669e6b8 100644 --- a/RELEASE_NOTES.rst +++ b/RELEASE_NOTES.rst @@ -16,10 +16,19 @@ Upcoming Release * Added 1-axis vertical and 2-axis tracking option for solar pv and trigon_model = "simple" * Added small documentation for get_windturbineconfig * The deprecated functions `grid_cells` and `grid_coordinates` were removed. -* Feature: Cutouts are now compressed during the `.prepare(...)` stepusing the native compression feature of netCDF files. +* Feature: Cutouts are now compressed differently during the `.prepare(...)` step using the native compression feature of netCDF files. + This increases time to build a cutout but should reduce cutout file sizes. Existing cutouts are not affected. To also compress existing cutouts, load and save them using `xarray` with compression specified, see `the xarray documentation `_ for details. +* Feature: Cutouts from `ERA5` are now downloaded for each month rather than for each year. + This allows for spatially larger cutouts (worldwide) which previously exceed the maximum + download size from ERA5. +* Doc: A subsection on how to reduce `cutout` sizes has been added to the documentation. +* Bug notice: An bug in one of `atlite` package dependencies (`xarray`) can lead to `nan` values when using `atlite`. + A workaround is implemented in `atlite` which reduces the performance when building cutouts, especially for ERA5 cutouts. + The `nan` values in `cutouts` which are affected by the bug can not be recoevered and the `cutout` needs to be downloaded again. + For more details on the bug, see the `xarray issue tracker `_. Version 0.2.10 ============== diff --git a/atlite/data.py b/atlite/data.py index dc73a873..43420d4d 100644 --- a/atlite/data.py +++ b/atlite/data.py @@ -115,7 +115,7 @@ def cutout_prepare( features=None, tmpdir=None, overwrite=False, - compression={"zlib": True, "complevel": 4}, + compression={"zlib": True, "complevel": 9, "shuffle": True}, ): """ Prepare all or a selection of features in a cutout. @@ -144,9 +144,10 @@ def cutout_prepare( compression : None/dict, optional Compression level to use for all features which are being prepared. The compression is handled via xarray.Dataset.to_netcdf(...), for details see: - https://docs.xarray.dev/en/stable/generated/xarray.Dataset.to_netcdf.html - To disable compression, set to None. As a trade-off between speed and - compression, the default is {'zlib': True, 'complevel': 4}. + https://docs.xarray.dev/en/stable/generated/xarray.Dataset.to_netcdf.html . + To efficiently reduce cutout sizes, specify the number of 'least_significant_digits': n here. + To disable compression, set "complevel" to None. + Default is {'zlib': True, 'complevel': 9, 'shuffle': True}. Returns ------- @@ -194,9 +195,12 @@ def cutout_prepare( fd, tmp = mkstemp(suffix=filename, dir=directory) os.close(fd) + logger.debug("Writing cutout to file...") + # Delayed writing for large cutout + # cf. https://stackoverflow.com/questions/69810367/python-how-to-write-large-netcdf-with-xarray + write_job = ds.to_netcdf(tmp, compute=False) with ProgressBar(): - ds.to_netcdf(tmp) - + write_job.compute() if cutout.path.exists(): cutout.data.close() cutout.path.unlink() diff --git a/atlite/datasets/era5.py b/atlite/datasets/era5.py index 4c06b432..7e4b3ef5 100644 --- a/atlite/datasets/era5.py +++ b/atlite/datasets/era5.py @@ -252,9 +252,10 @@ def retrieval_times(coords, static=False): """ Get list of retrieval cdsapi arguments for time dimension in coordinates. - If static is False, this function creates a query for each year in the - time axis in coords. This ensures not running into query limits of the - cdsapi. If static is True, the function return only one set of parameters + If static is False, this function creates a query for each month and year + in the time axis in coords. This ensures not running into size query limits + of the cdsapi even with very (spatially) large cutouts. + If static is True, the function return only one set of parameters for the very first time point. Parameters @@ -274,16 +275,18 @@ def retrieval_times(coords, static=False): "time": time[0].strftime("%H:00"), } + # Prepare request for all months and years times = [] for year in time.year.unique(): t = time[time.year == year] - query = { - "year": str(year), - "month": list(t.month.unique()), - "day": list(t.day.unique()), - "time": ["%02d:00" % h for h in t.hour.unique()], - } - times.append(query) + for month in t.month.unique(): + query = { + "year": str(year), + "month": str(month), + "day": list(t[t.month == month].day.unique()), + "time": ["%02d:00" % h for h in t[t.month == month].hour.unique()], + } + times.append(query) return times @@ -324,10 +327,11 @@ def retrieve_data(product, chunks=None, tmpdir=None, lock=None, **updates): fd, target = mkstemp(suffix=".nc", dir=tmpdir) os.close(fd) - yearstr = ", ".join(atleast_1d(request["year"])) + # Inform user about data being downloaded as "* variable (year-month)" + timestr = f"{request['year']}-{request['month']}" variables = atleast_1d(request["variable"]) - varstr = "".join(["\t * " + v + f" ({yearstr})\n" for v in variables]) - logger.info(f"CDS: Downloading variables\n{varstr}") + varstr = "\n\t".join([f"{v} ({timestr})" for v in variables]) + logger.info(f"CDS: Downloading variables\n\t{varstr}\n") result.download(target) ds = xr.open_dataset(target, chunks=chunks or {}) @@ -335,6 +339,15 @@ def retrieve_data(product, chunks=None, tmpdir=None, lock=None, **updates): logger.debug(f"Adding finalizer for {target}") weakref.finalize(ds._file_obj._manager, noisy_unlink, target) + # Remove default encoding we get from CDSAPI, which can lead to NaN values after loading with subsequent + # saving due to how xarray handles netcdf compression (only float encoded as short int seem affected) + # Fixes issue by keeping "float32" encoded as "float32" instead of internally saving as "short int", see: + # https://stackoverflow.com/questions/75755441/why-does-saving-to-netcdf-without-encoding-change-some-values-to-nan + # and hopefully fixed soon (could then remove), see https://github.com/pydata/xarray/issues/7691 + for v in ds.data_vars: + if ds[v].encoding["dtype"] == "int16": + ds[v].encoding.clear() + return ds diff --git a/examples/create_cutout.ipynb b/examples/create_cutout.ipynb index 256af8db..239fefb5 100644 --- a/examples/create_cutout.ipynb +++ b/examples/create_cutout.ipynb @@ -1456,11 +1456,9 @@ "plotting functionality from `xarray` to plot features from\n", "the cutout's data.\n", "\n", - "
\n", - "\n", - "**Warning:** This will trigger `xarray` to load all the corresponding data from disk into memory!\n", - "\n", - "
" + "> **Warning**\n", + "> This will trigger `xarray` to load all the corresponding data from disk into memory!\n", + "\n" ] }, { @@ -1469,6 +1467,39 @@ "source": [ "Now that your cutout is created and prepared, you can call conversion functions as `cutout.pv` or `cutout.wind`. Note that this requires a bit more information, like what kind of pv panels to use, where do they stand etc. Please have a look at the other examples to get a picture of application cases." ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Reducing Cutout file sizes\n", + "\n", + "Cutouts can become quite large, depending on the spatial and temporal scope they cover.\n", + "By default `atlite` uses a trade-off between speed and compression to reduce the file size of cutouts.\n", + "\n", + "Stronger compression can be selected when creating a new cutout by choosing a higher `complevel` (`1` to `9`, default: `4`)\n", + "```\n", + "cutout.prepare(compression={\"zlib\": True, \"complevel\": 9})\n", + "```\n", + "\n", + "To change the compression for an existing cutout:\n", + "```\n", + "cutout = atlite.Cutout(\"cutout-path.nc\")\n", + "\n", + "compression = {\"zlib\": True, \"complevel\": 9}\n", + "for var in cutout.data.data_vars:\n", + " cutout.data[var].encoding.update(compression)\n", + "\n", + "cutout.to_file()\n", + "```\n", + "For details and more arguments for `compression`, see the [xarray documentation](https://docs.xarray.dev/en/stable/generated/xarray.Dataset.to_netcdf.html) for details.\n", + "\n", + "Alternatively a cutout can also be compressed by using the `netcdf` utility `nccopy` from the commandline:\n", + "\n", + "```\n", + "nccopy -d4 -s \n", + "```" + ] } ], "metadata": {