From 3a6f5438ce06b0d70860a998767143cc3fe56dae Mon Sep 17 00:00:00 2001
From: euronion <42553970+euronion@users.noreply.github.com>
Date: Wed, 5 Apr 2023 11:08:53 +0200
Subject: [PATCH] Enable download of large cutouts from ERA5 via cdsapi (#236)
 and address xarray encoding bug

* Update era5.py

* Address memory errors for writing large cutouts.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add month being retrieved to informative output during cutout.prepare().

* Add note on how to compress cutouts from terminal

* Update doc-string for monthly retrieval

* Update RELEASE_NOTES.rst

* update release notes

* era5: ensure correct info print out format for data retrieval

* Update doc to refer to new compression feature.

* Fix nan issue with encoding in xarray/netcdf

* Use nicer Python syntax

* Increase default compression level

* Address bug in xarray encoding for ERA5 data

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Fabian <fab.hof@gmx.de>
---
 RELEASE_NOTES.rst            | 11 +++++++++-
 atlite/data.py               | 16 ++++++++------
 atlite/datasets/era5.py      | 39 ++++++++++++++++++++++------------
 examples/create_cutout.ipynb | 41 +++++++++++++++++++++++++++++++-----
 4 files changed, 82 insertions(+), 25 deletions(-)

diff --git a/RELEASE_NOTES.rst b/RELEASE_NOTES.rst
index 766a8a33..5669e6b8 100644
--- a/RELEASE_NOTES.rst
+++ b/RELEASE_NOTES.rst
@@ -16,10 +16,19 @@ Upcoming Release
 * Added 1-axis vertical and 2-axis tracking option for solar pv and trigon_model = "simple"
 * Added small documentation for get_windturbineconfig
 * The deprecated functions `grid_cells` and `grid_coordinates` were removed.
-* Feature: Cutouts are now compressed during the `.prepare(...)` stepusing the native compression feature of netCDF files.
+* Feature: Cutouts are now compressed differently during the `.prepare(...)` step using the native compression feature of netCDF files.
+    This increases time to build a cutout but should reduce cutout file sizes.
     Existing cutouts are not affected. To also compress existing cutouts, load and save them using `xarray` with
     compression specified, see `the xarray documentation <https://docs.xarray.dev/en/stable/generated/xarray.Dataset.to_netcdf.html>`_
     for details.
+* Feature: Cutouts from `ERA5` are now downloaded for each month rather than for each year.
+  This allows for spatially larger cutouts (worldwide) which previously exceed the maximum
+  download size from ERA5.
+* Doc: A subsection on how to reduce `cutout` sizes has been added to the documentation.
+* Bug notice: An bug in one of `atlite` package dependencies (`xarray`) can lead to `nan` values when using `atlite`.
+    A workaround is implemented in `atlite` which reduces the performance when building cutouts, especially for ERA5 cutouts.
+    The `nan` values in `cutouts` which are affected by the bug can not be recoevered and the `cutout` needs to be downloaded again.
+    For more details on the bug, see the `xarray issue tracker <https://github.com/pydata/xarray/issues/7691>`_.
 
 Version 0.2.10
 ==============
diff --git a/atlite/data.py b/atlite/data.py
index dc73a873..43420d4d 100644
--- a/atlite/data.py
+++ b/atlite/data.py
@@ -115,7 +115,7 @@ def cutout_prepare(
     features=None,
     tmpdir=None,
     overwrite=False,
-    compression={"zlib": True, "complevel": 4},
+    compression={"zlib": True, "complevel": 9, "shuffle": True},
 ):
     """
     Prepare all or a selection of features in a cutout.
@@ -144,9 +144,10 @@ def cutout_prepare(
     compression : None/dict, optional
         Compression level to use for all features which are being prepared.
         The compression is handled via xarray.Dataset.to_netcdf(...), for details see:
-        https://docs.xarray.dev/en/stable/generated/xarray.Dataset.to_netcdf.html
-        To disable compression, set to None. As a trade-off between speed and
-        compression, the default is {'zlib': True, 'complevel': 4}.
+        https://docs.xarray.dev/en/stable/generated/xarray.Dataset.to_netcdf.html .
+        To efficiently reduce cutout sizes, specify the number of 'least_significant_digits': n here.
+        To disable compression, set "complevel" to None.
+        Default is {'zlib': True, 'complevel': 9, 'shuffle': True}.
 
     Returns
     -------
@@ -194,9 +195,12 @@ def cutout_prepare(
         fd, tmp = mkstemp(suffix=filename, dir=directory)
         os.close(fd)
 
+        logger.debug("Writing cutout to file...")
+        # Delayed writing for large cutout
+        # cf. https://stackoverflow.com/questions/69810367/python-how-to-write-large-netcdf-with-xarray
+        write_job = ds.to_netcdf(tmp, compute=False)
         with ProgressBar():
-            ds.to_netcdf(tmp)
-
+            write_job.compute()
         if cutout.path.exists():
             cutout.data.close()
             cutout.path.unlink()
diff --git a/atlite/datasets/era5.py b/atlite/datasets/era5.py
index 4c06b432..7e4b3ef5 100644
--- a/atlite/datasets/era5.py
+++ b/atlite/datasets/era5.py
@@ -252,9 +252,10 @@ def retrieval_times(coords, static=False):
     """
     Get list of retrieval cdsapi arguments for time dimension in coordinates.
 
-    If static is False, this function creates a query for each year in the
-    time axis in coords. This ensures not running into query limits of the
-    cdsapi. If static is True, the function return only one set of parameters
+    If static is False, this function creates a query for each month and year
+    in the time axis in coords. This ensures not running into size query limits
+    of the cdsapi even with very (spatially) large cutouts.
+    If static is True, the function return only one set of parameters
     for the very first time point.
 
     Parameters
@@ -274,16 +275,18 @@ def retrieval_times(coords, static=False):
             "time": time[0].strftime("%H:00"),
         }
 
+    # Prepare request for all months and years
     times = []
     for year in time.year.unique():
         t = time[time.year == year]
-        query = {
-            "year": str(year),
-            "month": list(t.month.unique()),
-            "day": list(t.day.unique()),
-            "time": ["%02d:00" % h for h in t.hour.unique()],
-        }
-        times.append(query)
+        for month in t.month.unique():
+            query = {
+                "year": str(year),
+                "month": str(month),
+                "day": list(t[t.month == month].day.unique()),
+                "time": ["%02d:00" % h for h in t[t.month == month].hour.unique()],
+            }
+            times.append(query)
     return times
 
 
@@ -324,10 +327,11 @@ def retrieve_data(product, chunks=None, tmpdir=None, lock=None, **updates):
         fd, target = mkstemp(suffix=".nc", dir=tmpdir)
         os.close(fd)
 
-        yearstr = ", ".join(atleast_1d(request["year"]))
+        # Inform user about data being downloaded as "* variable (year-month)"
+        timestr = f"{request['year']}-{request['month']}"
         variables = atleast_1d(request["variable"])
-        varstr = "".join(["\t * " + v + f" ({yearstr})\n" for v in variables])
-        logger.info(f"CDS: Downloading variables\n{varstr}")
+        varstr = "\n\t".join([f"{v} ({timestr})" for v in variables])
+        logger.info(f"CDS: Downloading variables\n\t{varstr}\n")
         result.download(target)
 
     ds = xr.open_dataset(target, chunks=chunks or {})
@@ -335,6 +339,15 @@ def retrieve_data(product, chunks=None, tmpdir=None, lock=None, **updates):
         logger.debug(f"Adding finalizer for {target}")
         weakref.finalize(ds._file_obj._manager, noisy_unlink, target)
 
+    # Remove default encoding we get from CDSAPI, which can lead to NaN values after loading with subsequent
+    # saving due to how xarray handles netcdf compression (only float encoded as short int seem affected)
+    # Fixes issue by keeping "float32" encoded as "float32" instead of internally saving as "short int", see:
+    # https://stackoverflow.com/questions/75755441/why-does-saving-to-netcdf-without-encoding-change-some-values-to-nan
+    # and hopefully fixed soon (could then remove), see https://github.com/pydata/xarray/issues/7691
+    for v in ds.data_vars:
+        if ds[v].encoding["dtype"] == "int16":
+            ds[v].encoding.clear()
+
     return ds
 
 
diff --git a/examples/create_cutout.ipynb b/examples/create_cutout.ipynb
index 256af8db..239fefb5 100644
--- a/examples/create_cutout.ipynb
+++ b/examples/create_cutout.ipynb
@@ -1456,11 +1456,9 @@
     "plotting functionality from `xarray` to plot features from\n",
     "the cutout's data.\n",
     "\n",
-    "<div class=\"alert alert-info\">\n",
-    "\n",
-    "**Warning:** This will trigger `xarray` to load all the corresponding data from disk into memory!\n",
-    "\n",
-    "</div>"
+    "> **Warning**\n",
+    ">  This will trigger `xarray` to load all the corresponding data from disk into memory!\n",
+    "\n"
    ]
   },
   {
@@ -1469,6 +1467,39 @@
    "source": [
     "Now that your cutout is created and prepared, you can call conversion functions as `cutout.pv` or `cutout.wind`. Note that this requires a bit more information, like what kind of pv panels to use, where do they stand etc. Please have a look at the other examples to get a picture of application cases."
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Reducing Cutout file sizes\n",
+    "\n",
+    "Cutouts can become quite large, depending on the spatial and temporal scope they cover.\n",
+    "By default `atlite` uses a trade-off between speed and compression to reduce the file size of cutouts.\n",
+    "\n",
+    "Stronger compression can be selected when creating a new cutout by choosing a higher `complevel` (`1` to `9`, default: `4`)\n",
+    "```\n",
+    "cutout.prepare(compression={\"zlib\": True, \"complevel\": 9})\n",
+    "```\n",
+    "\n",
+    "To change the compression for an existing cutout:\n",
+    "```\n",
+    "cutout = atlite.Cutout(\"cutout-path.nc\")\n",
+    "\n",
+    "compression = {\"zlib\": True, \"complevel\": 9}\n",
+    "for var in cutout.data.data_vars:\n",
+    "    cutout.data[var].encoding.update(compression)\n",
+    "\n",
+    "cutout.to_file()\n",
+    "```\n",
+    "For details and more arguments for `compression`, see the [xarray documentation](https://docs.xarray.dev/en/stable/generated/xarray.Dataset.to_netcdf.html) for details.\n",
+    "\n",
+    "Alternatively a cutout can also be compressed by using the `netcdf` utility `nccopy` from the commandline:\n",
+    "\n",
+    "```\n",
+    "nccopy -d4 -s <input cutout .nc file> <output cutout .nc file>\n",
+    "```"
+   ]
   }
  ],
  "metadata": {