PyPSA · euronion · Apr 5, 2023 · May 11, 2022 · May 16, 2022 · May 16, 2022
diff --git a/RELEASE_NOTES.rst b/RELEASE_NOTES.rst
@@ -18,6 +18,10 @@ Upcoming Release
   This is now fixed and influx data is **always** shifted by minus 30 minutes.
   See `#256 <https://github.com/PyPSA/atlite/issues/256#issuecomment-1271446531>`_ for details.
 * Bugfix: The hydro inflow calculation was relying on a wrong distance calculation in `atlite.hydro.shift_and_aggregate_runoff_for_plants`. This is now fixed. 
+* Feature: Cutouts from `ERA5` are now downloaded for each month rather than for each year.
+  This allows for spatially larger cutouts (worldwide) which previously exceed the maximum
+  download size from ERA5.
+* Doc: A subsection on how to reduce `cutout` sizes has been added to the documentation.
 
 Version 0.2.9
 =============

diff --git a/atlite/data.py b/atlite/data.py
@@ -175,9 +175,12 @@ def cutout_prepare(cutout, features=None, tmpdir=None, overwrite=False):
         fd, tmp = mkstemp(suffix=filename, dir=directory)
         os.close(fd)
 
+        logger.debug("Writing cutout to file...")
+        # Delayed writing for large cutout
+        # cf. https://stackoverflow.com/questions/69810367/python-how-to-write-large-netcdf-with-xarray
+        write_job = ds.to_netcdf(tmp, compute=False)
         with ProgressBar():
-            ds.to_netcdf(tmp)
-
+            write_job.compute()
         if cutout.path.exists():
             cutout.data.close()
             cutout.path.unlink()

diff --git a/atlite/datasets/era5.py b/atlite/datasets/era5.py
@@ -225,9 +225,10 @@ def retrieval_times(coords, static=False):
     """
     Get list of retrieval cdsapi arguments for time dimension in coordinates.
 
-    If static is False, this function creates a query for each year in the
-    time axis in coords. This ensures not running into query limits of the
-    cdsapi. If static is True, the function return only one set of parameters
+    If static is False, this function creates a query for each month and year
+    in the time axis in coords. This ensures not running into size query limits
+    of the cdsapi even with very (spatially) large cutouts.
+    If static is True, the function return only one set of parameters
     for the very first time point.
 
     Parameters
@@ -248,16 +249,18 @@ def retrieval_times(coords, static=False):
             "time": time[0].strftime("%H:00"),
         }
 
+    # Prepare request for all months and years
     times = []
     for year in time.year.unique():
         t = time[time.year == year]
-        query = {
-            "year": str(year),
-            "month": list(t.month.unique()),
-            "day": list(t.day.unique()),
-            "time": ["%02d:00" % h for h in t.hour.unique()],
-        }
-        times.append(query)
+        for month in t.month.unique():
+            query = {
+                "year": str(year),
+                "month": str(month),
+                "day": list(t[t.month == month].day.unique()),
+                "time": ["%02d:00" % h for h in t[t.month == month].hour.unique()],
+            }
+            times.append(query)
     return times
 
 
@@ -296,10 +299,11 @@ def retrieve_data(product, chunks=None, tmpdir=None, lock=None, **updates):
         fd, target = mkstemp(suffix=".nc", dir=tmpdir)
         os.close(fd)
 
-        yearstr = ", ".join(atleast_1d(request["year"]))
+        # Inform user about data being downloaded as "* variable (year-month)"
+        timestr = f"{request['year']}-{request['month']}"
         variables = atleast_1d(request["variable"])
-        varstr = "".join(["\t * " + v + f" ({yearstr})\n" for v in variables])
-        logger.info(f"CDS: Downloading variables\n{varstr}")
+        varstr = "\n\t".join([f"{v} ({timestr})" for v in variables])
+        logger.info(f"CDS: Downloading variables\n\t{varstr}\n")
         result.download(target)
 
     ds = xr.open_dataset(target, chunks=chunks or {})

diff --git a/examples/create_cutout.ipynb b/examples/create_cutout.ipynb
@@ -1454,11 +1454,9 @@
     "plotting functionality from `xarray` to plot features from\n",
     "the cutout's data.\n",
     "\n",
-    "<div class=\"alert alert-info\">\n",
-    "\n",
-    "**Warning:** This will trigger `xarray` to load all the corresponding data from disk into memory!\n",
-    "\n",
-    "</div>"
+    "> **Warning**\n",
+    ">  This will trigger `xarray` to load all the corresponding data from disk into memory!\n",
+    "\n"
    ]
   },
   {
@@ -1467,6 +1465,28 @@
    "source": [
     "Now that your cutout is created and prepared, you can call conversion functions as `cutout.pv` or `cutout.wind`. Note that this requires a bit more information, like what kind of pv panels to use, where do they stand etc. Please have a look at the other examples to get a picture of application cases."
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Reducing Cutout file sizes\n",
+    "\n",
+    "Cutouts can become quite large, depending on the spatial and temporal scope they cover.\n",
+    "`atlite` does not use a specific compression method to reduce the file sizes of cutouts.\n",
+    "If you wish to reduce the file size of a cutout the `netcdf` utility `nccopy` can be \n",
+    "used from the commandline:\n",
+    "\n",
+    "```\n",
+    "nccopy -d4 -s <input cutout .nc file> <output cutout .nc file>\n",
+    "```\n",
+    "\n",
+    "This usually reduces the file size by ~50% without notably affecting the performance of `atlite`.\n",
+    "Decompression levels above `-d4` do not seem to have significant advantages.\n",
+    "\n",
+    "Duration of the compression depends on the cutout size.\n",
+    "For 10 GiB cutouts we have observed durations of ca. 15 min for the compression to finish."
+   ]
   }
  ],
  "metadata": {