Multiple Timeseries (#209)

## Description Extending time series utility to process multiple time series. This allows for time series comparisons between different collections Note that time series section must be a list.
JCSDA-internal · Oct 9, 2024 · 767125f · 767125f
1 parent cab65a4
commit 767125f
Show file tree

Hide file tree

Showing 12 changed files with 272 additions and 122 deletions.
diff --git a/.github/workflows/yaml_coding_norms.yml b/.github/workflows/yaml_coding_norms.yml
@@ -18,7 +18,7 @@ jobs:
     name: Check YAML Coding Norms
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - id: yaml-lint
         name: yaml-lint
         uses: ibiqlik/action-yamllint@v3
@@ -27,7 +27,7 @@ jobs:
           format: colored
           config_file: .yamllint.yml
 
-      - uses: actions/upload-artifact@v2
+      - uses: actions/upload-artifact@v4
         if: always()
         with:
           name: yamllint-logfile

diff --git a/requirements-github.txt b/requirements-github.txt
@@ -8,7 +8,7 @@ xarray>=2022.6.0
 seaborn>=0.12.2
 hvplot>=0.8.2
 nbconvert>=6.5.4
-bokeh<3.5.0,>=3.4.0
+bokeh<3.6.0,>=3.5.0
 geopandas>=0.13.2
 geoviews>=1.10.0
 nbsite

diff --git a/src/eva/data/data_collections.py b/src/eva/data/data_collections.py
@@ -65,9 +65,9 @@ def create_or_add_to_collection(self, collection_name, collection, concat_dimens
         """
 
         # If time_series collection name must also be time_series
-        if self.time_series and collection_name != 'time_series':
-            self.logger.abort('In create_or_add_to_collection: time_series collection must ' +
-                              'be \'time_series\'')
+        if self.time_series and 'time_series' not in collection_name:
+            self.logger.abort('In get_variable_data: time_series collection must ' +
+                              'have name containing \'time_series\'')
 
         # Collections should only be xarray datasets
         if not isinstance(collection, Dataset):
@@ -158,9 +158,9 @@ def add_variable_to_collection(self, collection_name, group_name, variable_name,
         """
 
         # If time_series collection name must also be time_series
-        if self.time_series and collection_name != 'time_series':
-            self.logger.abort('In add_variable_to_collection: time_series collection must ' +
-                              'be \'time_series\'')
+        if self.time_series and 'time_series' not in collection_name:
+            self.logger.abort('In get_variable_data: time_series collection must ' +
+                              'have name containing \'time_series\'')
 
         # Assert that new variable is an xarray Dataarray
         if not isinstance(variable, DataArray):
@@ -211,9 +211,9 @@ def get_variable_data_array(self, collection_name, group_name, variable_name,
         """
 
         # If time_series collection name must also be time_series
-        if self.time_series and collection_name != 'time_series':
-            self.logger.abort('In get_variable_data_array: time_series collection must ' +
-                              'be \'time_series\'')
+        if self.time_series and 'time_series' not in collection_name:
+            self.logger.abort('In get_variable_data: time_series collection must ' +
+                              'have name containing \'time_series\'')
 
         group_variable_name = group_name + '::' + variable_name
         data_array = self._collections[collection_name][group_variable_name]
@@ -293,9 +293,9 @@ def get_variable_data(self, collection_name, group_name, variable_name,
         """
 
         # If time_series collection name must also be time_series
-        if self.time_series and collection_name != 'time_series':
+        if self.time_series and 'time_series' not in collection_name:
             self.logger.abort('In get_variable_data: time_series collection must ' +
-                              'be \'time_series\'')
+                              'have name containing \'time_series\'')
 
         variable_array = self.get_variable_data_array(collection_name, group_name, variable_name,
                                                       channels, levels, datatypes)

diff --git a/src/eva/eva_driver.py b/src/eva/eva_driver.py
@@ -12,6 +12,7 @@
 from datetime import datetime
 import argparse
 import os
+from collections import defaultdict
 
 from eva.utilities.config import get
 from eva.utilities.logger import Logger
@@ -94,85 +95,103 @@ def read_transform_time_series(logger, timing, eva_dict, data_collections):
         None
     """
 
-    # Check for required keys
-    # -----------------------
-    required_keys = [
-        'begin_date',
-        'final_date',
-        'interval',
-        'collection',
-        'variables',
-        ]
-    for key in required_keys:
-        logger.assert_abort(key in eva_dict['time_series'], 'If running Eva in time series ' +
-                            f'mode the time series config must contain "{key}"')
-
-    # Write message that this is a time series run
-    logger.info('This instance of Eva is being used to accumulate a time series.')
-
-    # Optionally suppress the display of the collection
-    suppress_collection_display = get(eva_dict, logger, 'suppress_collection_display', False)
-
-    # Get the datasets configuration
-    time_series_config = eva_dict['time_series']
-
-    # Extract the dates of the time series
-    begin_date = time_series_config['begin_date']
-    final_date = time_series_config['final_date']
-    interval = time_series_config['interval']
-
-    # Convert begin and end dates from ISO strings to datetime objects
-    begin_date = datetime.fromisoformat(begin_date)
-    final_date = datetime.fromisoformat(final_date)
-
-    # Convert interval ISO string to timedelta object
-    interval = iso_duration_to_timedelta(logger, interval)
-
-    # Make list of dates from begin to end with interval
-    dates = []
-    date = begin_date
-    count = 0
-    while date <= final_date:
-        dates.append(date)
-        date += interval
-        count += 1
-        # Abort if count hits one million
-        logger.assert_abort(count < 1e6, 'You are planning to read more than one million ' +
-                            'time steps. This is likely an error. Please check your configuration.')
-
-    # Get the datasets configuration
-    datasets_config = get(eva_dict, logger, 'datasets')
-
-    # Assert that datasets_config is the same length as dates
-    logger.assert_abort(len(datasets_config) == len(dates), 'When running in time series mode ' +
-                        'the number of datasets must be the same as the number of dates.')
-
-    # Loop over datasets reading each one in turn, internally appending the data_collections
-    for ind, dataset_config in enumerate(datasets_config):
-
-        # Create a temporary collection for this time step
-        data_collections_tmp = DataCollections()
-
-        # Prepare diagnostic data
-        logger.info('Running data driver')
-        timing.start('DataDriverExecute')
-        data_driver(dataset_config, data_collections_tmp, timing, logger)
-        timing.stop('DataDriverExecute')
-
-        # Perform any transforms on the fly
+    # Iterate through list of time series dictionaries
+    for time_series_config in eva_dict['time_series']:
+
+        # Check for required keys
+        # -----------------------
+        required_keys = [
+            'begin_date',
+            'final_date',
+            'interval',
+            'collection',
+            'variables',
+            ]
+        for key in required_keys:
+            logger.assert_abort(key in time_series_config, 'If running Eva in time series ' +
+                                f'mode the time series config must contain "{key}"')
+
+        # Write message that this is a time series run
+        logger.info('This instance of Eva is being used to accumulate a time series.')
+
+        # Optionally suppress the display of the collection
+        suppress_collection_display = get(eva_dict, logger, 'suppress_collection_display', False)
+
+        # Extract the dates of the time series
+        begin_date = time_series_config['begin_date']
+        final_date = time_series_config['final_date']
+        interval = time_series_config['interval']
+
+        # Convert begin and end dates from ISO strings to datetime objects
+        begin_date = datetime.fromisoformat(begin_date)
+        final_date = datetime.fromisoformat(final_date)
+
+        # Convert interval ISO string to timedelta object
+        interval = iso_duration_to_timedelta(logger, interval)
+
+        # Make list of dates from begin to end with interval
+        dates = []
+        date = begin_date
+        count = 0
+        while date <= final_date:
+            dates.append(date)
+            date += interval
+            count += 1
+            # Abort if count hits one million
+            logger.assert_abort(count < 1e6, 'You are planning to read more than one million ' +
+                                'time steps. This is likely an error. Please check your ' +
+                                'configuration.')
+
+        # Get all datasets configuration
+        all_datasets = get(eva_dict, logger, 'datasets')
+
+        # Find all dataset_configs with collection name
+        datasets_config = []
+        for dataset in all_datasets:
+            if dataset['name'] == time_series_config["collection"]:
+                datasets_config.append(dataset)
+
+        # Save transforms to transform_dict based on collection
+        transform_dict = defaultdict(list)
         if 'transforms' in eva_dict:
-            logger.info(f'Running transform driver')
-            timing.start('TransformDriverExecute')
-            transform_driver(eva_dict, data_collections_tmp, timing, logger)
-            timing.stop('TransformDriverExecute')
+            for transform in get(eva_dict, logger, 'transforms'):
+                # Get collection name
+                name = transform['new name'].split('::')[0]
+                if name == time_series_config['collection']:
+                    transform_dict['transforms'].append(transform)
+
+        # Assert that datasets_config is the same length as dates
+        logger.assert_abort(len(datasets_config) == len(dates), 'When running in time ' +
+                            'series mode the number of datasets must be the same as the ' +
+                            'number of dates.')
+
+        # Loop over datasets reading each one in turn, internally appending the data_collections
+        for ind, dataset_config in enumerate(datasets_config):
+
+            # Create a temporary collection for this time step
+            data_collections_tmp = DataCollections()
+
+            # Prepare diagnostic data
+            logger.info('Running data driver')
+            timing.start('DataDriverExecute')
+            data_driver(dataset_config, data_collections_tmp, timing, logger)
+            timing.stop('DataDriverExecute')
+
+            # Perform any transforms on the fly
+            if transform_dict:
+                logger.info(f'Running transform driver')
+                timing.start('TransformDriverExecute')
+                transform_driver(transform_dict, data_collections_tmp, timing, logger)
+                timing.stop('TransformDriverExecute')
+
+            # Collapse data into time series
+            date = dates[ind]
+            collapse_collection_to_time_series(logger, ind, date, time_series_config,
+                                               data_collections, data_collections_tmp)
 
-        # Collapse data into time series
-        collapse_collection_to_time_series(logger, ind, dates, time_series_config, data_collections,
-                                           data_collections_tmp)
-
-    if not suppress_collection_display:
-        logger.info('Computing of Eva time series complete: status of collection:')
-        data_collections.display_collections()
+        if not suppress_collection_display:
+            logger.info('Computing of Eva time series complete: status of collection:')
+            data_collections.display_collections()
 
 
 # --------------------------------------------------------------------------------------------------

diff --git a/src/eva/tests/config/testIodaObsSpaceAmsuaN19_Multiple_TimeSeries.yaml b/src/eva/tests/config/testIodaObsSpaceAmsuaN19_Multiple_TimeSeries.yaml
@@ -0,0 +1,119 @@
+suppress_collection_display: false
+datasets:
+- name: control
+  type: IodaObsSpace
+  filenames:
+  - ${data_input_path}/ctrl_amsua_n19.20230726T030000Z.nc4
+  channels: 3,8
+  groups:
+  - name: ObsValue
+    variables:
+    - brightnessTemperature
+  - name: GsiHofXBc
+  - name: hofx0
+  - name: MetaData
+  - name: oman
+- name: control
+  type: IodaObsSpace
+  filenames:
+  - ${data_input_path}/ctrl_amsua_n19.20230726T090000Z.nc4
+  channels: 3,8
+  groups:
+  - name: ObsValue
+    variables:
+    - brightnessTemperature
+  - name: GsiHofXBc
+  - name: hofx0
+  - name: MetaData
+  - name: oman
+- name: experiment
+  type: IodaObsSpace
+  filenames:
+  - ${data_input_path}/exp_amsua_n19.20230726T030000Z.nc4
+  channels: 3,8
+  groups:
+  - name: ObsValue
+    variables:
+    - brightnessTemperature
+  - name: GsiHofXBc
+  - name: hofx0
+  - name: MetaData
+  - name: oman
+- name: experiment
+  type: IodaObsSpace
+  filenames:
+  - ${data_input_path}/exp_amsua_n19.20230726T090000Z.nc4
+  channels: 3,8
+  groups:
+  - name: ObsValue
+    variables:
+    - brightnessTemperature
+  - name: GsiHofXBc
+  - name: hofx0
+  - name: MetaData
+  - name: oman
+transforms:
+- transform: arithmetic
+  new name: control::ObsValueMinusHofx::${variable}
+  equals: control::ObsValue::${variable}-control::hofx0::${variable}
+  for:
+    variable: &id001
+    - brightnessTemperature
+- transform: arithmetic
+  new name: experiment::ObsValueMinusHofx::${variable}
+  equals: experiment::ObsValue::${variable}-experiment::hofx0::${variable}
+  for:
+    variable: *id001
+time_series:
+- begin_date: '2023-07-26T03:00:00'
+  final_date: '2023-07-26T09:00:00'
+  interval: PT6H
+  collection: experiment
+  variables:
+  - ObsValueMinusHofx::brightnessTemperature
+  aggregation_methods:
+  - mean
+  dimension: Location
+- begin_date: '2023-07-26T03:00:00'
+  final_date: '2023-07-26T09:00:00'
+  interval: PT6H
+  collection: control
+  variables:
+  - ObsValueMinusHofx::brightnessTemperature
+  aggregation_methods:
+  - mean
+  dimension: Location
+graphics:
+  plotting_backend: Emcpy
+  figure_list:
+  - figure:
+      layout:
+      - 1
+      - 1
+      title: Mean OmB | AMSU-A NOAA-19 | Ch 3 | ObsValueMinusHofx::brightnessTemperature
+      output name: time_series/amsua_n19/brightnessTemperature_mean/3/time_series_compare_omb.png
+    plots:
+    - add_xlabel: Datetime
+      add_ylabel: JEDI h(x)
+      add_grid: null
+      add_legend:
+        loc: upper left
+      layers:
+      - type: LinePlot
+        x:
+          variable: control_time_series::MetaData::Dates
+        y:
+          variable: control_time_series::ObsValueMinusHofx::brightnessTemperature_mean
+        channel: 3
+        markersize: 5
+        color: black
+        label: Observation minus h(x) - ctrl
+      - type: LinePlot
+        x:
+          variable: experiment_time_series::MetaData::Dates
+        y:
+          variable: experiment_time_series::ObsValueMinusHofx::brightnessTemperature_mean
+        channel: 3
+        markersize: 5
+        color: blue
+        label: Observation minus h(x) - exp