From a40e6510588b8d72b171cc10074d4f5794f81b9e Mon Sep 17 00:00:00 2001
From: Akira Sewnath <asewnath@users.noreply.github.com>
Date: Fri, 19 Apr 2024 10:11:40 -0400
Subject: [PATCH] Adding new reader and lat/lon match transform (#184)

We are adding a new reader to handle geoval files, but with the idea
that it can be generalized to other types of files. A new latlon match
transform was added using some work that @danholdaway had developed.

List of changes:
- Method to `data_collections.py` to retrieve a collection
- Generic `DataFile` reader
- `latlon_match` transform to match lat/lon coordinates from one
collection to another
- `DataFile` test yaml and some files for testing

Resolves #177
---
 requirements-github.txt                       |   2 +-
 src/eva/data/data_collections.py              |   5 +
 src/eva/data/geoval_space.py                  | 109 ++++++++++++++++++
 src/eva/tests/config/testGeovalSpace.yaml     | 100 ++++++++++++++++
 ...ob.PT6H.amsua_n19.2021-12-11T21:00:00Z.nc4 |   3 +
 ...ob.PT6H.amsua_n19.2021-12-11T21:00:00Z.nc4 |   3 +
 ...ofx.amsua_n19-geovals.20211211T210000Z.nc4 |   3 +
 .../swell-hofx.amsua_n19.20211211T210000Z.nc4 |   3 +
 src/eva/transforms/latlon_match.py            |  88 ++++++++++++++
 9 files changed, 315 insertions(+), 1 deletion(-)
 create mode 100644 src/eva/data/geoval_space.py
 create mode 100644 src/eva/tests/config/testGeovalSpace.yaml
 create mode 100644 src/eva/tests/data/ncdiag.x0048v2-geovals.ob.PT6H.amsua_n19.2021-12-11T21:00:00Z.nc4
 create mode 100644 src/eva/tests/data/ncdiag.x0048v2.ob.PT6H.amsua_n19.2021-12-11T21:00:00Z.nc4
 create mode 100644 src/eva/tests/data/swell-hofx.amsua_n19-geovals.20211211T210000Z.nc4
 create mode 100644 src/eva/tests/data/swell-hofx.amsua_n19.20211211T210000Z.nc4
 create mode 100644 src/eva/transforms/latlon_match.py

diff --git a/requirements-github.txt b/requirements-github.txt
index 881fcd31..2ea4f668 100644
--- a/requirements-github.txt
+++ b/requirements-github.txt
@@ -8,7 +8,7 @@ xarray>=2022.6.0
 seaborn>=0.12.2
 hvplot>=0.8.2
 nbconvert>=6.5.4
-bokeh>=3.1.1
+bokeh<3.5.0,>=3.4.0
 geopandas>=0.13.2
 geoviews>=1.10.0
 nbsite
diff --git a/src/eva/data/data_collections.py b/src/eva/data/data_collections.py
index 514f9e77..20d32109 100644
--- a/src/eva/data/data_collections.py
+++ b/src/eva/data/data_collections.py
@@ -169,6 +169,11 @@ def add_variable_to_collection(self, collection_name, group_name, variable_name,
 
     # ----------------------------------------------------------------------------------------------
 
+    def get_data_collection(self, collection_name):
+        return self._collections[collection_name]
+
+    # ----------------------------------------------------------------------------------------------
+
     def get_variable_data_array(self, collection_name, group_name, variable_name,
                                 channels=None, levels=None, datatypes=None):
 
diff --git a/src/eva/data/geoval_space.py b/src/eva/data/geoval_space.py
new file mode 100644
index 00000000..27c6bd28
--- /dev/null
+++ b/src/eva/data/geoval_space.py
@@ -0,0 +1,109 @@
+# (C) Copyright 2024 NOAA/NWS/EMC
+#
+# (C) Copyright 2024 United States Government as represented by the Administrator of the
+# National Aeronautics and Space Administration. All Rights Reserved.
+#
+# This software is licensed under the terms of the Apache Licence Version 2.0
+# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+
+# --------------------------------------------------------------------------------------------------
+
+import os
+import netCDF4 as nc
+import numpy as np
+from xarray import Dataset, open_dataset
+from eva.utilities.config import get
+from eva.data.eva_dataset_base import EvaDatasetBase
+from eva.utilities.utils import parse_channel_list
+
+
+class GeovalSpace(EvaDatasetBase):
+
+    """
+    A class for handling geoval files
+    """
+
+    def execute(self, dataset_config, data_collections, timing):
+
+        """
+        Executes the processing of data file dataset.
+
+        Args:
+            dataset_config (dict): Configuration dictionary for the dataset.
+            data_collections (DataCollections): Object for managing data collections.
+            timing (Timing): Timing object for tracking execution time.
+        """
+
+        # Set the collection name
+        # -----------------------
+        collection_name = get(dataset_config, self.logger, 'name')
+
+        # Get missing value threshold
+        # ---------------------------
+        threshold = float(get(dataset_config, self.logger, 'missing_value_threshold', 1.0e30))
+
+        # Get levels to plot profiles
+        # --------------------------_
+        levels_str_or_list = get(dataset_config, self.logger, 'levels', [])
+
+        # Convert levels to list
+        levels = []
+        if levels_str_or_list is not []:
+            levels = parse_channel_list(levels_str_or_list, self.logger)
+
+        # Filename to be used for reads
+        # ---------------------------------------
+        data_filename = get(dataset_config, self.logger, 'data_file')
+
+        # Get instrument name
+        instr_name = get(dataset_config, self.logger, 'instrument_name')
+
+        # Open instrument files xarray dataset
+        instr_ds = open_dataset(data_filename)
+
+        # Enforce that a variable exists, do not default to all variables
+        variables = get(dataset_config, self.logger, 'variables')
+        if not variables:
+            self.logger.abort('A variables list needs to be defined in the config file.')
+        vars_to_remove = list(set(list(instr_ds.keys())) - set(variables))
+        instr_ds = instr_ds.drop_vars(vars_to_remove)
+
+        # Rename variables and nval dimension
+        rename_dict = {}
+        rename_dims_dict = {}
+        for v in variables:
+            # Retrieve dimension names
+            dims = instr_ds[v].dims
+            if np.size(dims) > 1:
+                rename_dims_dict[dims[1]] = f'Level'
+            rename_dict[v] = f'{instr_name}::{v}'
+        instr_ds = instr_ds.rename(rename_dict)
+        instr_ds = instr_ds.rename_dims(rename_dims_dict)
+
+        # Add the dataset_config to the collections
+        data_collections.create_or_add_to_collection(collection_name, instr_ds)
+
+        # Nan out unphysical values
+        data_collections.nan_float_values_outside_threshold(threshold)
+
+        # Display the contents of the collections for helping the user with making plots
+        data_collections.display_collections()
+
+    def generate_default_config(self, filenames, collection_name):
+
+        """
+        Generate a default configuration for the dataset.
+
+        This method generates a default configuration for the dataset based on the provided
+        filenames and collection name. It can be used as a starting point for creating a
+        configuration for the dataset.
+
+        Args:
+            filenames: Filenames or file paths relevant to the dataset.
+            collection_name (str): Name of the collection for the dataset.
+
+        Returns:
+            dict: A dictionary representing the default configuration for the dataset.
+        """
+
+        pass
diff --git a/src/eva/tests/config/testGeovalSpace.yaml b/src/eva/tests/config/testGeovalSpace.yaml
new file mode 100644
index 00000000..9e0c1cd9
--- /dev/null
+++ b/src/eva/tests/config/testGeovalSpace.yaml
@@ -0,0 +1,100 @@
+datasets:
+
+  - name: exp_geovals_with_lvls
+    type: GeovalSpace
+    data_file: ${data_input_path}/swell-hofx.amsua_n19-geovals.20211211T210000Z.nc4
+    levels: &exp_levels 33,60
+    instrument_name: amsua_n19
+    variables: &exp_vars_with_lvls ['mole_fraction_of_carbon_dioxide_in_air']
+
+  - name: exp_geovals
+    type: GeovalSpace
+    data_file: ${data_input_path}/swell-hofx.amsua_n19-geovals.20211211T210000Z.nc4
+    instrument_name: amsua_n19
+    variables: &exp_vars ['vegetation_area_fraction', 'leaf_area_index']
+
+  - name: exp_latlon
+    type: IodaObsSpace
+    filenames:
+      - ${data_input_path}/swell-hofx.amsua_n19.20211211T210000Z.nc4
+    groups:
+      - name: MetaData
+
+  - name: ctrl_geovals_with_lvls
+    type: GeovalSpace
+    data_file: ${data_input_path}/ncdiag.x0048v2-geovals.ob.PT6H.amsua_n19.2021-12-11T21:00:00Z.nc4
+    levels: &ctrl_levels 33,60
+    instrument_name: amsua_n19
+    variables: &ctrl_vars_with_lvls ['mole_fraction_of_carbon_dioxide_in_air']
+
+  - name: ctrl_geovals
+    type: GeovalSpace
+    data_file: ${data_input_path}/ncdiag.x0048v2-geovals.ob.PT6H.amsua_n19.2021-12-11T21:00:00Z.nc4
+    instrument_name: amsua_n19
+    variables: &ctrl_vars ['vegetation_area_fraction', 'leaf_area_index']
+
+  - name: ctrl_latlon
+    type: IodaObsSpace
+    filenames:
+      - ${data_input_path}/ncdiag.x0048v2.ob.PT6H.amsua_n19.2021-12-11T21:00:00Z.nc4
+    groups:
+      - name: MetaData
+
+transforms:
+
+  - transform: latlon_match
+    new_collection_name: ctrl_geovals_matched_index
+    base_latlon: ctrl_latlon
+    match_base_latlon_to: exp_latlon
+    base_collection: ctrl_geovals::amsua_n19::${variable}
+    for:
+      variable: *ctrl_vars
+
+  - transform: latlon_match
+    new_collection_name: ctrl_geovals_with_lvls_matched_index
+    base_latlon: ctrl_latlon
+    match_base_latlon_to: exp_latlon
+    base_collection: ctrl_geovals_with_lvls::amsua_n19::${variable}
+    for:
+      variable: *ctrl_vars_with_lvls
+
+  - transform: arithmetic
+    new name: exp_geovals::amsua_n19::exp_minus_ctrl_${variable}
+    equals: exp_geovals::amsua_n19::${variable}-ctrl_geovals_matched_index::amsua_n19::${variable}
+    for:
+      variable: *exp_vars
+
+graphics:
+
+  plotting_backend: Emcpy
+  figure_list:
+
+  - batch figure:
+      variables: *exp_vars
+    dynamic options:
+      - type: vminvmaxcmap
+        data variable: exp_geovals::amsua_n19::exp_minus_ctrl_${variable}
+    figure:
+      figure size: [20,10]
+      layout: [1,1]
+      title: 'JEDI - GSI | AMSU-A NOAA-19 | Geoval | ${variable}'
+      output name: map_plots/geovals/amsua_n19/${variable}/observations_amsua_n19_${variable}.png
+    plots:
+      - mapping:
+          projection: plcarr
+          domain: global
+        add_map_features: ['coastline']
+        add_colorbar:
+          label: '${variable}'
+        layers:
+        - type: MapScatter
+          longitude:
+            variable: exp_latlon::MetaData::longitude
+          latitude:
+            variable: exp_latlon::MetaData::latitude
+          data:
+            variable: exp_geovals::amsua_n19::exp_minus_ctrl_${variable}
+          markersize: 2
+          cmap: ${dynamic_cmap}
+          vmin: ${dynamic_vmin}
+          vmax: ${dynamic_vmax}
diff --git a/src/eva/tests/data/ncdiag.x0048v2-geovals.ob.PT6H.amsua_n19.2021-12-11T21:00:00Z.nc4 b/src/eva/tests/data/ncdiag.x0048v2-geovals.ob.PT6H.amsua_n19.2021-12-11T21:00:00Z.nc4
new file mode 100644
index 00000000..75ff02c6
--- /dev/null
+++ b/src/eva/tests/data/ncdiag.x0048v2-geovals.ob.PT6H.amsua_n19.2021-12-11T21:00:00Z.nc4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4e45612c2316c187aa1e47319b367860781875b9a4f9856e5af567588e3bb602
+size 16948017
diff --git a/src/eva/tests/data/ncdiag.x0048v2.ob.PT6H.amsua_n19.2021-12-11T21:00:00Z.nc4 b/src/eva/tests/data/ncdiag.x0048v2.ob.PT6H.amsua_n19.2021-12-11T21:00:00Z.nc4
new file mode 100644
index 00000000..230bf334
--- /dev/null
+++ b/src/eva/tests/data/ncdiag.x0048v2.ob.PT6H.amsua_n19.2021-12-11T21:00:00Z.nc4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:33b4da4c1c3f96e48ba8e6966302e7873ae5bd0e9a1334bccebb6d6bf66ea7b6
+size 2291303
diff --git a/src/eva/tests/data/swell-hofx.amsua_n19-geovals.20211211T210000Z.nc4 b/src/eva/tests/data/swell-hofx.amsua_n19-geovals.20211211T210000Z.nc4
new file mode 100644
index 00000000..89773b42
--- /dev/null
+++ b/src/eva/tests/data/swell-hofx.amsua_n19-geovals.20211211T210000Z.nc4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e823689cdc33713b3db16bbc8eb2d6979e0f0bd116717e5ad6ca496c3cbabbbf
+size 16737844
diff --git a/src/eva/tests/data/swell-hofx.amsua_n19.20211211T210000Z.nc4 b/src/eva/tests/data/swell-hofx.amsua_n19.20211211T210000Z.nc4
new file mode 100644
index 00000000..e3ef411a
--- /dev/null
+++ b/src/eva/tests/data/swell-hofx.amsua_n19.20211211T210000Z.nc4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8ebaac0dd57bfa071b12692a5798f32a0a342c63d5c18050bb32883718376cd3
+size 14615231
diff --git a/src/eva/transforms/latlon_match.py b/src/eva/transforms/latlon_match.py
new file mode 100644
index 00000000..f06057a0
--- /dev/null
+++ b/src/eva/transforms/latlon_match.py
@@ -0,0 +1,88 @@
+# (C) Copyright 2024 NOAA/NWS/EMC
+#
+# (C) Copyright 2024 United States Government as represented by the Administrator of the
+# National Aeronautics and Space Administration. All Rights Reserved.
+#
+# This software is licensed under the terms of the Apache Licence Version 2.0
+# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+
+import numpy as np
+from xarray import Dataset, DataArray
+from eva.utilities.config import get
+from eva.utilities.logger import Logger
+from eva.transforms.transform_utils import parse_for_dict, split_collectiongroupvariable
+
+
+def latlon_match(config, data_collections):
+
+    """
+    Applies lat/lon match transform to a given collection.
+
+    Args:
+        config (dict): A configuration dictionary containing transformation parameters.
+        data_collections (DataCollections): An instance of the DataCollections class containing
+        input data.
+
+    Returns:
+        None
+
+    This function applies lat/lon matching to variables in the base collection. A new collection
+    with matched variables is added to the data collection.
+
+    base collection: collection to perform the latlon matching on
+    base_latlon: the collection with lat/lon coordiates corresponding to base collection
+    match_base_latlon_to: the collection with lat/lon coordinates corresponding to what you want to
+    match the base latlon to.
+
+    """
+
+    # Create a logger
+    logger = Logger('LatLonMatchTransform')
+
+    # Parse the for dictionary
+    _, _, variables = parse_for_dict(config, logger)
+
+    # Parse config for names
+    base_collection = get(config, logger, 'base_collection')
+    base_latlon_name = get(config, logger, 'base_latlon')
+    match_latlon_name = get(config, logger, 'match_base_latlon_to')
+
+    # Extract collection and group
+    cgv = split_collectiongroupvariable(logger, base_collection)
+
+    # Retrieve collections using collection names
+    base_lat = data_collections.get_variable_data_array(base_latlon_name, 'MetaData',
+                                                        'latitude').to_numpy()
+    base_lon = data_collections.get_variable_data_array(base_latlon_name, 'MetaData',
+                                                        'longitude').to_numpy()
+    match_lat = data_collections.get_variable_data_array(match_latlon_name, 'MetaData',
+                                                         'latitude').to_numpy()
+    match_lon = data_collections.get_variable_data_array(match_latlon_name, 'MetaData',
+                                                         'longitude').to_numpy()
+
+    # Find matching index (this can be updated using dask)
+    matching_index = []
+    for i in range(len(base_lat)):
+        matching_index.append((abs(base_lat - match_lat[i]) +
+                               abs(base_lon - match_lon[i])).argmin())
+
+    # Retrieve data collection from data collections
+    match_ds = data_collections.get_data_collection(cgv[0])
+
+    # Loop through starting_dataset and update all variable arrays
+    update_ds_list = []
+    for variable in variables:
+        var_array = data_collections.get_variable_data_array(cgv[0], cgv[1], variable)
+        var_values = var_array.values
+
+        # Index data array with matching_index and then save to new collection
+        var_values = var_values[matching_index]
+        var_array.values = var_values
+        match_ds[f'{cgv[1]}::{variable}'] = var_array
+
+    # get new collection name
+    new_collection_name = get(config, logger, 'new_collection_name')
+
+    # add new collection to data collections
+    data_collections.create_or_add_to_collection(new_collection_name, match_ds)
+    match_ds.close()