From a40e6510588b8d72b171cc10074d4f5794f81b9e Mon Sep 17 00:00:00 2001 From: Akira Sewnath Date: Fri, 19 Apr 2024 10:11:40 -0400 Subject: [PATCH] Adding new reader and lat/lon match transform (#184) We are adding a new reader to handle geoval files, but with the idea that it can be generalized to other types of files. A new latlon match transform was added using some work that @danholdaway had developed. List of changes: - Method to `data_collections.py` to retrieve a collection - Generic `DataFile` reader - `latlon_match` transform to match lat/lon coordinates from one collection to another - `DataFile` test yaml and some files for testing Resolves #177 --- requirements-github.txt | 2 +- src/eva/data/data_collections.py | 5 + src/eva/data/geoval_space.py | 109 ++++++++++++++++++ src/eva/tests/config/testGeovalSpace.yaml | 100 ++++++++++++++++ ...ob.PT6H.amsua_n19.2021-12-11T21:00:00Z.nc4 | 3 + ...ob.PT6H.amsua_n19.2021-12-11T21:00:00Z.nc4 | 3 + ...ofx.amsua_n19-geovals.20211211T210000Z.nc4 | 3 + .../swell-hofx.amsua_n19.20211211T210000Z.nc4 | 3 + src/eva/transforms/latlon_match.py | 88 ++++++++++++++ 9 files changed, 315 insertions(+), 1 deletion(-) create mode 100644 src/eva/data/geoval_space.py create mode 100644 src/eva/tests/config/testGeovalSpace.yaml create mode 100644 src/eva/tests/data/ncdiag.x0048v2-geovals.ob.PT6H.amsua_n19.2021-12-11T21:00:00Z.nc4 create mode 100644 src/eva/tests/data/ncdiag.x0048v2.ob.PT6H.amsua_n19.2021-12-11T21:00:00Z.nc4 create mode 100644 src/eva/tests/data/swell-hofx.amsua_n19-geovals.20211211T210000Z.nc4 create mode 100644 src/eva/tests/data/swell-hofx.amsua_n19.20211211T210000Z.nc4 create mode 100644 src/eva/transforms/latlon_match.py diff --git a/requirements-github.txt b/requirements-github.txt index 881fcd31..2ea4f668 100644 --- a/requirements-github.txt +++ b/requirements-github.txt @@ -8,7 +8,7 @@ xarray>=2022.6.0 seaborn>=0.12.2 hvplot>=0.8.2 nbconvert>=6.5.4 -bokeh>=3.1.1 +bokeh<3.5.0,>=3.4.0 geopandas>=0.13.2 geoviews>=1.10.0 nbsite diff --git a/src/eva/data/data_collections.py b/src/eva/data/data_collections.py index 514f9e77..20d32109 100644 --- a/src/eva/data/data_collections.py +++ b/src/eva/data/data_collections.py @@ -169,6 +169,11 @@ def add_variable_to_collection(self, collection_name, group_name, variable_name, # ---------------------------------------------------------------------------------------------- + def get_data_collection(self, collection_name): + return self._collections[collection_name] + + # ---------------------------------------------------------------------------------------------- + def get_variable_data_array(self, collection_name, group_name, variable_name, channels=None, levels=None, datatypes=None): diff --git a/src/eva/data/geoval_space.py b/src/eva/data/geoval_space.py new file mode 100644 index 00000000..27c6bd28 --- /dev/null +++ b/src/eva/data/geoval_space.py @@ -0,0 +1,109 @@ +# (C) Copyright 2024 NOAA/NWS/EMC +# +# (C) Copyright 2024 United States Government as represented by the Administrator of the +# National Aeronautics and Space Administration. All Rights Reserved. +# +# This software is licensed under the terms of the Apache Licence Version 2.0 +# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. + +# -------------------------------------------------------------------------------------------------- + +import os +import netCDF4 as nc +import numpy as np +from xarray import Dataset, open_dataset +from eva.utilities.config import get +from eva.data.eva_dataset_base import EvaDatasetBase +from eva.utilities.utils import parse_channel_list + + +class GeovalSpace(EvaDatasetBase): + + """ + A class for handling geoval files + """ + + def execute(self, dataset_config, data_collections, timing): + + """ + Executes the processing of data file dataset. + + Args: + dataset_config (dict): Configuration dictionary for the dataset. + data_collections (DataCollections): Object for managing data collections. + timing (Timing): Timing object for tracking execution time. + """ + + # Set the collection name + # ----------------------- + collection_name = get(dataset_config, self.logger, 'name') + + # Get missing value threshold + # --------------------------- + threshold = float(get(dataset_config, self.logger, 'missing_value_threshold', 1.0e30)) + + # Get levels to plot profiles + # --------------------------_ + levels_str_or_list = get(dataset_config, self.logger, 'levels', []) + + # Convert levels to list + levels = [] + if levels_str_or_list is not []: + levels = parse_channel_list(levels_str_or_list, self.logger) + + # Filename to be used for reads + # --------------------------------------- + data_filename = get(dataset_config, self.logger, 'data_file') + + # Get instrument name + instr_name = get(dataset_config, self.logger, 'instrument_name') + + # Open instrument files xarray dataset + instr_ds = open_dataset(data_filename) + + # Enforce that a variable exists, do not default to all variables + variables = get(dataset_config, self.logger, 'variables') + if not variables: + self.logger.abort('A variables list needs to be defined in the config file.') + vars_to_remove = list(set(list(instr_ds.keys())) - set(variables)) + instr_ds = instr_ds.drop_vars(vars_to_remove) + + # Rename variables and nval dimension + rename_dict = {} + rename_dims_dict = {} + for v in variables: + # Retrieve dimension names + dims = instr_ds[v].dims + if np.size(dims) > 1: + rename_dims_dict[dims[1]] = f'Level' + rename_dict[v] = f'{instr_name}::{v}' + instr_ds = instr_ds.rename(rename_dict) + instr_ds = instr_ds.rename_dims(rename_dims_dict) + + # Add the dataset_config to the collections + data_collections.create_or_add_to_collection(collection_name, instr_ds) + + # Nan out unphysical values + data_collections.nan_float_values_outside_threshold(threshold) + + # Display the contents of the collections for helping the user with making plots + data_collections.display_collections() + + def generate_default_config(self, filenames, collection_name): + + """ + Generate a default configuration for the dataset. + + This method generates a default configuration for the dataset based on the provided + filenames and collection name. It can be used as a starting point for creating a + configuration for the dataset. + + Args: + filenames: Filenames or file paths relevant to the dataset. + collection_name (str): Name of the collection for the dataset. + + Returns: + dict: A dictionary representing the default configuration for the dataset. + """ + + pass diff --git a/src/eva/tests/config/testGeovalSpace.yaml b/src/eva/tests/config/testGeovalSpace.yaml new file mode 100644 index 00000000..9e0c1cd9 --- /dev/null +++ b/src/eva/tests/config/testGeovalSpace.yaml @@ -0,0 +1,100 @@ +datasets: + + - name: exp_geovals_with_lvls + type: GeovalSpace + data_file: ${data_input_path}/swell-hofx.amsua_n19-geovals.20211211T210000Z.nc4 + levels: &exp_levels 33,60 + instrument_name: amsua_n19 + variables: &exp_vars_with_lvls ['mole_fraction_of_carbon_dioxide_in_air'] + + - name: exp_geovals + type: GeovalSpace + data_file: ${data_input_path}/swell-hofx.amsua_n19-geovals.20211211T210000Z.nc4 + instrument_name: amsua_n19 + variables: &exp_vars ['vegetation_area_fraction', 'leaf_area_index'] + + - name: exp_latlon + type: IodaObsSpace + filenames: + - ${data_input_path}/swell-hofx.amsua_n19.20211211T210000Z.nc4 + groups: + - name: MetaData + + - name: ctrl_geovals_with_lvls + type: GeovalSpace + data_file: ${data_input_path}/ncdiag.x0048v2-geovals.ob.PT6H.amsua_n19.2021-12-11T21:00:00Z.nc4 + levels: &ctrl_levels 33,60 + instrument_name: amsua_n19 + variables: &ctrl_vars_with_lvls ['mole_fraction_of_carbon_dioxide_in_air'] + + - name: ctrl_geovals + type: GeovalSpace + data_file: ${data_input_path}/ncdiag.x0048v2-geovals.ob.PT6H.amsua_n19.2021-12-11T21:00:00Z.nc4 + instrument_name: amsua_n19 + variables: &ctrl_vars ['vegetation_area_fraction', 'leaf_area_index'] + + - name: ctrl_latlon + type: IodaObsSpace + filenames: + - ${data_input_path}/ncdiag.x0048v2.ob.PT6H.amsua_n19.2021-12-11T21:00:00Z.nc4 + groups: + - name: MetaData + +transforms: + + - transform: latlon_match + new_collection_name: ctrl_geovals_matched_index + base_latlon: ctrl_latlon + match_base_latlon_to: exp_latlon + base_collection: ctrl_geovals::amsua_n19::${variable} + for: + variable: *ctrl_vars + + - transform: latlon_match + new_collection_name: ctrl_geovals_with_lvls_matched_index + base_latlon: ctrl_latlon + match_base_latlon_to: exp_latlon + base_collection: ctrl_geovals_with_lvls::amsua_n19::${variable} + for: + variable: *ctrl_vars_with_lvls + + - transform: arithmetic + new name: exp_geovals::amsua_n19::exp_minus_ctrl_${variable} + equals: exp_geovals::amsua_n19::${variable}-ctrl_geovals_matched_index::amsua_n19::${variable} + for: + variable: *exp_vars + +graphics: + + plotting_backend: Emcpy + figure_list: + + - batch figure: + variables: *exp_vars + dynamic options: + - type: vminvmaxcmap + data variable: exp_geovals::amsua_n19::exp_minus_ctrl_${variable} + figure: + figure size: [20,10] + layout: [1,1] + title: 'JEDI - GSI | AMSU-A NOAA-19 | Geoval | ${variable}' + output name: map_plots/geovals/amsua_n19/${variable}/observations_amsua_n19_${variable}.png + plots: + - mapping: + projection: plcarr + domain: global + add_map_features: ['coastline'] + add_colorbar: + label: '${variable}' + layers: + - type: MapScatter + longitude: + variable: exp_latlon::MetaData::longitude + latitude: + variable: exp_latlon::MetaData::latitude + data: + variable: exp_geovals::amsua_n19::exp_minus_ctrl_${variable} + markersize: 2 + cmap: ${dynamic_cmap} + vmin: ${dynamic_vmin} + vmax: ${dynamic_vmax} diff --git a/src/eva/tests/data/ncdiag.x0048v2-geovals.ob.PT6H.amsua_n19.2021-12-11T21:00:00Z.nc4 b/src/eva/tests/data/ncdiag.x0048v2-geovals.ob.PT6H.amsua_n19.2021-12-11T21:00:00Z.nc4 new file mode 100644 index 00000000..75ff02c6 --- /dev/null +++ b/src/eva/tests/data/ncdiag.x0048v2-geovals.ob.PT6H.amsua_n19.2021-12-11T21:00:00Z.nc4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e45612c2316c187aa1e47319b367860781875b9a4f9856e5af567588e3bb602 +size 16948017 diff --git a/src/eva/tests/data/ncdiag.x0048v2.ob.PT6H.amsua_n19.2021-12-11T21:00:00Z.nc4 b/src/eva/tests/data/ncdiag.x0048v2.ob.PT6H.amsua_n19.2021-12-11T21:00:00Z.nc4 new file mode 100644 index 00000000..230bf334 --- /dev/null +++ b/src/eva/tests/data/ncdiag.x0048v2.ob.PT6H.amsua_n19.2021-12-11T21:00:00Z.nc4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33b4da4c1c3f96e48ba8e6966302e7873ae5bd0e9a1334bccebb6d6bf66ea7b6 +size 2291303 diff --git a/src/eva/tests/data/swell-hofx.amsua_n19-geovals.20211211T210000Z.nc4 b/src/eva/tests/data/swell-hofx.amsua_n19-geovals.20211211T210000Z.nc4 new file mode 100644 index 00000000..89773b42 --- /dev/null +++ b/src/eva/tests/data/swell-hofx.amsua_n19-geovals.20211211T210000Z.nc4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e823689cdc33713b3db16bbc8eb2d6979e0f0bd116717e5ad6ca496c3cbabbbf +size 16737844 diff --git a/src/eva/tests/data/swell-hofx.amsua_n19.20211211T210000Z.nc4 b/src/eva/tests/data/swell-hofx.amsua_n19.20211211T210000Z.nc4 new file mode 100644 index 00000000..e3ef411a --- /dev/null +++ b/src/eva/tests/data/swell-hofx.amsua_n19.20211211T210000Z.nc4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ebaac0dd57bfa071b12692a5798f32a0a342c63d5c18050bb32883718376cd3 +size 14615231 diff --git a/src/eva/transforms/latlon_match.py b/src/eva/transforms/latlon_match.py new file mode 100644 index 00000000..f06057a0 --- /dev/null +++ b/src/eva/transforms/latlon_match.py @@ -0,0 +1,88 @@ +# (C) Copyright 2024 NOAA/NWS/EMC +# +# (C) Copyright 2024 United States Government as represented by the Administrator of the +# National Aeronautics and Space Administration. All Rights Reserved. +# +# This software is licensed under the terms of the Apache Licence Version 2.0 +# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. + +import numpy as np +from xarray import Dataset, DataArray +from eva.utilities.config import get +from eva.utilities.logger import Logger +from eva.transforms.transform_utils import parse_for_dict, split_collectiongroupvariable + + +def latlon_match(config, data_collections): + + """ + Applies lat/lon match transform to a given collection. + + Args: + config (dict): A configuration dictionary containing transformation parameters. + data_collections (DataCollections): An instance of the DataCollections class containing + input data. + + Returns: + None + + This function applies lat/lon matching to variables in the base collection. A new collection + with matched variables is added to the data collection. + + base collection: collection to perform the latlon matching on + base_latlon: the collection with lat/lon coordiates corresponding to base collection + match_base_latlon_to: the collection with lat/lon coordinates corresponding to what you want to + match the base latlon to. + + """ + + # Create a logger + logger = Logger('LatLonMatchTransform') + + # Parse the for dictionary + _, _, variables = parse_for_dict(config, logger) + + # Parse config for names + base_collection = get(config, logger, 'base_collection') + base_latlon_name = get(config, logger, 'base_latlon') + match_latlon_name = get(config, logger, 'match_base_latlon_to') + + # Extract collection and group + cgv = split_collectiongroupvariable(logger, base_collection) + + # Retrieve collections using collection names + base_lat = data_collections.get_variable_data_array(base_latlon_name, 'MetaData', + 'latitude').to_numpy() + base_lon = data_collections.get_variable_data_array(base_latlon_name, 'MetaData', + 'longitude').to_numpy() + match_lat = data_collections.get_variable_data_array(match_latlon_name, 'MetaData', + 'latitude').to_numpy() + match_lon = data_collections.get_variable_data_array(match_latlon_name, 'MetaData', + 'longitude').to_numpy() + + # Find matching index (this can be updated using dask) + matching_index = [] + for i in range(len(base_lat)): + matching_index.append((abs(base_lat - match_lat[i]) + + abs(base_lon - match_lon[i])).argmin()) + + # Retrieve data collection from data collections + match_ds = data_collections.get_data_collection(cgv[0]) + + # Loop through starting_dataset and update all variable arrays + update_ds_list = [] + for variable in variables: + var_array = data_collections.get_variable_data_array(cgv[0], cgv[1], variable) + var_values = var_array.values + + # Index data array with matching_index and then save to new collection + var_values = var_values[matching_index] + var_array.values = var_values + match_ds[f'{cgv[1]}::{variable}'] = var_array + + # get new collection name + new_collection_name = get(config, logger, 'new_collection_name') + + # add new collection to data collections + data_collections.create_or_add_to_collection(new_collection_name, match_ds) + match_ds.close()