From b14e19db11f925b72c3ec6956d046b47c899b21c Mon Sep 17 00:00:00 2001 From: Ed Safford <62339196+EdwardSafford-NOAA@users.noreply.github.com> Date: Tue, 2 Jan 2024 14:36:13 -0500 Subject: [PATCH] Extend eva to support legacy ConMon time/vert data (#169) * Ref #167 Add support for batch processing on DataType/dtype. * Ref #167 Fix datatype use with station data. * Ref #167 Add update for conmon vert plots. * Ref #167 Fix typo. --- src/eva/data/data_collections.py | 31 +++- src/eva/data/mon_data_space.py | 155 +++++++++++++----- .../batch/base/diagnostics/line_plot.py | 13 +- .../batch/base/plot_tools/dynamic_config.py | 5 +- .../batch/base/plot_tools/figure_driver.py | 9 +- .../batch/emcpy/diagnostics/line_plot.py | 13 +- 6 files changed, 165 insertions(+), 61 deletions(-) diff --git a/src/eva/data/data_collections.py b/src/eva/data/data_collections.py index 28326ef5..514f9e77 100644 --- a/src/eva/data/data_collections.py +++ b/src/eva/data/data_collections.py @@ -170,7 +170,7 @@ def add_variable_to_collection(self, collection_name, group_name, variable_name, # ---------------------------------------------------------------------------------------------- def get_variable_data_array(self, collection_name, group_name, variable_name, - channels=None, levels=None): + channels=None, levels=None, datatypes=None): """ Retrieve a specific variable (as a DataArray) from a collection. @@ -181,18 +181,21 @@ def get_variable_data_array(self, collection_name, group_name, variable_name, variable_name (str): Name of the variable. channels (int or list[int]): Indices of channels to select (optional). levels (int or list[int]): Indices of levels to select (optional). + datatypes (str or list[str]): Indices of data types to select (optional). Returns: DataArray: The selected variable as an xarray DataArray. Raises: - ValueError: If channels are provided but the 'Channel' dimension is missing. + ValueError: If channels, levels, or datatypes are provided but the + corresponding 'Channel', 'Level', or 'DataType' dimension + is missing. """ group_variable_name = group_name + '::' + variable_name data_array = self._collections[collection_name][group_variable_name] - if channels is None and levels is None: + if channels is None and levels is None and datatypes is None: return data_array if channels is not None: @@ -201,6 +204,7 @@ def get_variable_data_array(self, collection_name, group_name, variable_name, if 'Channel' not in list(self._collections[collection_name].dims): self.logger.abort(f'In get_variable_data_array channels is provided but ' + f'Channel is not a dimension in Dataset') + # Make sure it is a list channels_sel = [] channels_sel.append(channels) @@ -229,10 +233,26 @@ def get_variable_data_array(self, collection_name, group_name, variable_name, self.logger.abort('In get_variable_data_array levels is neither none ' + 'nor a list of integers') + elif datatypes is not None: + if isinstance(datatypes, str) or not any(not isinstance(dt, str) for dt in datatypes): + # DataType must be a dimension if it will be used for selection + if 'DataType' not in list(self._collections[collection_name].dims): + self.logger.abort(f'In get_variable_data_array levels is provided but ' + + f'DataType is not a dimension in Dataset') + datatypes_sel = [] + datatypes_sel.append(datatypes) + + # Create a new DataArray with the requested datatypes + data_array_dtypes = data_array.sel(DataType=datatypes_sel) + return data_array_dtypes + else: + self.logger.abort('In get_variable_data_array datatype is neither none ' + + 'nor a list of strings') + # ---------------------------------------------------------------------------------------------- def get_variable_data(self, collection_name, group_name, variable_name, - channels=None, levels=None): + channels=None, levels=None, datatypes=None): """ Retrieve the data of a specific variable from a collection. @@ -243,13 +263,14 @@ def get_variable_data(self, collection_name, group_name, variable_name, variable_name (str): Name of the variable. channels (int or list[int]): Indices of channels to select (optional). levels (int or list[int]): Indices of levels to select (optional). + datatypes (str or list[str]): Indices of data types to select (optional). Returns: ndarray: The selected variable data as a NumPy array. """ variable_array = self.get_variable_data_array(collection_name, group_name, variable_name, - channels, levels) + channels, levels, datatypes) # Extract the actual data array variable_data = variable_array.data diff --git a/src/eva/data/mon_data_space.py b/src/eva/data/mon_data_space.py index 3a393b61..51f1ffad 100644 --- a/src/eva/data/mon_data_space.py +++ b/src/eva/data/mon_data_space.py @@ -30,6 +30,15 @@ class MonDataSpace(EvaDatasetBase): A class for handling MonDataSpace dataset configuration and processing. """ + # index values for specific control files + level_iuse_ozn = 7 + channel_iuse_rad = 7 + channel_num_rad = 4 + type_con = 1 + datatype_con = 5 + subtype_con = 7 + regions_rad_ang = 5 + def execute(self, dataset_config, data_collections, timing): """ @@ -57,13 +66,13 @@ def execute(self, dataset_config, data_collections, timing): dims_arr = [] if self.is_stn_data(control_file[0]): - coords, dims, attribs, nvars, vars, scanpo, levs_dict, chans_dict = ( - self.get_stn_ctl_dict(control_file[0])) + coords, dims, attribs, nvars, vars, scanpo, levs_dict, chans_dict, datatype_dict = ( + self.get_stn_ctl_dict(control_file[0])) ndims_used = 2 dims_arr = ['xdef', 'ydef', 'zdef'] stn_data = True else: - coords, dims, attribs, nvars, vars, scanpo, levs_dict, chans_dict = ( + coords, dims, attribs, nvars, vars, scanpo, levs_dict, chans_dict, datatype_dict = ( self.get_ctl_dict(control_file[0])) ndims_used, dims_arr = self.get_ndims_used(dims) @@ -90,7 +99,8 @@ def execute(self, dataset_config, data_collections, timing): # Set coordinate ranges # --------------------- channo = chans_dict["chan_nums"] if chans_dict is not None else None - x_range, y_range, z_range = self.get_dim_ranges(coords, dims, channo) + datatypes = datatype_dict["datatype"] if datatype_dict is not None else None + x_range, y_range, z_range = self.get_dim_ranges(coords, dims, channo, datatypes) # Get missing value threshold # --------------------------- @@ -121,7 +131,7 @@ def execute(self, dataset_config, data_collections, timing): # create dataset from file contents timestep_ds = None timestep_ds = self.load_dset(vars, nvars, coords, darr, dims, ndims_used, - dims_arr, x_range, y_range, z_range, channo, cyc_darr) + dims_arr, x_range, y_range, z_range, cyc_darr) if attribs['sat']: timestep_ds.attrs['satellite'] = attribs['sat'] @@ -165,7 +175,8 @@ def execute(self, dataset_config, data_collections, timing): # Conditionally add channel, level, scan, and iteration related variables # ----------------------------------------------------------------------- iterations = x_range if 'Iteration' in coords.values() else None - ds = self.loadConditionalItems(ds, chans_dict, levs_dict, scanpo, iterations) + ds = self.loadConditionalItems(ds, chans_dict, levs_dict, + datatype_dict, scanpo, iterations) # Rename variables with group rename_dict = {} @@ -299,13 +310,14 @@ def get_ctl_dict(self, control_file): list: List of scan positions. dict: Dictionary containing channel information. dict: Dictionary containing level information. + dict: Dictionary containing datatype information. """ coords = {'xdef': None, 'ydef': None, 'zdef': None} coord_list = [] dims = {'xdef': 0, 'ydef': 0, 'zdef': 0} dim_list = [] - attribs = {'sensor': None, 'sat': None, 'dtype': None} + attribs = {'sensor': None, 'sat': None, 'datatype': None} vars = [] nvars = 0 chans_dict = None @@ -318,13 +330,18 @@ def get_ctl_dict(self, control_file): levs = [] level_assim = [] level_nassim = [] + datatype_dict = None + datatype = [] + datatype_assim = [] coord_dict = { 'channel': 'Channel', 'scan': 'Scan', 'pressure': 'Level', + 'vertical': 'Level', 'region': 'Region', - 'iter': 'Iteration' + 'iter': 'Iteration', + 'data type': 'DataType' } with open(control_file, 'r') as fp: @@ -334,6 +351,7 @@ def get_ctl_dict(self, control_file): # Locate the coordinates using coord_dict. There will be 1-3 # coordinates specified as XDEF, YDEF, and ZDEF. for item in list(coord_dict.keys()): + if 'DEF' in line and item in line: coord_list.append(coord_dict[item]) @@ -375,8 +393,14 @@ def get_ctl_dict(self, control_file): attribs['sensor'] = strs[1] attribs['sat'] = strs[2] - if line.find('dtype station') != -1 or line.find('DTYPE station') != -1: - attribs['dtype'] = 'station' + if line.find('datatype station') != -1 or line.find('DTYPE station') != -1: + attribs['datatype'] = 'station' + + if line.find('subtype') != -1: + strs = line.split() + datatype.append(strs[self.type_con] + strs[self.datatype_con] + '_' + + strs[self.subtype_con]) + datatype_assim.append(strs[9]) # Note we need to extract the actual channel numbers. We have the # number of channels via the xdef line, but they are not necessarily @@ -388,11 +412,11 @@ def get_ctl_dict(self, control_file): if line.find('channel=') != -1: strs = line.split() if strs[4].isdigit(): - channo.append(int(strs[4])) - if strs[7] == '1': - chan_assim.append(int(strs[4])) - if strs[7] == '-1': - chan_nassim.append(int(strs[4])) + channo.append(int(strs[self.channel_num_rad])) + if strs[self.channel_iuse_rad] == '1': + chan_assim.append(int(strs[self.channel_num_rad])) + if strs[self.channel_iuse_rad] == '-1': + chan_nassim.append(int(strs[self.channel_num_rad])) if line.find('level=') != -1: @@ -400,17 +424,23 @@ def get_ctl_dict(self, control_file): tlev = strs[2].replace(',', '') if tlev.isdigit(): levs.append(int(tlev)) - if strs[7] == '1': - level_assim.append(int(tlev)) - if strs[7] == '-1': - level_nassim.append(int(tlev)) + + # Ozn data control files include the assim flag on the Level definition + # lines. Con data control files use level but assim is included on the + # datatype line, not Level + # + if len(strs) >= self.level_iuse_ozn: + if strs[self.level_iuse_ozn] == '1': + level_assim.append(int(tlev)) + if strs[self.level_iuse_ozn] == '-1': + level_nassim.append(int(tlev)) # The list of variables is at the end of the file between the lines # "vars" and "end vars". start = len(lines) - (nvars + 1) for x in range(start, start + nvars): strs = lines[x].split() - vars.append(strs[-1]) + vars.append(strs[0]) # Ignore any coordinates in the control file that have a value of 1. used = 0 @@ -442,15 +472,22 @@ def get_ctl_dict(self, control_file): 'chans_nassim': chan_nassim} if 'Level' in coords.values(): - for x in range(len(level_assim), len(levs)): - level_assim.append(0) - for x in range(len(level_nassim), len(levs)): - level_nassim.append(0) - levs_dict = {'levels': levs, - 'levels_assim': level_assim, - 'levels_nassim': level_nassim} + levs_dict = {'levels': levs} + + if len(level_assim) > 0 or len(level_nassim) > 0: + for x in range(len(level_assim), len(levs)): + level_assim.append(0) + for x in range(len(level_nassim), len(levs)): + level_nassim.append(0) + levs_dict['levels_assim'] = level_assim + levs_dict['levels_nassim'] = level_nassim + + if 'DataType' in coords.values(): + datatype_dict = {'datatype': datatype, + 'assim': datatype_assim} + fp.close() - return coords, dims, attribs, nvars, vars, scanpo, levs_dict, chans_dict + return coords, dims, attribs, nvars, vars, scanpo, levs_dict, chans_dict, datatype_dict # ---------------------------------------------------------------------------------------------- @@ -471,12 +508,13 @@ def get_stn_ctl_dict(self, control_file): list: List of scan positions. dict: Dictionary containing channel information. dict: Dictionary containing level information. + dict: Dictionary containing datatype information. """ coords = {'xdef': 'Level', 'ydef': 'Nobs', 'zdef': None} dims = {'xdef': 0, 'ydef': 0, 'zdef': 0} dim_list = [] - attribs = {'sensor': None, 'sat': None, 'dtype': 'station'} + attribs = {'sensor': None, 'sat': None, 'datatype': 'station'} vars = [] nvars = 0 chans_dict = None @@ -486,13 +524,17 @@ def get_stn_ctl_dict(self, control_file): lev_vals = [] level_assim = [] level_nassim = [] + datatype_dict = None + datatype = [] + datatype_assim = [] coord_dict = { 'channel': 'Channel', 'scan': 'Scan', 'pressure': 'Level', 'region': 'Region', - 'iter': 'Iteration' + 'iter': 'Iteration', + 'data type': 'DataType' } with open(control_file, 'r') as fp: @@ -554,8 +596,12 @@ def get_stn_ctl_dict(self, control_file): 'levels_nassim': level_nassim} dims['xdef'] = len(levs) + if 'DataType' in coords.values(): + datatype_dict = {'datatype': datatype, + 'assim': datatype_assim} + fp.close() - return coords, dims, attribs, nvars, vars, scanpo, levs_dict, chans_dict + return coords, dims, attribs, nvars, vars, scanpo, levs_dict, chans_dict, datatype_dict # ---------------------------------------------------------------------------------------------- @@ -598,7 +644,7 @@ def read_ieee(self, file_name, coords, dims, ndims_used, dims_arr, nvars, vars, load_data = False if ndims_used == 1: # MinMon - rtn_array = np.empty((0, dims[dims_arr[0]]), dtype=float) + rtn_array = np.empty((0, dims[dims_arr[0]]), datatype=float) if not load_data: zarray = np.zeros((dims[dims_arr[0]]), float) dimensions = [dims[dims_arr[0]]] @@ -609,7 +655,7 @@ def read_ieee(self, file_name, coords, dims, ndims_used, dims_arr, nvars, vars, zarray = np.zeros((dims[dims_arr[0]], dims[dims_arr[1]]), float) dimensions = [dims[dims_arr[1]], dims[dims_arr[0]]] - if ndims_used == 3: # RadMon angle + if ndims_used == 3: # RadMon angle, ConMon time/vert rtn_array = np.empty((0, dims[dims_arr[0]], dims[dims_arr[1]], dims[dims_arr[2]]), float) zarray = np.zeros((dims[dims_arr[0]], dims[dims_arr[1]], @@ -627,7 +673,7 @@ def read_ieee(self, file_name, coords, dims, ndims_used, dims_arr, nvars, vars, tarr = zarray else: mylist = [] - for z in range(5): + for z in range(dims['zdef']): arr = f.read_reals(dtype=np.dtype('>f4')).reshape(dims['ydef'], dims['xdef']) mylist.append(np.transpose(arr)) @@ -635,6 +681,7 @@ def read_ieee(self, file_name, coords, dims, ndims_used, dims_arr, nvars, vars, else: tarr = zarray + rtn_array = np.append(rtn_array, [tarr], axis=0) else: # ndims_used == 1|2 @@ -768,7 +815,7 @@ def var_to_np_array(self, dims, ndims_used, dims_arr, var): # ---------------------------------------------------------------------------------------------- - def get_dim_ranges(self, coords, dims, channo): + def get_dim_ranges(self, coords, dims, channo, datatypes): """ Get the valid ranges for each dimension based on the specified coordinates and channel @@ -778,6 +825,7 @@ def get_dim_ranges(self, coords, dims, channo): coords (dict): Dictionary of coordinates. dims (dict): Dictionary of dimension sizes. channo (list): List of channel numbers. + datatypes (list): List of data types. Returns: numpy.ndarray or None: Valid x coordinate range or None. @@ -795,7 +843,12 @@ def get_dim_ranges(self, coords, dims, channo): # - The z coordinate is never used for channel. if dims['xdef'] > 1: - x_range = channo if coords['xdef'] == 'Channel' else np.arange(1, dims['xdef']+1) + if coords['xdef'] == 'Channel': + x_range = channo + elif coords['xdef'] == 'DataType': + x_range = datatypes + else: + x_range = np.arange(1, dims['xdef']+1) if dims['ydef'] > 1: y_range = channo if coords['ydef'] == 'Channel' else np.arange(1, dims['ydef']+1) @@ -842,7 +895,7 @@ def get_ndims_used(self, dims): # ---------------------------------------------------------------------------------------------- def load_dset(self, vars, nvars, coords, darr, dims, ndims_used, - dims_arr, x_range, y_range, z_range, channo, cyc_darr=None): + dims_arr, x_range, y_range, z_range, cyc_darr=None): """ Create a dataset from various components. @@ -859,7 +912,6 @@ def load_dset(self, vars, nvars, coords, darr, dims, ndims_used, y_range (numpy.ndarray or None): Valid y coordinate range. z_range (numpy.ndarray or None): Valid z coordinate range. cyc_darr (numpy.ndarray): Numpy array of cycle data. - channo (list): List of channel numbers. Returns: xarray.Dataset: Created dataset. @@ -904,9 +956,6 @@ def load_dset(self, vars, nvars, coords, darr, dims, ndims_used, coords[dims_arr[2]]: z_range } - if 'Channel' in coords.values(): - d.update({"Channel": {"dims": ("Channel"), "data": channo}}) - new_ds = Dataset.from_dict(d) rtn_ds = new_ds if rtn_ds is None else rtn_ds.merge(new_ds) @@ -948,7 +997,8 @@ def load_dset(self, vars, nvars, coords, darr, dims, ndims_used, # ---------------------------------------------------------------------------------------------- - def loadConditionalItems(self, dataset, chans_dict, levs_dict, scanpo, iterations=None): + def loadConditionalItems(self, dataset, chans_dict, levs_dict, datatype_dict, + scanpo, iterations=None): """ Add channel, level, and scan related variables to the dataset. @@ -973,13 +1023,28 @@ def loadConditionalItems(self, dataset, chans_dict, levs_dict, scanpo, iteration dataset['chan_nassim'] = (['Channel'], chans_dict["chans_nassim"]) if levs_dict is not None: - dataset['level'] = (['Level'], levs_dict["levels"]) + + # If datatype_dict is available then level needs to include that dimension + # for potential batch processing by DataType (conmon vert plots). + if datatype_dict is not None: + combined_list = [] + for x in datatype_dict['datatype']: + combined_list.append(levs_dict['levels']) + dataset['level'] = (['DataType', 'Level'], combined_list) + else: + dataset['level'] = (['Level'], levs_dict["levels"]) dataset['level_yaxis_z'] = (['Level'], [0.0]*len(levs_dict["levels"])) - if len(levs_dict["levels_assim"]) > 0: + if 'levels_assim' in levs_dict: dataset['level_assim'] = (['Level'], levs_dict["levels_assim"]) - if len(levs_dict["levels_nassim"]) > 0: + if 'levels_nassim' in levs_dict: dataset['level_nassim'] = (['Level'], levs_dict["levels_nassim"]) + if 'levels_value' in levs_dict: + dataset['level_value'] = (['Level'], levs_dict["levels_value"]) + + if datatype_dict is not None: + dataset['datatype'] = (['DataType'], datatype_dict['datatype']) + dataset['datatype_assim'] = (['DataType'], datatype_dict['assim']) if scanpo is not None: nscan = dataset.dims.get('Scan') diff --git a/src/eva/plotting/batch/base/diagnostics/line_plot.py b/src/eva/plotting/batch/base/diagnostics/line_plot.py index 8b8c2d14..2d733783 100644 --- a/src/eva/plotting/batch/base/diagnostics/line_plot.py +++ b/src/eva/plotting/batch/base/diagnostics/line_plot.py @@ -58,16 +58,21 @@ def __init__(self, config, logger, dataobj): logger.abort('In Scatter comparison the first variable \'var1\' does not appear to ' + 'be in the required format of collection::group::variable.') - # Optionally get the channel|level to plot + # Optionally get the channel|level|datatype to plot channel = None if 'channel' in config: channel = config.get('channel') level = None if 'level' in config: level = config.get('level') - - xdata = dataobj.get_variable_data(var0_cgv[0], var0_cgv[1], var0_cgv[2], channel, level) - ydata = dataobj.get_variable_data(var1_cgv[0], var1_cgv[1], var1_cgv[2], channel, level) + datatype = None + if 'datatype' in config: + datatype = config.get('datatype') + + xdata = dataobj.get_variable_data(var0_cgv[0], var0_cgv[1], + var0_cgv[2], channel, level, datatype) + ydata = dataobj.get_variable_data(var1_cgv[0], var1_cgv[1], + var1_cgv[2], channel, level, datatype) # see if we need to slice data xdata = slice_var_from_str(config['x'], xdata, logger) diff --git a/src/eva/plotting/batch/base/plot_tools/dynamic_config.py b/src/eva/plotting/batch/base/plot_tools/dynamic_config.py index ffe24508..4df75fd2 100644 --- a/src/eva/plotting/batch/base/plot_tools/dynamic_config.py +++ b/src/eva/plotting/batch/base/plot_tools/dynamic_config.py @@ -62,15 +62,16 @@ def vminvmaxcmap(logger, option_dict, plots_dict, data_collections): # is kept percentage_capture = option_dict.get('percentage capture', 100) - # Optionally the data might have a channel or level. + # Optionally the data might have a channel, level, or datatype. channel = option_dict.get('channel', None) level = option_dict.get('level', None) + datatype = option_dict.get('datatype', None) # Get the data variable to use for determining cbar limits varname = option_dict.get('data variable') varname_cgv = varname.split('::') datavar = data_collections.get_variable_data(varname_cgv[0], varname_cgv[1], - varname_cgv[2], channel, level) + varname_cgv[2], channel, level, datatype) # Reorder the data array datavar = np.sort(datavar) diff --git a/src/eva/plotting/batch/base/plot_tools/figure_driver.py b/src/eva/plotting/batch/base/plot_tools/figure_driver.py index 28464d38..5a7d1bfc 100644 --- a/src/eva/plotting/batch/base/plot_tools/figure_driver.py +++ b/src/eva/plotting/batch/base/plot_tools/figure_driver.py @@ -98,7 +98,7 @@ def figure_driver(config, data_collections, timing, logger): step_var_name = 'channel' title_fill = ' Ch. ' - # Get list of levels, conditionally override step variables + # Get list of levels, load step variables levels_str_or_list = batch_conf.get('levels', []) levels = parse_channel_list(levels_str_or_list, logger) if levels: @@ -106,6 +106,13 @@ def figure_driver(config, data_collections, timing, logger): step_var_name = 'level' title_fill = ' Lev. ' + # Get list of datatypes and load step variables + datatypes = batch_conf.get('datatypes', []) + if datatypes: + step_vars = datatypes + step_var_name = 'datatype' + title_fill = ' Dtype. ' + # Set some fake values to ensure the loops are entered if not variables: logger.abort("Batch Figure must provide variables, even if with channels") diff --git a/src/eva/plotting/batch/emcpy/diagnostics/line_plot.py b/src/eva/plotting/batch/emcpy/diagnostics/line_plot.py index 1738cf8b..8c2f5175 100644 --- a/src/eva/plotting/batch/emcpy/diagnostics/line_plot.py +++ b/src/eva/plotting/batch/emcpy/diagnostics/line_plot.py @@ -58,16 +58,21 @@ def __init__(self, config, logger, dataobj): logger.abort('In Scatter comparison the first variable \'var1\' does not appear to ' + 'be in the required format of collection::group::variable.') - # Optionally get the channel|level to plot + # Optionally get the channel|level|datatype to plot channel = None if 'channel' in config: channel = config.get('channel') level = None if 'level' in config: level = config.get('level') + datatype = None + if 'datatype' in config: + datatype = config.get('datatype') - xdata = dataobj.get_variable_data(var0_cgv[0], var0_cgv[1], var0_cgv[2], channel, level) - ydata = dataobj.get_variable_data(var1_cgv[0], var1_cgv[1], var1_cgv[2], channel, level) + xdata = dataobj.get_variable_data(var0_cgv[0], var0_cgv[1], var0_cgv[2], + channel, level, datatype) + ydata = dataobj.get_variable_data(var1_cgv[0], var1_cgv[1], var1_cgv[2], + channel, level, datatype) # see if we need to slice data xdata = slice_var_from_str(config['x'], xdata, logger) @@ -97,7 +102,7 @@ def __init__(self, config, logger, dataobj): 'batch', 'emcpy', 'defaults', 'line_plot.yaml')) config = get_schema(layer_schema, config, logger) - delvars = ['x', 'y', 'type', 'schema', 'channel', 'level'] + delvars = ['x', 'y', 'type', 'schema', 'channel', 'level', 'datatype'] for d in delvars: config.pop(d, None) self.plotobj = update_object(self.plotobj, config, logger)