diff --git a/README.md b/README.md deleted file mode 100644 index ac91ec2..0000000 --- a/README.md +++ /dev/null @@ -1,2 +0,0 @@ -# rnr-score-harvester -Repo to house any code necessary to pull diagnostics files or rmse scores related to ufs-rnr experiments and push the output to desired location and in desired format diff --git a/src/score_hv/__init__.py b/src/score_hv/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/data/gridcell-area_noaa-ufs-gefsv13replay-pds_bfg_control_1536x768_20231116.nc b/src/score_hv/data/gridcell-area_noaa-ufs-gefsv13replay-pds_bfg_control_1536x768_20231116.nc similarity index 100% rename from data/gridcell-area_noaa-ufs-gefsv13replay-pds_bfg_control_1536x768_20231116.nc rename to src/score_hv/data/gridcell-area_noaa-ufs-gefsv13replay-pds_bfg_control_1536x768_20231116.nc diff --git a/src/score_hv/harvesters/__init__.py b/src/score_hv/harvesters/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/score_hv/harvesters/daily_bfg.py b/src/score_hv/harvesters/daily_bfg.py old mode 100755 new mode 100644 index c3130b2..ffd2fac --- a/src/score_hv/harvesters/daily_bfg.py +++ b/src/score_hv/harvesters/daily_bfg.py @@ -2,38 +2,57 @@ import os import sys -from pathlib import Path - +import ast +import xarray as xr import numpy as np +import cftime +from pathlib import Path from netCDF4 import MFDataset -import xarray as xr from datetime import datetime as dt -import cftime from collections import namedtuple from dataclasses import dataclass from dataclasses import field - -from score_hv.config_base import ConfigInterface +from score_hv.config_base import ConfigInterface +from score_hv.stats_utils import VarStatsCatalog +from score_hv.region_utils import GeoRegionsCatalog +from score_hv.mask_utils import MaskCatalog HARVESTER_NAME = 'daily_bfg' VALID_STATISTICS = ('mean', 'variance', 'minimum', 'maximum') - +VALID_REGION_BOUND_KEYS = ('min_lat', 'max_lat', 'west_lon', 'east_lon') +DEFAULT_REGION = {'global': {'north_lat': 90.0, 'south_lat': -90.0, 'west_long': 0.0, 'east_long': 360.0}} +""" + The following variables are automatically masked when they + are read in. + Soil variables are soil4,soilm,soilt4 and tg3. + This mask is different than the user requested mask. + """ +VARIABLES_TO_MASK = ['soill4', 'soilm', 'soilt4', 'tg3'] +VALID_MASKS = ['land','ocean','sea', 'ice'] """Variables of interest that come from the background forecast data. Commented out variables can be uncommented to generate gridcell weighted statistics but are in development and are currently not fully supported. """ -VALID_VARIABLES = (#'icetk', # sea ice thickness (m) - #'lhtfl_ave', # surface latent heat flux (W m^-2) - #'prate_ave', # surface precip rate (mm weq. s^-1) - 'prateb_ave', # bucket surface precip rate (mm weq. s^-1) +VALID_VARIABLES = ( + #'icetk', # sea ice thickness (m) + 'lhtfl_ave',# surface latent heat flux (W/m**2) + 'shtfl_ave', # surface sensible heat flux (W/m**2) + 'dlwrf_ave', # surface downward longwave flux (W/m**2) + 'dswrf_ave', # averaged surface downward shortwave flux (W/m**2) + 'ulwrf_ave', # surface upward longwave flux (W/m**2) + 'uswrf_ave', # averaged surface upward shortwave flux (W/m**2) + 'netrf_avetoa',#top of atmosphere net radiative flux (SW and LW) (W/m**2) + 'netef_ave',#surface energy balance (W/m**2) + 'prate_ave', # surface precip rate (mm weq. s^-1) #'pressfc', # surface pressure (Pa) #'snod', # surface snow depth (m) - #'soil4', # liquid soil moisture at layer-4 (?) - #'soilm', # total column soil moisture content (mm weq.) - #'soilt4', # soil temperature unknown layer 4 (K) - #'tg3', # deep soil temperature (K) - #'tmp2m', # 2m (surface air) temperature (K) - #'tmpsfc', # surface temperature (K) + 'soill4', # liquid soil moisture at layer-4 (?) + 'soilm', # total column soil moisture content (mm weq.) + 'soilt4', # soil temperature unknown layer 4 (K) + 'tg3', # deep soil temperature (K) + 'tmp2m', # 2m (surface air) temperature (K) + 'tmpsfc', # surface temperature (K) + 'ulwrf_avetoa', # top of atmos upward longwave flux (W m^-2) #'weasd', # surface snow water equivalent (mm weq.) ) @@ -43,12 +62,172 @@ 'value', 'units', 'mediantime', - 'longname']) + 'longname', + 'surface_mask', + 'regions']) def get_gridcell_area_data_path(): - return os.path.join(Path(__file__).parent.parent.parent.parent.resolve(), + return os.path.join(Path(__file__).parent.parent.resolve(), 'data', 'gridcell-area' + '_noaa-ufs-gefsv13replay-pds' + '_bfg_control_1536x768_20231116.nc') + +def get_median_cftime(xr_dataset): + """the xr_dataset is a netcdf bfg_xr_dataset. Where time stamps represent + end points for the time period. + """ + temporal_endpoints = np.array([cftime.date2num(time, \ + 'hours since 1951-01-01 00:00:00') \ + for time in xr_dataset['time']]) + num_times = len(temporal_endpoints) + if num_times > 1: + """ can estimate the time step only if there're more than 1 + timestamps + """ + temporal_midpoints = temporal_endpoints - np.gradient(temporal_endpoints)/2. + + median_cftime = cftime.num2date(np.median(temporal_midpoints), \ + 'hours since 1951-01-01 00:00:00') + else: + median_cftime = cftime.num2date(np.median(temporal_endpoints), \ + 'hours since 1951-01-01 00:00:00') + return(median_cftime) + +def check_variable_exists(var_name,dataset_variable_names): + """ + Checks if a field exists in the xarray dataset. If + the variable does not exist we exit with a message giving + the missing variable name. + Parameters: + var_name: The name of the variable to check for. + dataset_variable_names: A list of variables names that are + in the input dataset. + """ + if var_name not in dataset_variable_names: + raise KeyError(f"Missing required field in the dataset: {var_name}") + sys.exit(1) + +def calculate_surface_energy_balance(xr_dataset,dataset_variable_names): + """ + This method calculates the derived variable netef_ave. + The required fields are: + dswrf_ave : averaged surface downward shortwave flux + dlwrf_ave : surface downward longwave flux + ulwrf_ave : surface upward longwave flux + uswrf_ave : averaged surface upward shortwave flux + shtfl_ave : surface sensible heat flux + lhtfl_ave : surface latent heat flux + netef_ave = dswrf_ave + dlwrf_ave - ulwrf_ave - uswrf_ave - shtfl_ave - lhtfl_ave + Parameter: + xr_dataset - The entire bfg data set given by the user in the harvested + data list. + Return - The calculated surface energy balance variable. + """ + required_variables = ['dswrf_ave', 'dlwrf_ave', 'ulwrf_ave', 'uswrf_ave', 'shtfl_ave', 'lhtfl_ave'] + for var_name in required_variables: + check_variable_exists(var_name,dataset_variable_names) + + dswrf=xr_dataset['dswrf_ave'] + dlwrf=xr_dataset['dlwrf_ave'] + ulwrf=xr_dataset['ulwrf_ave'] + uswrf=xr_dataset['uswrf_ave'] + shtfl=xr_dataset['shtfl_ave'] + lhtfl=xr_dataset['lhtfl_ave'] + netef_ave_data = dswrf + dlwrf - ulwrf - uswrf - shtfl - lhtfl + return(netef_ave_data) + +def calculate_toa_radative_flux(xr_dataset,dataset_variable_names): + """ + This method calculates the derived variable netef_ave. + The variable name netrf_avetoa referes to the top of the + atmosphere (TOA) net radiative energy flux. + The required fields are: + dswrf_avetoa:averaged surface downward shortwave flux + uswrf_avetoa:averaged surface upward shortwave flux + ulwrf_avetoa:surface upward longwave flux + netrf_avetoa = dswrf_avetoa - uswrf_avetoa - ulwrf_avetoa + Parameter: + xr_dataset - The entire bfg data set given by the user in the harvested + data list. + Return - The calculated the top of atmosphere radiative energy flux. + """ + required_variables = ['dswrf_avetoa', 'uswrf_avetoa', 'ulwrf_avetoa'] + for var_name in required_variables: + check_variable_exists(var_name,dataset_variable_names) + + dswrf=xr_dataset['dswrf_avetoa'] + uswrf=xr_dataset['uswrf_avetoa'] + ulwrf=xr_dataset['ulwrf_avetoa'] + netrf_avetoa=dswrf - uswrf - ulwrf + return(netrf_avetoa) + + +def check_array_dimensions(region_variable,region_weights): + """ + This method makes sure the dimensions numbers of grid_ny + and grid_ny are the same after subsetting the region_variable and + region weights into the region requested by the user. + If they are not the same we need to exit the program. + The calculation of the statistics will fail. + Parameters: + region_variable - The variable data from the bfg data files + that corresponds to the region requested bye + the user. + region_weights - The weights from the weight files that corresponds to + the region requested by the user. + Return - nothing is returned. + """ + vardims = region_variable.dims + for dim in vardims: + if dim in region_variable.coords and dim in region_weights.coords: + var_coord_values = region_variable.coords[dim].size + weight_coord_values = region_weights.coords[dim].size + try: + assert var_coord_values == weight_coord_values + except AssertionError: + msg = (f"Mismatch in coordinate sizes for dimension '{dim}':\n" + f"region_variable '{dim}' size: {var_coord_values}\n" + f"region_weights '{dim}' size: {weight_coord_values}") + raise ValueError(msg) + + +def calculate_and_normalize_solid_angle(sum_global_weights,region_weights): + """ + This method calculates the solid angle for the regional weights + and normalizes them. + Parameters: + sum_global_weights - The sum_of_global_weights is is the sum of the + variable "area" in the gridcell area data file. + region_weights - The region weights are the weights the + gridcell_area data file that corresponds to the region + requested by the user. The region weights come in as + an xarray Dataset. + Return - The normalized weights based on the solid angle is returned. + """ + region_weights = region_weights.to_array() + sum_region_weights = region_weights.sum() + """ + The line region_solid_angle calculates the sum of weights for a region, + adjusted by the ratio of the region's weights to the + sum of global weights, and then scaled by the + solid angle of a sphere (which is 4 * np.pi) + """ + region_solid_angle = (sum_region_weights / sum_global_weights) * 4 * np.pi + """ + Calculate the solid angle for each weight in the region. + Make sure the calculated region solid angle matches the sum of + the individual weights' solid angles. If it does not we need to + exit with an error. + """ + region_solid_angle_sum_of_weights = (region_weights / sum_global_weights) * 4 * np.pi + if not np.isclose(region_solid_angle, region_solid_angle_sum_of_weights.sum()): + raise ValueError(f"The region solid angle {region_solid_angle}" + f"does not match the sum of the weights") + + # Normalize the weights based on the solid angle sum. + normalized_weights = region_solid_angle_sum_of_weights / region_solid_angle_sum_of_weights.sum() + return(normalized_weights) + + @dataclass class DailyBFGConfig(ConfigInterface): @@ -60,9 +239,11 @@ def __post_init__(self): def set_config(self): """ function to set configuration variables from given dictionary """ - self.harvest_filenames = self.config_data.get('filenames') + self.harvest_filenames=self.config_data.get('filenames') self.set_stats() self.set_variables() + self.set_surface_mask() + self.set_regions() def set_variables(self): """ @@ -73,14 +254,36 @@ def set_variables(self): for var in self.variables: if var not in VALID_VARIABLES: msg = ("'%s' is not a supported " - "variable to harvest from the background forecast data. " + "variablee to harvest from the background forecast data. " "Please reconfigure the input dictionary using only the " "following variables: %r" % (var, VALID_VARIABLES)) raise KeyError(msg) - + def get_stats(self): ''' return list of all stat types based on harvest_config ''' return self.stats + + def get_masks(self): + return self.surface_mask + + def set_surface_mask(self): + self.surface_mask = self.config_data.get('surface_mask') + if self.surface_mask != None: + for mask in self.surface_mask: + if mask not in VALID_MASKS: + msg = f'The mask: {mask} is not a supported mask. ' \ + f'Valid masks are: "land, ocean ,sea, ice.' + raise ValueError(msg) + + def get_regions(self): + ''' return list of regions based on harvest config''' + return self.regions + + def set_regions(self): + self.regions = self.config_data.get('regions') + if self.regions is None: + self.regions = DEFAULT_REGION + print("Setting a default global region ",self.regions) def set_stats(self): """ @@ -94,7 +297,7 @@ def set_stats(self): "Please reconfigure the input dictionary using only the " "following statistics: %r" % (stat, VALID_STATISTICS)) raise KeyError(msg) - + def get_variables(self): ''' return list of all variable types based on harvest_config''' return self.variables @@ -107,7 +310,7 @@ class DailyBFGHv(object): Parameters: ----------- config: DailyBFGConfig object containing information used to determine - which variable to parse the log file + which variable to parse the log file Methods: -------- get_data: parse descriptive statistics from log files based on input @@ -116,7 +319,7 @@ class DailyBFGHv(object): returns a list of tuples containing specific data """ config: DailyBFGConfig = field(default_factory=DailyBFGConfig) - + def get_data(self): """ Harvests requested statistics and variables from background forecast data and returns harvested_data, a list of HarvestData @@ -137,74 +340,163 @@ def get_data(self): combine='nested', concat_dim='time', decode_times=True) - + # Get the list of variable names in the dataset + dataset_variable_names = list(xr_dataset.data_vars.keys()) + latitudes = xr_dataset['grid_yt'] + longitudes = xr_dataset['grid_xt'] gridcell_area_data = xr.open_dataset(get_gridcell_area_data_path()) gridcell_area_weights = (gridcell_area_data.variables['area']) - - temporal_endpoints = np.array([cftime.date2num(time, - 'hours since 1951-01-01 00:00:00') for time in xr_dataset['time']]) - - if len(self.config.harvest_filenames) > 1: - """ can estimate the time step only if there're more than 1 - timestamps - """ - temporal_midpoints = temporal_endpoints - np.gradient(temporal_endpoints)/2. - - median_cftime = cftime.num2date(np.median(temporal_midpoints), - 'hours since 1951-01-01 00:00:00') - else: - median_cftime = cftime.num2date(np.median(temporal_endpoints), - 'hours since 1951-01-01 00:00:00') + """ + This sum is needed for the calculation of the weighted sum of the regional + variable data. + """ + sum_global_weights = gridcell_area_data['area'].sum() + median_cftime = get_median_cftime(xr_dataset) + + """ + Get the statistics requested by the user. + Instantiate the stats class in stats_util.py. + Initialize the temporal_means list. The + temporal_means are calculated in the stats_util.py + class and returned. + """ + stats_list = self.config.get_stats() + var_stats_instance = VarStatsCatalog(stats_list) + + """ + We will always have a least one region so we can instantiate the region catalog. + The regions were gathered in the get_regions method above in the DailyBFGConfig class. + and are defined in self.config.regions dictionary. + """ + regions_catalog = GeoRegionsCatalog(xr_dataset) + regions_catalog.add_user_region(self.config.regions) + + """ + Here we initialize the MaskCatalog class. + mask_instance = MaskCatalog(). The sotyp variable + on the dataset is used to distinguish between + land,ocean and ice. + ocean = sotyp has vaues of 0 + sea = sotyp has values of 0 + ice = sotyp has values of 16 + land = sotyp has values between ocean and ice. + """ + soil_type = 'sotyp' + check_variable_exists(soil_type,dataset_variable_names) + soil_type_variable = xr_dataset['sotyp'] + mask_instance = MaskCatalog(self.config.surface_mask,soil_type_variable) for i, variable in enumerate(self.config.get_variables()): - """ The first nested loop iterates through each requested variable + """ + The first nested loop iterates through each requesed variable. + """ + namelist=self.config.get_variables() + var_name=namelist[i] + if var_name == "netef_ave": + variable_data = calculate_surface_energy_balance(xr_dataset,dataset_variable_names) + longname = "surface energy balance" + units = "W/m**2" + elif var_name == "netrf_avetoa": + variable_data = calculate_toa_radative_flux(xr_dataset,dataset_variable_names) + longname =" Top of atmosphere net radiative flux" + units = "W/m**2" + else: + check_variable_exists(var_name,dataset_variable_names) + variable_data = xr_dataset[variable] + if var_name in VARIABLES_TO_MASK: + variable_data = mask_instance.initial_mask_of_variable(var_name,variable_data) + + variable_data = mask_instance.replace_bad_values_with_nan(variable_data) + if 'long_name' in variable_data.attrs: + longname=variable_data.attrs['long_name'] + else: + longname="None" + if 'units' in variable_data.attrs: + units=variable_data.attrs['units'] + else: + units="None" + """ - variable_data = xr_dataset[variable] - temporal_means = np.ma.masked_invalid(variable_data.mean(dim='time', - skipna=True)) - if 'long_name' in variable_data.attrs: - longname = variable_data.attrs['long_name'] - else: - longname = "None" - if 'units' in variable_data.attrs: - units = variable_data.attrs['units'] - else: - units = "None" - - expected_value, sumweights = np.ma.average(temporal_means, - weights=gridcell_area_weights, - returned=True) - assert sumweights >= 0.999 * 4. * np.pi - assert sumweights <= 1.001 * 4. * np.pi - + The temporal means is a list which will hold the + temporal means calculated in this function. + The temporal means are passed into the stats_util + class in order to calculate the statistics requested + by the user. + """ + temporal_means = [] + var_stats_instance.clear_requested_statistics() + for iregion in range(len(self.config.regions)): + region_variable_data = regions_catalog.get_region_data(iregion,variable_data) + region_weights = regions_catalog.get_region_data(iregion,gridcell_area_data) + check_array_dimensions(region_variable_data,region_weights) + normalized_weights = calculate_and_normalize_solid_angle(sum_global_weights,region_weights) + + if self.config.surface_mask != None: + """ + Subset the soil_type_variable into the region requested by the user. + """ + region_mask = regions_catalog.get_region_data(iregion,soil_type_variable) + check_array_dimensions(region_variable_data,region_mask) + """ + Delete the values in the region_mask that are not wanted by the user. + """ + masked_array = mask_instance.user_mask_array(region_mask) + masked_array_firsttime = masked_array.isel(time=0) + """ + Adding the drop=False keeps the original xarray.DataArray structure. That is + it keeps the original dimension names and metadata. Where the dimension names + are time,grid_yt,grid_xt. + NOTE:region_variable_data is of type xarray.core.dataarray.DataArray + landmasks_array is of type xarray.core.dataarray.DataArray. + normalized_weights is of type xarray.core.dataarray.DataArray + """ + region_variable_data = region_variable_data.where(masked_array, drop=False) + normalized_weights = normalized_weights.where(masked_array_firsttime,drop=False) + + # Compute the mean over the time dimension. + value = region_variable_data.mean(dim='time',skipna=True) + temporal_means.append(value) + normalized_weights = normalized_weights.squeeze(axis=0) + """ + Here we pass the normalized_weights and the temporal_mean into the + stats class. + Note: we had to "squeeze" the normalized_weights down to just + the numnx and numny value to get rid of the time dimension. + This is done because the temporal_means array does not have the + time dimension. The stats class will give an error when calculating + the weighted mean and the variance of the data if the dimensions + of the normalized weights and temporal mean are different. + """ + var_stats_instance.calculate_requested_statistics(normalized_weights,value) + for j, statistic in enumerate(self.config.get_stats()): """ The second nested loop iterates through each requested - statistic - """ + statistic and regions if the user has requested geographical + regions.. + """ if statistic == 'mean': - value = expected_value + value = var_stats_instance.weighted_averages elif statistic == 'variance': - value = -expected_value**2 + np.ma.sum( - temporal_means**2 * - (gridcell_area_weights / - gridcell_area_weights.sum())) - + value = var_stats_instance.variances + elif statistic == 'maximum': - value = np.ma.max(temporal_means) - + value = var_stats_instance.maximum + elif statistic == 'minimum': - value = np.ma.min(temporal_means) - + value = var_stats_instance.minimum + harvested_data.append(HarvestedData( - self.config.harvest_filenames, - statistic, - variable, - np.float32(value), - units, - dt.fromisoformat(median_cftime.isoformat()), - longname)) - + self.config.harvest_filenames, + statistic, + variable, + np.float32(value), + units, + dt.fromisoformat(median_cftime.isoformat()), + longname, + self.config.surface_mask, + self.config.regions)) + gridcell_area_data.close() xr_dataset.close() - return harvested_data \ No newline at end of file + return harvested_data diff --git a/src/score_hv/mask_utils.py b/src/score_hv/mask_utils.py new file mode 100644 index 0000000..642423c --- /dev/null +++ b/src/score_hv/mask_utils.py @@ -0,0 +1,103 @@ +""" +Copyright 2024 NOAA +All rights reserved. + +Collection of methods to work with masking of user requested variables +""" +import sys,os +import numpy as np +import xarray as xr +import pytest +import pdb + +VALID_MASKS = ['land','ocean','sea', 'ice'] + +class MaskCatalog: + def __init__(self,user_mask_value,soil_type_values): + """ + Here we initalize the MaskCatalog class. + """ + self.name = None + self.user_mask = user_mask_value + """The variable name on the data set + that is used for masking is "sotyp". + The sotyp array values are passed in from the + calling routine where it is read from + the data set. + ocean = sotyp has vaues of 0 + sea = sotyp has values of 0 + ice = sotyp has values of 16 + land = sotyp has values between ocean and ice. + """ + self.data_mask = soil_type_values + + def initial_mask_of_variable(self,var_name,variable_data): + """ + This method does the initial masking special variables. + The land variables that are masked initially are: + soill4,soilm,soilt4 and tg3. + It uses the sotyp(soil type) variable in the data set. + Parameters: + var_name - The variable name. + varaiable_data - The initial variable data with no masking. + return - The variable data is returned with the ocean and ice + values set to missing. + """ + + self.name = var_name + """ + The values of 0 and 16 in the sotyp variable are used to delete values + over ocean and ice. + """ + masked_data = variable_data.where((self.data_mask != 0) & (self.data_mask != 16)) + return(masked_data) + + def replace_bad_values_with_nan(self,variable_data): + """ + Check for _FillValue or missing_values in the variable data. + This method will replace the missing or fill values with + NaN. + Parameters; + variable_data - The variable field data. Variable_data is of + type class 'xarray.core.dataarray.DataArray. + Return - The variable field data with NaN's for where the + grid values are missing or fill values. The returned + masked_variable_data is of type + class 'xarray.core.dataarray.DataArray. + """ + fill_value = variable_data.encoding.get('_FillValue', None) + missing_value = variable_data.encoding.get('missing_value', None) + + """ + Create a combined mask for fill_values,missing_value and non + finite values. + """ + if fill_value is not None and missing_value is not None: + mask = (variable_data != fill_value) & (variable_data != missing_value) + elif fill_value is not None: + mask = variable_data != fill_value + elif missing_value is not None: + mask = variable_data != missing_value + else: + mask = np.isfinite(variable_data) + + # Apply the mask + masked_variable = variable_data.where(mask,np.nan) + return(masked_variable) + + + def user_mask_array(self,region_mask): + """ + The user has requested a mask. Here we keep only the + type of data that the user wants. + Parameters: + region_mask - This is the sotyp variable read in from the + bfg data file. + Return - The mask array is returned. This will contain + boolean values. True for grid points we want and False + for grid points we do not want. + """ + if 'land' in self.user_mask: + masked_array = ~region_mask.isin([0, 16]) + return(masked_array) + diff --git a/src/score_hv/region_utils.py b/src/score_hv/region_utils.py new file mode 100644 index 0000000..bf00a83 --- /dev/null +++ b/src/score_hv/region_utils.py @@ -0,0 +1,242 @@ +""" +Copyright 2024 NOAA +All rights reserved. + +Collection of methods to work with user requested +geographic regions. +""" +import sys,os +import numpy as np +import xarray as xr +import math +from datetime import datetime +import pytest +import pdb +from netCDF4 import Dataset +from pathlib import Path +""" + The class GeoRegions has functions to allow the user to request specific + geographical regions. + """ + +""" + Here we define default latitude and longitude ranges. + min lat is -90 and max lat is 90. + NOTE: Our longitude on the BFG files goes from 0 to 360 degrees east, circular. + west long is 0 and east_long is 360. + """ +NORTH_LAT = 90 +SOUTH_LAT = -90 +WEST_LONG = 0 +EAST_LONG = 360 + +class GeoRegionsCatalog: + def __init__(self,dataset): + """ + Here we initalize the region class as a dictionary. + Parameter: datset - This is a dataset that has been + opened with xarray. + """ + self.name = [] + self.north_lat = [] + self.south_lat = [] + self.west_long = [] + self.east_long = [] + self.region_indices = [] + self.latitude_values = dataset['grid_yt'].values + self.longitude_values = dataset['grid_xt'].values + + def test_user_latitudes(self,north_lat,south_lat): + """ + This method tests to make sure the latitude values + entered by the user are within the bounds of -90 to 90. + """ + if south_lat < -90 or north_lat > 90: + msg = f'southern latitude must be greater than -90. and less than 90. '\ + f'south_lat: {south_lat}' + raise ValueError(msg) + + if north_lat < -90 or north_lat > 90: + msg = f'northern latitude must be greater than -90 and less than 90. ' \ + f'north_lat: {north_lat}' + raise ValueError(msg) + + if south_lat > north_lat: + msg = f'southern latitude must be less than northern latitude ' \ + f'south_lat: {south_lat}, north_lat: {north_lat}' + raise ValueError(msg) + + def test_user_longitudes(self,west_long,east_long): + """ + This method tests to make sure the longitude values + entered by the user are within the bounds of 0 to 360. + """ + if east_long < 0 or east_long > 360: + msg = f'east longitude must be greater than 0 and less than 360 '\ + f'east_long: {east_long}' + raise ValueError(msg) + + if west_long < 0 or west_long > 360: + msg = f'west longitude must be greater than 0 and less than 360 '\ + f'west_long: {west_lon}' + raise ValueError(msg) + + + def add_user_region(self,dictionary): + """ + Parameters: + dictionary - The dictionary should contain the following keys. + name - name of the region. + north_lat - northern most latitude. + south_lat - southern most latitude. + west_long - westmost longitude. + east_long - eastmost longitude. + If the dictionay is empty we exit with a message. + If the region name key word is missing then we exit with a message. + If either the latitude or the longitude keys are missing we supply + a default. + """ + if not dictionary: + msg = f'The dictionary passed in to add_user_region is empty' + raise ValueError(msg) + + for region_name, input_dictionary in dictionary.items(): + if not region_name: + msg = 'No region name was given. Please enter a name for your region.' + raise ValueError(msg) + self.name.append(region_name) + + required_keywords = {'north_lat','south_lat','west_long','east_long'} + missing_keys = required_keywords - set(input_dictionary.keys()) + if "north_lat" in missing_keys: + print("north_lat is missing. Using the default of north latitude = 90") + north_lat = NORTH_LAT + else: + north_lat = input_dictionary.get("north_lat") + + if "south_lat" in missing_keys: + print("south_lat is missing. Using the default of south latitude = -90") + south_lat = SOUTH_LAT + else: + south_lat = input_dictionary.get("south_lat") + + if "west_long" in missing_keys: + print("west_long is missing. Using the default of west longitude = 0") + west_long = WEST_LONG + else: + west_long = input_dictionary.get("west_long") + + if "east_long" in missing_keys: + print("east_long is missing. Using the default of east longitude = 360") + east_long = EAST_LONG + else: + east_long = input_dictionary.get("east_long") + + self.test_user_latitudes(north_lat,south_lat) + self.north_lat.append(north_lat) + self.south_lat.append(south_lat) + + self.test_user_longitudes(west_long,east_long) + self.west_long.append(west_long) + self.east_long.append(east_long) + + def get_region_indices(self,region_index): + """ + This method calculates the indices in the latitude values and + longitude values that are passed in from the calling function. + The latitude and longitude values assigned in the + method, add_user_region, above are used to determine the + indices. + Parameters: + region_index - The number of the region that the user + has defined by the key word-name. There + can be multiple regions. + returns - The lat_indices and lon_indices. These lists + return the start and end indices as tuples. + """ + num_long = len(self.longitude_values) + num_lat = len(self.latitude_values) + step_size = self.longitude_values[num_long-1]/num_long + + # Find latitude indices within the region. + name = self.name[region_index] + + north_lat = self.north_lat[region_index] + south_lat = self.south_lat[region_index] + latitude_indices = [index for index, lat in enumerate(self.latitude_values) if south_lat <= lat <= north_lat ] + if latitude_indices: + lat_start_index = latitude_indices[0] + lat_end_index = latitude_indices[-1] + else: + msg=f"No latitude values were found within the specified range of {south_lat} and {max_lat}." + raise KeyError(msg) + + + """Find longitue indices within the region. + We have two cases to consider here. + Ranges that do not cross the Prime Meridian and ranges that do cross the + prime meridian. Our longitude array is 0 to 360 degrees east circular. + """ + east_long = self.east_long[region_index] + west_long = self.west_long[region_index] + if east_long <= west_long: + #region crosses the prime meridian. So we need to get two separate regions. + long1_start_index = int(west_long/step_size) + long1_end_index = num_long - 1 + long2_start_index = 0 + long2_end_index = int(east_long/step_size) + else: + long1_start_index = int(west_long / step_size) + long1_end_index = int(east_long / step_size) + long2_start_index = -999 + long2_end_index = -999 + return (lat_start_index,lat_end_index,long1_start_index,long1_end_index,long2_start_index,long2_end_index) + + def get_region_data(self,region_index,data): + """ + This method returns the data which corresponds to the user selected + region depending on the region indicies. The region is a subset of the + original variable data. + The data.isel is a xarray method that selects a range of indices + along the grid_yt and grid_xt dimension. + Parameters: + region_index - The number of the region to be sub-regioned.. + data - The full grid variable data to be sub-regioned based on the + start and ending indices returned from get_region_indices. + return - The variable data sub-setted into the region. + """ + lat_start_index,lat_end_index,long1_start_index,long1_end_index,long2_start_index, \ + long2_end_index = self.get_region_indices(region_index) + + # Check dimensions of the specific variable in the dataset + var_dims = data.dims + if 'time' in var_dims: + if long2_start_index != -999: + region1_data = data.isel(time=slice(None), + grid_yt=slice(lat_start_index, lat_end_index + 1), + grid_xt=slice(long1_start_index, long1_end_index)) + + region2_data = data.isel(time=slice(None), + grid_yt=slice(lat_start_index, lat_end_index + 1), + grid_xt=slice(long2_start_index, long2_end_index)) + region_data = xr.concat([region1_data,region2_data],dim='grid_xt') + + else: + region_data = data.isel(time=slice(None), + grid_yt=slice(lat_start_index, lat_end_index + 1), + grid_xt=slice(long1_start_index, long1_end_index)) + else: + if long2_start_index != -999: + region1_data = data.isel(grid_yt=slice(lat_start_index, lat_end_index + 1), + grid_xt=slice(long1_start_index,long1_end_index)) + + region2_data = data.isel(grid_yt=slice(lat_start_index, lat_end_index + 1), + grid_xt=slice(long2_start_index,long2_end_index)) + + region_data = xr.concat([region1_data,region2_data],dim='grid_xt') + else: + region_data = data.isel(grid_yt=slice(lat_start_index, lat_end_index + 1), + grid_xt=slice(long1_start_index,long1_end_index)) + return(region_data) + + diff --git a/src/score_hv/stats_utils.py b/src/score_hv/stats_utils.py new file mode 100644 index 0000000..03287fa --- /dev/null +++ b/src/score_hv/stats_utils.py @@ -0,0 +1,133 @@ +""" +Copyright 2024 NOAA +All rights reserved. + +Methods to calculate gridcell area weighted statistics +The functions generally take the following two input variables: + xarray_variable - xarray variable data + gridcell_area_data - xarray variable containing gridcell areas, in the + same shape as xarray_variable +""" +import sys,os +import xarray as xr +import numpy as np +import pytest +import pdb +from netCDF4 import Dataset +from pathlib import Path + +class VarStatsCatalog : + """ + This allows for the instantiation of the var_stats class. The + requested list of statistics that the user wants is passed into this + method and is used in the calculate_requested_statistics method + below to calculate the desired statistics. + """ + def __init__(self,stats_list): + self.weighted_averages=[] + self.variances=[] + self.minimum=[] + self.maximum=[] + self.stats=stats_list + + def clear_requested_statistics(self): + self.weighted_averages=[] + self.variances=[] + self.minimum=[] + self.maximum=[] + + def calculate_requested_statistics(self,weights,temporal_mean): + """This function calls the methods to calculate the statistics + requested by the user. If the weights and/or temporal_mean + arrays come in as xr.DataArrays, we convert the weights and + temporal_mean arrays to NumPy arrays for the + calculations. + Parameters: + weights: The weights are the normalized weights + based on the solid angle calculation in the + calling routine. + temporal_mean:the temporal means array that is calculated in + the calling function. + Return:Nothing is returned. + """ + + if isinstance(weights,xr.DataArray): + weights = weights.values + if isinstance(temporal_mean , xr.DataArray): + temporal_mean = temporal_mean.values + + for stat in self.stats: + if stat=="mean": + self.calculate_weighted_average(weights,temporal_mean) + if stat=='variance': + self.calculate_var_variance(weights,temporal_mean) + if stat=='minimum': + self.find_minimum_value(temporal_mean) + if stat=='maximum': + self.find_maximum_value(temporal_mean) + + def calculate_weighted_average(self,weights,temporal_mean): + """This method calculates a weighted average + for the variable data passed in from + the calling function calcuate_requested_statistics. + Parameters: + weights: The weights are the normalized weights + based on the solid angle calculation in the + calling routine. This is a NumPy array. + temporal_mean:the temporal means array that is calculated in + the calling function. This is a NumPy array. + """ + weighted_sum = np.nansum(weights * temporal_mean) + value = weighted_sum.item() + self.weighted_averages.append(value) + + def calculate_var_variance(self,weights,temporal_mean): + """ + This method returns the gridcell weighted variance of the requested variables using + the following formula: + variance = sum_R( w_i * (x_i - xbar)^2 ), + where sum_R represents the summation for each value x_i over the region of + interest R with normalized gridcell area weights w_i and weighted mean xbar + Parameters: + weights: The weights are the normalized weights + based on the solid angle calculation in the + calling routine. + temporal_mean:the temporal means array that is calculated in + the calling function. + """ + + # Ensure the weights and temporal_mean have compatible shapes + try: + assert weights.shape == temporal_mean.shape + except AssertionError: + msg = f'Assertion failed: The shape of the weights array {weights.shape}' \ + f'is not equal to the shape of the temporal mean array {temporal_mean.shape}' + raise ValueError(msg) + + # Calculate the weighted mean (xbar) for the two-dimensional array + weighted_sum = np.nansum(weights * temporal_mean) + sum_of_weights = np.nansum(weights) + weighted_average = weighted_sum / sum_of_weights + value = np.nansum(weights * (temporal_mean - weighted_average) ** 2) / sum_of_weights + self.variances.append(value) + + def find_minimum_value(self,temporal_mean): + """ + This method finds the minimum values of the temporal_means array + Parameters: + temporal_means:The array of means for the requested variable that + is calculated in the calling script. + Return:Nothing is returned. + """ + self.minimum.append(np.nanmin(temporal_mean)) + + def find_maximum_value(self,temporal_mean): + """ + This method finds the maximum value of the temporal_means array. + Parameters: + temporal_means:The array of means for the requested variable that is + calcuated in the calling script. + Return:Nothing is returned. + """ + self.maximum.append(np.nanmax(temporal_mean)) + diff --git a/tests/configs/netcdf_harvester_config__valid.yaml b/tests/configs/netcdf_harvester_config__valid.yaml new file mode 100644 index 0000000..cd47b38 --- /dev/null +++ b/tests/configs/netcdf_harvester_config__valid.yaml @@ -0,0 +1,31 @@ +elevation_unit: plevs +file_meta: + cycletime: 2015-12-01 00:00:00 + filename_format_str: innov_stats.metric.%Y%m%d%H.nc + filepath_format_str: /Users/sfredrick/adam/score-hv/tests/data/ +harvester_name: innov_stats_netcdf +metrics: +- temperature +- spechumid +- uvwind +output_format: pandas_dataframe +regions: + equatorial: + lat_max: 5.0 + lat_min: -5.0 + global: + lat_max: 90.0 + lat_min: -90.0 + north_hemis: + lat_max: 60.0 + lat_min: 20.0 + south_hemis: + lat_max: -20.0 + lat_min: -60.0 + tropics: + lat_max: 20.0 + lat_min: -20.0 +stats: +- bias +- count +- rmsd diff --git a/tests/data/bfg_1994010100_fhr09_prateb_control.nc b/tests/data/bfg_1994010100_fhr09_prateb_control.nc deleted file mode 100644 index 2ada765..0000000 Binary files a/tests/data/bfg_1994010100_fhr09_prateb_control.nc and /dev/null differ diff --git a/tests/data/bfg_1994010100_fhr09_soil_control.nc b/tests/data/bfg_1994010100_fhr09_soil_control.nc new file mode 100644 index 0000000..3376e6e Binary files /dev/null and b/tests/data/bfg_1994010100_fhr09_soil_control.nc differ diff --git a/tests/data/bfg_1994010106_fhr06_prateb_control.nc b/tests/data/bfg_1994010106_fhr06_prateb_control.nc deleted file mode 100644 index 03edc23..0000000 Binary files a/tests/data/bfg_1994010106_fhr06_prateb_control.nc and /dev/null differ diff --git a/tests/data/bfg_1994010106_fhr06_soil_control.nc b/tests/data/bfg_1994010106_fhr06_soil_control.nc new file mode 100644 index 0000000..fec1e48 Binary files /dev/null and b/tests/data/bfg_1994010106_fhr06_soil_control.nc differ diff --git a/tests/data/bfg_1994010106_fhr09_prateb_control.nc b/tests/data/bfg_1994010106_fhr09_prateb_control.nc deleted file mode 100644 index 570e890..0000000 Binary files a/tests/data/bfg_1994010106_fhr09_prateb_control.nc and /dev/null differ diff --git a/tests/data/bfg_1994010106_fhr09_soil_control.nc b/tests/data/bfg_1994010106_fhr09_soil_control.nc new file mode 100644 index 0000000..c9972b3 Binary files /dev/null and b/tests/data/bfg_1994010106_fhr09_soil_control.nc differ diff --git a/tests/data/bfg_1994010112_fhr06_prateb_control.nc b/tests/data/bfg_1994010112_fhr06_prateb_control.nc deleted file mode 100644 index 917aebe..0000000 Binary files a/tests/data/bfg_1994010112_fhr06_prateb_control.nc and /dev/null differ diff --git a/tests/data/bfg_1994010112_fhr06_soil_control.nc b/tests/data/bfg_1994010112_fhr06_soil_control.nc new file mode 100644 index 0000000..51a8b4d Binary files /dev/null and b/tests/data/bfg_1994010112_fhr06_soil_control.nc differ diff --git a/tests/data/bfg_1994010112_fhr09_prateb_control.nc b/tests/data/bfg_1994010112_fhr09_prateb_control.nc deleted file mode 100644 index 011a56d..0000000 Binary files a/tests/data/bfg_1994010112_fhr09_prateb_control.nc and /dev/null differ diff --git a/tests/data/bfg_1994010112_fhr09_soil_control.nc b/tests/data/bfg_1994010112_fhr09_soil_control.nc new file mode 100644 index 0000000..d74ee56 Binary files /dev/null and b/tests/data/bfg_1994010112_fhr09_soil_control.nc differ diff --git a/tests/data/bfg_1994010118_fhr06_prateb_control.nc b/tests/data/bfg_1994010118_fhr06_prateb_control.nc deleted file mode 100644 index cdc475a..0000000 Binary files a/tests/data/bfg_1994010118_fhr06_prateb_control.nc and /dev/null differ diff --git a/tests/data/bfg_1994010118_fhr06_soil_control.nc b/tests/data/bfg_1994010118_fhr06_soil_control.nc new file mode 100644 index 0000000..cf5f573 Binary files /dev/null and b/tests/data/bfg_1994010118_fhr06_soil_control.nc differ diff --git a/tests/data/bfg_1994010118_fhr09_prateb_control.nc b/tests/data/bfg_1994010118_fhr09_prateb_control.nc deleted file mode 100644 index b883faf..0000000 Binary files a/tests/data/bfg_1994010118_fhr09_prateb_control.nc and /dev/null differ diff --git a/tests/data/bfg_1994010118_fhr09_soil_control.nc b/tests/data/bfg_1994010118_fhr09_soil_control.nc new file mode 100644 index 0000000..94b67e0 Binary files /dev/null and b/tests/data/bfg_1994010118_fhr09_soil_control.nc differ diff --git a/tests/data/bfg_1994010200_fhr06_prateb_control.nc b/tests/data/bfg_1994010200_fhr06_prateb_control.nc deleted file mode 100644 index 467c6df..0000000 Binary files a/tests/data/bfg_1994010200_fhr06_prateb_control.nc and /dev/null differ diff --git a/tests/data/bfg_1994010200_fhr06_soil_control.nc b/tests/data/bfg_1994010200_fhr06_soil_control.nc new file mode 100644 index 0000000..8bdaf6e Binary files /dev/null and b/tests/data/bfg_1994010200_fhr06_soil_control.nc differ diff --git a/tests/data/innov_stats.spechumid.2015120106.nc b/tests/data/innov_stats.spechumid.2015120106.nc deleted file mode 100644 index d14e41f..0000000 Binary files a/tests/data/innov_stats.spechumid.2015120106.nc and /dev/null differ diff --git a/tests/data/innov_stats.temperature.2015120106.nc b/tests/data/innov_stats.temperature.2015120106.nc deleted file mode 100644 index 85a7fb4..0000000 Binary files a/tests/data/innov_stats.temperature.2015120106.nc and /dev/null differ diff --git a/tests/data/innov_stats.uvwind.2015120106.nc b/tests/data/innov_stats.uvwind.2015120106.nc deleted file mode 100644 index da75757..0000000 Binary files a/tests/data/innov_stats.uvwind.2015120106.nc and /dev/null differ diff --git a/tests/test_harvester_daily_bfg_prateb.py b/tests/test_harvester_daily_bfg_prateb.py deleted file mode 100755 index 41aebc0..0000000 --- a/tests/test_harvester_daily_bfg_prateb.py +++ /dev/null @@ -1,227 +0,0 @@ -#!/usr/bin/env python - -import os -import sys -from pathlib import Path - -import numpy as np -from datetime import datetime -import pytest -import yaml -from netCDF4 import Dataset - -from score_hv import hv_registry -from score_hv.harvester_base import harvest -from score_hv.yaml_utils import YamlLoader -from score_hv.harvesters.innov_netcdf import Region, InnovStatsCfg - -TEST_DATA_FILE_NAMES = ['bfg_1994010100_fhr09_prateb_control.nc', - 'bfg_1994010106_fhr06_prateb_control.nc', - 'bfg_1994010106_fhr09_prateb_control.nc', - 'bfg_1994010112_fhr06_prateb_control.nc', - 'bfg_1994010112_fhr09_prateb_control.nc', - 'bfg_1994010118_fhr06_prateb_control.nc', - 'bfg_1994010118_fhr09_prateb_control.nc', - 'bfg_1994010200_fhr06_prateb_control.nc'] - -DATA_DIR = os.path.join(Path(__file__).parent.parent.resolve(), 'data') -GRIDCELL_AREA_DATA_PATH = os.path.join(DATA_DIR, - 'gridcell-area' + - '_noaa-ufs-gefsv13replay-pds' + - '_bfg_control_1536x768_20231116.nc') - -CONFIGS_DIR = 'configs' -PYTEST_CALLING_DIR = Path(__file__).parent.resolve() -TEST_DATA_PATH = os.path.join(PYTEST_CALLING_DIR, 'data') -BFG_PATH = [os.path.join(TEST_DATA_PATH, - file_name) for file_name in TEST_DATA_FILE_NAMES] - -VALID_CONFIG_DICT = {'harvester_name': hv_registry.DAILY_BFG, - 'filenames' : BFG_PATH, - 'statistic': ['mean', 'variance', 'minimum', 'maximum'], - 'variable': ['prateb_ave']} - -def test_gridcell_area_conservation(tolerance=0.001): - - gridcell_area_data = Dataset(GRIDCELL_AREA_DATA_PATH) - - assert gridcell_area_data['area'].units == 'steradian' - - sum_gridcell_area = np.sum(gridcell_area_data.variables['area']) - - assert sum_gridcell_area < (1 + tolerance) * 4 * np.pi - assert sum_gridcell_area > (1 - tolerance) * 4 * np.pi - - gridcell_area_data.close() - -def test_variable_names(): - data1 = harvest(VALID_CONFIG_DICT) - assert data1[0].variable == 'prateb_ave' - -def test_global_mean_values(tolerance=0.001): - """The value of 3.117e-05 is the mean value of the global means - calculated from eight forecast files: - - bfg_1994010100_fhr09_prateb_control.nc - bfg_1994010106_fhr06_prateb_control.nc - bfg_1994010106_fhr09_prateb_control.nc - bfg_1994010112_fhr06_prateb_control.nc - bfg_1994010112_fhr09_prateb_control.nc - bfg_1994010118_fhr06_prateb_control.nc - bfg_1994010118_fhr09_prateb_control.nc - bfg_1994010200_fhr06_prateb_control.nc - - When averaged together, these files represent a 24 hour mean. The - average value hard-coded in this test was calculated from - these forecast files using a separate python code. - """ - data1 = harvest(VALID_CONFIG_DICT) - global_mean = 3.1173840683271906e-05 - assert data1[0].value <= (1 + tolerance) * global_mean - assert data1[0].value >= (1 - tolerance) * global_mean - -def test_global_mean_values2(tolerance=0.001): - """Opens each background Netcdf file using the - netCDF4 library function Dataset and computes the expected value - of the provided variable. In this case prateb_ave. - """ - data1 = harvest(VALID_CONFIG_DICT) - - gridcell_area_data = Dataset(GRIDCELL_AREA_DATA_PATH) - norm_weights = gridcell_area_data.variables['area'][:] / np.sum( - gridcell_area_data.variables['area'][:]) - - summation = np.ma.zeros(gridcell_area_data.variables['area'].shape) - for file_count, data_file in enumerate(BFG_PATH): - test_rootgrp = Dataset(data_file) - - summation += test_rootgrp.variables[VALID_CONFIG_DICT['variable'][0]][0] - - test_rootgrp.close() - - temporal_mean = summation / (file_count + 1) - global_mean = np.ma.sum(norm_weights * temporal_mean) - - for i, harvested_tuple in enumerate(data1): - if harvested_tuple.statistic == 'mean': - assert global_mean <= (1 + tolerance) * harvested_tuple.value - assert global_mean >= (1 - tolerance) * harvested_tuple.value - - gridcell_area_data.close() - -def test_gridcell_variance(tolerance=0.001): - """Opens each background Netcdf file using the - netCDF4 library function Dataset and computes the variance - of the provided variable. In this case prateb_ave. - """ - data1 = harvest(VALID_CONFIG_DICT) - - gridcell_area_data = Dataset(GRIDCELL_AREA_DATA_PATH) - norm_weights = gridcell_area_data.variables['area'][:] / np.sum( - gridcell_area_data.variables['area'][:]) - - summation = np.ma.zeros(gridcell_area_data.variables['area'].shape) - for file_count, data_file in enumerate(BFG_PATH): - test_rootgrp = Dataset(data_file) - - summation += test_rootgrp.variables[VALID_CONFIG_DICT['variable'][0]][0] - - test_rootgrp.close() - - temporal_mean = summation / (file_count + 1) - - global_mean = np.ma.sum(norm_weights * temporal_mean) - variance = np.ma.sum((temporal_mean - global_mean)**2 * norm_weights) - - for i, harvested_tuple in enumerate(data1): - if harvested_tuple.statistic == 'variance': - assert variance <= (1 + tolerance) * harvested_tuple.value - assert variance >= (1 - tolerance) * harvested_tuple.value - - gridcell_area_data.close() - -def test_gridcell_min_max(tolerance=0.001): - """Opens each background Netcdf file using the - netCDF4 library function Dataset and computes the maximum - of the provided variable. In this case prateb_ave. - """ - data1 = harvest(VALID_CONFIG_DICT) - - gridcell_area_data = Dataset(GRIDCELL_AREA_DATA_PATH) - - summation = np.ma.zeros(gridcell_area_data.variables['area'].shape) - for file_count, data_file in enumerate(BFG_PATH): - test_rootgrp = Dataset(data_file) - - summation += test_rootgrp.variables[VALID_CONFIG_DICT['variable'][0]][0] - - test_rootgrp.close() - - temporal_mean = summation / (file_count + 1) - minimum = np.ma.min(temporal_mean) - maximum = np.ma.max(temporal_mean) - - """The following offline min and max were calculated from an external - python code - """ - offline_min = 0.0 - offline_max = 0.0043600933 - for i, harvested_tuple in enumerate(data1): - if harvested_tuple.statistic == 'maximum': - assert maximum <= (1 + tolerance) * harvested_tuple.value - assert maximum >= (1 - tolerance) * harvested_tuple.value - - assert offline_max <= (1 + tolerance) * harvested_tuple.value - assert offline_max >= (1 - tolerance) * harvested_tuple.value - - - elif harvested_tuple.statistic == 'minimum': - assert minimum <= (1 + tolerance) * harvested_tuple.value - assert minimum >= (1 - tolerance) * harvested_tuple.value - - assert offline_min <= (1 + tolerance) * harvested_tuple.value - assert offline_min >= (1 - tolerance) * harvested_tuple.value - - gridcell_area_data.close() - -def test_units(): - data1 = harvest(VALID_CONFIG_DICT) - assert data1[0].units == "kg/m**2/s" - -def test_cycletime(): - """ The hard coded datetimestr 1994-01-01 12:00:00 - is the median midpoint time of the filenames defined above in the - BFG_PATH. We have to convert this into a datetime object in order - to compare this string to what is returned by - global_bucket_precip_ave.py - """ - data1 = harvest(VALID_CONFIG_DICT) - expected_datetime = datetime.strptime("1994-01-01 12:00:00", - "%Y-%m-%d %H:%M:%S") - assert data1[0].mediantime == expected_datetime - -def test_longname(): - data1 = harvest(VALID_CONFIG_DICT) - var_longname = "bucket surface precipitation rate" - assert data1[0].longname == var_longname - -def test_precip_harvester(): - data1 = harvest(VALID_CONFIG_DICT) - assert type(data1) is list - assert len(data1) > 0 - assert data1[0].filenames==BFG_PATH - -def main(): - test_gridcell_area_conservation() - test_precip_harvester() - test_variable_names() - test_units() - test_global_mean_values() - test_global_mean_values2() - test_gridcell_variance() - test_gridcell_min_max() - test_cycletime() - test_longname() - -if __name__=='__main__': - main() \ No newline at end of file diff --git a/tests/test_harvester_global_soil_moisture.py b/tests/test_harvester_global_soil_moisture.py new file mode 100755 index 0000000..a5b8431 --- /dev/null +++ b/tests/test_harvester_global_soil_moisture.py @@ -0,0 +1,184 @@ +#!/usr/bin/env python + +import os +import sys +from pathlib import Path + +import numpy as np +from datetime import datetime +import pytest +import yaml +from netCDF4 import Dataset + +from score_hv import hv_registry +from score_hv.harvester_base import harvest +from score_hv.yaml_utils import YamlLoader +from score_hv.harvesters.innov_netcdf import Region, InnovStatsCfg + +TEST_DATA_FILE_NAMES = ['bfg_1994010100_fhr09_soil_control.nc', + 'bfg_1994010106_fhr06_soil_control.nc', + 'bfg_1994010106_fhr09_soil_control.nc', + 'bfg_1994010112_fhr06_soil_control.nc', + 'bfg_1994010112_fhr09_soil_control.nc', + 'bfg_1994010118_fhr06_soil_control.nc', + 'bfg_1994010118_fhr09_soil_control.nc', + 'bfg_1994010200_fhr06_soil_control.nc'] + +DATA_DIR = os.path.join(Path(__file__).parent.parent.resolve(),'src','score_hv','data') +GRIDCELL_AREA_DATA_PATH = os.path.join(DATA_DIR, + 'gridcell-area' + + '_noaa-ufs-gefsv13replay-pds' + + '_bfg_control_1536x768_20231116.nc') +CONFIGS_DIR = 'configs' +PYTEST_CALLING_DIR = Path(__file__).parent.resolve() +TEST_DATA_PATH = os.path.join(PYTEST_CALLING_DIR, 'data') +BFG_PATH = [os.path.join(TEST_DATA_PATH, + file_name) for file_name in TEST_DATA_FILE_NAMES] + +VALID_CONFIG_DICT = {'harvester_name': hv_registry.DAILY_BFG, + 'filenames' : BFG_PATH, + 'statistic': ['mean','variance', 'minimum', 'maximum'], + 'variable': ['soill4','soilm'], + } + +def test_gridcell_area_conservation(tolerance=0.001): + + gridcell_area_data = Dataset(GRIDCELL_AREA_DATA_PATH) + assert gridcell_area_data['area'].units == 'steradian' + sum_gridcell_area = np.sum(gridcell_area_data.variables['area']) + assert sum_gridcell_area < (1 + tolerance) * 4 * np.pi + assert sum_gridcell_area > (1 - tolerance) * 4 * np.pi + gridcell_area_data.close() + +def test_variable_names(): + """Here we are testing two variables. The daily_bfg harvester + should return values for both variables at once. + """ + expected_variables = ['soill4', 'soilm'] + assert VALID_CONFIG_DICT['variable'] == expected_variables + +def test_global_mean_values(tolerance=0.001): + """ + The values of the calculated_means list were + calculated from these eight forecast files: + + bfg_1994010100_fhr09_soil_control.nc + bfg_1994010106_fhr06_soil_control.nc + bfg_1994010106_fhr09_soil_control.nc + bfg_1994010112_fhr06_soil_control.nc + bfg_1994010112_fhr09_soil_control.nc + bfg_1994010118_fhr06_soil_control.nc + bfg_1994010118_fhr09_soil_control.nc + bfg_1994010200_fhr06_soil_control.nc + + When averaged together, these files represent a 24 hour mean. The + average values hard-coded in this test was calculated from + forecast files using a separate python code. + """ + data1 = harvest(VALID_CONFIG_DICT) + + for item in data1: + if item.variable == 'soill4' and item.statistic == 'mean': + calculated_means = [0.0782377246355602] + assert calculated_means <= (1 + tolerance) * item.value + assert calculated_means >= (1 - tolerance) * item.value + elif item.variable == 'soilm' and item.statistic == 'mean': + calculated_means = 159.0458158728542 + assert calculated_means <= (1 + tolerance) * item.value + assert calculated_means >= (1 - tolerance) * item.value + +def test_gridcell_variance(tolerance=0.001): + """ + The values of the calculated_variances list were calculated + from the forecast files listed above in a separate python script. + """ + data1 = harvest(VALID_CONFIG_DICT) + + for item in data1: + if item.variable == 'soill4' and item.statistic == 'variance': + calculated_variances = 0.014143652677176984 + assert calculated_variances <= (1 + tolerance) * item.value + assert calculated_variances >= (1 - tolerance) * item.value + elif item.variable == 'soilm' and item.statistic == 'variance': + calculated_variances = 54875.85889320354 + assert calculated_variances <= (1 + tolerance) * item.value + assert calculated_variances >= (1 - tolerance) * item.value + +def test_gridcell_min_max(tolerance=0.001): + data1 = harvest(VALID_CONFIG_DICT) + + for item in data1: + if item.variable == 'soill4' and item.statistic == 'minimum': + calculated_min = 1e-05 + assert calculated_min <= (1 + tolerance) * item.value + assert calculated_min >= (1 - tolerance) * item.value + + elif item.variable == 'soill4' and item.statistic == 'maximum': + calculated_max = 0.46713507 + assert calculated_max <= (1 + tolerance) * item.value + assert calculated_max >= (1 - tolerance) * item.value + + elif item.variable == 'soilm' and item.statistic == 'minimum': + calculated_min = 89.12119 + assert calculated_min <= (1 + tolerance) * item.value + assert calculated_min >= (1 - tolerance) * item.value + + elif item.variable == 'soilm' and item.statistic == 'maximum': + calculated_max = 922.9723 + assert calculated_max <= (1 + tolerance) * item.value + assert calculated_max >= (1 - tolerance) * item.value + +def test_units(): + variable_dictionary = {} + data1 = harvest(VALID_CONFIG_DICT) + + for item in data1: + if item.variable == 'soill4': + expected_units = 'xxx' + assert expected_units == item.units + elif item.variable == 'soilm': + expected_units = 'kg/m**2' + assert expected_units == item.units + +def test_cycletime(): + """ The hard coded datetimestr 1994-01-01 12:00:00 + is the median midpoint time of the filenames defined above in the + BFG_PATH. We have to convert this into a datetime object in order + to compare this string to what is returned by + global_bucket_precip_ave.py + """ + data1 = harvest(VALID_CONFIG_DICT) + expected_datetime = datetime.strptime("1994-01-01 12:00:00", + "%Y-%m-%d %H:%M:%S") + assert data1[0].mediantime == expected_datetime + +def test_longname(): + data1 = harvest(VALID_CONFIG_DICT) + + for item in data1: + if item.variable == 'soilt4': + expected_longname = 'soil temperature unknown layer 4' + assert expected_longname == item.longname + elif item.variable == 'tg3': + expected_longname = 'deep soil temperature' + assert expected_longname == item.longname + +def test_soil_moisture_level4_harvester(): + data1 = harvest(VALID_CONFIG_DICT) + assert type(data1) is list + assert len(data1) > 0 + assert data1[0].filenames==BFG_PATH + +def main(): + test_gridcell_area_conservation() + test_soil_moisture_level4_harvester() + test_variable_names() + test_units() + test_global_mean_values() + test_gridcell_variance() + test_gridcell_min_max() + test_cycletime() + test_longname() + +if __name__=='__main__': + main() diff --git a/tests/test_harvester_global_soil_temperature.py b/tests/test_harvester_global_soil_temperature.py new file mode 100755 index 0000000..d22b412 --- /dev/null +++ b/tests/test_harvester_global_soil_temperature.py @@ -0,0 +1,184 @@ +#!/usr/bin/env python + +import os +import sys +from pathlib import Path + +import numpy as np +from datetime import datetime +import pytest +import yaml +from netCDF4 import Dataset + +from score_hv import hv_registry +from score_hv.harvester_base import harvest +from score_hv.yaml_utils import YamlLoader +from score_hv.harvesters.innov_netcdf import Region, InnovStatsCfg + +TEST_DATA_FILE_NAMES = ['bfg_1994010100_fhr09_soil_control.nc', + 'bfg_1994010106_fhr06_soil_control.nc', + 'bfg_1994010106_fhr09_soil_control.nc', + 'bfg_1994010112_fhr06_soil_control.nc', + 'bfg_1994010112_fhr09_soil_control.nc', + 'bfg_1994010118_fhr06_soil_control.nc', + 'bfg_1994010118_fhr09_soil_control.nc', + 'bfg_1994010200_fhr06_soil_control.nc'] + +DATA_DIR = os.path.join(Path(__file__).parent.parent.resolve(),'src','score_hv','data') +GRIDCELL_AREA_DATA_PATH = os.path.join(DATA_DIR, + 'gridcell-area' + + '_noaa-ufs-gefsv13replay-pds' + + '_bfg_control_1536x768_20231116.nc') +CONFIGS_DIR = 'configs' +PYTEST_CALLING_DIR = Path(__file__).parent.resolve() +TEST_DATA_PATH = os.path.join(PYTEST_CALLING_DIR, 'data') +BFG_PATH = [os.path.join(TEST_DATA_PATH, + file_name) for file_name in TEST_DATA_FILE_NAMES] + +VALID_CONFIG_DICT = {'harvester_name': hv_registry.DAILY_BFG, + 'filenames' : BFG_PATH, + 'statistic': ['mean','variance', 'minimum', 'maximum'], + 'variable': ['soilt4','tg3'], + } + +def test_gridcell_area_conservation(tolerance=0.001): + + gridcell_area_data = Dataset(GRIDCELL_AREA_DATA_PATH) + assert gridcell_area_data['area'].units == 'steradian' + sum_gridcell_area = np.sum(gridcell_area_data.variables['area']) + assert sum_gridcell_area < (1 + tolerance) * 4 * np.pi + assert sum_gridcell_area > (1 - tolerance) * 4 * np.pi + gridcell_area_data.close() + +def test_variable_names(): + """Here we are testing two variables. The daily_bfg harvester + should return values for both variables at once. + """ + expected_variables = ['soilt4', 'tg3'] + assert VALID_CONFIG_DICT['variable'] == expected_variables + +def test_global_mean_values(tolerance=0.001): + """ + The values of the calculated_means list were + calculated from these eight forecast files: + + bfg_1994010100_fhr09_soil_control.nc + bfg_1994010106_fhr06_soil_control.nc + bfg_1994010106_fhr09_soil_control.nc + bfg_1994010112_fhr06_soil_control.nc + bfg_1994010112_fhr09_soil_control.nc + bfg_1994010118_fhr06_soil_control.nc + bfg_1994010118_fhr09_soil_control.nc + bfg_1994010200_fhr06_soil_control.nc + + When averaged together, these files represent a 24 hour mean. The + average values hard-coded in this test was calculated from + forecast files using a separate python code. + """ + data1 = harvest(VALID_CONFIG_DICT) + + for item in data1: + if item.variable == 'soilt4' and item.statistic == 'mean': + calculated_means = 77.3765814142164 + assert calculated_means <= (1 + tolerance) * item.value + assert calculated_means >= (1 - tolerance) * item.value + elif item.variable == 'tg3' and item.statistic == 'mean': + calculated_means = 77.40945179743201 + assert calculated_means <= (1 + tolerance) * item.value + assert calculated_means >= (1 - tolerance) * item.value + +def test_gridcell_variance(tolerance=0.001): + """ + The values of the calculated_variances list were calculated + from the forecast files listed above in a separate python script. + """ + data1 = harvest(VALID_CONFIG_DICT) + + for item in data1: + if item.variable == 'soilt4' and item.statistic == 'variance': + calculated_variances = 11907.888288429804 + assert calculated_variances <= (1 + tolerance) * item.value + assert calculated_variances >= (1 - tolerance) * item.value + elif item.variable == 'tg3' and item.statistic == 'variance': + calculated_variances = 11911.783810514964 + assert calculated_variances <= (1 + tolerance) * item.value + assert calculated_variances >= (1 - tolerance) * item.value + +def test_gridcell_min_max(tolerance=0.001): + data1 = harvest(VALID_CONFIG_DICT) + + for item in data1: + if item.variable == 'soilt4' and item.statistic == 'minimum': + calculated_min = 240.58295 + assert calculated_min <= (1 + tolerance) * item.value + assert calculated_min >= (1 - tolerance) * item.value + + elif item.variable == 'soilt4' and item.statistic == 'maximum': + calculated_max = 308.44702 + assert calculated_max <= (1 + tolerance) * item.value + assert calculated_max >= (1 - tolerance) * item.value + + elif item.variable == 'tg3' and item.statistic == 'minimum': + calculated_min = 250.65329 + assert calculated_min <= (1 + tolerance) * item.value + assert calculated_min >= (1 - tolerance) * item.value + + elif item.variable == 'tg3' and item.statistic == 'maximum': + calculated_max = 302.39206 + assert calculated_max <= (1 + tolerance) * item.value + assert calculated_max >= (1 - tolerance) * item.value + +def test_units(): + variable_dictionary = {} + data1 = harvest(VALID_CONFIG_DICT) + + for item in data1: + if item.variable == 'soill4': + expected_units = 'xxx' + assert expected_units == item.units + elif item.variable == 'soilm': + expected_units = 'kg/m**2' + assert expected_units == item.units + +def test_cycletime(): + """ The hard coded datetimestr 1994-01-01 12:00:00 + is the median midpoint time of the filenames defined above in the + BFG_PATH. We have to convert this into a datetime object in order + to compare this string to what is returned by + global_bucket_precip_ave.py + """ + data1 = harvest(VALID_CONFIG_DICT) + expected_datetime = datetime.strptime("1994-01-01 12:00:00", + "%Y-%m-%d %H:%M:%S") + assert data1[0].mediantime == expected_datetime + +def test_longname(): + data1 = harvest(VALID_CONFIG_DICT) + + for item in data1: + if item.variable == 'soilt4': + expected_longname = 'soil temperature unknown layer 4' + assert expected_longname == item.longname + elif item.variable == 'tg3': + expected_longname = 'deep soil temperature' + assert expected_longname == item.longname + +def test_soil_moisture_level4_harvester(): + data1 = harvest(VALID_CONFIG_DICT) + assert type(data1) is list + assert len(data1) > 0 + assert data1[0].filenames==BFG_PATH + +def main(): + test_gridcell_area_conservation() + test_soil_moisture_level4_harvester() + test_variable_names() + test_units() + test_global_mean_values() + test_gridcell_variance() + test_gridcell_min_max() + test_cycletime() + test_longname() + +if __name__=='__main__': + main() diff --git a/tests/test_harvester_netcdf_temperature.py b/tests/test_harvester_netcdf_temperature.py deleted file mode 100644 index 330d01d..0000000 --- a/tests/test_harvester_netcdf_temperature.py +++ /dev/null @@ -1,118 +0,0 @@ -""" -Copyright 2022 NOAA -All rights reserved. - -Unit tests for io_utils - -""" -import os -import pathlib -import pytest -import yaml -from datetime import datetime - -from score_hv import hv_registry -from score_hv.harvester_base import harvest -from score_hv.yaml_utils import YamlLoader -from score_hv.harvesters.innov_netcdf import Region, InnovStatsCfg - - -PYTEST_CALLING_DIR = pathlib.Path(__file__).parent.resolve() -NETCDF_HARVESTER_CONFIG__VALID = 'netcdf_harvester_config__valid.yaml' - -DATA_DIR = 'data' -CONFIGS_DIR = 'configs' - -file_path_innov_stats = os.path.join( - PYTEST_CALLING_DIR, - DATA_DIR -) - -VALID_CONFIG_DICT = { - 'harvester_name': hv_registry.INNOV_NETCDF, - 'file_meta': { - 'filepath_format_str': '%s/' % file_path_innov_stats, - 'filename_format_str': 'innov_stats.metric.%Y%m%d%H.nc', - 'cycletime': datetime(2015,12,1,0) - }, - 'stats': ['bias', 'count', 'rmsd'], - 'metrics': ['temperature','spechumid','uvwind'], - 'elevation_unit': 'plevs', - 'regions': { - 'equatorial': { - 'lat_min': -5.0, - 'lat_max': 5.0 - }, - 'global': { - 'lat_min': -90.0, - 'lat_max': 90.0 - }, - 'north_hemis': { - 'lat_min': 20.0, - 'lat_max': 60.0 - }, - 'tropics': { - 'lat_min': -20.0, - 'lat_max': 20.0 - }, - 'south_hemis': { - 'lat_min': -60.0, - 'lat_max': -20.0 - }, - 'south_hemis': { - 'lat_min': -60.0, - 'lat_max': -20.0 - }, - }, - 'output_format': hv_registry.PANDAS_DATAFRAME -} - - -def test_netcdf_harvester_config(): - """ - Test basic harvester configuration - """ - conf_yaml_fn = os.path.join( - PYTEST_CALLING_DIR, - CONFIGS_DIR, - NETCDF_HARVESTER_CONFIG__VALID - ) - - with open(conf_yaml_fn, 'w', encoding='utf8') as file: - documents = yaml.dump(VALID_CONFIG_DICT, file) - print(f'conf_dict: {conf_yaml_fn}, documents: {documents}') - - data1 = harvest(VALID_CONFIG_DICT) - harvest_dict = YamlLoader(conf_yaml_fn).load()[0] - data2 = harvest(conf_yaml_fn) - - assert len(data1) == len(data2) - - print(f'dataframe column names: {data2.columns.values.tolist()}') - - print(f'harvested {len(data1)} records using config: {VALID_CONFIG_DICT}') - print(f'harvested {len(data2)} records using config: {harvest_dict}') - print(f'harvested data type: {type(data2)}') - print(f'harvested data: {data2}') - - os.remove(conf_yaml_fn) - - -def test_netcdf_harvester_region_config(): - """ - Test region class - """ - - region = Region('test_region', -10.0, 10.0) - with pytest.raises(ValueError): - region = Region(5, -10, 10) - with pytest.raises(ValueError): - region = Region({}, -10, 10) - with pytest.raises(ValueError): - region = Region([], -10, 10) - with pytest.raises(ValueError): - region = Region('test_region', -10, 10) - with pytest.raises(ValueError): - region = Region('test_region', -10, []) - with pytest.raises(ValueError): - region = Region('test_region', -10, {}) diff --git a/tests/test_harvester_regional_soil_moisture.py b/tests/test_harvester_regional_soil_moisture.py new file mode 100755 index 0000000..f560d79 --- /dev/null +++ b/tests/test_harvester_regional_soil_moisture.py @@ -0,0 +1,202 @@ +#!/usr/bin/env python + +import os +import sys +from pathlib import Path + +import numpy as np +from datetime import datetime +import pytest +import yaml +from netCDF4 import Dataset + +from score_hv import hv_registry +from score_hv.harvester_base import harvest +from score_hv.yaml_utils import YamlLoader +from score_hv.harvesters.innov_netcdf import Region, InnovStatsCfg + +TEST_DATA_FILE_NAMES = ['bfg_1994010100_fhr09_soil_control.nc', + 'bfg_1994010106_fhr06_soil_control.nc', + 'bfg_1994010106_fhr09_soil_control.nc', + 'bfg_1994010112_fhr06_soil_control.nc', + 'bfg_1994010112_fhr09_soil_control.nc', + 'bfg_1994010118_fhr06_soil_control.nc', + 'bfg_1994010118_fhr09_soil_control.nc', + 'bfg_1994010200_fhr06_soil_control.nc'] + +DATA_DIR = os.path.join(Path(__file__).parent.parent.resolve(),'src','score_hv','data') +GRIDCELL_AREA_DATA_PATH = os.path.join(DATA_DIR, + 'gridcell-area' + + '_noaa-ufs-gefsv13replay-pds' + + '_bfg_control_1536x768_20231116.nc') +CONFIGS_DIR = 'configs' +PYTEST_CALLING_DIR = Path(__file__).parent.resolve() +TEST_DATA_PATH = os.path.join(PYTEST_CALLING_DIR, 'data') +BFG_PATH = [os.path.join(TEST_DATA_PATH, + file_name) for file_name in TEST_DATA_FILE_NAMES] + +VALID_CONFIG_DICT = {'harvester_name': hv_registry.DAILY_BFG, + 'filenames' : BFG_PATH, + 'statistic': ['mean','variance', 'minimum', 'maximum'], + 'variable': ['soill4','soilm'], + 'regions': { + 'north_hemi': {'north_lat': 90.0, 'south_lat': 0.0, 'west_long': 0.0, 'east_long': 360.0}, + 'south_meni': {'north_lat': 0.0, 'south_lat': -90.0, 'west_long': 0.0, 'east_long': 360.0}, + 'eastern_hemis': {'north_lat': 90.0, 'south_lat': -90.0, 'west_long': 0.0, 'east_long': 180.0}, + 'western_hemis': {'north_lat': 90.0, 'south_lat': -90.0, 'west_long': 180.0, 'east_long': 360.0}, + } + + } + +def test_gridcell_area_conservation(tolerance=0.001): + + gridcell_area_data = Dataset(GRIDCELL_AREA_DATA_PATH) + assert gridcell_area_data['area'].units == 'steradian' + sum_gridcell_area = np.sum(gridcell_area_data.variables['area']) + assert sum_gridcell_area < (1 + tolerance) * 4 * np.pi + assert sum_gridcell_area > (1 - tolerance) * 4 * np.pi + gridcell_area_data.close() + +def test_variable_names(): + """Here we are testing two variables. The daily_bfg harvester + should return values for both variables at once. + """ + expected_variables = ['soill4', 'soilm'] + assert VALID_CONFIG_DICT['variable'] == expected_variables + +def test_global_mean_values(tolerance=0.001): + """ + The values of the calculated_means list were + calculated from these eight forecast files: + + bfg_1994010100_fhr09_soil_control.nc + bfg_1994010106_fhr06_soil_control.nc + bfg_1994010106_fhr09_soil_control.nc + bfg_1994010112_fhr06_soil_control.nc + bfg_1994010112_fhr09_soil_control.nc + bfg_1994010118_fhr06_soil_control.nc + bfg_1994010118_fhr09_soil_control.nc + bfg_1994010200_fhr06_soil_control.nc + + When averaged together, these files represent a 24 hour mean. The + average values hard-coded in this test was calculated from + forecast files using a separate python code. + + In this test there are four regions. The daily_bfg harvester will return + the values of all four regions at once. + """ + data1 = harvest(VALID_CONFIG_DICT) + + for item in data1: + if item.variable == 'soill4' and item.statistic == 'mean': + calculated_means = [0.11134281856307063, 0.04513263070804976, 0.10126455296065043, 0.05521089631046988] + for index in range(len(calculated_means)): + assert calculated_means[index] <= (1 + tolerance) * item.value[index] + assert calculated_means[index] >= (1 - tolerance) * item.value[index] + elif item.variable == 'soilm' and item.statistic == 'mean': + calculated_means = [232.69920751701292, 85.39242422869546, 205.6628787788818, 112.42875296682669] + for index in range(len(calculated_means)): + assert calculated_means[index] <= (1 + tolerance) * item.value[index] + assert calculated_means[index] >= (1 - tolerance) * item.value[index] + +def test_gridcell_variance(tolerance=0.001): + """ + The values of the calculated_variances list were calculated + from the forecast files listed above in a separate python script. + """ + data1 = harvest(VALID_CONFIG_DICT) + + for item in data1: + if item.variable == 'soill4' and item.statistic == 'variance': + calculated_variances = [0.014560371053031877, 0.011259243705960105, 0.014589256931596429, 0.012456589848810333] + for index in range(len(calculated_variances)): + assert calculated_variances[index] <= (1 + tolerance) * item.value[index] + assert calculated_variances[index] >= (1 - tolerance) * item.value[index] + elif item.variable == 'soilm' and item.statistic == 'variance': + calculated_variances = [56077.7004913776, 40892.99653052055, 56345.750697882184, 48327.96961675843] + for index in range(len(calculated_variances)): + assert calculated_variances[index] <= (1 + tolerance) * item.value[index] + assert calculated_variances[index] >= (1 - tolerance) * item.value[index] + +def test_gridcell_min_max(tolerance=0.001): + data1 = harvest(VALID_CONFIG_DICT) + + for item in data1: + if item.variable == 'soill4' and item.statistic == 'minimum': + calculated_min = [1e-05, 0.044400092, 0.009126244, 1e-05] + for index in range(len(calculated_min)): + assert calculated_min[index] <= (1 + tolerance) * item.value[index] + assert calculated_min[index] >= (1 - tolerance) * item.value[index] + + elif item.variable == 'soill4' and item.statistic == 'maximum': + calculated_max = [0.46617407, 0.46713507, 0.46713507, 0.46617407] + for index in range(len(calculated_min)): + assert calculated_max[index] <= (1 + tolerance) * item.value[index] + assert calculated_max[index] >= (1 - tolerance) * item.value[index] + + elif item.variable == 'soilm' and item.statistic == 'minimum': + calculated_min = [93.802376, 89.12119, 108.01729, 89.12119] + for index in range(len(calculated_min)): + assert calculated_min[index] <= (1 + tolerance) * item.value[index] + assert calculated_min[index] >= (1 - tolerance) * item.value[index] + + elif item.variable == 'soilm' and item.statistic == 'maximum': + calculated_max = [913.89874, 922.9723, 922.9723, 922.3684] + for index in range(len(calculated_max)): + assert calculated_max[index] <= (1 + tolerance) * item.value[index] + assert calculated_max[index] >= (1 - tolerance) * item.value[index] + +def test_units(): + variable_dictionary = {} + data1 = harvest(VALID_CONFIG_DICT) + + for item in data1: + if item.variable == 'soill4': + expected_units = 'xxx' + assert expected_units == item.units + elif item.variable == 'soilm': + expected_units = 'kg/m**2' + assert expected_units == item.units + +def test_cycletime(): + """ The hard coded datetimestr 1994-01-01 12:00:00 + is the median midpoint time of the filenames defined above in the + BFG_PATH. We have to convert this into a datetime object in order + to compare this string to what is returned by + global_bucket_precip_ave.py + """ + data1 = harvest(VALID_CONFIG_DICT) + expected_datetime = datetime.strptime("1994-01-01 12:00:00", + "%Y-%m-%d %H:%M:%S") + assert data1[0].mediantime == expected_datetime + +def test_longname(): + data1 = harvest(VALID_CONFIG_DICT) + + for item in data1: + if item.variable == 'soilt4': + expected_longname = 'soil temperature unknown layer 4' + assert expected_longname == item.longname + elif item.variable == 'tg3': + expected_longname = 'deep soil temperature' + assert expected_longname == item.longname + +def test_soil_moisture_level4_harvester(): + data1 = harvest(VALID_CONFIG_DICT) + assert type(data1) is list + assert len(data1) > 0 + assert data1[0].filenames==BFG_PATH + +def main(): + test_gridcell_area_conservation() + test_soil_moisture_level4_harvester() + test_variable_names() + test_units() + test_global_mean_values() + test_gridcell_variance() + test_gridcell_min_max() + test_cycletime() + test_longname() + +if __name__=='__main__': + main() diff --git a/tests/test_harvester_regional_soil_temperature.py b/tests/test_harvester_regional_soil_temperature.py new file mode 100755 index 0000000..3371053 --- /dev/null +++ b/tests/test_harvester_regional_soil_temperature.py @@ -0,0 +1,202 @@ +#!/usr/bin/env python + +import os +import sys +from pathlib import Path + +import numpy as np +from datetime import datetime +import pytest +import yaml +from netCDF4 import Dataset + +from score_hv import hv_registry +from score_hv.harvester_base import harvest +from score_hv.yaml_utils import YamlLoader +from score_hv.harvesters.innov_netcdf import Region, InnovStatsCfg + +TEST_DATA_FILE_NAMES = ['bfg_1994010100_fhr09_soil_control.nc', + 'bfg_1994010106_fhr06_soil_control.nc', + 'bfg_1994010106_fhr09_soil_control.nc', + 'bfg_1994010112_fhr06_soil_control.nc', + 'bfg_1994010112_fhr09_soil_control.nc', + 'bfg_1994010118_fhr06_soil_control.nc', + 'bfg_1994010118_fhr09_soil_control.nc', + 'bfg_1994010200_fhr06_soil_control.nc'] + +DATA_DIR = os.path.join(Path(__file__).parent.parent.resolve(),'src','score_hv','data') +GRIDCELL_AREA_DATA_PATH = os.path.join(DATA_DIR, + 'gridcell-area' + + '_noaa-ufs-gefsv13replay-pds' + + '_bfg_control_1536x768_20231116.nc') +CONFIGS_DIR = 'configs' +PYTEST_CALLING_DIR = Path(__file__).parent.resolve() +TEST_DATA_PATH = os.path.join(PYTEST_CALLING_DIR, 'data') +BFG_PATH = [os.path.join(TEST_DATA_PATH, + file_name) for file_name in TEST_DATA_FILE_NAMES] + +VALID_CONFIG_DICT = {'harvester_name': hv_registry.DAILY_BFG, + 'filenames' : BFG_PATH, + 'statistic': ['mean','variance', 'minimum', 'maximum'], + 'variable': ['soilt4','tg3'], + 'regions': { + 'north_hemi': {'north_lat': 90.0, 'south_lat': 0.0, 'west_long': 0.0, 'east_long': 360.0}, + 'south_meni': {'north_lat': 0.0, 'south_lat': -90.0, 'west_long': 0.0, 'east_long': 360.0}, + 'eastern_hemis': {'north_lat': 90.0, 'south_lat': -90.0, 'west_long': 0.0, 'east_long': 180.0}, + 'western_hemis': {'north_lat': 90.0, 'south_lat': -90.0, 'west_long': 180.0, 'east_long': 360.0}, + } + + } + +def test_gridcell_area_conservation(tolerance=0.001): + + gridcell_area_data = Dataset(GRIDCELL_AREA_DATA_PATH) + assert gridcell_area_data['area'].units == 'steradian' + sum_gridcell_area = np.sum(gridcell_area_data.variables['area']) + assert sum_gridcell_area < (1 + tolerance) * 4 * np.pi + assert sum_gridcell_area > (1 - tolerance) * 4 * np.pi + gridcell_area_data.close() + +def test_variable_names(): + """Here we are testing two variables. The daily_bfg harvester + should return values for both variables at once. + """ + expected_variables = ['soilt4', 'tg3'] + assert VALID_CONFIG_DICT['variable'] == expected_variables + +def test_global_mean_values(tolerance=0.001): + """ + The values of the calculated_means list were + calculated from these eight forecast files: + + bfg_1994010100_fhr09_soil_control.nc + bfg_1994010106_fhr06_soil_control.nc + bfg_1994010106_fhr09_soil_control.nc + bfg_1994010112_fhr06_soil_control.nc + bfg_1994010112_fhr09_soil_control.nc + bfg_1994010118_fhr06_soil_control.nc + bfg_1994010118_fhr09_soil_control.nc + bfg_1994010200_fhr06_soil_control.nc + + When averaged together, these files represent a 24 hour mean. The + average values hard-coded in this test was calculated from + forecast files using a separate python code. + + In this test there are four regions. The daily_bfg harvester will return + the values of all four regions at once. + """ + data1 = harvest(VALID_CONFIG_DICT) + + for item in data1: + if item.variable == 'soilt4' and item.statistic == 'mean': + calculated_means = [112.60353787828686, 42.149624950145956,101.11860562851737, 53.63455719991545] + for index in range(len(calculated_means)): + assert calculated_means[index] <= (1 + tolerance) * item.value[index] + assert calculated_means[index] >= (1 - tolerance) * item.value[index] + elif item.variable == 'tg3' and item.statistic == 'mean': + calculated_means = [113.15909397838503, 41.659809616478945,101.13745915934136, 53.681444435522614] + for index in range(len(calculated_means)): + assert calculated_means[index] <= (1 + tolerance) * item.value[index] + assert calculated_means[index] >= (1 - tolerance) * item.value[index] + +def test_gridcell_variance(tolerance=0.001): + """ + The values of the calculated_variances list were calculated + from the forecast files listed above in a separate python script. + """ + data1 = harvest(VALID_CONFIG_DICT) + + for item in data1: + if item.variable == 'soilt4' and item.statistic == 'variance': + calculated_variances = [11653.339922043157, 9262.338575055992, 12271.96061131527, 10198.287477989317] + for index in range(len(calculated_variances)): + assert calculated_variances[index] <= (1 + tolerance) * item.value[index] + assert calculated_variances[index] >= (1 - tolerance) * item.value[index] + elif item.variable == 'tg3' and item.statistic == 'variance': + calculated_variances = [11768.367926290819, 9047.996504014442, 12266.536131610563, 10213.705627241196] + for index in range(len(calculated_variances)): + assert calculated_variances[index] <= (1 + tolerance) * item.value[index] + assert calculated_variances[index] >= (1 - tolerance) * item.value[index] + +def test_gridcell_min_max(tolerance=0.001): + data1 = harvest(VALID_CONFIG_DICT) + + for item in data1: + if item.variable == 'soilt4' and item.statistic == 'minimum': + calculated_min = [240.58295, 272.74335, 242.01396, 240.58295] + for index in range(len(calculated_min)): + assert calculated_min[index] <= (1 + tolerance) * item.value[index] + assert calculated_min[index] >= (1 - tolerance) * item.value[index] + + elif item.variable == 'soilt4' and item.statistic == 'maximum': + calculated_max = [308.44702, 308.445, 308.44702, 307.2143] + for index in range(len(calculated_max)): + assert calculated_max[index] <= (1 + tolerance) * item.value[index] + assert calculated_max[index] >= (1 - tolerance) * item.value[index] + + elif item.variable == 'tg3' and item.statistic == 'minimum': + calculated_min = [250.65329, 273.11252, 257.85712, 250.65329] + for index in range(len(calculated_min)): + assert calculated_min[index] <= (1 + tolerance) * item.value[index] + assert calculated_min[index] >= (1 - tolerance) * item.value[index] + + elif item.variable == 'tg3' and item.statistic == 'maximum': + calculated_max = [302.39206, 301.11945, 302.39206, 302.35144] + for index in range(len(calculated_max)): + assert calculated_max[index] <= (1 + tolerance) * item.value[index] + assert calculated_max[index] >= (1 - tolerance) * item.value[index] + +def test_units(): + variable_dictionary = {} + data1 = harvest(VALID_CONFIG_DICT) + + for item in data1: + if item.variable == 'soill4': + expected_units = 'xxx' + assert expected_units == item.units + elif item.variable == 'soilm': + expected_units = 'kg/m**2' + assert expected_units == item.units + +def test_cycletime(): + """ The hard coded datetimestr 1994-01-01 12:00:00 + is the median midpoint time of the filenames defined above in the + BFG_PATH. We have to convert this into a datetime object in order + to compare this string to what is returned by + global_bucket_precip_ave.py + """ + data1 = harvest(VALID_CONFIG_DICT) + expected_datetime = datetime.strptime("1994-01-01 12:00:00", + "%Y-%m-%d %H:%M:%S") + assert data1[0].mediantime == expected_datetime + +def test_longname(): + data1 = harvest(VALID_CONFIG_DICT) + + for item in data1: + if item.variable == 'soilt4': + expected_longname = 'soil temperature unknown layer 4' + assert expected_longname == item.longname + elif item.variable == 'tg3': + expected_longname = 'deep soil temperature' + assert expected_longname == item.longname + +def test_soil_moisture_level4_harvester(): + data1 = harvest(VALID_CONFIG_DICT) + assert type(data1) is list + assert len(data1) > 0 + assert data1[0].filenames==BFG_PATH + +def main(): + test_gridcell_area_conservation() + test_soil_moisture_level4_harvester() + test_variable_names() + test_units() + test_global_mean_values() + test_gridcell_variance() + test_gridcell_min_max() + test_cycletime() + test_longname() + +if __name__=='__main__': + main()