From 2c0332abb56dae4c182053b873a14b0d88f4ce27 Mon Sep 17 00:00:00 2001 From: George McCabe <23407799+georgemccabe@users.noreply.github.com> Date: Wed, 14 Jun 2023 12:05:34 -0600 Subject: [PATCH 01/18] update diff util and test that changes properly diff failling diffs, ci-run-diff --- .github/parm/use_case_groups.json | 4 +- metplus/util/diff_util.py | 422 +++++++++++++++++++----------- 2 files changed, 265 insertions(+), 161 deletions(-) diff --git a/.github/parm/use_case_groups.json b/.github/parm/use_case_groups.json index 0d5e34a9eb..a6fdaf3129 100644 --- a/.github/parm/use_case_groups.json +++ b/.github/parm/use_case_groups.json @@ -7,7 +7,7 @@ { "category": "met_tool_wrapper", "index_list": "30-58", - "run": false + "run": true }, { "category": "air_quality_and_comp", @@ -227,7 +227,7 @@ { "category": "tc_and_extra_tc", "index_list": "0-2", - "run": false + "run": true }, { "category": "tc_and_extra_tc", diff --git a/metplus/util/diff_util.py b/metplus/util/diff_util.py index 611732ab9b..a77f9df8c7 100755 --- a/metplus/util/diff_util.py +++ b/metplus/util/diff_util.py @@ -1,9 +1,13 @@ +#! /usr/bin/env python3 + +import sys import os import netCDF4 import filecmp import csv from PIL import Image, ImageChops import numpy +from numpy.core._exceptions import UFuncTypeError IMAGE_EXTENSIONS = [ '.jpg', @@ -37,6 +41,7 @@ # Note: Completing METplus issue #1873 could allow this to be set to 6 ROUNDING_PRECISION = 5 + def get_file_type(filepath): _, file_extension = os.path.splitext(filepath) @@ -55,7 +60,7 @@ def get_file_type(filepath): try: netCDF4.Dataset(filepath) return 'netcdf' - except: + except OSError: pass if file_extension in SKIP_EXTENSIONS: @@ -81,63 +86,69 @@ def compare_dir(dir_a, dir_b, debug=False, save_diff=False): return [result] diff_files = [] - for root, _, files in os.walk(dir_a): - # skip logs directories - if root.endswith('logs'): + for filepath_a in _get_files(dir_a): + filepath_b = filepath_a.replace(dir_a, dir_b) + print("\n# # # # # # # # # # # # # # # # # # # # # # # # # # " + "# # # #\n") + rel_path = filepath_a.replace(f'{dir_a}/', '') + print(f"COMPARING {rel_path}") + result = compare_files(filepath_a, + filepath_b, + debug=debug, + dir_a=dir_a, + dir_b=dir_b, + save_diff=save_diff) + + # no differences of skipped + if result is None or result is True: continue - for filename in files: - filepath_a = os.path.join(root, filename) + diff_files.append(result) - # skip directories - if not os.path.isfile(filepath_a): - continue + # loop through dir_b and report if any files are not found in dir_a + for filepath_b in _get_files(dir_b): + filepath_a = filepath_b.replace(dir_b, dir_a) + if os.path.exists(filepath_a): + continue + # check if missing file is actually diff file that was generated + diff_list = [item[3] for item in diff_files] + if filepath_b in diff_list: + continue + print(f"ERROR: File does not exist: {filepath_a}") + diff_files.append(('', filepath_b, 'file not found (new output)', '')) - # skip final conf file - if 'metplus_final.conf' in os.path.basename(filepath_a): - continue + print('::endgroup::') - filepath_b = filepath_a.replace(dir_a, dir_b) - print("\n# # # # # # # # # # # # # # # # # # # # # # # # # # " - "# # # #\n") - rel_path = filepath_a.replace(f'{dir_a}/', '') - print(f"COMPARING {rel_path}") - result = compare_files(filepath_a, - filepath_b, - debug=debug, - dir_a=dir_a, - dir_b=dir_b, - save_diff=save_diff) + _print_dir_summary(diff_files) + return diff_files - # no differences of skipped - if result is None or result is True: - continue - diff_files.append(result) +def _get_files(search_dir): + """!Generator to get all files in a directory. + Skips directories that end with 'logs' and files named metplus_final.conf - # loop through dir_b and report if any files are not found in dir_a - for root, _, files in os.walk(dir_b): + @param search_dir directory to search recursively + """ + for root, _, files in os.walk(search_dir): # skip logs directories if root.endswith('logs'): continue for filename in files: - filepath_b = os.path.join(root, filename) + filepath = os.path.join(root, filename) + + # skip directories + if not os.path.isfile(filepath): + continue # skip final conf file - if 'metplus_final.conf' in os.path.basename(filepath_b): + if 'metplus_final.conf' in os.path.basename(filepath): continue - filepath_a = filepath_b.replace(dir_b, dir_a) - if not os.path.exists(filepath_a): - # check if missing file is actually diff file that was generated - diff_list = [item[3] for item in diff_files] - if filepath_b in diff_list: - continue - print(f"ERROR: File does not exist: {filepath_a}") - diff_files.append(('', filepath_b, 'file not found (new output)', '')) + yield filepath - print('::endgroup::') + +def _print_dir_summary(diff_files): print("\n\n**************************************************\nSummary:\n") if diff_files: print("\nERROR: Some differences were found") @@ -151,7 +162,6 @@ def compare_dir(dir_a, dir_b, debug=False, save_diff=False): print("Finished comparing directories\n" "**************************************************\n\n") - return diff_files def compare_files(filepath_a, filepath_b, debug=False, dir_a=None, dir_b=None, @@ -178,47 +188,16 @@ def compare_files(filepath_a, filepath_b, debug=False, dir_a=None, dir_b=None, return filepath_a, filepath_b, file_type, '' if file_type == 'csv': - print('Comparing CSV') - if not compare_csv_files(filepath_a, filepath_b): - print(f'ERROR: CSV file differs: {filepath_b}') - return filepath_a, filepath_b, 'CSV diff', '' - - print("No differences in CSV files") - return True + return _handle_csv_files(filepath_a, filepath_b) if file_type == 'netcdf': - print("Comparing NetCDF") - if not nc_is_equal(filepath_a, filepath_b): - return filepath_a, filepath_b, 'NetCDF diff', '' - - print("No differences in NetCDF files") - return True + return _handle_netcdf_files(filepath_a, filepath_b) if file_type == 'pdf': - print("Comparing PDF as images") - diff_file = compare_pdf_as_images(filepath_a, filepath_b, - save_diff=save_diff) - if diff_file is True: - print("No differences in PDF files") - return True - - if diff_file is False: - diff_file = '' - - return filepath_a, filepath_b, 'PDF diff', diff_file + return _handle_pdf_files(filepath_a, filepath_b, save_diff) if file_type == 'image': - print("Comparing images") - diff_file = compare_image_files(filepath_a, filepath_b, - save_diff=save_diff) - if diff_file is True: - print("No differences in image files") - return True - - if diff_file is False: - diff_file = '' - - return filepath_a, filepath_b, 'Image diff', diff_file + return _handle_image_files(filepath_a, filepath_b, save_diff) # if not any of the above types, use diff to compare print("Comparing text files") @@ -236,6 +215,53 @@ def compare_files(filepath_a, filepath_b, debug=False, dir_a=None, dir_b=None, return True +def _handle_csv_files(filepath_a, filepath_b): + print('Comparing CSV') + if not compare_csv_files(filepath_a, filepath_b): + print(f'ERROR: CSV file differs: {filepath_b}') + return filepath_a, filepath_b, 'CSV diff', '' + + print("No differences in CSV files") + return True + + +def _handle_netcdf_files(filepath_a, filepath_b): + print("Comparing NetCDF") + if not nc_is_equal(filepath_a, filepath_b): + return filepath_a, filepath_b, 'NetCDF diff', '' + + print("No differences in NetCDF files") + return True + + +def _handle_pdf_files(filepath_a, filepath_b, save_diff): + print("Comparing PDF as images") + diff_file = compare_pdf_as_images(filepath_a, filepath_b, + save_diff=save_diff) + if diff_file is True: + print("No differences in PDF files") + return True + + if diff_file is False: + diff_file = '' + + return filepath_a, filepath_b, 'PDF diff', diff_file + + +def _handle_image_files(filepath_a, filepath_b, save_diff): + print("Comparing images") + diff_file = compare_image_files(filepath_a, filepath_b, + save_diff=save_diff) + if diff_file is True: + print("No differences in image files") + return True + + if diff_file is False: + diff_file = '' + + return filepath_a, filepath_b, 'Image diff', diff_file + + def compare_pdf_as_images(filepath_a, filepath_b, save_diff=False): try: from pdf2image import convert_from_path @@ -307,15 +333,20 @@ def compare_csv_files(filepath_a, filepath_b): lines_b = [] with open(filepath_a, 'r') as file_handle: - csv_read = csv.DictReader(file_handle, delimiter=',') - for row in csv_read: - lines_a.append(row) + lines_a.extend(csv.DictReader(file_handle, delimiter=',')) with open(filepath_b, 'r') as file_handle: - csv_read = csv.DictReader(file_handle, delimiter=',') - for row in csv_read: - lines_b.append(row) + lines_b.extend(csv.DictReader(file_handle, delimiter=',')) + # compare header values and number of lines + if not _compare_csv_lengths(lines_a, lines_b): + return False + + # compare each CSV column + return _compare_csv_columns(lines_a, lines_b) + + +def _compare_csv_lengths(lines_a, lines_b): keys_a = lines_a[0].keys() keys_b = lines_b[0].keys() # compare header columns and report error if they differ @@ -337,7 +368,11 @@ def compare_csv_files(filepath_a, filepath_b): f'than in OUTPUT ({len(lines_b)})') return False - # compare each CSV column + return True + + +def _compare_csv_columns(lines_a, lines_b): + keys_a = lines_a[0].keys() status = True for num, (line_a, line_b) in enumerate(zip(lines_a, lines_b), start=1): for key in keys_a: @@ -349,7 +384,7 @@ def compare_csv_files(filepath_a, filepath_b): # ROUNDING_PRECISION decimal places # METplus issue #1873 addresses the real problem try: - if is_equal_rounded(val_a, val_b): + if _is_equal_rounded(val_a, val_b): continue print(f"ERROR: Line {num} - {key} differs by " f"less than {ROUNDING_PRECISION} decimals: " @@ -364,7 +399,7 @@ def compare_csv_files(filepath_a, filepath_b): return status -def is_equal_rounded(value_a, value_b): +def _is_equal_rounded(value_a, value_b): if _truncate_float(value_a) == _truncate_float(value_b): return True if _round_float(value_a) == _round_float(value_b): @@ -425,10 +460,8 @@ def compare_txt_files(filepath_a, filepath_b, dir_a=None, dir_b=None): if is_stat_file: print("Comparing stat file") header_a = lines_a.pop(0).split()[1:] - header_b = lines_b.pop(0).split()[1:] else: header_a = None - header_b = None if len(lines_a) != len(lines_b): print(f"ERROR: Different number of lines in {filepath_b}") @@ -475,110 +508,181 @@ def diff_text_lines(lines_a, lines_b, compare_b = compare_b.replace(dir_b, dir_a) # check for differences - if compare_a != compare_b: - # if the diff is in a stat file, ignore the version number - if is_stat_file: - cols_a = compare_a.split()[1:] - cols_b = compare_b.split()[1:] - for col_a, col_b, label in zip(cols_a, cols_b, header_a): - if col_a != col_b: - if print_error: - print(f"ERROR: {label} differs:\n" - f" A: {col_a}\n B: {col_b}") - all_good = False - else: - if print_error: - print(f"ERROR: Line differs\n" - f" A: {compare_a}\n B: {compare_b}") + if compare_a == compare_b: + continue + + # if the diff is in a stat file, ignore the version number + if is_stat_file: + if not _diff_stat_line(compare_a, compare_b, header_a, print_error=print_error): all_good = False + continue + + if print_error: + print(f"ERROR: Line differs\n" + f" A: {compare_a}\n B: {compare_b}") + all_good = False + + return all_good + + +def _diff_stat_line(compare_a, compare_b, header_a, print_error=False): + """Compare values in .stat file. Ignore first column which contains MET + version number + + @param compare_a list of values in line A + @param compare_b list of values in line B + @param header_a list of header values in file A excluding MET version + @param print_error If True, print an error message if any value differs + """ + cols_a = compare_a.split()[1:] + cols_b = compare_b.split()[1:] + all_good = True + for col_a, col_b, label in zip(cols_a, cols_b, header_a): + if col_a == col_b: + continue + if print_error: + print(f"ERROR: {label} differs:\n" + f" A: {col_a}\n B: {col_b}") + all_good = False return all_good def nc_is_equal(file_a, file_b, fields=None, debug=False): """! Check if two NetCDF files have the same data - @param file_a first file to compare - @param file_b second file to compare - @param fields (Optional) list of fields to compare. If unset, compare - all fields - @returns True if all values in fields are equivalent, False if not + + @param file_a first file to compare + @param file_b second file to compare + @param fields (Optional) list of fields to compare. If unset, compare all + @param debug (optional) boolean to output more information about diff + @returns True if all values in fields are equivalent, False if not """ nc_a = netCDF4.Dataset(file_a) nc_b = netCDF4.Dataset(file_b) + # keep track of any differences that are found + is_equal = True + # if no fields are specified, get all of them if fields: - if not isinstance(fields, list): - field_list = [fields] - else: - field_list = fields + field_list = [fields] if not isinstance(fields, list) else fields else: a_fields = sorted(nc_a.variables.keys()) b_fields = sorted(nc_b.variables.keys()) + # fail if any fields exist in 1 file and not the other if a_fields != b_fields: print("ERROR: Field list differs between files\n" f" File_A: {a_fields}\n File_B:{b_fields}\n" f"Using File_A fields.") + is_equal = False field_list = a_fields # loop through fields, keeping track of any differences - is_equal = True - try: - for field in field_list: - var_a = nc_a.variables[field] - var_b = nc_b.variables[field] - - if debug: - print(f"Field: {field}") - print(f"Var_A:{var_a}\nVar_B:{var_b}") - print(f"Instance type: {type(var_a[0])}") - try: - values_a = var_a[:] - values_b = var_b[:] - values_diff = values_a - values_b - if (numpy.isnan(values_diff.min()) and - numpy.isnan(values_diff.max())): - print(f"WARNING: Variable {field} contains NaN values. " - "Cannot perform comparison.") - elif values_diff.min() != 0.0 or values_diff.max() != 0.0: - print(f"ERROR: Field ({field}) values differ\n" - f"Min diff: {values_diff.min()}, " - f"Max diff: {values_diff.max()}") - is_equal = False - # print indices that are not zero and count of diffs - if debug: - count = 0 - values_list = [j for sub in values_diff.tolist() - for j in sub] - for idx, val in enumerate(values_list): - if val != 0.0: - print(f"{idx}: {val}") - count += 1 - print(f"{count} / {idx+1} points differ") - - except: - # handle non-numeric fields - try: - if any(var_a[:].flatten() != var_b[:].flatten()): - print(f"ERROR: Field ({field}) values (non-numeric) " - "differ\n" - f" File_A: {var_a[:]}\n File_B: {var_b[:]}") - is_equal = False - except: - print("ERROR: Couldn't diff NetCDF files, need to update diff method") - is_equal = False + for field in field_list: + if not _nc_fields_are_equal(field, nc_a, nc_b, debug=debug): + is_equal = False + return is_equal + + +def _nc_fields_are_equal(field, nc_a, nc_b, debug=False): + """!Compare same field from 2 NetCDF files. + + @param field name of field to compare + @param nc_a first netCDF4.Dataset + @param nc_b first netCDF4.Dataset + @param debug (optional) boolean to output more information about diff + @returns True is fields are equal, False if fields are not equal or if + field is not found in one of the files + """ + try: + var_a = nc_a.variables[field] + var_b = nc_b.variables[field] except KeyError: print(f"ERROR: Field {field} not found") return False - return is_equal + if debug: + print(f"Field: {field}") + print(f"Var_A:{var_a}\nVar_B:{var_b}") + print(f"Instance type: {type(var_a[0])}") + + values_a = var_a[:] + values_b = var_b[:] + try: + values_diff = values_a - values_b + except (UFuncTypeError, TypeError): + # handle non-numeric fields + if not _all_values_are_equal(var_a, var_b): + print(f"ERROR: Field ({field}) values (non-numeric) " + "differ\n" + f" File_A: {var_a[:]}\n File_B: {var_b[:]}") + return False + + return True + + # if any NaN values in either data set, min and max of diff will be NaN + # compare each value + if numpy.isnan(values_diff.min()) and numpy.isnan(values_diff.max()): + print(f"Variable {field} contains NaN. Comparing each value...") + if not _all_values_are_equal(var_a, var_b): + print(f'ERROR: Some values differ in {field}') + return False + return True + + # consider all values equal is min and max diff are 0 + if not values_diff.min() and not values_diff.max(): + return True + + print(f"ERROR: Field ({field}) values differ\n" + f"Min diff: {values_diff.min()}, " + f"Max diff: {values_diff.max()}") + if debug: + # print indices that are not zero and count of diffs + _print_nc_field_diff_summary(values_diff) + + return False + + +def _print_nc_field_diff_summary(values_diff): + """!Print summary of NetCDF fields that differ. Prints the index of each + point that differs with the numeric difference between the points. + Also print number of points that differ and the total number of points. + + @param values_diff numpy array (possibly 2D) of differences + """ + count = 0 + values_list = [j for sub in values_diff.tolist() for j in sub] + for idx, val in enumerate(values_list): + if val != 0.0: + print(f"{idx}: {val}") + count += 1 + print(f"{count} / {idx + 1} points differ") + + +def _all_values_are_equal(var_a, var_b): + """!Compare each value to find differences. Handles case if both values + are NaN. + + @param var_a Numpy array + @param var_b Numpy array + @returns True if all values are equal, False otherwise + """ + for val_a, val_b in zip(var_a[:].flatten(), var_b[:].flatten()): + # continue to next value if both values are NaN + if numpy.isnan(val_a) and numpy.isnan(val_b): + continue + if val_a != val_b: + return False + return True if __name__ == '__main__': + if len(sys.argv) < 3: + print('ERROR: Must supply 2 directories to compare as arguments') + sys.exit(1) dir_a = sys.argv[1] dir_b = sys.argv[2] - if len(sys.argv) > 3: - save_diff = True + save_diff = len(sys.argv) > 3 compare_dir(dir_a, dir_b, debug=True, save_diff=save_diff) From 905b409bc16b5b8870b0202e8a820bb34e3f1735 Mon Sep 17 00:00:00 2001 From: George McCabe <23407799+georgemccabe@users.noreply.github.com> Date: Wed, 14 Jun 2023 13:44:38 -0600 Subject: [PATCH 02/18] turn off use case --- .github/parm/use_case_groups.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/parm/use_case_groups.json b/.github/parm/use_case_groups.json index a6fdaf3129..131e0eed72 100644 --- a/.github/parm/use_case_groups.json +++ b/.github/parm/use_case_groups.json @@ -137,7 +137,7 @@ { "category": "s2s", "index_list": "3", - "run": true + "run": false }, { "category": "s2s", From c2b6e3a37dc9d07307dc11560dc1705da2c6ae63 Mon Sep 17 00:00:00 2001 From: George McCabe <23407799+georgemccabe@users.noreply.github.com> Date: Wed, 14 Jun 2023 13:48:25 -0600 Subject: [PATCH 03/18] catch any exception that occurs in NetCDF file diff, ci-skip-unit-tests --- metplus/util/diff_util.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/metplus/util/diff_util.py b/metplus/util/diff_util.py index a77f9df8c7..680f85fa6b 100755 --- a/metplus/util/diff_util.py +++ b/metplus/util/diff_util.py @@ -669,12 +669,16 @@ def _all_values_are_equal(var_a, var_b): @param var_b Numpy array @returns True if all values are equal, False otherwise """ - for val_a, val_b in zip(var_a[:].flatten(), var_b[:].flatten()): - # continue to next value if both values are NaN - if numpy.isnan(val_a) and numpy.isnan(val_b): - continue - if val_a != val_b: - return False + try: + for val_a, val_b in zip(var_a[:].flatten(), var_b[:].flatten()): + # continue to next value if both values are NaN + if numpy.isnan(val_a) and numpy.isnan(val_b): + continue + if val_a != val_b: + return False + except Exception as err: + print(f'ERROR: Exception occurred: {err}') + return False return True From 5fdee80f510df17d7765521dec95cc2cd766765c Mon Sep 17 00:00:00 2001 From: George McCabe <23407799+georgemccabe@users.noreply.github.com> Date: Wed, 14 Jun 2023 14:10:10 -0600 Subject: [PATCH 04/18] catch any exception from file diff instead of just in NetCDF diff, ci-skip-unit-tests --- metplus/util/diff_util.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/metplus/util/diff_util.py b/metplus/util/diff_util.py index 680f85fa6b..be63dd034c 100755 --- a/metplus/util/diff_util.py +++ b/metplus/util/diff_util.py @@ -92,12 +92,16 @@ def compare_dir(dir_a, dir_b, debug=False, save_diff=False): "# # # #\n") rel_path = filepath_a.replace(f'{dir_a}/', '') print(f"COMPARING {rel_path}") - result = compare_files(filepath_a, - filepath_b, - debug=debug, - dir_a=dir_a, - dir_b=dir_b, - save_diff=save_diff) + try: + result = compare_files(filepath_a, + filepath_b, + debug=debug, + dir_a=dir_a, + dir_b=dir_b, + save_diff=save_diff) + except Exception as err: + print(f"ERROR: Exception occurred in diff logic: {err}") + result = filepath_a, filepath_b, 'Exception in diff logic', '' # no differences of skipped if result is None or result is True: @@ -669,16 +673,12 @@ def _all_values_are_equal(var_a, var_b): @param var_b Numpy array @returns True if all values are equal, False otherwise """ - try: - for val_a, val_b in zip(var_a[:].flatten(), var_b[:].flatten()): - # continue to next value if both values are NaN - if numpy.isnan(val_a) and numpy.isnan(val_b): - continue - if val_a != val_b: - return False - except Exception as err: - print(f'ERROR: Exception occurred: {err}') - return False + for val_a, val_b in zip(var_a[:].flatten(), var_b[:].flatten()): + # continue to next value if both values are NaN + if numpy.isnan(val_a) and numpy.isnan(val_b): + continue + if val_a != val_b: + return False return True From 21af4b24d45bc7c461d0892d6fc563411a825e9b Mon Sep 17 00:00:00 2001 From: George McCabe <23407799+georgemccabe@users.noreply.github.com> Date: Wed, 14 Jun 2023 14:12:58 -0600 Subject: [PATCH 05/18] forgot to trigger diff, ci-run-diff, ci-skip-unit-tests --- metplus/util/diff_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metplus/util/diff_util.py b/metplus/util/diff_util.py index be63dd034c..437302eb13 100755 --- a/metplus/util/diff_util.py +++ b/metplus/util/diff_util.py @@ -627,7 +627,7 @@ def _nc_fields_are_equal(field, nc_a, nc_b, debug=False): return True # if any NaN values in either data set, min and max of diff will be NaN - # compare each value + # compare each value if numpy.isnan(values_diff.min()) and numpy.isnan(values_diff.max()): print(f"Variable {field} contains NaN. Comparing each value...") if not _all_values_are_equal(var_a, var_b): From 2172134b869ec6f3bf78be7f07b78db5e61fa9b5 Mon Sep 17 00:00:00 2001 From: George McCabe <23407799+georgemccabe@users.noreply.github.com> Date: Wed, 14 Jun 2023 14:16:06 -0600 Subject: [PATCH 06/18] use pandas.isnull instead of numpy.isnan to prevent exceptions from occurring, ci-run-diff, ci-skip-unit-tests --- metplus/util/diff_util.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/metplus/util/diff_util.py b/metplus/util/diff_util.py index 437302eb13..f9ca1215f1 100755 --- a/metplus/util/diff_util.py +++ b/metplus/util/diff_util.py @@ -6,7 +6,7 @@ import filecmp import csv from PIL import Image, ImageChops -import numpy +from pandas import isnull from numpy.core._exceptions import UFuncTypeError IMAGE_EXTENSIONS = [ @@ -627,8 +627,8 @@ def _nc_fields_are_equal(field, nc_a, nc_b, debug=False): return True # if any NaN values in either data set, min and max of diff will be NaN - # compare each value - if numpy.isnan(values_diff.min()) and numpy.isnan(values_diff.max()): + # compare each value + if isnull(values_diff.min()) and isnull(values_diff.max()): print(f"Variable {field} contains NaN. Comparing each value...") if not _all_values_are_equal(var_a, var_b): print(f'ERROR: Some values differ in {field}') @@ -675,7 +675,7 @@ def _all_values_are_equal(var_a, var_b): """ for val_a, val_b in zip(var_a[:].flatten(), var_b[:].flatten()): # continue to next value if both values are NaN - if numpy.isnan(val_a) and numpy.isnan(val_b): + if isnull(val_a) and isnull(val_b): continue if val_a != val_b: return False From eefaa0c90eb71d0361941d7677f0a0d4efcd5cce Mon Sep 17 00:00:00 2001 From: George McCabe <23407799+georgemccabe@users.noreply.github.com> Date: Wed, 14 Jun 2023 14:41:49 -0600 Subject: [PATCH 07/18] capture failure in diff script, ci-run-diff, ci-skip-unit-tests --- .github/jobs/run_difference_tests.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/jobs/run_difference_tests.sh b/.github/jobs/run_difference_tests.sh index 49e7242ec8..333e0372e7 100755 --- a/.github/jobs/run_difference_tests.sh +++ b/.github/jobs/run_difference_tests.sh @@ -12,6 +12,7 @@ matrix_categories=$1 artifact_name=$2 .github/jobs/setup_and_run_diff.py ${matrix_categories} $artifact_name +ret=$? if [ "$( ls -A ${RUNNER_WORKSPACE}/diff)" ]; then echo "upload_diff=true" >> $GITHUB_OUTPUT @@ -21,3 +22,4 @@ if [ "$( ls -A ${RUNNER_WORKSPACE}/diff)" ]; then fi echo "upload_diff=false" >> $GITHUB_OUTPUT +return $ret \ No newline at end of file From 0201daa8df93dcfc75859ba62ce6728805cf40af Mon Sep 17 00:00:00 2001 From: George McCabe <23407799+georgemccabe@users.noreply.github.com> Date: Wed, 14 Jun 2023 14:46:22 -0600 Subject: [PATCH 08/18] ignore .idea directory that was auto-generated --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index a065a4e761..f62ca72792 100644 --- a/.gitignore +++ b/.gitignore @@ -88,6 +88,8 @@ ENV/ # pytest cache files *.pytest_cache* +.idea + # tilda files generated by emacs *~ From 194088d68f6e7bc4071e313619152903336648cc Mon Sep 17 00:00:00 2001 From: George McCabe <23407799+georgemccabe@users.noreply.github.com> Date: Wed, 14 Jun 2023 15:03:31 -0600 Subject: [PATCH 09/18] fix incorrect syntax, ci-run-diff, ci-skip-unit-tests --- .github/jobs/run_difference_tests.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/jobs/run_difference_tests.sh b/.github/jobs/run_difference_tests.sh index 333e0372e7..8112ad0ecf 100755 --- a/.github/jobs/run_difference_tests.sh +++ b/.github/jobs/run_difference_tests.sh @@ -22,4 +22,4 @@ if [ "$( ls -A ${RUNNER_WORKSPACE}/diff)" ]; then fi echo "upload_diff=false" >> $GITHUB_OUTPUT -return $ret \ No newline at end of file +exit $ret \ No newline at end of file From 8017d99c3d9c2471488300a225362490abaf7983 Mon Sep 17 00:00:00 2001 From: George McCabe <23407799+georgemccabe@users.noreply.github.com> Date: Wed, 14 Jun 2023 16:13:58 -0600 Subject: [PATCH 10/18] handle failing comparison when NetCDF values are stored as a string, ci-run-diff, ci-skip-unit-tests --- metplus/util/diff_util.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/metplus/util/diff_util.py b/metplus/util/diff_util.py index f9ca1215f1..ccaac6846e 100755 --- a/metplus/util/diff_util.py +++ b/metplus/util/diff_util.py @@ -673,6 +673,11 @@ def _all_values_are_equal(var_a, var_b): @param var_b Numpy array @returns True if all values are equal, False otherwise """ + # if the values are stored as a string, compare them with == + if isinstance(var_a[:], str) or isinstance(var_b[:], str): + return var_a[:] == var_b[:] + + # flatten the numpy.ndarray and compare each value for val_a, val_b in zip(var_a[:].flatten(), var_b[:].flatten()): # continue to next value if both values are NaN if isnull(val_a) and isnull(val_b): From d7981f218aed398d66b81faac35f065a3a5146b0 Mon Sep 17 00:00:00 2001 From: George McCabe <23407799+georgemccabe@users.noreply.github.com> Date: Wed, 14 Jun 2023 16:15:16 -0600 Subject: [PATCH 11/18] Preserve backwards compatibility by forcing TC_PAIRS_RUN_ONCE=False if LOOP_ORDER is still used and warn that it should be removed. Add debug log message to alert user that only the first init/valid time is being processed and how to change that if desired --- metplus/wrappers/tc_pairs_wrapper.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/metplus/wrappers/tc_pairs_wrapper.py b/metplus/wrappers/tc_pairs_wrapper.py index 61fad0a136..78377e4317 100755 --- a/metplus/wrappers/tc_pairs_wrapper.py +++ b/metplus/wrappers/tc_pairs_wrapper.py @@ -293,6 +293,22 @@ def create_c_dict(self): False) ) + # check for settings that cause differences moving from v4.1 to v5.0 + # warn and update run setting to preserve old behavior + if (self.config.has_option('config', 'LOOP_ORDER') and + self.config.getstr_nocheck('config', 'LOOP_ORDER') == 'times' and + not self.config.has_option('config', 'TC_PAIRS_RUN_ONCE')): + self.logger.warning( + 'LOOP_ORDER has been deprecated. LOOP_ORDER has been set to ' + '"times" and TC_PAIRS_RUN_ONCE is not set. ' + 'Forcing TC_PAIRS_RUN_ONCE=False to preserve behavior prior to ' + 'v5.0.0. Please remove LOOP_ORDER and set ' + 'TC_PAIRS_RUN_ONCE=False to preserve previous behavior and ' + 'remove this warning message.' + ) + c_dict['RUN_ONCE'] = False + return c_dict + # only run once if True c_dict['RUN_ONCE'] = self.config.getbool('config', 'TC_PAIRS_RUN_ONCE', @@ -318,6 +334,8 @@ def run_all_times(self): if not self.c_dict['RUN_ONCE']: return super().run_all_times() + self.logger.debug('Only processing first run time. Set ' + 'TC_PAIRS_RUN_ONCE=False to process all run times.') self.run_at_time(input_dict) return self.all_commands From c8a80b2bdd4c6c30322280dcf4eeceedb66fd62d Mon Sep 17 00:00:00 2001 From: George McCabe <23407799+georgemccabe@users.noreply.github.com> Date: Wed, 14 Jun 2023 16:17:38 -0600 Subject: [PATCH 12/18] add pandas to diff conda environment used for diff tests in GHA to properly check for null/NaN values --- internal/scripts/docker_env/scripts/diff_env.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/internal/scripts/docker_env/scripts/diff_env.sh b/internal/scripts/docker_env/scripts/diff_env.sh index 92a59c1470..bb89248e0c 100755 --- a/internal/scripts/docker_env/scripts/diff_env.sh +++ b/internal/scripts/docker_env/scripts/diff_env.sh @@ -2,10 +2,11 @@ ################################################################################ # Environment: diff.v5.1 -# Last Updated: 2023-01-31 (mccabe@ucar.edu) +# Last Updated: 2023-06-14 (mccabe@ucar.edu) # Notes: Adds packages needed to run differences tests to compare output to # truth data. # Python Packages: +# pandas # pillow==9.2.0 # pdf2image==1.16.0 # @@ -23,6 +24,7 @@ ENV_NAME=diff.${METPLUS_VERSION} BASE_ENV=netcdf4.${METPLUS_VERSION} conda create -y --clone ${BASE_ENV} --name ${ENV_NAME} +conda install -y --name ${ENV_NAME} -c conda-forge pandas conda install -y --name ${ENV_NAME} -c conda-forge pillow==9.2.0 apt install -y poppler-utils From 353fd152d9cf5374860b3bd3a1044f8d99e71d79 Mon Sep 17 00:00:00 2001 From: George McCabe <23407799+georgemccabe@users.noreply.github.com> Date: Wed, 14 Jun 2023 16:44:23 -0600 Subject: [PATCH 13/18] turn off use cases after tests succeeded --- .github/parm/use_case_groups.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/parm/use_case_groups.json b/.github/parm/use_case_groups.json index 131e0eed72..23ff852726 100644 --- a/.github/parm/use_case_groups.json +++ b/.github/parm/use_case_groups.json @@ -7,7 +7,7 @@ { "category": "met_tool_wrapper", "index_list": "30-58", - "run": true + "run": false }, { "category": "air_quality_and_comp", @@ -227,7 +227,7 @@ { "category": "tc_and_extra_tc", "index_list": "0-2", - "run": true + "run": false }, { "category": "tc_and_extra_tc", From 52f5cebef3b734986fd9a08195e740308d99f9f9 Mon Sep 17 00:00:00 2001 From: George McCabe <23407799+georgemccabe@users.noreply.github.com> Date: Thu, 15 Jun 2023 10:55:24 -0600 Subject: [PATCH 14/18] update apt-get to fix poppler-utils install error, add specific version for pandas --- internal/scripts/docker_env/scripts/diff_env.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/internal/scripts/docker_env/scripts/diff_env.sh b/internal/scripts/docker_env/scripts/diff_env.sh index bb89248e0c..54544a38e8 100755 --- a/internal/scripts/docker_env/scripts/diff_env.sh +++ b/internal/scripts/docker_env/scripts/diff_env.sh @@ -6,7 +6,7 @@ # Notes: Adds packages needed to run differences tests to compare output to # truth data. # Python Packages: -# pandas +# pandas==2.0.2 # pillow==9.2.0 # pdf2image==1.16.0 # @@ -24,9 +24,10 @@ ENV_NAME=diff.${METPLUS_VERSION} BASE_ENV=netcdf4.${METPLUS_VERSION} conda create -y --clone ${BASE_ENV} --name ${ENV_NAME} -conda install -y --name ${ENV_NAME} -c conda-forge pandas +conda install -y --name ${ENV_NAME} -c conda-forge pandas==2.0.2 conda install -y --name ${ENV_NAME} -c conda-forge pillow==9.2.0 -apt install -y poppler-utils +apt-get update +apt-get install -y poppler-utils conda install -y --name ${ENV_NAME} -c conda-forge pdf2image==1.16.0 From 1421c18e5b01e37881c1a39fdfdbe0043327a762 Mon Sep 17 00:00:00 2001 From: George McCabe <23407799+georgemccabe@users.noreply.github.com> Date: Thu, 15 Jun 2023 11:07:20 -0600 Subject: [PATCH 15/18] run use case to test that diff.v5.1 conda env was created properly, ci-run-diff --- .github/parm/use_case_groups.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/parm/use_case_groups.json b/.github/parm/use_case_groups.json index 23ff852726..6bae411ea5 100644 --- a/.github/parm/use_case_groups.json +++ b/.github/parm/use_case_groups.json @@ -157,7 +157,7 @@ { "category": "s2s_mid_lat", "index_list": "0-2", - "run": false + "run": true }, { "category": "s2s_mid_lat", From b29a7acc67d80c9107aca78e6641a154365cb23a Mon Sep 17 00:00:00 2001 From: George McCabe <23407799+georgemccabe@users.noreply.github.com> Date: Thu, 15 Jun 2023 11:23:04 -0600 Subject: [PATCH 16/18] check if values are completely equal inside _is_equal_rounded function, call that function in NC value diff to prevent differences that are smaller than 5 decimal places --- metplus/util/diff_util.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/metplus/util/diff_util.py b/metplus/util/diff_util.py index ccaac6846e..14efccf47c 100755 --- a/metplus/util/diff_util.py +++ b/metplus/util/diff_util.py @@ -382,8 +382,6 @@ def _compare_csv_columns(lines_a, lines_b): for key in keys_a: val_a = line_a[key] val_b = line_b[key] - if val_a == val_b: - continue # prevent error if values are diffs are less than # ROUNDING_PRECISION decimal places # METplus issue #1873 addresses the real problem @@ -404,6 +402,8 @@ def _compare_csv_columns(lines_a, lines_b): def _is_equal_rounded(value_a, value_b): + if value_a == value_b: + return True if _truncate_float(value_a) == _truncate_float(value_b): return True if _round_float(value_a) == _round_float(value_b): @@ -682,7 +682,8 @@ def _all_values_are_equal(var_a, var_b): # continue to next value if both values are NaN if isnull(val_a) and isnull(val_b): continue - if val_a != val_b: + if not _is_equal_rounded(val_a, val_b): + print(f'val_a: {val_a}, val_b: {val_b}') return False return True From 2b9bea52580fca1621a94958bf18fa91728980db Mon Sep 17 00:00:00 2001 From: George McCabe <23407799+georgemccabe@users.noreply.github.com> Date: Thu, 15 Jun 2023 13:35:26 -0600 Subject: [PATCH 17/18] turn off all use case groups before re-opening PR --- .github/parm/use_case_groups.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/parm/use_case_groups.json b/.github/parm/use_case_groups.json index 6bae411ea5..23ff852726 100644 --- a/.github/parm/use_case_groups.json +++ b/.github/parm/use_case_groups.json @@ -157,7 +157,7 @@ { "category": "s2s_mid_lat", "index_list": "0-2", - "run": true + "run": false }, { "category": "s2s_mid_lat", From 0cbcff25c134e7379696a58e9d2e0693cd7d53db Mon Sep 17 00:00:00 2001 From: George McCabe <23407799+georgemccabe@users.noreply.github.com> Date: Thu, 15 Jun 2023 17:16:08 -0600 Subject: [PATCH 18/18] if both netcdf values are masked, consider them equal --- metplus/util/diff_util.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/metplus/util/diff_util.py b/metplus/util/diff_util.py index 14efccf47c..f26ba790f2 100755 --- a/metplus/util/diff_util.py +++ b/metplus/util/diff_util.py @@ -7,6 +7,7 @@ import csv from PIL import Image, ImageChops from pandas import isnull +from numpy.ma import is_masked from numpy.core._exceptions import UFuncTypeError IMAGE_EXTENSIONS = [ @@ -680,7 +681,7 @@ def _all_values_are_equal(var_a, var_b): # flatten the numpy.ndarray and compare each value for val_a, val_b in zip(var_a[:].flatten(), var_b[:].flatten()): # continue to next value if both values are NaN - if isnull(val_a) and isnull(val_b): + if (isnull(val_a) and isnull(val_b)) or (is_masked(val_a) and is_masked(val_b)): continue if not _is_equal_rounded(val_a, val_b): print(f'val_a: {val_a}, val_b: {val_b}')