From a51c83d811a3696f3ec39043e8dbebbfb798c451 Mon Sep 17 00:00:00 2001 From: "Michael Kavulich, Jr" Date: Tue, 21 Feb 2023 15:22:20 -0700 Subject: [PATCH 01/52] Fix error introduced to pythonized WE2E script --- tests/WE2E/run_WE2E_tests.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/WE2E/run_WE2E_tests.py b/tests/WE2E/run_WE2E_tests.py index 66df2e8205..a189301fb3 100755 --- a/tests/WE2E/run_WE2E_tests.py +++ b/tests/WE2E/run_WE2E_tests.py @@ -175,8 +175,8 @@ def run_we2e_tests(homedir, args) -> None: if 'verification' in test_cfg: logging.debug(test_cfg['verification']) - test_cfg['verification'] = check_task_verification(test_cfg,machine_defaults,config_defaults) - logging.debug(test_cfg['verification']) + test_cfg['verification'] = check_task_verification(test_cfg,machine_defaults,config_defaults) + logging.debug(test_cfg['verification']) logging.debug(f"Writing updated config.yaml for test {test_name}\nbased on specified command-line arguments:\n") logging.debug(cfg_to_yaml_str(test_cfg)) From 7e393a031cd6f7290373d238f20fcef48b8f4410 Mon Sep 17 00:00:00 2001 From: "Michael Kavulich, Jr" Date: Wed, 22 Feb 2023 22:43:36 +0000 Subject: [PATCH 02/52] Fix failing "specify_template_filenames" test --- .../wflow_features/config.specify_template_filenames.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/WE2E/test_configs/wflow_features/config.specify_template_filenames.yaml b/tests/WE2E/test_configs/wflow_features/config.specify_template_filenames.yaml index 462de85819..2c39bc388e 100644 --- a/tests/WE2E/test_configs/wflow_features/config.specify_template_filenames.yaml +++ b/tests/WE2E/test_configs/wflow_features/config.specify_template_filenames.yaml @@ -5,11 +5,11 @@ metadata: user: RUN_ENVIR: community workflow: - DATA_TABLE_TMPL_FN: data_table + DATA_TABLE_FN: data_table DIAG_TABLE_TMPL_FN: diag_table.FV3_GFS_v15p2 FIELD_TABLE_TMPL_FN: field_table.FV3_GFS_v15p2 - MODEL_CONFIG_TMPL_FN: model_configure - NEMS_CONFIG_TMPL_FN: nems.configure + MODEL_CONFIG_FN: model_configure + NEMS_CONFIG_FN: nems.configure CCPP_PHYS_SUITE: FV3_GFS_v15p2 PREDEF_GRID_NAME: RRFS_CONUS_25km DATE_FIRST_CYCL: '2019070100' From a63286de3ca77f2e45ef3b5beee9aacf18e96004 Mon Sep 17 00:00:00 2001 From: "Michael Kavulich, Jr" Date: Wed, 22 Feb 2023 23:13:45 +0000 Subject: [PATCH 03/52] Add logic and instructions for neatly interrupting and resuming the monitor script --- tests/WE2E/monitor_jobs.py | 4 ++++ tests/WE2E/run_WE2E_tests.py | 21 ++++++++++++++------- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/tests/WE2E/monitor_jobs.py b/tests/WE2E/monitor_jobs.py index 8fbd4f2afb..3ef9ddd0c5 100755 --- a/tests/WE2E/monitor_jobs.py +++ b/tests/WE2E/monitor_jobs.py @@ -51,6 +51,7 @@ def monitor_jobs(expt_dict: dict, monitor_file: str = '', debug: bool = False) - write_monitor_file(monitor_file,expt_dict) logging.info(f'Setup complete; monitoring {len(expt_dict)} experiments') + logging.info('Use ctrl-c to pause job submission/monitoring') #Make a copy of experiment dictionary; will use this copy to monitor active experiments running_expts = expt_dict.copy() @@ -262,6 +263,9 @@ def setup_logging(logfile: str = "log.run_WE2E_tests", debug: bool = False) -> N try: monitor_jobs(expt_dict,args.yaml_file, args.debug) + except KeyboardInterrupt: + logging.info("\n\nUser interrupted monitor script; to resume monitoring jobs run:\n") + logging.info(f"{__file__} -y={args.yaml_file}\n") except: logging.exception( dedent( diff --git a/tests/WE2E/run_WE2E_tests.py b/tests/WE2E/run_WE2E_tests.py index a189301fb3..18abf54999 100755 --- a/tests/WE2E/run_WE2E_tests.py +++ b/tests/WE2E/run_WE2E_tests.py @@ -6,6 +6,7 @@ import argparse import logging from textwrap import dedent +from datetime import datetime sys.path.append("../../ush") @@ -17,7 +18,7 @@ from check_python_version import check_python_version -from monitor_jobs import monitor_jobs +from monitor_jobs import monitor_jobs, write_monitor_file def run_we2e_tests(homedir, args) -> None: @@ -205,12 +206,18 @@ def run_we2e_tests(homedir, args) -> None: if not args.use_cron_to_relaunch: logging.info("calling function that monitors jobs, prints summary") - monitor_file = monitor_jobs(monitor_yaml, debug=args.debug) - - logging.info("All experiments are complete") - logging.info(f"Summary of results available in {monitor_file}") - - + monitor_file = f'WE2E_tests_{datetime.now().strftime("%Y%m%d%H%M%S")}.yaml' + write_monitor_file(monitor_file,monitor_yaml) + try: + monitor_file = monitor_jobs(monitor_yaml, monitor_file=monitor_file, debug=args.debug) + except KeyboardInterrupt: + logging.info("\n\nUser interrupted monitor script; to resume monitoring jobs run:\n") + logging.info(f"./monitor_jobs.py -y={monitor_file}\n") + except: + raise + else: + logging.info("All experiments are complete") + logging.info(f"Summary of results available in {monitor_file}") From 5330ecf4ac58a778508b53e0b318d8b3c240d2b6 Mon Sep 17 00:00:00 2001 From: "Michael Kavulich, Jr" Date: Thu, 23 Feb 2023 03:43:19 +0000 Subject: [PATCH 04/52] Consolidate check_task_get_extrn_ics and check_task_get_extrn_lbcs into a single function: check_task_get_extrn_bcs --- tests/WE2E/run_WE2E_tests.py | 125 +++++++++++------------------------ 1 file changed, 38 insertions(+), 87 deletions(-) diff --git a/tests/WE2E/run_WE2E_tests.py b/tests/WE2E/run_WE2E_tests.py index 18abf54999..285e81b615 100755 --- a/tests/WE2E/run_WE2E_tests.py +++ b/tests/WE2E/run_WE2E_tests.py @@ -167,11 +167,11 @@ def run_we2e_tests(homedir, args) -> None: if 'task_get_extrn_ics' in test_cfg: logging.debug(test_cfg['task_get_extrn_ics']) - test_cfg['task_get_extrn_ics'] = check_task_get_extrn_ics(test_cfg,machine_defaults,config_defaults) + test_cfg['task_get_extrn_ics'] = check_task_get_extrn_bcs(test_cfg,machine_defaults,config_defaults,"ics") logging.debug(test_cfg['task_get_extrn_ics']) if 'task_get_extrn_lbcs' in test_cfg: logging.debug(test_cfg['task_get_extrn_lbcs']) - test_cfg['task_get_extrn_lbcs'] = check_task_get_extrn_lbcs(test_cfg,machine_defaults,config_defaults) + test_cfg['task_get_extrn_lbcs'] = check_task_get_extrn_bcs(test_cfg,machine_defaults,config_defaults,"lbcs") logging.debug(test_cfg['task_get_extrn_lbcs']) if 'verification' in test_cfg: @@ -218,6 +218,9 @@ def run_we2e_tests(homedir, args) -> None: else: logging.info("All experiments are complete") logging.info(f"Summary of results available in {monitor_file}") + else: + logging.info("All experiments have been generated; using cron to submit workflows") + logging.info("To view running experiments in cron try `crontab -l`") @@ -290,99 +293,47 @@ def check_test(test: str) -> str: return config -def check_task_get_extrn_ics(cfg: dict, mach: dict, dflt: dict) -> dict: +def check_task_get_extrn_bcs(cfg: dict, mach: dict, dflt: dict, ics_or_lbcs: str = "") -> dict: """ - Function for checking and updating various settings in task_get_extrn_ics section of test config yaml + Function for checking and updating various settings in task_get_extrn_ics or + task_get_extrn_lbcs section of test config yaml Args: cfg : Dictionary loaded from test config file mach : Dictionary loaded from machine settings file dflt : Dictionary loaded from default config file + ics_or_lbcs: Perform checks for ICs task or LBCs task + Returns: - cfg_ics : Updated dictionary for task_get_extrn_ics section of test config + cfg_bcs : Updated dictionary for task_get_extrn_[ics|lbcs] section of test config """ - #Make our lives easier by shortening some dictionary calls - cfg_ics = cfg['task_get_extrn_ics'] - - # If RUN_TASK_GET_EXTRN_ICS is explicitly set to false, do nothing and return - if 'workflow_switches' in cfg: - if 'RUN_TASK_GET_EXTRN_ICS' in cfg['workflow_switches']: - if cfg['workflow_switches']['RUN_TASK_GET_EXTRN_ICS'] is False: - return cfg_ics + if ics_or_lbcs not in ["lbcs", "ics"]: + raise ValueError(f"ics_or_lbcs must be set to 'lbcs' or 'ics'") - # If USE_USER_STAGED_EXTRN_FILES not specified or false, do nothing and return - if not cfg_ics.get('USE_USER_STAGED_EXTRN_FILES'): - logging.debug(f'USE_USER_STAGED_EXTRN_FILES not specified or False in task_get_extrn_ics section of config') - return cfg_ics - - # If EXTRN_MDL_SYSBASEDIR_ICS is "set_to_non_default_location_in_testing_script", replace with test value from machine file - if cfg_ics.get('EXTRN_MDL_SYSBASEDIR_ICS') == "set_to_non_default_location_in_testing_script": - if 'TEST_ALT_EXTRN_MDL_SYSBASEDIR_ICS' in mach['platform']: - if os.path.isdir(mach['platform']['TEST_ALT_EXTRN_MDL_SYSBASEDIR_ICS']): - raise FileNotFoundError(f"Non-default input file location TEST_ALT_EXTRN_MDL_SYSBASEDIR_ICS from machine file does not exist or is not a directory") - cfg_ics['EXTRN_MDL_SYSBASEDIR_ICS'] = mach['platform']['TEST_ALT_EXTRN_MDL_SYSBASEDIR_ICS'] - else: - raise KeyError(f"Non-default input file location TEST_ALT_EXTRN_MDL_SYSBASEDIR_ICS not set in machine file") - return cfg_ics - - # Because USE_USER_STAGED_EXTRN_FILES is true, only look on disk, and ensure the staged data directory exists - cfg['platform']['EXTRN_MDL_DATA_STORES'] = "disk" - if 'TEST_EXTRN_MDL_SOURCE_BASEDIR' not in mach['platform']: - raise KeyError("TEST_EXTRN_MDL_SOURCE_BASEDIR, the directory for staged test data,"\ - "has not been specified in the machine file for this platform") - if not os.path.isdir(mach['platform']['TEST_EXTRN_MDL_SOURCE_BASEDIR']): - raise FileNotFoundError(dedent(f"""The directory for staged test data specified in this platform's machine file - TEST_EXTRN_MDL_SOURCE_BASEDIR = {mach['platform']['TEST_EXTRN_MDL_SOURCE_BASEDIR']} - does not exist.""")) - - # Different input data types have different directory structures, so set the data directory accordingly - if cfg_ics['EXTRN_MDL_NAME_ICS'] == 'FV3GFS': - if 'FV3GFS_FILE_FMT_ICS' not in cfg_ics: - cfg_ics['FV3GFS_FILE_FMT_ICS'] = dflt['task_get_extrn_ics']['FV3GFS_FILE_FMT_ICS'] - cfg_ics['EXTRN_MDL_SOURCE_BASEDIR_ICS'] = f"{mach['platform']['TEST_EXTRN_MDL_SOURCE_BASEDIR']}/"\ - f"{cfg_ics['EXTRN_MDL_NAME_ICS']}/{cfg_ics['FV3GFS_FILE_FMT_ICS']}/${{yyyymmddhh}}" - else: - cfg_ics['EXTRN_MDL_SOURCE_BASEDIR_ICS'] = f"{mach['platform']['TEST_EXTRN_MDL_SOURCE_BASEDIR']}/"\ - f"{cfg_ics['EXTRN_MDL_NAME_ICS']}/${{yyyymmddhh}}" - - return cfg_ics - -def check_task_get_extrn_lbcs(cfg: dict, mach: dict, dflt: dict) -> dict: - """ - Function for checking and updating various settings in task_get_extrn_lbcs section of test config yaml - - Args: - cfg : Dictionary loaded from test config file - mach : Dictionary loaded from machine settings file - dflt : Dictionary loaded from default config file - Returns: - cfg_lbcs : Updated dictionary for task_get_extrn_lbcs section of test config - """ + I_OR_L = ics_or_lbcs.upper() #Make our lives easier by shortening some dictionary calls - cfg_lbcs = cfg['task_get_extrn_lbcs'] + cfg_bcs = cfg[f'task_get_extrn_{ics_or_lbcs}'] - # If RUN_TASK_GET_EXTRN_LBCS is explicitly set to false, do nothing and return - if 'workflow_switches' in cfg: - if 'RUN_TASK_GET_EXTRN_LBCS' in cfg['workflow_switches']: - if cfg['workflow_switches']['RUN_TASK_GET_EXTRN_LBCS'] is False: - return cfg_lbcs + # If RUN_TASK_GET_EXTRN_* is explicitly set to false, do nothing and return + if cfg.get('workflow_switches', {}).get(f'RUN_TASK_GET_EXTRN_{I_OR_L}', True) is False: + return cfg_bcs # If USE_USER_STAGED_EXTRN_FILES not specified or false, do nothing and return - if not cfg_lbcs.get('USE_USER_STAGED_EXTRN_FILES'): - logging.debug(f'USE_USER_STAGED_EXTRN_FILES not specified or False in task_get_extrn_lbcs section of config') - return cfg_lbcs - - # If EXTRN_MDL_SYSBASEDIR_LBCS is "set_to_non_default_location_in_testing_script", replace with test value from machine file - if cfg_lbcs.get('EXTRN_MDL_SYSBASEDIR_LBCS') == "set_to_non_default_location_in_testing_script": - if 'TEST_ALT_EXTRN_MDL_SYSBASEDIR_LBCS' in mach['platform']: - if os.path.isdir(mach['platform']['TEST_ALT_EXTRN_MDL_SYSBASEDIR_LBCS']): - raise FileNotFoundError(f"Non-default input file location TEST_ALT_EXTRN_MDL_SYSBASEDIR_LBCS from machine file does not exist or is not a directory") - cfg_lbcs['EXTRN_MDL_SYSBASEDIR_LBCS'] = mach['platform']['TEST_ALT_EXTRN_MDL_SYSBASEDIR_LBCS'] + if not cfg_bcs.get('USE_USER_STAGED_EXTRN_FILES'): + logging.debug(f'USE_USER_STAGED_EXTRN_FILES not specified or False in task_get_extrn_{ics_or_lbcs} section of config') + return cfg_bcs + + # If EXTRN_MDL_SYSBASEDIR_* is "set_to_non_default_location_in_testing_script", replace with test value from machine file + if cfg_bcs.get(f'EXTRN_MDL_SYSBASEDIR_{I_OR_L}') == "set_to_non_default_location_in_testing_script": + if f'TEST_ALT_EXTRN_MDL_SYSBASEDIR_{I_OR_L}' in mach['platform']: + if os.path.isdir(mach['platform'][f'TEST_ALT_EXTRN_MDL_SYSBASEDIR_{I_OR_L}']): + raise FileNotFoundError(f"Non-default input file location TEST_ALT_EXTRN_MDL_SYSBASEDIR_{I_OR_L} from machine file does not exist or is not a directory") + cfg_bcs[f'EXTRN_MDL_SYSBASEDIR_{I_OR_L}'] = mach['platform'][f'TEST_ALT_EXTRN_MDL_SYSBASEDIR_{I_OR_L}'] else: - raise KeyError(f"Non-default input file location TEST_ALT_EXTRN_MDL_SYSBASEDIR_LBCS not set in machine file") - return cfg_lbcs + raise KeyError(f"Non-default input file location TEST_ALT_EXTRN_MDL_SYSBASEDIR_{I_OR_L} not set in machine file") + return cfg_bcs # Because USE_USER_STAGED_EXTRN_FILES is true, only look on disk, and ensure the staged data directory exists cfg['platform']['EXTRN_MDL_DATA_STORES'] = "disk" @@ -395,16 +346,16 @@ def check_task_get_extrn_lbcs(cfg: dict, mach: dict, dflt: dict) -> dict: does not exist.""")) # Different input data types have different directory structures, so set the data directory accordingly - if cfg_lbcs['EXTRN_MDL_NAME_LBCS'] == 'FV3GFS': - if 'FV3GFS_FILE_FMT_LBCS' not in cfg_lbcs: - cfg_lbcs['FV3GFS_FILE_FMT_LBCS'] = dflt['task_get_extrn_lbcs']['FV3GFS_FILE_FMT_LBCS'] - cfg_lbcs['EXTRN_MDL_SOURCE_BASEDIR_LBCS'] = f"{mach['platform']['TEST_EXTRN_MDL_SOURCE_BASEDIR']}/"\ - f"{cfg_lbcs['EXTRN_MDL_NAME_LBCS']}/{cfg_lbcs['FV3GFS_FILE_FMT_LBCS']}/${{yyyymmddhh}}" + if cfg_bcs[f'EXTRN_MDL_NAME_{I_OR_L}'] == 'FV3GFS': + if f'FV3GFS_FILE_FMT_{I_OR_L}' not in cfg_bcs: + cfg_bcs[f'FV3GFS_FILE_FMT_{I_OR_L}'] = dflt[f'task_get_extrn_{ics_or_lbcs}'][f'FV3GFS_FILE_FMT_{I_OR_L}'] + cfg_bcs[f'EXTRN_MDL_SOURCE_BASEDIR_{I_OR_L}'] = f"{mach['platform']['TEST_EXTRN_MDL_SOURCE_BASEDIR']}/"\ + f"{cfg_bcs[f'EXTRN_MDL_NAME_{I_OR_L}']}/{cfg_bcs[f'FV3GFS_FILE_FMT_{I_OR_L}']}/${{yyyymmddhh}}" else: - cfg_lbcs['EXTRN_MDL_SOURCE_BASEDIR_LBCS'] = f"{mach['platform']['TEST_EXTRN_MDL_SOURCE_BASEDIR']}/"\ - f"{cfg_lbcs['EXTRN_MDL_NAME_LBCS']}/${{yyyymmddhh}}" + cfg_bcs[f'EXTRN_MDL_SOURCE_BASEDIR_{I_OR_L}'] = f"{mach['platform']['TEST_EXTRN_MDL_SOURCE_BASEDIR']}/"\ + f"{cfg_bcs[f'EXTRN_MDL_NAME_{I_OR_L}']}/${{yyyymmddhh}}" - return cfg_lbcs + return cfg_bcs def check_task_verification(cfg: dict, mach: dict, dflt: dict) -> dict: """ From e27dacca293977c7d3c2bf9a767eb12b0faa70a5 Mon Sep 17 00:00:00 2001 From: "Michael Kavulich, Jr" Date: Thu, 23 Feb 2023 06:21:50 +0000 Subject: [PATCH 05/52] Remove some unnecessary debug prints --- tests/WE2E/run_WE2E_tests.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tests/WE2E/run_WE2E_tests.py b/tests/WE2E/run_WE2E_tests.py index 285e81b615..0916788831 100755 --- a/tests/WE2E/run_WE2E_tests.py +++ b/tests/WE2E/run_WE2E_tests.py @@ -166,18 +166,12 @@ def run_we2e_tests(homedir, args) -> None: logging.debug(f"Overwriting WE2E-test-specific settings for test \n{test_name}\n") if 'task_get_extrn_ics' in test_cfg: - logging.debug(test_cfg['task_get_extrn_ics']) test_cfg['task_get_extrn_ics'] = check_task_get_extrn_bcs(test_cfg,machine_defaults,config_defaults,"ics") - logging.debug(test_cfg['task_get_extrn_ics']) if 'task_get_extrn_lbcs' in test_cfg: - logging.debug(test_cfg['task_get_extrn_lbcs']) test_cfg['task_get_extrn_lbcs'] = check_task_get_extrn_bcs(test_cfg,machine_defaults,config_defaults,"lbcs") - logging.debug(test_cfg['task_get_extrn_lbcs']) if 'verification' in test_cfg: - logging.debug(test_cfg['verification']) test_cfg['verification'] = check_task_verification(test_cfg,machine_defaults,config_defaults) - logging.debug(test_cfg['verification']) logging.debug(f"Writing updated config.yaml for test {test_name}\nbased on specified command-line arguments:\n") logging.debug(cfg_to_yaml_str(test_cfg)) From 6dcc7c25f17b35ca380bd2128f9987474df96e52 Mon Sep 17 00:00:00 2001 From: "Michael Kavulich, Jr" Date: Thu, 23 Feb 2023 06:22:02 +0000 Subject: [PATCH 06/52] Initial version of job summary script --- tests/WE2E/job_summary.py | 109 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100755 tests/WE2E/job_summary.py diff --git a/tests/WE2E/job_summary.py b/tests/WE2E/job_summary.py new file mode 100755 index 0000000000..32f3c8c9a3 --- /dev/null +++ b/tests/WE2E/job_summary.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python3 + +import os +import sys +import argparse +import logging +import subprocess +import sqlite3 +import time +from textwrap import dedent +from datetime import datetime +from contextlib import closing + +sys.path.append("../../ush") + +from python_utils import ( + load_config_file, + cfg_to_yaml_str +) + +from check_python_version import check_python_version + +from monitor_jobs import update_expt_status + +def print_job_summary(expt_dict: dict, debug: bool = False): + """Function that creates a summary for the specified experiment + + Args: + expt_dict (dict): A dictionary containing the information needed to run + one or more experiments. See example file monitor_jobs.yaml + debug (bool): [optional] Enable extra output for debugging + Returns: + None + """ + + # Perform initial setup for each experiment + for expt in expt_dict: + print("\n======================================") + print(f'Checking workflow status of experiment "{expt}" ...') + update_expt_status(expt_dict[expt],expt) + print(f"Workflow status: {expt_dict[expt]['status']}") + print("======================================") + +def create_expt_dict(expt_dir: str) -> dict: + """ + Function takes in a directory, searches that directory for subdirectories containing + experiments, and creates a skeleton dictionary that can be filled out by update_expt_status() + + Args: + expt_dir (str) : Experiment directory + Returns: + dict : Experiment dictionary + """ + contents = os.listdir(expt_dir) + + expt_dict=dict() + for item in contents: + # Look for FV3LAM_wflow.xml to indicate directories with experiments in them + if os.path.isfile(os.path.join(expt_dir, item, 'FV3LAM_wflow.xml')): + expt_dict[item] = dict() + expt_dict[item].update({"expt_dir": os.path.join(expt_dir,item)}) + expt_dict[item].update({"status": "CREATED"}) + + return expt_dict + + +def setup_logging(debug: bool = False) -> None: + """ + Sets up logging, printing high-priority (INFO and higher) messages to screen, and printing all + messages with detailed timing and routine info in the specified text file. + """ + logging.getLogger().setLevel(logging.DEBUG) + + console = logging.StreamHandler() + if debug: + console.setLevel(logging.DEBUG) + else: + console.setLevel(logging.INFO) + logging.getLogger().addHandler(console) + logging.debug("Logging set up successfully") + + +if __name__ == "__main__": + + check_python_version() + + #Parse arguments + parser = argparse.ArgumentParser(description="Script for creating a job summary printed to screen and a file, either from a yaml experiment file created by monitor_jobs() or from a provided directory of experiments\n") + + req = parser.add_mutually_exclusive_group(required=True) + req.add_argument('-y', '--yaml_file', type=str, help='YAML-format file specifying the information of jobs to be summarized; for an example file, see monitor_jobs.yaml') + req.add_argument('-e', '--expt_dir', type=str, help='The full path of an experiment directory, containing one or more subdirectories with UFS SRW App experiments in them') + parser.add_argument('-d', '--debug', action='store_true', help='Script will be run in debug mode with more verbose output') + + args = parser.parse_args() + + setup_logging(args.debug) + + # Set up dictionary of experiments + if args.expt_dir: + expt_dict = create_expt_dict(args.expt_dir) + elif args.yaml_file: + expt_dict = load_config_file(args.yaml_file) + else: + raise ValueError(f'Bad arguments; run {__file__} -h for more information') + + #Call main function + print_job_summary(expt_dict, args.debug) + From b5fb16cc5e2b3aefb804be377077497045dd3426 Mon Sep 17 00:00:00 2001 From: "Michael Kavulich, Jr" Date: Thu, 23 Feb 2023 21:52:53 +0000 Subject: [PATCH 07/52] Update experiment yamls to track task cores and walltime, update job_monitor to report core hours --- tests/WE2E/job_summary.py | 89 +++++++++++++++++++++++++++++++++----- tests/WE2E/monitor_jobs.py | 14 +++--- 2 files changed, 86 insertions(+), 17 deletions(-) diff --git a/tests/WE2E/job_summary.py b/tests/WE2E/job_summary.py index 32f3c8c9a3..a93107fa7c 100755 --- a/tests/WE2E/job_summary.py +++ b/tests/WE2E/job_summary.py @@ -4,6 +4,7 @@ import sys import argparse import logging +import re import subprocess import sqlite3 import time @@ -14,13 +15,17 @@ sys.path.append("../../ush") from python_utils import ( + cfg_to_yaml_str, + flatten_dict, load_config_file, - cfg_to_yaml_str + load_shell_config ) from check_python_version import check_python_version -from monitor_jobs import update_expt_status +from monitor_jobs import update_expt_status, write_monitor_file + +REPORT_WIDTH = 110 def print_job_summary(expt_dict: dict, debug: bool = False): """Function that creates a summary for the specified experiment @@ -33,13 +38,24 @@ def print_job_summary(expt_dict: dict, debug: bool = False): None """ - # Perform initial setup for each experiment + # Create summary table as list of strings + summary = [] + summary.append('-'*REPORT_WIDTH) + summary.append(f'Experiment name {" "*44} | Status | Core hours used ') + # Flag for tracking if "cores per node" is in dictionary + summary.append('-'*REPORT_WIDTH) for expt in expt_dict: - print("\n======================================") - print(f'Checking workflow status of experiment "{expt}" ...') - update_expt_status(expt_dict[expt],expt) - print(f"Workflow status: {expt_dict[expt]['status']}") - print("======================================") + status = expt_dict[expt]["status"] + ch = 0 + for task in expt_dict[expt]: + if "core_hours" in expt_dict[expt][task]: + ch += expt_dict[expt][task]["core_hours"] + summary.append(f'{expt[:60]:<60s} {status:^12s} {ch:^12.2f}') + + # Print summary to screen + for line in summary: + print(line) + def create_expt_dict(expt_dir: str) -> dict: """ @@ -56,13 +72,55 @@ def create_expt_dict(expt_dir: str) -> dict: expt_dict=dict() for item in contents: # Look for FV3LAM_wflow.xml to indicate directories with experiments in them - if os.path.isfile(os.path.join(expt_dir, item, 'FV3LAM_wflow.xml')): + fullpath = os.path.join(expt_dir, item) + if not os.path.isdir(fullpath): + continue + xmlfile = os.path.join(expt_dir, item, 'FV3LAM_wflow.xml') + if os.path.isfile(xmlfile): expt_dict[item] = dict() expt_dict[item].update({"expt_dir": os.path.join(expt_dir,item)}) expt_dict[item].update({"status": "CREATED"}) + else: + logging.debug(f'Skipping directory {item}, experiment XML file not found') + #Update the experiment dictionary + logging.info(f"Reading status of experiment {item}") + update_expt_status(expt_dict[item],item,True) + summary_file = f'job_summary_{datetime.now().strftime("%Y%m%d%H%M%S")}.yaml' - return expt_dict + return summary_file, expt_dict +def calculate_core_hours(expt_dict: dict) -> dict: + """ + Function takes in an experiment dictionary, reads the var_defns file for necessary information, + and calculates the core hours used by each task, updating expt_dict with this info + + Args: + expt_dict (dict) : Experiment dictionary + Returns: + dict : Experiment dictionary updated with core hours + """ + + for expt in expt_dict: + # Read variable definitions file + vardefs = load_shell_config(os.path.join(expt_dict[expt]["expt_dir"],"var_defns.sh")) + vdf = flatten_dict(vardefs) + cores_per_node = vdf["NCORES_PER_NODE"] + for task in expt_dict[expt]: + # Skip non-task entries + if task in ["expt_dir","status"]: + continue + # Cycle is last 12 characters, task name is rest (minus separating underscore) + taskname = task[:-13] + # Handle task names that have ensemble and/or fhr info appended with regex + print(taskname) + taskname = re.sub('_mem\d{3}', '', taskname) + taskname = re.sub('_f\d{3}', '', taskname) + print(taskname) + nnodes = vdf[f'NNODES_{taskname.upper()}'] + # Users are charged for full use of nodes, so core hours are CPN * nodes * time in hrs + core_hours = cores_per_node * nnodes * expt_dict[expt][task]['walltime'] / 3600 + expt_dict[expt][task]['core_hours'] = round(core_hours,2) + return expt_dict def setup_logging(debug: bool = False) -> None: """ @@ -80,6 +138,7 @@ def setup_logging(debug: bool = False) -> None: logging.debug("Logging set up successfully") + if __name__ == "__main__": check_python_version() @@ -96,14 +155,20 @@ def setup_logging(debug: bool = False) -> None: setup_logging(args.debug) + yaml_file = args.yaml_file + # Set up dictionary of experiments if args.expt_dir: - expt_dict = create_expt_dict(args.expt_dir) + yaml_file, expt_dict = create_expt_dict(args.expt_dir) elif args.yaml_file: expt_dict = load_config_file(args.yaml_file) else: raise ValueError(f'Bad arguments; run {__file__} -h for more information') - #Call main function + # Calculate core hours and update yaml + expt_dict = calculate_core_hours(expt_dict) + write_monitor_file(yaml_file,expt_dict) + + #Call function to print summary print_job_summary(expt_dict, args.debug) diff --git a/tests/WE2E/monitor_jobs.py b/tests/WE2E/monitor_jobs.py index 3ef9ddd0c5..4459e7d249 100755 --- a/tests/WE2E/monitor_jobs.py +++ b/tests/WE2E/monitor_jobs.py @@ -38,7 +38,7 @@ def monitor_jobs(expt_dict: dict, monitor_file: str = '', debug: bool = False) - # Write monitor_file, which will contain information on each monitored experiment if not monitor_file: monitor_file = f'monitor_jobs_{starttime.strftime("%Y%m%d%H%M%S")}.yaml' - logging.info(f"Writing information for all experiments to {monitor_file}") + logging.info(f"Writing information for all experiments to {monitor_file}") write_monitor_file(monitor_file,expt_dict) @@ -147,17 +147,21 @@ def update_expt_status(expt: dict, name: str, refresh: bool = False) -> dict: # of tuples containing the taskname, cycle, and state of each job respectively with closing(sqlite3.connect(rocoto_db)) as connection: with closing(connection.cursor()) as cur: - db = cur.execute('SELECT taskname,cycle,state from jobs').fetchall() + db = cur.execute('SELECT taskname,cycle,state,cores,duration from jobs').fetchall() except: logging.warning(f"Unable to read database {rocoto_db}\nCan not track experiment {name}") expt["status"] = "ERROR" return expt for task in db: - # For each entry from rocoto database, store that under a dictionary key named TASKNAME_CYCLE + # For each entry from rocoto database, store that task's info under a dictionary key named TASKNAME_CYCLE # Cycle comes from the database in Unix Time (seconds), so convert to human-readable cycle = datetime.utcfromtimestamp(task[1]).strftime('%Y%m%d%H%M') - expt[f"{task[0]}_{cycle}"] = task[2] + if f"{task[0]}_{cycle}" not in expt: + expt[f"{task[0]}_{cycle}"] = dict() + expt[f"{task[0]}_{cycle}"]["status"] = task[2] + expt[f"{task[0]}_{cycle}"]["cores"] = task[3] + expt[f"{task[0]}_{cycle}"]["walltime"] = task[4] #Run rocotorun again to get around rocotobqserver proliferation issue subprocess.run(rocotorun_cmd) @@ -167,7 +171,7 @@ def update_expt_status(expt: dict, name: str, refresh: bool = False) -> dict: # Skip non-task entries if task in ["expt_dir","status"]: continue - statuses.append(expt[task]) + statuses.append(expt[task]["status"]) if "DEAD" in statuses: still_live = ["RUNNING", "SUBMITTING", "QUEUED"] From e653076f95cff181d21a12dc97e172e618287bab Mon Sep 17 00:00:00 2001 From: "Michael Kavulich, Jr" Date: Thu, 23 Feb 2023 23:30:20 +0000 Subject: [PATCH 08/52] Print summary once all jobs are finished monitoring, add failsafe core hour calculation for missing NNODES variables, clean up formatting --- tests/WE2E/job_summary.py | 21 +++++++++++++-------- tests/WE2E/monitor_jobs.py | 8 ++++++++ 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/tests/WE2E/job_summary.py b/tests/WE2E/job_summary.py index a93107fa7c..7710ed00d6 100755 --- a/tests/WE2E/job_summary.py +++ b/tests/WE2E/job_summary.py @@ -25,7 +25,7 @@ from monitor_jobs import update_expt_status, write_monitor_file -REPORT_WIDTH = 110 +REPORT_WIDTH = 100 def print_job_summary(expt_dict: dict, debug: bool = False): """Function that creates a summary for the specified experiment @@ -41,7 +41,7 @@ def print_job_summary(expt_dict: dict, debug: bool = False): # Create summary table as list of strings summary = [] summary.append('-'*REPORT_WIDTH) - summary.append(f'Experiment name {" "*44} | Status | Core hours used ') + summary.append(f'Experiment name {" "*43} | Status | Core hours used ') # Flag for tracking if "cores per node" is in dictionary summary.append('-'*REPORT_WIDTH) for expt in expt_dict: @@ -50,7 +50,7 @@ def print_job_summary(expt_dict: dict, debug: bool = False): for task in expt_dict[expt]: if "core_hours" in expt_dict[expt][task]: ch += expt_dict[expt][task]["core_hours"] - summary.append(f'{expt[:60]:<60s} {status:^12s} {ch:^12.2f}') + summary.append(f'{expt[:60]:<60s} {status:<12s} {ch:>13.2f}') # Print summary to screen for line in summary: @@ -112,13 +112,18 @@ def calculate_core_hours(expt_dict: dict) -> dict: # Cycle is last 12 characters, task name is rest (minus separating underscore) taskname = task[:-13] # Handle task names that have ensemble and/or fhr info appended with regex - print(taskname) taskname = re.sub('_mem\d{3}', '', taskname) taskname = re.sub('_f\d{3}', '', taskname) - print(taskname) - nnodes = vdf[f'NNODES_{taskname.upper()}'] - # Users are charged for full use of nodes, so core hours are CPN * nodes * time in hrs - core_hours = cores_per_node * nnodes * expt_dict[expt][task]['walltime'] / 3600 + nnodes_var = f'NNODES_{taskname.upper()}' + if nnodes_var in vdf: + nnodes = vdf[nnodes_var] + # Users are charged for full use of nodes, so core hours are CPN * nodes * time in hrs + core_hours = cores_per_node * nnodes * expt_dict[expt][task]['walltime'] / 3600 + expt_dict[expt][task]['exact_count'] = True + else: + # If we can't find the number of nodes, assume full usage (may undercount) + core_hours = expt_dict[expt][task]['cores'] * expt_dict[expt][task]['walltime'] / 3600 + expt_dict[expt][task]['exact_count'] = False expt_dict[expt][task]['core_hours'] = round(core_hours,2) return expt_dict diff --git a/tests/WE2E/monitor_jobs.py b/tests/WE2E/monitor_jobs.py index 4459e7d249..4d5b8d647b 100755 --- a/tests/WE2E/monitor_jobs.py +++ b/tests/WE2E/monitor_jobs.py @@ -83,6 +83,14 @@ def monitor_jobs(expt_dict: dict, monitor_file: str = '', debug: bool = False) - total_walltime = endtime - starttime logging.info(f'All {len(expt_dict)} experiments finished in {str(total_walltime)}') + logging.info(f'Calculating core-hour usage and printing final summary') + + # Calculate core hours and update yaml + expt_dict = calculate_core_hours(expt_dict) + write_monitor_file(monitor_file,expt_dict) + + #Call function to print summary + print_job_summary(expt_dict, debug) return monitor_file From da15a9122ab589fa9a623e903dc61d23280dcc27 Mon Sep 17 00:00:00 2001 From: "Michael Kavulich, Jr" Date: Thu, 23 Feb 2023 23:44:23 +0000 Subject: [PATCH 09/52] Move some functions into a new utils.py file to avoid circular dependencies --- tests/WE2E/job_summary.py | 44 +----- tests/WE2E/monitor_jobs.py | 142 +---------------- tests/WE2E/utils.py | 303 +++++++++++++++++++++++++++++++++++++ 3 files changed, 312 insertions(+), 177 deletions(-) create mode 100755 tests/WE2E/utils.py diff --git a/tests/WE2E/job_summary.py b/tests/WE2E/job_summary.py index 7710ed00d6..e29e79f734 100755 --- a/tests/WE2E/job_summary.py +++ b/tests/WE2E/job_summary.py @@ -4,7 +4,6 @@ import sys import argparse import logging -import re import subprocess import sqlite3 import time @@ -16,14 +15,12 @@ from python_utils import ( cfg_to_yaml_str, - flatten_dict, - load_config_file, - load_shell_config + load_config_file ) from check_python_version import check_python_version -from monitor_jobs import update_expt_status, write_monitor_file +from utils import calculate_core_hours, create_expt_dict, update_expt_status, write_monitor_file REPORT_WIDTH = 100 @@ -89,43 +86,6 @@ def create_expt_dict(expt_dir: str) -> dict: return summary_file, expt_dict -def calculate_core_hours(expt_dict: dict) -> dict: - """ - Function takes in an experiment dictionary, reads the var_defns file for necessary information, - and calculates the core hours used by each task, updating expt_dict with this info - - Args: - expt_dict (dict) : Experiment dictionary - Returns: - dict : Experiment dictionary updated with core hours - """ - - for expt in expt_dict: - # Read variable definitions file - vardefs = load_shell_config(os.path.join(expt_dict[expt]["expt_dir"],"var_defns.sh")) - vdf = flatten_dict(vardefs) - cores_per_node = vdf["NCORES_PER_NODE"] - for task in expt_dict[expt]: - # Skip non-task entries - if task in ["expt_dir","status"]: - continue - # Cycle is last 12 characters, task name is rest (minus separating underscore) - taskname = task[:-13] - # Handle task names that have ensemble and/or fhr info appended with regex - taskname = re.sub('_mem\d{3}', '', taskname) - taskname = re.sub('_f\d{3}', '', taskname) - nnodes_var = f'NNODES_{taskname.upper()}' - if nnodes_var in vdf: - nnodes = vdf[nnodes_var] - # Users are charged for full use of nodes, so core hours are CPN * nodes * time in hrs - core_hours = cores_per_node * nnodes * expt_dict[expt][task]['walltime'] / 3600 - expt_dict[expt][task]['exact_count'] = True - else: - # If we can't find the number of nodes, assume full usage (may undercount) - core_hours = expt_dict[expt][task]['cores'] * expt_dict[expt][task]['walltime'] / 3600 - expt_dict[expt][task]['exact_count'] = False - expt_dict[expt][task]['core_hours'] = round(core_hours,2) - return expt_dict def setup_logging(debug: bool = False) -> None: """ diff --git a/tests/WE2E/monitor_jobs.py b/tests/WE2E/monitor_jobs.py index 4d5b8d647b..e383358748 100755 --- a/tests/WE2E/monitor_jobs.py +++ b/tests/WE2E/monitor_jobs.py @@ -1,5 +1,7 @@ #!/usr/bin/env python3 +import os +import re import sys import argparse import logging @@ -13,12 +15,16 @@ sys.path.append("../../ush") from python_utils import ( + cfg_to_yaml_str, + flatten_dict, load_config_file, - cfg_to_yaml_str + load_shell_config ) from check_python_version import check_python_version +from job_summary import print_job_summary +from utils import calculate_core_hours, write_monitor_file, update_expt_status def monitor_jobs(expt_dict: dict, monitor_file: str = '', debug: bool = False) -> str: """Function to monitor and run jobs for the specified experiment using Rocoto @@ -94,140 +100,6 @@ def monitor_jobs(expt_dict: dict, monitor_file: str = '', debug: bool = False) - return monitor_file -def update_expt_status(expt: dict, name: str, refresh: bool = False) -> dict: - """ - This function reads the dictionary showing the location of a given experiment, runs a - `rocotorun` command to update the experiment (running new jobs and updating the status of - previously submitted ones), and reads the rocoto database file to update the status of - each job for that experiment in the experiment dictionary. - - The function then and uses a simple set of rules to combine the statuses of every task - into a useful "status" for the whole experiment, and returns the updated experiment dictionary. - - Experiment "status" levels explained: - CREATED: The experiments have been created, but the monitor script has not yet processed them. - This is immediately overwritten at the beginning of the "monitor_jobs" function, so we - should never see this status in this function. Including just for completeness sake. - SUBMITTING: All jobs are in status SUBMITTING or SUCCEEDED. This is a normal state; we will - continue to monitor this experiment. - DYING: One or more tasks have died (status "DEAD"), so this experiment has had an error. - We will continue to monitor this experiment until all tasks are either status DEAD or - status SUCCEEDED (see next entry). - DEAD: One or more tasks are at status DEAD, and the rest are either DEAD or SUCCEEDED. We - will no longer monitor this experiment. - ERROR: One or more tasks are at status UNKNOWN, meaning that rocoto has failed to track the - job associated with that task. This will require manual intervention to solve, so we - will no longer monitor this experiment. - This status may also appear if we fail to read the rocoto database file. - RUNNING: One or more jobs are at status RUNNING, and the rest are either status QUEUED, SUBMITTED, - or SUCCEEDED. This is a normal state; we will continue to monitor this experiment. - QUEUED: One or more jobs are at status QUEUED, and some others may be at status SUBMITTED or - SUCCEEDED. - This is a normal state; we will continue to monitor this experiment. - SUCCEEDED: All jobs are status SUCCEEDED; we will monitor for one more cycle in case there are - unsubmitted jobs remaining. - COMPLETE:All jobs are status SUCCEEDED, and we have monitored this job for an additional cycle - to ensure there are no un-submitted jobs. We will no longer monitor this experiment. - - Args: - expt (dict): A dictionary containing the information for an individual experiment, as - described in the main monitor_jobs() function. - name (str): Name of the experiment; used for logging only - refresh (bool): If true, this flag will check an experiment status even if it is listed - as DEAD, ERROR, or COMPLETE. Used for initial checks for experiments - that may have been restarted. - Returns: - dict: The updated experiment dictionary. - """ - - #If we are no longer tracking this experiment, return unchanged - if (expt["status"] in ['DEAD','ERROR','COMPLETE']) and not refresh: - return expt - - # Update experiment, read rocoto database - rocoto_db = f"{expt['expt_dir']}/FV3LAM_wflow.db" - rocotorun_cmd = ["rocotorun", f"-w {expt['expt_dir']}/FV3LAM_wflow.xml", f"-d {rocoto_db}"] - subprocess.run(rocotorun_cmd) - - logging.debug(f"Reading database for experiment {name}, updating experiment dictionary") - try: - # This section of code queries the "job" table of the rocoto database, returning a list - # of tuples containing the taskname, cycle, and state of each job respectively - with closing(sqlite3.connect(rocoto_db)) as connection: - with closing(connection.cursor()) as cur: - db = cur.execute('SELECT taskname,cycle,state,cores,duration from jobs').fetchall() - except: - logging.warning(f"Unable to read database {rocoto_db}\nCan not track experiment {name}") - expt["status"] = "ERROR" - return expt - - for task in db: - # For each entry from rocoto database, store that task's info under a dictionary key named TASKNAME_CYCLE - # Cycle comes from the database in Unix Time (seconds), so convert to human-readable - cycle = datetime.utcfromtimestamp(task[1]).strftime('%Y%m%d%H%M') - if f"{task[0]}_{cycle}" not in expt: - expt[f"{task[0]}_{cycle}"] = dict() - expt[f"{task[0]}_{cycle}"]["status"] = task[2] - expt[f"{task[0]}_{cycle}"]["cores"] = task[3] - expt[f"{task[0]}_{cycle}"]["walltime"] = task[4] - - #Run rocotorun again to get around rocotobqserver proliferation issue - subprocess.run(rocotorun_cmd) - - statuses = list() - for task in expt: - # Skip non-task entries - if task in ["expt_dir","status"]: - continue - statuses.append(expt[task]["status"]) - - if "DEAD" in statuses: - still_live = ["RUNNING", "SUBMITTING", "QUEUED"] - if any(status in still_live for status in statuses): - logging.debug(f'DEAD job in experiment {name}; continuing to track until all jobs are complete') - expt["status"] = "DYING" - else: - expt["status"] = "DEAD" - return expt - - if "UNKNOWN" in statuses: - expt["status"] = "ERROR" - - if "RUNNING" in statuses: - expt["status"] = "RUNNING" - elif "QUEUED" in statuses: - expt["status"] = "QUEUED" - elif "SUBMITTING" in statuses: - expt["status"] = "SUBMITTING" - elif "SUCCEEDED" in statuses: - if expt["status"] == "SUCCEEDED": - expt["status"] = "COMPLETE" - else: - expt["status"] = "SUCCEEDED" - else: - logging.fatal("Some kind of horrible thing has happened") - raise ValueError(dedent(f"""Some kind of horrible thing has happened to the experiment status - for experiment {name} - status is {expt["status"]} - all task statuses are {statuses}""")) - - return expt - - -def write_monitor_file(monitor_file: str, expt_dict: dict): - try: - with open(monitor_file,"w") as f: - f.write("### WARNING ###\n") - f.write("### THIS FILE IS AUTO_GENERATED AND REGULARLY OVER-WRITTEN BY monitor_jobs.py\n") - f.write("### EDITS MAY RESULT IN MISBEHAVIOR OF EXPERIMENTS RUNNING\n") - f.writelines(cfg_to_yaml_str(expt_dict)) - except: - logging.fatal("\n********************************\n") - logging.fatal(f"WARNING WARNING WARNING\nFailure occurred while writing monitor file {monitor_file}") - logging.fatal("File may be corrupt or invalid for re-run!!") - logging.fatal("\n********************************\n") - raise - def setup_logging(logfile: str = "log.run_WE2E_tests", debug: bool = False) -> None: """ diff --git a/tests/WE2E/utils.py b/tests/WE2E/utils.py new file mode 100755 index 0000000000..0869608b29 --- /dev/null +++ b/tests/WE2E/utils.py @@ -0,0 +1,303 @@ +#!/usr/bin/env python3 +""" +A collection of utilities used by the various WE2E scripts +""" +import os +import re +import sys +import argparse +import logging +import subprocess +import sqlite3 +import time +from textwrap import dedent +from datetime import datetime +from contextlib import closing + +sys.path.append("../../ush") + +from python_utils import ( + cfg_to_yaml_str, + flatten_dict, + load_config_file, + load_shell_config +) + +from check_python_version import check_python_version + +REPORT_WIDTH = 100 + +def print_job_summary(expt_dict: dict, debug: bool = False): + """Function that creates a summary for the specified experiment + + Args: + expt_dict (dict): A dictionary containing the information needed to run + one or more experiments. See example file monitor_jobs.yaml + debug (bool): [optional] Enable extra output for debugging + Returns: + None + """ + + # Create summary table as list of strings + summary = [] + summary.append('-'*REPORT_WIDTH) + summary.append(f'Experiment name {" "*43} | Status | Core hours used ') + # Flag for tracking if "cores per node" is in dictionary + summary.append('-'*REPORT_WIDTH) + for expt in expt_dict: + status = expt_dict[expt]["status"] + ch = 0 + for task in expt_dict[expt]: + if "core_hours" in expt_dict[expt][task]: + ch += expt_dict[expt][task]["core_hours"] + summary.append(f'{expt[:60]:<60s} {status:<12s} {ch:>13.2f}') + + # Print summary to screen + for line in summary: + print(line) + + +def calculate_core_hours(expt_dict: dict) -> dict: + """ + Function takes in an experiment dictionary, reads the var_defns file for necessary information, + and calculates the core hours used by each task, updating expt_dict with this info + + Args: + expt_dict (dict) : Experiment dictionary + Returns: + dict : Experiment dictionary updated with core hours + """ + + for expt in expt_dict: + # Read variable definitions file + vardefs = load_shell_config(os.path.join(expt_dict[expt]["expt_dir"],"var_defns.sh")) + vdf = flatten_dict(vardefs) + cores_per_node = vdf["NCORES_PER_NODE"] + for task in expt_dict[expt]: + # Skip non-task entries + if task in ["expt_dir","status"]: + continue + # Cycle is last 12 characters, task name is rest (minus separating underscore) + taskname = task[:-13] + # Handle task names that have ensemble and/or fhr info appended with regex + taskname = re.sub('_mem\d{3}', '', taskname) + taskname = re.sub('_f\d{3}', '', taskname) + nnodes_var = f'NNODES_{taskname.upper()}' + if nnodes_var in vdf: + nnodes = vdf[nnodes_var] + # Users are charged for full use of nodes, so core hours are CPN * nodes * time in hrs + core_hours = cores_per_node * nnodes * expt_dict[expt][task]['walltime'] / 3600 + expt_dict[expt][task]['exact_count'] = True + else: + # If we can't find the number of nodes, assume full usage (may undercount) + core_hours = expt_dict[expt][task]['cores'] * expt_dict[expt][task]['walltime'] / 3600 + expt_dict[expt][task]['exact_count'] = False + expt_dict[expt][task]['core_hours'] = round(core_hours,2) + return expt_dict + +def create_expt_dict(expt_dir: str) -> dict: + """ + Function takes in a directory, searches that directory for subdirectories containing + experiments, and creates a skeleton dictionary that can be filled out by update_expt_status() + + Args: + expt_dir (str) : Experiment directory + Returns: + dict : Experiment dictionary + """ + contents = os.listdir(expt_dir) + + expt_dict=dict() + for item in contents: + # Look for FV3LAM_wflow.xml to indicate directories with experiments in them + fullpath = os.path.join(expt_dir, item) + if not os.path.isdir(fullpath): + continue + xmlfile = os.path.join(expt_dir, item, 'FV3LAM_wflow.xml') + if os.path.isfile(xmlfile): + expt_dict[item] = dict() + expt_dict[item].update({"expt_dir": os.path.join(expt_dir,item)}) + expt_dict[item].update({"status": "CREATED"}) + else: + logging.debug(f'Skipping directory {item}, experiment XML file not found') + #Update the experiment dictionary + logging.info(f"Reading status of experiment {item}") + update_expt_status(expt_dict[item],item,True) + summary_file = f'job_summary_{datetime.now().strftime("%Y%m%d%H%M%S")}.yaml' + + return summary_file, expt_dict + +def calculate_core_hours(expt_dict: dict) -> dict: + """ + Function takes in an experiment dictionary, reads the var_defns file for necessary information, + and calculates the core hours used by each task, updating expt_dict with this info + + Args: + expt_dict (dict) : Experiment dictionary + Returns: + dict : Experiment dictionary updated with core hours + """ + + for expt in expt_dict: + # Read variable definitions file + vardefs = load_shell_config(os.path.join(expt_dict[expt]["expt_dir"],"var_defns.sh")) + vdf = flatten_dict(vardefs) + cores_per_node = vdf["NCORES_PER_NODE"] + for task in expt_dict[expt]: + # Skip non-task entries + if task in ["expt_dir","status"]: + continue + # Cycle is last 12 characters, task name is rest (minus separating underscore) + taskname = task[:-13] + # Handle task names that have ensemble and/or fhr info appended with regex + taskname = re.sub('_mem\d{3}', '', taskname) + taskname = re.sub('_f\d{3}', '', taskname) + nnodes_var = f'NNODES_{taskname.upper()}' + if nnodes_var in vdf: + nnodes = vdf[nnodes_var] + # Users are charged for full use of nodes, so core hours are CPN * nodes * time in hrs + core_hours = cores_per_node * nnodes * expt_dict[expt][task]['walltime'] / 3600 + expt_dict[expt][task]['exact_count'] = True + else: + # If we can't find the number of nodes, assume full usage (may undercount) + core_hours = expt_dict[expt][task]['cores'] * expt_dict[expt][task]['walltime'] / 3600 + expt_dict[expt][task]['exact_count'] = False + expt_dict[expt][task]['core_hours'] = round(core_hours,2) + return expt_dict + + +def write_monitor_file(monitor_file: str, expt_dict: dict): + try: + with open(monitor_file,"w") as f: + f.write("### WARNING ###\n") + f.write("### THIS FILE IS AUTO_GENERATED AND REGULARLY OVER-WRITTEN BY WORKFKLOW SCRIPTS\n") + f.write("### EDITS MAY RESULT IN MISBEHAVIOR OF EXPERIMENTS RUNNING\n") + f.writelines(cfg_to_yaml_str(expt_dict)) + except: + logging.fatal("\n********************************\n") + logging.fatal(f"WARNING WARNING WARNING\nFailure occurred while writing monitor file {monitor_file}") + logging.fatal("File may be corrupt or invalid for re-run!!") + logging.fatal("\n********************************\n") + raise + + +def update_expt_status(expt: dict, name: str, refresh: bool = False) -> dict: + """ + This function reads the dictionary showing the location of a given experiment, runs a + `rocotorun` command to update the experiment (running new jobs and updating the status of + previously submitted ones), and reads the rocoto database file to update the status of + each job for that experiment in the experiment dictionary. + + The function then and uses a simple set of rules to combine the statuses of every task + into a useful "status" for the whole experiment, and returns the updated experiment dictionary. + + Experiment "status" levels explained: + CREATED: The experiments have been created, but the monitor script has not yet processed them. + This is immediately overwritten at the beginning of the "monitor_jobs" function, so we + should never see this status in this function. Including just for completeness sake. + SUBMITTING: All jobs are in status SUBMITTING or SUCCEEDED. This is a normal state; we will + continue to monitor this experiment. + DYING: One or more tasks have died (status "DEAD"), so this experiment has had an error. + We will continue to monitor this experiment until all tasks are either status DEAD or + status SUCCEEDED (see next entry). + DEAD: One or more tasks are at status DEAD, and the rest are either DEAD or SUCCEEDED. We + will no longer monitor this experiment. + ERROR: One or more tasks are at status UNKNOWN, meaning that rocoto has failed to track the + job associated with that task. This will require manual intervention to solve, so we + will no longer monitor this experiment. + This status may also appear if we fail to read the rocoto database file. + RUNNING: One or more jobs are at status RUNNING, and the rest are either status QUEUED, SUBMITTED, + or SUCCEEDED. This is a normal state; we will continue to monitor this experiment. + QUEUED: One or more jobs are at status QUEUED, and some others may be at status SUBMITTED or + SUCCEEDED. + This is a normal state; we will continue to monitor this experiment. + SUCCEEDED: All jobs are status SUCCEEDED; we will monitor for one more cycle in case there are + unsubmitted jobs remaining. + COMPLETE:All jobs are status SUCCEEDED, and we have monitored this job for an additional cycle + to ensure there are no un-submitted jobs. We will no longer monitor this experiment. + + Args: + expt (dict): A dictionary containing the information for an individual experiment, as + described in the main monitor_jobs() function. + name (str): Name of the experiment; used for logging only + refresh (bool): If true, this flag will check an experiment status even if it is listed + as DEAD, ERROR, or COMPLETE. Used for initial checks for experiments + that may have been restarted. + Returns: + dict: The updated experiment dictionary. + """ + + #If we are no longer tracking this experiment, return unchanged + if (expt["status"] in ['DEAD','ERROR','COMPLETE']) and not refresh: + return expt + + # Update experiment, read rocoto database + rocoto_db = f"{expt['expt_dir']}/FV3LAM_wflow.db" + rocotorun_cmd = ["rocotorun", f"-w {expt['expt_dir']}/FV3LAM_wflow.xml", f"-d {rocoto_db}"] + subprocess.run(rocotorun_cmd) + + logging.debug(f"Reading database for experiment {name}, updating experiment dictionary") + try: + # This section of code queries the "job" table of the rocoto database, returning a list + # of tuples containing the taskname, cycle, and state of each job respectively + with closing(sqlite3.connect(rocoto_db)) as connection: + with closing(connection.cursor()) as cur: + db = cur.execute('SELECT taskname,cycle,state,cores,duration from jobs').fetchall() + except: + logging.warning(f"Unable to read database {rocoto_db}\nCan not track experiment {name}") + expt["status"] = "ERROR" + return expt + + for task in db: + # For each entry from rocoto database, store that task's info under a dictionary key named TASKNAME_CYCLE + # Cycle comes from the database in Unix Time (seconds), so convert to human-readable + cycle = datetime.utcfromtimestamp(task[1]).strftime('%Y%m%d%H%M') + if f"{task[0]}_{cycle}" not in expt: + expt[f"{task[0]}_{cycle}"] = dict() + expt[f"{task[0]}_{cycle}"]["status"] = task[2] + expt[f"{task[0]}_{cycle}"]["cores"] = task[3] + expt[f"{task[0]}_{cycle}"]["walltime"] = task[4] + + #Run rocotorun again to get around rocotobqserver proliferation issue + subprocess.run(rocotorun_cmd) + + statuses = list() + for task in expt: + # Skip non-task entries + if task in ["expt_dir","status"]: + continue + statuses.append(expt[task]["status"]) + + if "DEAD" in statuses: + still_live = ["RUNNING", "SUBMITTING", "QUEUED"] + if any(status in still_live for status in statuses): + logging.debug(f'DEAD job in experiment {name}; continuing to track until all jobs are complete') + expt["status"] = "DYING" + else: + expt["status"] = "DEAD" + return expt + + if "UNKNOWN" in statuses: + expt["status"] = "ERROR" + + if "RUNNING" in statuses: + expt["status"] = "RUNNING" + elif "QUEUED" in statuses: + expt["status"] = "QUEUED" + elif "SUBMITTING" in statuses: + expt["status"] = "SUBMITTING" + elif "SUCCEEDED" in statuses: + if expt["status"] == "SUCCEEDED": + expt["status"] = "COMPLETE" + else: + expt["status"] = "SUCCEEDED" + else: + logging.fatal("Some kind of horrible thing has happened") + raise ValueError(dedent(f"""Some kind of horrible thing has happened to the experiment status + for experiment {name} + status is {expt["status"]} + all task statuses are {statuses}""")) + + return expt + + From 3799562fbe0f7166b30da98aed6a6e76d33fc3f7 Mon Sep 17 00:00:00 2001 From: "Michael Kavulich, Jr" Date: Fri, 24 Feb 2023 03:51:31 +0000 Subject: [PATCH 10/52] Send rocotorun output to logging.debug, append logs rather than overwriting --- tests/WE2E/monitor_jobs.py | 2 +- tests/WE2E/run_WE2E_tests.py | 2 +- tests/WE2E/utils.py | 8 +++++--- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/tests/WE2E/monitor_jobs.py b/tests/WE2E/monitor_jobs.py index e383358748..1aea7f35c9 100755 --- a/tests/WE2E/monitor_jobs.py +++ b/tests/WE2E/monitor_jobs.py @@ -110,7 +110,7 @@ def setup_logging(logfile: str = "log.run_WE2E_tests", debug: bool = False) -> N formatter = logging.Formatter("%(name)-16s %(levelname)-8s %(message)s") - fh = logging.FileHandler(logfile, mode='w') + fh = logging.FileHandler(logfile, mode='a') fh.setLevel(logging.DEBUG) fh.setFormatter(formatter) logging.getLogger().addHandler(fh) diff --git a/tests/WE2E/run_WE2E_tests.py b/tests/WE2E/run_WE2E_tests.py index 0916788831..d51bbce728 100755 --- a/tests/WE2E/run_WE2E_tests.py +++ b/tests/WE2E/run_WE2E_tests.py @@ -407,7 +407,7 @@ def setup_logging(logfile: str = "log.run_WE2E_tests", debug: bool = False) -> N formatter = logging.Formatter("%(name)-16s %(levelname)-8s %(message)s") - fh = logging.FileHandler(logfile, mode='w') + fh = logging.FileHandler(logfile, mode='a') fh.setLevel(logging.DEBUG) fh.setFormatter(formatter) logging.getLogger().addHandler(fh) diff --git a/tests/WE2E/utils.py b/tests/WE2E/utils.py index 0869608b29..27aabf8a92 100755 --- a/tests/WE2E/utils.py +++ b/tests/WE2E/utils.py @@ -233,8 +233,9 @@ def update_expt_status(expt: dict, name: str, refresh: bool = False) -> dict: # Update experiment, read rocoto database rocoto_db = f"{expt['expt_dir']}/FV3LAM_wflow.db" - rocotorun_cmd = ["rocotorun", f"-w {expt['expt_dir']}/FV3LAM_wflow.xml", f"-d {rocoto_db}"] - subprocess.run(rocotorun_cmd) + rocotorun_cmd = ["rocotorun", f"-w {expt['expt_dir']}/FV3LAM_wflow.xml", f"-d {rocoto_db}", "-v 10"] + p = subprocess.run(rocotorun_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) + logging.debug(p.stdout) logging.debug(f"Reading database for experiment {name}, updating experiment dictionary") try: @@ -259,7 +260,8 @@ def update_expt_status(expt: dict, name: str, refresh: bool = False) -> dict: expt[f"{task[0]}_{cycle}"]["walltime"] = task[4] #Run rocotorun again to get around rocotobqserver proliferation issue - subprocess.run(rocotorun_cmd) + p = subprocess.run(rocotorun_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) + logging.debug(p.stdout) statuses = list() for task in expt: From 17bf55de24ba06d1dc2ee7f4dc24c17697aeab06 Mon Sep 17 00:00:00 2001 From: "Michael Kavulich, Jr" Date: Fri, 24 Feb 2023 05:43:25 +0000 Subject: [PATCH 11/52] Add ability to update experiments in parallel --- tests/WE2E/monitor_jobs.py | 30 ++++++++++++++++++++++-------- tests/WE2E/run_WE2E_tests.py | 7 +++++-- tests/WE2E/utils.py | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 60 insertions(+), 10 deletions(-) diff --git a/tests/WE2E/monitor_jobs.py b/tests/WE2E/monitor_jobs.py index 1aea7f35c9..b4757d6e24 100755 --- a/tests/WE2E/monitor_jobs.py +++ b/tests/WE2E/monitor_jobs.py @@ -24,9 +24,9 @@ from check_python_version import check_python_version from job_summary import print_job_summary -from utils import calculate_core_hours, write_monitor_file, update_expt_status +from utils import calculate_core_hours, write_monitor_file, update_expt_status, update_expt_status_parallel -def monitor_jobs(expt_dict: dict, monitor_file: str = '', debug: bool = False) -> str: +def monitor_jobs(expt_dict: dict, monitor_file: str = '', procs: int = 1, debug: bool = False) -> str: """Function to monitor and run jobs for the specified experiment using Rocoto Args: @@ -50,9 +50,14 @@ def monitor_jobs(expt_dict: dict, monitor_file: str = '', debug: bool = False) - # Perform initial setup for each experiment logging.info("Checking tests available for monitoring...") - for expt in expt_dict: - logging.info(f"Starting experiment {expt} running") - expt_dict[expt] = update_expt_status(expt_dict[expt], expt, True) + + if procs > 1: + print(f'Starting experiments in parallel with {procs} processes') + expt_dict = update_expt_status_parallel(expt_dict, procs) + else: + for expt in expt_dict: + logging.info(f"Starting experiment {expt} running") + expt_dict[expt] = update_expt_status(expt_dict[expt], expt, True) write_monitor_file(monitor_file,expt_dict) @@ -65,8 +70,13 @@ def monitor_jobs(expt_dict: dict, monitor_file: str = '', debug: bool = False) - i = 0 while running_expts: i += 1 + if procs > 1: + expt_dict = update_expt_status_parallel(expt_dict, procs) + else: + for expt in running_expts.copy(): + expt_dict[expt] = update_expt_status(expt_dict[expt], expt) + for expt in running_expts.copy(): - expt_dict[expt] = update_expt_status(expt_dict[expt], expt) running_expts[expt] = expt_dict[expt] if running_expts[expt]["status"] in ['DEAD','ERROR','COMPLETE']: logging.info(f'Experiment {expt} is {running_expts[expt]["status"]}; will no longer monitor.') @@ -135,6 +145,7 @@ def setup_logging(logfile: str = "log.run_WE2E_tests", debug: bool = False) -> N parser = argparse.ArgumentParser(description="Script for monitoring and running jobs in a specified experiment, as specified in a yaml configuration file\n") parser.add_argument('-y', '--yaml_file', type=str, help='YAML-format file specifying the information of jobs to be run; for an example file, see monitor_jobs.yaml', required=True) + parser.add_argument('-p', '--procs', type=int, help='Run resource-heavy tasks (such as calls to rocotorun) in parallel, with provided number of parallel tasks', default=1) parser.add_argument('-d', '--debug', action='store_true', help='Script will be run in debug mode with more verbose output') args = parser.parse_args() @@ -143,13 +154,16 @@ def setup_logging(logfile: str = "log.run_WE2E_tests", debug: bool = False) -> N expt_dict = load_config_file(args.yaml_file) + if args.procs < 1: + raise ValueError('You can not have less than one parallel process; select a valid value for --procs') + #Call main function try: - monitor_jobs(expt_dict,args.yaml_file, args.debug) + monitor_jobs(expt_dict,args.yaml_file,args.procs,args.debug) except KeyboardInterrupt: logging.info("\n\nUser interrupted monitor script; to resume monitoring jobs run:\n") - logging.info(f"{__file__} -y={args.yaml_file}\n") + logging.info(f"{__file__} -y={args.yaml_file} -p={args.procs}\n") except: logging.exception( dedent( diff --git a/tests/WE2E/run_WE2E_tests.py b/tests/WE2E/run_WE2E_tests.py index d51bbce728..7b0dc1b145 100755 --- a/tests/WE2E/run_WE2E_tests.py +++ b/tests/WE2E/run_WE2E_tests.py @@ -203,10 +203,10 @@ def run_we2e_tests(homedir, args) -> None: monitor_file = f'WE2E_tests_{datetime.now().strftime("%Y%m%d%H%M%S")}.yaml' write_monitor_file(monitor_file,monitor_yaml) try: - monitor_file = monitor_jobs(monitor_yaml, monitor_file=monitor_file, debug=args.debug) + monitor_file = monitor_jobs(monitor_yaml, monitor_file=monitor_file, procs=args.procs, debug=args.debug) except KeyboardInterrupt: logging.info("\n\nUser interrupted monitor script; to resume monitoring jobs run:\n") - logging.info(f"./monitor_jobs.py -y={monitor_file}\n") + logging.info(f"./monitor_jobs.py -y={monitor_file} -p={args.procs}\n") except: raise else: @@ -448,6 +448,7 @@ def setup_logging(logfile: str = "log.run_WE2E_tests", debug: bool = False) -> N parser.add_argument('-c', '--compiler', type=str, help='Compiler used for building the app', default='intel') parser.add_argument('-d', '--debug', action='store_true', help='Script will be run in debug mode with more verbose output') parser.add_argument('-q', '--quiet', action='store_true', help='Suppress console output from workflow generation; this will help keep the screen uncluttered') + parser.add_argument('-p', '--procs', type=int, help='Run resource-heavy tasks (such as calls to rocotorun) in parallel, with provided number of parallel tasks', default=1) parser.add_argument('--modulefile', type=str, help='Modulefile used for building the app') @@ -466,6 +467,8 @@ def setup_logging(logfile: str = "log.run_WE2E_tests", debug: bool = False) -> N #Set defaults that need other argument values if args.modulefile is None: args.modulefile = f'build_{args.machine.lower()}_{args.compiler}' + if args.procs < 1: + raise ValueError('You can not have less than one parallel process; select a valid value for --procs') #Call main function diff --git a/tests/WE2E/utils.py b/tests/WE2E/utils.py index 27aabf8a92..afb529837b 100755 --- a/tests/WE2E/utils.py +++ b/tests/WE2E/utils.py @@ -13,6 +13,7 @@ from textwrap import dedent from datetime import datetime from contextlib import closing +from multiprocessing import Pool sys.path.append("../../ush") @@ -302,4 +303,36 @@ def update_expt_status(expt: dict, name: str, refresh: bool = False) -> dict: return expt +def update_expt_status_parallel(expt_dict: dict, procs: int) -> dict: + """ + This function updates an entire set of experiments in parallel, drastically speeding up + the process if given enough parallel processes. Given an experiment dictionary, it will + output the updated dictionary. + + parallelizes the call to update_expt_status across the given number of processes. + Making use of the python multiprocessing starmap functionality, takes + + Args: + expt_dict (dict): A dictionary containing information for all experiments + procs (int): The number of parallel processes + + Returns: + dict: The updated dictionary of experiment dictionaries + """ + args = [] + # Define a tuple of arguments to pass to starmap + for expt in expt_dict: + args.append( (expt_dict[expt],expt,True) ) + + # call update_expt_status() in parallel + with Pool(processes=procs) as pool: + output = pool.starmap(update_expt_status, args) + + # Update dictionary with output from all calls to update_expt_status() + i = 0 + for expt in expt_dict: + expt_dict[expt] = output[i] + i += 1 + + return expt_dict From cabea582999fca9b119f1c27bda6499762613cf9 Mon Sep 17 00:00:00 2001 From: "Michael Kavulich, Jr" Date: Fri, 24 Feb 2023 13:20:47 -0700 Subject: [PATCH 12/52] Remove duplicate routine, add totals row to job summary --- tests/WE2E/job_summary.py | 33 ++------------------------------- tests/WE2E/utils.py | 17 +++++++++++++++-- 2 files changed, 17 insertions(+), 33 deletions(-) diff --git a/tests/WE2E/job_summary.py b/tests/WE2E/job_summary.py index e29e79f734..3249057e3f 100755 --- a/tests/WE2E/job_summary.py +++ b/tests/WE2E/job_summary.py @@ -20,40 +20,11 @@ from check_python_version import check_python_version -from utils import calculate_core_hours, create_expt_dict, update_expt_status, write_monitor_file +from utils import calculate_core_hours, create_expt_dict, update_expt_status, \ + print_job_summary, write_monitor_file REPORT_WIDTH = 100 -def print_job_summary(expt_dict: dict, debug: bool = False): - """Function that creates a summary for the specified experiment - - Args: - expt_dict (dict): A dictionary containing the information needed to run - one or more experiments. See example file monitor_jobs.yaml - debug (bool): [optional] Enable extra output for debugging - Returns: - None - """ - - # Create summary table as list of strings - summary = [] - summary.append('-'*REPORT_WIDTH) - summary.append(f'Experiment name {" "*43} | Status | Core hours used ') - # Flag for tracking if "cores per node" is in dictionary - summary.append('-'*REPORT_WIDTH) - for expt in expt_dict: - status = expt_dict[expt]["status"] - ch = 0 - for task in expt_dict[expt]: - if "core_hours" in expt_dict[expt][task]: - ch += expt_dict[expt][task]["core_hours"] - summary.append(f'{expt[:60]:<60s} {status:<12s} {ch:>13.2f}') - - # Print summary to screen - for line in summary: - print(line) - - def create_expt_dict(expt_dir: str) -> dict: """ Function takes in a directory, searches that directory for subdirectories containing diff --git a/tests/WE2E/utils.py b/tests/WE2E/utils.py index afb529837b..ca9c2d2b3e 100755 --- a/tests/WE2E/utils.py +++ b/tests/WE2E/utils.py @@ -45,13 +45,26 @@ def print_job_summary(expt_dict: dict, debug: bool = False): summary.append(f'Experiment name {" "*43} | Status | Core hours used ') # Flag for tracking if "cores per node" is in dictionary summary.append('-'*REPORT_WIDTH) + total_core_hours = 0 + statuses = [] for expt in expt_dict: - status = expt_dict[expt]["status"] + statuses.append(expt_dict[expt]["status"]) ch = 0 for task in expt_dict[expt]: if "core_hours" in expt_dict[expt][task]: ch += expt_dict[expt][task]["core_hours"] - summary.append(f'{expt[:60]:<60s} {status:<12s} {ch:>13.2f}') + summary.append(f'{expt[:60]:<60s} {statuses[-1]:<12s} {ch:>13.2f}') + total_core_hours += ch + if "ERROR" in statuses: + total_status = "ERROR" + elif "DEAD" in statuses: + total_status = "DEAD" + elif "COMPLETE" in statuses: + total_status = "COMPLETE" + else: + total_status = "UNKNOWN" + summary.append('-'*REPORT_WIDTH) + summary.append(f'Total {" "*54} {total_status:<12s} {total_core_hours:>13.2f}') # Print summary to screen for line in summary: From 523c3491f3cdbab0d0a9c8248d94c268d9eb5007 Mon Sep 17 00:00:00 2001 From: "Michael Kavulich, Jr" Date: Sat, 25 Feb 2023 03:30:34 +0000 Subject: [PATCH 13/52] More improvements - Remove another duplicate routine - Rename "job_summary" to "WE2E_summary" - Rename various auto-created yaml files to WE2E_tests_{TIME}.yaml for consistency - Write "summary_{TIME}.txt" containing full report on WE2E tests --- .../WE2E/{job_summary.py => WE2E_summary.py} | 37 +------------------ tests/WE2E/monitor_jobs.py | 6 +-- tests/WE2E/utils.py | 37 +++++++++++++++++-- 3 files changed, 38 insertions(+), 42 deletions(-) rename tests/WE2E/{job_summary.py => WE2E_summary.py} (63%) diff --git a/tests/WE2E/job_summary.py b/tests/WE2E/WE2E_summary.py similarity index 63% rename from tests/WE2E/job_summary.py rename to tests/WE2E/WE2E_summary.py index 3249057e3f..8e1a6113e2 100755 --- a/tests/WE2E/job_summary.py +++ b/tests/WE2E/WE2E_summary.py @@ -21,43 +21,10 @@ from check_python_version import check_python_version from utils import calculate_core_hours, create_expt_dict, update_expt_status, \ - print_job_summary, write_monitor_file + print_WE2E_summary, write_monitor_file REPORT_WIDTH = 100 -def create_expt_dict(expt_dir: str) -> dict: - """ - Function takes in a directory, searches that directory for subdirectories containing - experiments, and creates a skeleton dictionary that can be filled out by update_expt_status() - - Args: - expt_dir (str) : Experiment directory - Returns: - dict : Experiment dictionary - """ - contents = os.listdir(expt_dir) - - expt_dict=dict() - for item in contents: - # Look for FV3LAM_wflow.xml to indicate directories with experiments in them - fullpath = os.path.join(expt_dir, item) - if not os.path.isdir(fullpath): - continue - xmlfile = os.path.join(expt_dir, item, 'FV3LAM_wflow.xml') - if os.path.isfile(xmlfile): - expt_dict[item] = dict() - expt_dict[item].update({"expt_dir": os.path.join(expt_dir,item)}) - expt_dict[item].update({"status": "CREATED"}) - else: - logging.debug(f'Skipping directory {item}, experiment XML file not found') - #Update the experiment dictionary - logging.info(f"Reading status of experiment {item}") - update_expt_status(expt_dict[item],item,True) - summary_file = f'job_summary_{datetime.now().strftime("%Y%m%d%H%M%S")}.yaml' - - return summary_file, expt_dict - - def setup_logging(debug: bool = False) -> None: """ Sets up logging, printing high-priority (INFO and higher) messages to screen, and printing all @@ -106,5 +73,5 @@ def setup_logging(debug: bool = False) -> None: write_monitor_file(yaml_file,expt_dict) #Call function to print summary - print_job_summary(expt_dict, args.debug) + print_WE2E_summary(expt_dict, args.debug) diff --git a/tests/WE2E/monitor_jobs.py b/tests/WE2E/monitor_jobs.py index b4757d6e24..d815e12c80 100755 --- a/tests/WE2E/monitor_jobs.py +++ b/tests/WE2E/monitor_jobs.py @@ -23,7 +23,7 @@ from check_python_version import check_python_version -from job_summary import print_job_summary +from WE2E_summary import print_WE2E_summary from utils import calculate_core_hours, write_monitor_file, update_expt_status, update_expt_status_parallel def monitor_jobs(expt_dict: dict, monitor_file: str = '', procs: int = 1, debug: bool = False) -> str: @@ -43,7 +43,7 @@ def monitor_jobs(expt_dict: dict, monitor_file: str = '', procs: int = 1, debug: starttime = datetime.now() # Write monitor_file, which will contain information on each monitored experiment if not monitor_file: - monitor_file = f'monitor_jobs_{starttime.strftime("%Y%m%d%H%M%S")}.yaml' + monitor_file = f'WE2E_tests_{starttime.strftime("%Y%m%d%H%M%S")}.yaml' logging.info(f"Writing information for all experiments to {monitor_file}") write_monitor_file(monitor_file,expt_dict) @@ -106,7 +106,7 @@ def monitor_jobs(expt_dict: dict, monitor_file: str = '', procs: int = 1, debug: write_monitor_file(monitor_file,expt_dict) #Call function to print summary - print_job_summary(expt_dict, debug) + print_WE2E_summary(expt_dict, debug) return monitor_file diff --git a/tests/WE2E/utils.py b/tests/WE2E/utils.py index ca9c2d2b3e..ee8720aa00 100755 --- a/tests/WE2E/utils.py +++ b/tests/WE2E/utils.py @@ -28,7 +28,7 @@ REPORT_WIDTH = 100 -def print_job_summary(expt_dict: dict, debug: bool = False): +def print_WE2E_summary(expt_dict: dict, debug: bool = False): """Function that creates a summary for the specified experiment Args: @@ -43,16 +43,34 @@ def print_job_summary(expt_dict: dict, debug: bool = False): summary = [] summary.append('-'*REPORT_WIDTH) summary.append(f'Experiment name {" "*43} | Status | Core hours used ') - # Flag for tracking if "cores per node" is in dictionary summary.append('-'*REPORT_WIDTH) total_core_hours = 0 statuses = [] + expt_details = [] for expt in expt_dict: statuses.append(expt_dict[expt]["status"]) ch = 0 + expt_details.append('') + expt_details.append('-'*REPORT_WIDTH) + expt_details.append(f'Detailed summary of experiment {expt}') + expt_details.append(f'{" "*40} | Status | Walltime | Core hours used') + expt_details.append('-'*REPORT_WIDTH) + for task in expt_dict[expt]: + # Skip non-task entries + if task in ["expt_dir","status"]: + continue + status = expt_dict[expt][task]["status"] + walltime = expt_dict[expt][task]["walltime"] + expt_details.append(f'{task[:40]:<40s} {status:<12s} {walltime:>10.1f}') if "core_hours" in expt_dict[expt][task]: - ch += expt_dict[expt][task]["core_hours"] + task_ch = expt_dict[expt][task]["core_hours"] + ch += task_ch + expt_details[-1] = f'{expt_details[-1]} {task_ch:>13.2f}' + else: + expt_details[-1] = f'{expt_details[-1]} -' + expt_details.append('-'*REPORT_WIDTH) + expt_details.append(f'Total {" "*34} {statuses[-1]:<12s} {" "*11} {ch:>13.2f}') summary.append(f'{expt[:60]:<60s} {statuses[-1]:<12s} {ch:>13.2f}') total_core_hours += ch if "ERROR" in statuses: @@ -70,6 +88,16 @@ def print_job_summary(expt_dict: dict, debug: bool = False): for line in summary: print(line) + # Print summary and details to file + summary_file = f'WE2E_summary_{datetime.now().strftime("%Y%m%d%H%M%S")}.txt' + print(f"\nDetailed summary written to {summary_file}\n") + + with open(summary_file, 'w') as f: + for line in summary: + f.write(f"{line}\n") + f.write("\nDetailed summary of each experiment:\n") + for line in expt_details: + f.write(f"{line}\n") def calculate_core_hours(expt_dict: dict) -> dict: """ @@ -134,10 +162,11 @@ def create_expt_dict(expt_dir: str) -> dict: expt_dict[item].update({"status": "CREATED"}) else: logging.debug(f'Skipping directory {item}, experiment XML file not found') + continue #Update the experiment dictionary logging.info(f"Reading status of experiment {item}") update_expt_status(expt_dict[item],item,True) - summary_file = f'job_summary_{datetime.now().strftime("%Y%m%d%H%M%S")}.yaml' + summary_file = f'WE2E_tests_{datetime.now().strftime("%Y%m%d%H%M%S")}.yaml' return summary_file, expt_dict From 170a732d0ff0e0a4f069bec29fffb3ff19fe3c72 Mon Sep 17 00:00:00 2001 From: "Michael Kavulich, Jr" Date: Mon, 27 Feb 2023 07:22:14 +0000 Subject: [PATCH 14/52] Add final missing options: --opsroot, allows user to set NCO OPSROOT variable --print_test_details, allows user to print a pipe-delimited text file (test_details.txt) analogous to previous WE2E_test_info.csv file --- tests/WE2E/run_WE2E_tests.py | 16 +++++- tests/WE2E/utils.py | 95 ++++++++++++++++++++++++++++++++++++ 2 files changed, 109 insertions(+), 2 deletions(-) diff --git a/tests/WE2E/run_WE2E_tests.py b/tests/WE2E/run_WE2E_tests.py index 7b0dc1b145..3245e53e1d 100755 --- a/tests/WE2E/run_WE2E_tests.py +++ b/tests/WE2E/run_WE2E_tests.py @@ -19,7 +19,7 @@ from check_python_version import check_python_version from monitor_jobs import monitor_jobs, write_monitor_file - +from utils import print_test_details def run_we2e_tests(homedir, args) -> None: """Function to run the WE2E tests selected by the user @@ -144,6 +144,10 @@ def run_we2e_tests(homedir, args) -> None: if 'nco' not in test_cfg: test_cfg['nco'] = dict() test_cfg['nco'].update({"model_ver": "we2e"}) + if args.opsroot: + if 'nco' not in test_cfg: + test_cfg['nco'] = dict() + test_cfg['nco'].update({"OPSROOT": args.opsroot}) # if platform section was not in input config, initialize as empty dict if 'platform' not in test_cfg: test_cfg['platform'] = dict() @@ -163,6 +167,7 @@ def run_we2e_tests(homedir, args) -> None: if args.verbose_tests: test_cfg['workflow'].update({"VERBOSE": args.verbose_tests}) + logging.debug(f"Overwriting WE2E-test-specific settings for test \n{test_name}\n") if 'task_get_extrn_ics' in test_cfg: @@ -398,6 +403,7 @@ def check_task_verification(cfg: dict, mach: dict, dflt: dict) -> dict: return cfg_vx + def setup_logging(logfile: str = "log.run_WE2E_tests", debug: bool = False) -> None: """ Sets up logging, printing high-priority (INFO and higher) messages to screen, and printing all @@ -456,7 +462,9 @@ def setup_logging(logfile: str = "log.run_WE2E_tests", debug: bool = False) -> N parser.add_argument('--expt_basedir', type=str, help='Explicitly set EXPT_BASEDIR for all experiments') parser.add_argument('--exec_subdir', type=str, help='Explicitly set EXEC_SUBDIR for all experiments') parser.add_argument('--use_cron_to_relaunch', action='store_true', help='Explicitly set USE_CRON_TO_RELAUNCH for all experiments; this option disables the "monitor" script functionality') - parser.add_argument('--cron_relaunch_intvl_mnts', type=str, help='Overrides CRON_RELAUNCH_INTVL_MNTS for all experiments') + parser.add_argument('--cron_relaunch_intvl_mnts', type=int, help='Overrides CRON_RELAUNCH_INTVL_MNTS for all experiments') + parser.add_argument('--opsroot', type=str, help='If test is for NCO mode, sets OPSROOT (see config_defaults.yaml for details)') + parser.add_argument('--print_test_details', action='store_true', help='Create a "test_details.txt" file summarizing each test prior to starting experiment') parser.add_argument('--debug_tests', action='store_true', help='Explicitly set DEBUG=TRUE for all experiments') parser.add_argument('--verbose_tests', action='store_true', help='Explicitly set VERBOSE=TRUE for all experiments') @@ -470,6 +478,10 @@ def setup_logging(logfile: str = "log.run_WE2E_tests", debug: bool = False) -> N if args.procs < 1: raise ValueError('You can not have less than one parallel process; select a valid value for --procs') + # Print test details (if requested) + if args.print_test_details: + print_test_details("test_details.txt") + sys.exit() #Call main function try: diff --git a/tests/WE2E/utils.py b/tests/WE2E/utils.py index ee8720aa00..3fc5b54ef5 100755 --- a/tests/WE2E/utils.py +++ b/tests/WE2E/utils.py @@ -10,6 +10,7 @@ import subprocess import sqlite3 import time +import glob from textwrap import dedent from datetime import datetime from contextlib import closing @@ -378,3 +379,97 @@ def update_expt_status_parallel(expt_dict: dict, procs: int) -> dict: i += 1 return expt_dict + + + +def print_test_details(txtfile: str = "test_details.txt") -> None: + """Prints a pipe ( | ) delimited text file containing summaries of each test defined by a + config file in test_configs/* + + """ + + testfiles = glob.glob('test_configs/**/config*.yaml', recursive=True) + testdict = dict() + links = dict() + for testfile in testfiles: + pathname, filename = os.path.split(testfile) + testname = filename[7:-5] + dirname = os.path.basename(os.path.normpath(pathname)) + if os.path.islink(filename): + targettestfile = os.readlink(testfile) + targetfilename = os.path.basename(targettestfile) + targettestname = targetfilename[7:-5] + links[testname] = (testname, dirname, targettestname) + else: + testdict[testname] = load_config_file(testfile) + testdict[testname]["directory"] = dirname + + # For each found link, add its info to the appropriate test dictionary entry + for link in links: + testdict[link[2]]["alternate_name"] = link[0] + testdict[link[2]]["alternate_directory_name"] = link[1] + + # Print the file + with open(txtfile, 'w') as f: + # Field delimiter character + d = "\" | \"" + txt_output = ['"Test Name'] + txt_output.append(f'(Subdirectory){d}Alternate Test Names') + txt_output.append(f'(Subdirectories){d}Test Purpose/Description{d}Relative Cost of Running Dynamics') + txt_output.append(f'(1 corresponds to running a 6-hour forecast on the RRFS_CONUS_25km predefined grid using the default time step){d}PREDEF_GRID_NAME{d}CCPP_PHYS_SUITE{d}EXTRN_MDL_NAME_ICS{d}EXTRN_MDL_NAME_LBCS{d}DATE_FIRST_CYCL{d}DATE_LAST_CYCL{d}INCR_CYCL_FREQ{d}FCST_LEN_HRS{d}LBC_SPEC_INTVL_HRS{d}NUM_ENS_MEMBERS') + + for line in txt_output: + f.write(f"{line}\n") + for expt in testdict: + f.write(f"\"{expt}\n(") + f.write(f"{testdict[expt]['directory']}){d}") + if "alternate_name" in testdict[expt]: + f.write(f"{testdict[expt]['alternate_name']}\n({testdict[expt]['alternate_directory_name']}){d}") + else: + f.write(f"{d}\n") + desc = testdict[expt]['metadata']['description'].splitlines() + for line in desc[:-1]: + f.write(f" {line}\n") + f.write(f" {desc[-1]}") + f.write(f"{d}'0{d}'0") + f.write(f"{d}" + get_or_print_blank(testdict[expt],'workflow','PREDEF_GRID_NAME')) + f.write(f"{d}" + get_or_print_blank(testdict[expt],'workflow','CCPP_PHYS_SUITE')) + f.write(f"{d}" + get_or_print_blank(testdict[expt],'task_get_extrn_ics','EXTRN_MDL_NAME_ICS')) + f.write(f"{d}" + get_or_print_blank(testdict[expt],'task_get_extrn_lbcs','EXTRN_MDL_NAME_LBCS')) + f.write(f"{d}" + get_or_print_blank(testdict[expt],'workflow','DATE_FIRST_CYCL')) + f.write(f"{d}" + get_or_print_blank(testdict[expt],'workflow','DATE_LAST_CYCL')) + f.write(f"{d}" + get_or_print_blank(testdict[expt],'workflow','INCR_CYCL_FREQ')) + f.write(f"{d}" + get_or_print_blank(testdict[expt],'workflow','FCST_LEN_HRS')) + f.write(f"{d}" + get_or_print_blank(testdict[expt],'task_run_fcst','DT_ATMOS')) + f.write(f"{d}" + get_or_print_blank(testdict[expt],'task_get_extrn_lbcs','LBC_SPEC_INTVL_HRS')) + f.write(f"{d}" + get_or_print_blank(testdict[expt],'global','NUM_ENS_MEMBERS') + "\n") + +# f.write(f"{d}{testdict[expt]['workflow']['PREDEF_GRID_NAME']}") +# f.write(f"{d}{testdict[expt]['workflow']['CCPP_PHYS_SUITE']}") +# f.write(f"{d}{testdict[expt]['task_get_extrn_ics']['EXTRN_MDL_NAME_ICS']}") +# f.write(f"{d}{testdict[expt]['task_get_extrn_lbcs']['EXTRN_MDL_NAME_LBCS']}") +# f.write(f"{d}{testdict[expt]['workflow']['DATE_FIRST_CYCL']}") +# f.write(f"{d}{testdict[expt]['workflow']['DATE_LAST_CYCL']}") +# if "INCR_CYCL_FREQ" in testdict[expt]['workflow']: +# f.write(f"{d}{testdict[expt]['workflow']['INCR_CYCL_FREQ']}") +# else: +# f.write(f"{d}") +# f.write(f"{d}{testdict[expt]['workflow']['FCST_LEN_HRS']}") +# f.write(f"{d}{testdict[expt]['task_get_extrn_lbcs']['LBC_SPEC_INTVL_HRS']}") +# if "global" in testdict[expt]: +# if NUM_ENS_MEMBERS in testdict[expt]['global']: +# f.write(f"{d}{testdict[expt]['global']['NUM_ENS_MEMBERS']}") +# else: +# f.write(f"{d}") +# else: +# f.write(f"{d}") + +def get_or_print_blank(d,key1,key2): + if d.get(key1,{}).get(key2): + write = f"{d[key1][key2]}" + else: + write = "" + + return write + + From 90ab3b92f3f9ef7f532a6269b78440b3452c7edd Mon Sep 17 00:00:00 2001 From: "Michael Kavulich, Jr" Date: Wed, 1 Mar 2023 20:13:54 +0000 Subject: [PATCH 15/52] - Remove incorrectly left-in exit call - Remove incorrectly left-in commented code - Add documentation for get_or_print_blank() - Some suggested changes from pylint --- tests/WE2E/run_WE2E_tests.py | 23 ++++++++++++----------- tests/WE2E/utils.py | 35 +++++++++++++++-------------------- 2 files changed, 27 insertions(+), 31 deletions(-) diff --git a/tests/WE2E/run_WE2E_tests.py b/tests/WE2E/run_WE2E_tests.py index 3245e53e1d..0e96b6cfc5 100755 --- a/tests/WE2E/run_WE2E_tests.py +++ b/tests/WE2E/run_WE2E_tests.py @@ -65,11 +65,13 @@ def run_we2e_tests(homedir, args) -> None: tests_to_check = [] for f in alltests: filename = os.path.basename(f) - # We just want the test namein this list, so cut out the "config." prefix and ".yaml" extension + # We just want the test name in this list, so cut out the + # "config." prefix and ".yaml" extension tests_to_check.append(filename[7:-5]) logging.debug(f"Will check all tests:\n{tests_to_check}") elif user_spec_tests[0] in ['fundamental', 'comprehensive']: - # I am writing this section of code under protest; we should use args.run_envir to check for run_envir-specific files! + # I am writing this section of code under protest; we should use args.run_envir to + # check for run_envir-specific files! prefix = f"machine_suites/{user_spec_tests[0]}" testfilename = f"{prefix}.{machine}.{args.compiler}.nco" if not os.path.isfile(testfilename): @@ -91,19 +93,21 @@ def run_we2e_tests(homedir, args) -> None: logging.debug(f'{testfilename} exists for this platform and run_envir has not been specified'\ 'Setting run_envir = {run_envir} for all tests') logging.debug(f"Reading test file: {testfilename}") - with open(testfilename) as f: + with open(testfilename, encoding="utf-8") as f: tests_to_check = [x.rstrip() for x in f] logging.debug(f"Will check {user_spec_tests[0]} tests:\n{tests_to_check}") else: - # If we have gotten this far then the only option left for user_spec_tests is a file containing test names + # If we have gotten this far then the only option left for user_spec_tests is a + # file containing test names logging.debug(f'Checking if {user_spec_tests} is a file containing test names') if os.path.isfile(user_spec_tests[0]): - with open(user_spec_tests[0]) as f: + with open(user_spec_tests[0], encoding="utf-8") as f: tests_to_check = [x.rstrip() for x in f] else: raise FileNotFoundError(dedent(f""" The specified 'tests' argument '{user_spec_tests}' - does not appear to be a valid test name, a valid test suite, or a file containing valid test names. + does not appear to be a valid test name, a valid test suite, or a file + containing valid test names. Check your inputs and try again. """)) @@ -180,7 +184,7 @@ def run_we2e_tests(homedir, args) -> None: logging.debug(f"Writing updated config.yaml for test {test_name}\nbased on specified command-line arguments:\n") logging.debug(cfg_to_yaml_str(test_cfg)) - with open(ushdir + "/config.yaml","w") as f: + with open(ushdir + "/config.yaml","w", encoding="utf-8") as f: f.writelines(cfg_to_yaml_str(test_cfg)) logging.info(f"Calling workflow generation function for test {test_name}\n") @@ -212,8 +216,6 @@ def run_we2e_tests(homedir, args) -> None: except KeyboardInterrupt: logging.info("\n\nUser interrupted monitor script; to resume monitoring jobs run:\n") logging.info(f"./monitor_jobs.py -y={monitor_file} -p={args.procs}\n") - except: - raise else: logging.info("All experiments are complete") logging.info(f"Summary of results available in {monitor_file}") @@ -380,7 +382,7 @@ def check_task_verification(cfg: dict, mach: dict, dflt: dict) -> dict: return cfg_vx # Attempt to obtain the values of RUN_TASK_RUN_FCST, WRITE_DO_POST, and RUN_TASK_RUN_POST - # from the test configuration dictionary. If not available there, get them from the default + # from the test configuration dictionary. If not available there, get them from the default # configuration dictionary. flags = {'RUN_TASK_RUN_FCST': False, 'WRITE_DOPOST': False, 'RUN_TASK_RUN_POST': False} for section in ['workflow_switches', 'task_run_fcst']: @@ -481,7 +483,6 @@ def setup_logging(logfile: str = "log.run_WE2E_tests", debug: bool = False) -> N # Print test details (if requested) if args.print_test_details: print_test_details("test_details.txt") - sys.exit() #Call main function try: diff --git a/tests/WE2E/utils.py b/tests/WE2E/utils.py index 3fc5b54ef5..34100a24d1 100755 --- a/tests/WE2E/utils.py +++ b/tests/WE2E/utils.py @@ -444,27 +444,22 @@ def print_test_details(txtfile: str = "test_details.txt") -> None: f.write(f"{d}" + get_or_print_blank(testdict[expt],'task_get_extrn_lbcs','LBC_SPEC_INTVL_HRS')) f.write(f"{d}" + get_or_print_blank(testdict[expt],'global','NUM_ENS_MEMBERS') + "\n") -# f.write(f"{d}{testdict[expt]['workflow']['PREDEF_GRID_NAME']}") -# f.write(f"{d}{testdict[expt]['workflow']['CCPP_PHYS_SUITE']}") -# f.write(f"{d}{testdict[expt]['task_get_extrn_ics']['EXTRN_MDL_NAME_ICS']}") -# f.write(f"{d}{testdict[expt]['task_get_extrn_lbcs']['EXTRN_MDL_NAME_LBCS']}") -# f.write(f"{d}{testdict[expt]['workflow']['DATE_FIRST_CYCL']}") -# f.write(f"{d}{testdict[expt]['workflow']['DATE_LAST_CYCL']}") -# if "INCR_CYCL_FREQ" in testdict[expt]['workflow']: -# f.write(f"{d}{testdict[expt]['workflow']['INCR_CYCL_FREQ']}") -# else: -# f.write(f"{d}") -# f.write(f"{d}{testdict[expt]['workflow']['FCST_LEN_HRS']}") -# f.write(f"{d}{testdict[expt]['task_get_extrn_lbcs']['LBC_SPEC_INTVL_HRS']}") -# if "global" in testdict[expt]: -# if NUM_ENS_MEMBERS in testdict[expt]['global']: -# f.write(f"{d}{testdict[expt]['global']['NUM_ENS_MEMBERS']}") -# else: -# f.write(f"{d}") -# else: -# f.write(f"{d}") - def get_or_print_blank(d,key1,key2): + """Function that checks the existence of keys in a nested dictionary in the form: + + dictionary[key1][key2] + + If dictionary[key1][key2] exists, return its value as a string. + If either key1 or key2 do not exist, return an empty string + + Args: + d (dict) : Dictionary to check for keys + key1 (str) : The key for dictionary d + key2 (str) : The key for dictionary d[key1] + Returns: + write : A string containing the value of d[key1][key2] + """ + if d.get(key1,{}).get(key2): write = f"{d[key1][key2]}" else: From e14bc5b6b9463869a37af6c25f34e161d91ba7b5 Mon Sep 17 00:00:00 2001 From: "Michael Kavulich, Jr" Date: Wed, 1 Mar 2023 20:17:19 +0000 Subject: [PATCH 16/52] New way of specifying relative time tests without system calls to `date`, courtesy of Christina Holt --- .../config.get_from_NOMADS_ics_FV3GFS_lbcs_FV3GFS.yaml | 4 ++-- ush/python_utils/config_parser.py | 10 ++++++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/tests/WE2E/test_configs/wflow_features/config.get_from_NOMADS_ics_FV3GFS_lbcs_FV3GFS.yaml b/tests/WE2E/test_configs/wflow_features/config.get_from_NOMADS_ics_FV3GFS_lbcs_FV3GFS.yaml index 38fbbe5af6..3a704b3c22 100644 --- a/tests/WE2E/test_configs/wflow_features/config.get_from_NOMADS_ics_FV3GFS_lbcs_FV3GFS.yaml +++ b/tests/WE2E/test_configs/wflow_features/config.get_from_NOMADS_ics_FV3GFS_lbcs_FV3GFS.yaml @@ -10,8 +10,8 @@ platform: workflow: CCPP_PHYS_SUITE: FV3_GFS_2017_gfdlmp PREDEF_GRID_NAME: RRFS_CONUS_25km - DATE_FIRST_CYCL: $DATE_UTIL --utc --date="2 days ago" +%Y%m%d00 - DATE_LAST_CYCL: $DATE_UTIL --utc --date="2 days ago" +%Y%m%d00 + DATE_FIRST_CYCL: '{{ 2|days_ago }}' + DATE_LAST_CYCL: '{{ 2|days_ago }}' FCST_LEN_HRS: 6 PREEXISTING_DIR_METHOD: rename task_get_extrn_ics: diff --git a/ush/python_utils/config_parser.py b/ush/python_utils/config_parser.py index 6510af62eb..a66be884f2 100644 --- a/ush/python_utils/config_parser.py +++ b/ush/python_utils/config_parser.py @@ -15,6 +15,7 @@ """ import argparse +import datetime # # Note: Yaml maynot be available in which case we suppress @@ -97,6 +98,14 @@ def path_join(arg): return os.path.join(*arg) +def days_ago(arg): + """A filter for jinja2 that gives us a date string for x number of + days ago""" + + return (datetime.date.today() - + datetime.timedelta(days=arg)).strftime("%Y%m%d00") + + def extend_yaml(yaml_dict, full_dict=None): """ @@ -140,6 +149,7 @@ def extend_yaml(yaml_dict, full_dict=None): loader=jinja2.BaseLoader, undefined=jinja2.StrictUndefined ) j2env.filters["path_join"] = path_join + j2env.filters["days_ago"] = days_ago j2tmpl = j2env.from_string(template) try: # Fill in a template that has the appropriate variables From fe2d7633f226c5f93071d205459953d2e4392866 Mon Sep 17 00:00:00 2001 From: "Michael Kavulich, Jr" Date: Wed, 1 Mar 2023 20:18:27 +0000 Subject: [PATCH 17/52] Some needed changes to address problems with parallel mode - Pass "refresh" flag correctly in parallel mode: only for first pass through of tests list - Move second "rocotorun" call to immediately after for better chance of creating rocoto db file prior to attempting to read - Only mark experiment in "error" state if it was not created after the second pass through - Print warning message for the case where jobs are continuously not submitted, giving users info in case they mis-configured their experiment --- tests/WE2E/monitor_jobs.py | 2 +- tests/WE2E/utils.py | 45 ++++++++++++++++++++++++++++---------- 2 files changed, 35 insertions(+), 12 deletions(-) diff --git a/tests/WE2E/monitor_jobs.py b/tests/WE2E/monitor_jobs.py index d815e12c80..8c607e45e7 100755 --- a/tests/WE2E/monitor_jobs.py +++ b/tests/WE2E/monitor_jobs.py @@ -53,7 +53,7 @@ def monitor_jobs(expt_dict: dict, monitor_file: str = '', procs: int = 1, debug: if procs > 1: print(f'Starting experiments in parallel with {procs} processes') - expt_dict = update_expt_status_parallel(expt_dict, procs) + expt_dict = update_expt_status_parallel(expt_dict, procs, True) else: for expt in expt_dict: logging.info(f"Starting experiment {expt} running") diff --git a/tests/WE2E/utils.py b/tests/WE2E/utils.py index 34100a24d1..b56f231836 100755 --- a/tests/WE2E/utils.py +++ b/tests/WE2E/utils.py @@ -281,6 +281,10 @@ def update_expt_status(expt: dict, name: str, refresh: bool = False) -> dict: p = subprocess.run(rocotorun_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) logging.debug(p.stdout) + #Run rocotorun again to get around rocotobqserver proliferation issue + p = subprocess.run(rocotorun_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) + logging.debug(p.stdout) + logging.debug(f"Reading database for experiment {name}, updating experiment dictionary") try: # This section of code queries the "job" table of the rocoto database, returning a list @@ -289,9 +293,14 @@ def update_expt_status(expt: dict, name: str, refresh: bool = False) -> dict: with closing(connection.cursor()) as cur: db = cur.execute('SELECT taskname,cycle,state,cores,duration from jobs').fetchall() except: - logging.warning(f"Unable to read database {rocoto_db}\nCan not track experiment {name}") - expt["status"] = "ERROR" - return expt + # Some platforms (including Hera) can have a problem with rocoto jobs not submitting + # properly due to build-ups of background processes. This will resolve over time as + # rocotorun continues to be called, so let's only treat this as an error if we are + # past the first initial iteration of job submissions + if not refresh: + logging.warning(f"Unable to read database {rocoto_db}\nCan not track experiment {name}") + expt["status"] = "ERROR" + return expt for task in db: # For each entry from rocoto database, store that task's info under a dictionary key named TASKNAME_CYCLE @@ -303,10 +312,6 @@ def update_expt_status(expt: dict, name: str, refresh: bool = False) -> dict: expt[f"{task[0]}_{cycle}"]["cores"] = task[3] expt[f"{task[0]}_{cycle}"]["walltime"] = task[4] - #Run rocotorun again to get around rocotobqserver proliferation issue - p = subprocess.run(rocotorun_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) - logging.debug(p.stdout) - statuses = list() for task in expt: # Skip non-task entries @@ -337,16 +342,33 @@ def update_expt_status(expt: dict, name: str, refresh: bool = False) -> dict: expt["status"] = "COMPLETE" else: expt["status"] = "SUCCEEDED" + elif expt["status"] == "CREATED": + # Some platforms (including Hera) can have a problem with rocoto jobs not submitting + # properly due to build-ups of background processes. This will resolve over time as + # rocotorun continues to be called, so let's only print this warning message if we + # are past the first initial iteration of job submissions + if not refresh: + logging.warning(dedent( + """WARNING:Tasks have not yet been submitted for experiment {name}; + it could be that your jobs are being throttled at the system level. + + If you continue to see this message, there may be an error with your + experiment configuration, such as an incorrect queue or account number. + + You can use ctrl-c to pause this script and inspect log files. + """)) + else: logging.fatal("Some kind of horrible thing has happened") - raise ValueError(dedent(f"""Some kind of horrible thing has happened to the experiment status + raise ValueError(dedent( + f"""Some kind of horrible thing has happened to the experiment status for experiment {name} status is {expt["status"]} all task statuses are {statuses}""")) return expt -def update_expt_status_parallel(expt_dict: dict, procs: int) -> dict: +def update_expt_status_parallel(expt_dict: dict, procs: int, refresh: bool = False) -> dict: """ This function updates an entire set of experiments in parallel, drastically speeding up the process if given enough parallel processes. Given an experiment dictionary, it will @@ -357,7 +379,8 @@ def update_expt_status_parallel(expt_dict: dict, procs: int) -> dict: Args: expt_dict (dict): A dictionary containing information for all experiments - procs (int): The number of parallel processes + procs (int): The number of parallel processes + refresh (bool): "Refresh" flag to pass to update_expt_status() Returns: dict: The updated dictionary of experiment dictionaries @@ -366,7 +389,7 @@ def update_expt_status_parallel(expt_dict: dict, procs: int) -> dict: args = [] # Define a tuple of arguments to pass to starmap for expt in expt_dict: - args.append( (expt_dict[expt],expt,True) ) + args.append( (expt_dict[expt],expt,refresh) ) # call update_expt_status() in parallel with Pool(processes=procs) as pool: From d5abe5317f55c31680a594b2644c6d9588b11473 Mon Sep 17 00:00:00 2001 From: "Michael Kavulich, Jr" Date: Thu, 2 Mar 2023 17:34:08 +0000 Subject: [PATCH 18/52] Add a final check using rocotostat to ensure that there are no un-submitted tasks remaining --- tests/WE2E/utils.py | 82 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 78 insertions(+), 4 deletions(-) diff --git a/tests/WE2E/utils.py b/tests/WE2E/utils.py index b56f231836..f4fbd5488e 100755 --- a/tests/WE2E/utils.py +++ b/tests/WE2E/utils.py @@ -338,10 +338,10 @@ def update_expt_status(expt: dict, name: str, refresh: bool = False) -> dict: elif "SUBMITTING" in statuses: expt["status"] = "SUBMITTING" elif "SUCCEEDED" in statuses: - if expt["status"] == "SUCCEEDED": - expt["status"] = "COMPLETE" - else: - expt["status"] = "SUCCEEDED" + # If all task statuses are "SUCCEEDED", set the experiment status to "SUCCEEDED". This + # will trigger a final check using rocotostat to make sure there are no remaining un- + # started tests. + expt["status"] = "SUCCEEDED" elif expt["status"] == "CREATED": # Some platforms (including Hera) can have a problem with rocoto jobs not submitting # properly due to build-ups of background processes. This will resolve over time as @@ -366,6 +366,11 @@ def update_expt_status(expt: dict, name: str, refresh: bool = False) -> dict: status is {expt["status"]} all task statuses are {statuses}""")) + # Final check for experiments where all tasks are "SUCCEEDED"; since the rocoto database does + # not include info on jobs that have not been submitted yet, use rocotostat to check that + # there are no un-submitted jobs remaining. + expt = compare_rocotostat(expt,name) + return expt def update_expt_status_parallel(expt_dict: dict, procs: int, refresh: bool = False) -> dict: @@ -490,4 +495,73 @@ def get_or_print_blank(d,key1,key2): return write +def compare_rocotostat(expt_dict,name): + """Reads the dictionary showing the location of a given experiment, runs a `rocotostat` command + to get the full set of tasks for the experiment, and compares the two to see if there are any + unsubmitted tasks remaining. + """ + + # Call rocotostat and store output + rocoto_db = f"{expt_dict['expt_dir']}/FV3LAM_wflow.db" + rocotorun_cmd = ["rocotostat", f"-w {expt_dict['expt_dir']}/FV3LAM_wflow.xml", f"-d {rocoto_db}", "-v 10"] + p = subprocess.run(rocotorun_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) + rsout = p.stdout + + # Parse each line of rocotostat output, extracting relevant information + untracked_tasks = [] + for line in rsout.split('\n'): + # Skip blank lines and dividing lines of '=====...' + if not line: + continue + if line[0] == '=': + continue + line_array = line.split() + # Skip header lines + if line_array[0] == 'CYCLE': + continue + # We should now just have lines describing jobs, in the form: + # line_array = ['cycle','task','jobid','status','exit status','num tries','walltime'] + + # As defined in update_expt_status(), the "task names" in the dictionary are a combination + # of the task name and cycle + taskname = f'{line_array[1]}_{line_array[0]}' + # If we're already tracking this task, continue + if expt_dict.get(taskname): + continue + + # Otherwise, extract information into dictionary of untracked tasks + untracked_tasks.append(taskname) + + if untracked_tasks: + # We want to give this a couple loops before reporting that it is "stuck" + if expt_dict['status'] == 'SUCCEEDED': + expt_dict['status'] = 'STALLED' + elif expt_dict['status'] == 'STALLED': + expt_dict['status'] = 'STUCK' + elif expt_dict['status'] == 'STUCK': + msg = f"WARNING: For experiment {name}, there are some jobs that are not being submitted:" + for ut in untracked_tasks: + msg += ut + msg = msg + f"""WARNING: For experiment {name}, + there are some jobs that are not being submitted. + It could be that your jobs are being throttled at the system level, or + some task dependencies have not been met. + + If you continue to see this message, there may be an error with your + experiment configuration. + + You can use ctrl-c to pause this script and inspect log files. + """ + logging.warning(dedent(msg)) + else: + logging.fatal("Some kind of horrible thing has happened") + raise ValueError(dedent( + f"""Some kind of horrible thing has happened to the experiment status + for experiment {name} + status is {expt["status"]} + untracked tasknames are {untracked_tasks}""")) + else: + expt_dict["status"] = "COMPLETE" + + return expt_dict From 6281c40be5fcf3da537b643b656460272e1724ea Mon Sep 17 00:00:00 2001 From: "Michael Kavulich, Jr" Date: Thu, 2 Mar 2023 17:57:14 +0000 Subject: [PATCH 19/52] Fix logic for jobs in DEAD or UNKNOWN status, add logic for "FAILED" jobs, fix incorrect variable in error message --- tests/WE2E/utils.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/tests/WE2E/utils.py b/tests/WE2E/utils.py index f4fbd5488e..0607d65ff4 100755 --- a/tests/WE2E/utils.py +++ b/tests/WE2E/utils.py @@ -320,22 +320,21 @@ def update_expt_status(expt: dict, name: str, refresh: bool = False) -> dict: statuses.append(expt[task]["status"]) if "DEAD" in statuses: - still_live = ["RUNNING", "SUBMITTING", "QUEUED"] + still_live = ["RUNNING", "SUBMITTING", "QUEUED", "FAILED"] if any(status in still_live for status in statuses): logging.debug(f'DEAD job in experiment {name}; continuing to track until all jobs are complete') expt["status"] = "DYING" else: expt["status"] = "DEAD" - return expt - - if "UNKNOWN" in statuses: + return expt + elif "UNKNOWN" in statuses: expt["status"] = "ERROR" - - if "RUNNING" in statuses: + elif "RUNNING" in statuses: expt["status"] = "RUNNING" elif "QUEUED" in statuses: expt["status"] = "QUEUED" - elif "SUBMITTING" in statuses: + elif "FAILED" in statuses or "SUBMITTING" in statuses: + # Job in "FAILED" status means it will be retried expt["status"] = "SUBMITTING" elif "SUCCEEDED" in statuses: # If all task statuses are "SUCCEEDED", set the experiment status to "SUCCEEDED". This @@ -559,7 +558,7 @@ def compare_rocotostat(expt_dict,name): raise ValueError(dedent( f"""Some kind of horrible thing has happened to the experiment status for experiment {name} - status is {expt["status"]} + status is {expt_dict["status"]} untracked tasknames are {untracked_tasks}""")) else: expt_dict["status"] = "COMPLETE" From dd02d0a1df57ebedc618b8601179ac79131433e1 Mon Sep 17 00:00:00 2001 From: "Michael Kavulich, Jr" Date: Thu, 2 Mar 2023 18:02:29 +0000 Subject: [PATCH 20/52] Remove UNKNOWN from the status check; according to rocoto source code, UNKNOWN jobs will be retried, so we don't want to mark these as ERROR --- tests/WE2E/utils.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/tests/WE2E/utils.py b/tests/WE2E/utils.py index 0607d65ff4..594e11e3de 100755 --- a/tests/WE2E/utils.py +++ b/tests/WE2E/utils.py @@ -246,9 +246,8 @@ def update_expt_status(expt: dict, name: str, refresh: bool = False) -> dict: status SUCCEEDED (see next entry). DEAD: One or more tasks are at status DEAD, and the rest are either DEAD or SUCCEEDED. We will no longer monitor this experiment. - ERROR: One or more tasks are at status UNKNOWN, meaning that rocoto has failed to track the - job associated with that task. This will require manual intervention to solve, so we - will no longer monitor this experiment. + ERROR: Could not read the rocoto database file. This will require manual intervention to + solve, so we will no longer monitor this experiment. This status may also appear if we fail to read the rocoto database file. RUNNING: One or more jobs are at status RUNNING, and the rest are either status QUEUED, SUBMITTED, or SUCCEEDED. This is a normal state; we will continue to monitor this experiment. @@ -327,8 +326,6 @@ def update_expt_status(expt: dict, name: str, refresh: bool = False) -> dict: else: expt["status"] = "DEAD" return expt - elif "UNKNOWN" in statuses: - expt["status"] = "ERROR" elif "RUNNING" in statuses: expt["status"] = "RUNNING" elif "QUEUED" in statuses: From 5185b0435ef5cde2d4f8fd1f913daa805c520a4b Mon Sep 17 00:00:00 2001 From: "Michael J. Kavulich, Jr" Date: Thu, 2 Mar 2023 21:39:06 +0000 Subject: [PATCH 21/52] Some more fixes from final testing - Set VX_FCST_INPUT_BASEDIR to null string if not set on platform (rather than failing) - Print message about updating experiment on first go around; this allows user to see progress with parallel option - Only call compare_rocotostat() if job may be finished or stuck --- tests/WE2E/run_WE2E_tests.py | 2 +- tests/WE2E/utils.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/WE2E/run_WE2E_tests.py b/tests/WE2E/run_WE2E_tests.py index 0e96b6cfc5..3cb94856f9 100755 --- a/tests/WE2E/run_WE2E_tests.py +++ b/tests/WE2E/run_WE2E_tests.py @@ -401,7 +401,7 @@ def check_task_verification(cfg: dict, mach: dict, dflt: dict) -> dict: if 'TEST_VX_FCST_INPUT_BASEDIR' in mach['platform']: cfg_vx['VX_FCST_INPUT_BASEDIR'] = mach['platform']['TEST_VX_FCST_INPUT_BASEDIR'] else: - raise KeyError(f"Non-default forecast file location for verification (TEST_VX_FCST_INPUT_BASEDIR) not set in machine file") + cfg_vx['VX_FCST_INPUT_BASEDIR'] = '' return cfg_vx diff --git a/tests/WE2E/utils.py b/tests/WE2E/utils.py index 594e11e3de..8f612c787e 100755 --- a/tests/WE2E/utils.py +++ b/tests/WE2E/utils.py @@ -274,6 +274,8 @@ def update_expt_status(expt: dict, name: str, refresh: bool = False) -> dict: if (expt["status"] in ['DEAD','ERROR','COMPLETE']) and not refresh: return expt + if refresh: + logging.info(f"Updating database for experiment {name}") # Update experiment, read rocoto database rocoto_db = f"{expt['expt_dir']}/FV3LAM_wflow.db" rocotorun_cmd = ["rocotorun", f"-w {expt['expt_dir']}/FV3LAM_wflow.xml", f"-d {rocoto_db}", "-v 10"] @@ -365,7 +367,8 @@ def update_expt_status(expt: dict, name: str, refresh: bool = False) -> dict: # Final check for experiments where all tasks are "SUCCEEDED"; since the rocoto database does # not include info on jobs that have not been submitted yet, use rocotostat to check that # there are no un-submitted jobs remaining. - expt = compare_rocotostat(expt,name) + if expt["status"] in ["SUCCEEDED","STALLED","STUCK"]: + expt = compare_rocotostat(expt,name) return expt From dcc8c585ac89f3639ac927e25cdabe26fb6f1dd7 Mon Sep 17 00:00:00 2001 From: "Michael Kavulich, Jr" Date: Thu, 2 Mar 2023 15:18:03 -0700 Subject: [PATCH 22/52] Start updating documentation --- docs/UsersGuide/source/WE2Etests.rst | 186 ++++++++++++++++++--------- 1 file changed, 128 insertions(+), 58 deletions(-) diff --git a/docs/UsersGuide/source/WE2Etests.rst b/docs/UsersGuide/source/WE2Etests.rst index 4495357934..183ae956b3 100644 --- a/docs/UsersGuide/source/WE2Etests.rst +++ b/docs/UsersGuide/source/WE2Etests.rst @@ -3,7 +3,7 @@ ================================== Workflow End-to-End (WE2E) Tests ================================== -The SRW App contains a set of end-to-end tests that exercise various workflow configurations of the SRW App. These are referred to as workflow end-to-end (WE2E) tests because they all use the Rocoto workflow manager to run their individual workflows. The purpose of these tests is to ensure that new changes to the App do not break existing functionality and capabilities. +The SRW App contains a set of end-to-end tests that exercise various workflow configurations of the SRW App. These are referred to as workflow end-to-end (WE2E) tests because they all use the Rocoto workflow manager to run their individual workflows from start to finish. The purpose of these tests is to ensure that new changes to the App do not break existing functionality and capabilities. Note that the WE2E tests are not regression tests---they do not check whether current results are identical to previously established baselines. They also do @@ -34,13 +34,13 @@ The test configuration files for these categories are located in the following d ufs-srweather-app/tests/WE2E/test_configs/grids_extrn_mdls_suites_nco ufs-srweather-app/tests/WE2E/test_configs/wflow_features -The script to run the WE2E tests is named ``run_WE2E_tests.sh`` and is located in the directory ``ufs-srweather-app/tests/WE2E``. Each WE2E test has an associated configuration file named ``config.${test_name}.yaml``, where ``${test_name}`` is the name of the corresponding test. These configuration files are subsets of the full range of ``config.yaml`` experiment configuration options. (See :numref:`Chapter %s ` for all configurable options and :numref:`Section %s ` for information on configuring ``config.yaml``.) For each test, the ``run_WE2E_tests.sh`` script reads in the test configuration file and generates from it a complete ``config.yaml`` file. It then calls ``generate_FV3LAM_wflow.py``, which in turn reads in ``config.yaml`` and generates a new experiment for the test. The name of each experiment directory is set to that of the corresponding test, and a copy of ``config.yaml`` for each test is placed in its experiment directory. +The script to run the WE2E tests is named ``run_WE2E_tests.py`` and is located in the directory ``ufs-srweather-app/tests/WE2E``. Each WE2E test has an associated configuration file named ``config.${test_name}.yaml``, where ``${test_name}`` is the name of the corresponding test. These configuration files are subsets of the full range of ``config.yaml`` experiment configuration options. (See :numref:`Chapter %s ` for all configurable options and :numref:`Section %s ` for information on configuring ``config.yaml``.) For each test, the ``run_WE2E_tests.py`` script reads in the test configuration file and generates from it a complete ``config.yaml`` file. It then calls the ``generate_FV3LAM_wflow()`` function, which in turn reads in ``config.yaml`` and generates a new experiment for the test. The name of each experiment directory is set to that of the corresponding test, and a copy of ``config.yaml`` for each test is placed in its experiment directory. -Since ``run_WE2E_tests.sh`` calls ``generate_FV3LAM_wflow.py`` for each test, the -Python modules required for experiment generation must be loaded before ``run_WE2E_tests.sh`` +As with any other experiment within the App, the +Python modules required for experiment generation must be loaded before ``run_WE2E_tests.py`` can be called. See :numref:`Section %s ` for information on loading the Python -environment on supported platforms. Note also that ``run_WE2E_tests.sh`` assumes that all of -the executables have been built (see :numref:`Section %s `). If they have not, then ``run_WE2E_tests.sh`` will still generate the experiment directories, but the workflows will fail. +environment on supported platforms. Note also that ``run_WE2E_tests.py`` assumes that all of +the executables have been built (see :numref:`Section %s `). If they have not, then ``run_WE2E_tests.py`` will still generate the experiment directories, but the workflows will fail. Supported Tests =================== @@ -50,24 +50,111 @@ The full list of WE2E tests is extensive; it is not recommended to run all the t Running the WE2E Tests ================================ -Users may specify the set of tests to run by creating a text file, such as ``my_tests.txt``, which contains a list of the WE2E tests to run (one per line). Then, they pass the name of that file to ``run_WE2E_tests.sh``. For example, to run the tests ``custom_ESGgrid`` and ``grid_RRFS_CONUScompact_25km_ics_FV3GFS_lbcs_FV3GFS_suite_GFS_v16`` (from the ``wflow_features`` and ``grids_extrn_mdls_suites_community`` categories, respectively), users would enter the following commands from the ``WE2E`` working directory (``ufs-srweather-app/tests/WE2E/``): +Users may specify the set of tests to run in one of three ways. First, users can pass the name of a single test or list of tests to the script. Secondly, they can pass an option to run the ``fundamental`` or ``comprehensive`` suite of tests, or ``all`` tests (not recommended). Finally, users can create a text file, such as ``my_tests.txt``, which contains a list of the WE2E tests to run (one per line). Any one of these options can be passed to the ``run_WE2E_tests.py`` script via the ``--tests`` or ``-t`` option. + +For example, to run the tests ``custom_ESGgrid`` and ``grid_RRFS_CONUScompact_25km_ics_FV3GFS_lbcs_FV3GFS_suite_GFS_v16`` (from the ``wflow_features`` and ``grids_extrn_mdls_suites_community`` categories, respectively), users would enter the following commands from the ``WE2E`` working directory (``ufs-srweather-app/tests/WE2E/``): + +.. code-block:: console + + echo "custom_ESGgrid" > my_tests.txt + echo "grid_RRFS_CONUScompact_25km_ics_FV3GFS_lbcs_FV3GFS_suite_GFS_v16" >> my_tests.txt + +For each specified test, ``run_WE2E_tests.py`` will generate a new experiment directory and, by default, launch a second function ``monitor_jobs()`` that will continuously monitor active jobs, submit new jobs, and track the success or failure status of the experiment in a ``.yaml`` file. Finally, when all jobs have finished running (successfully or not), the function ``print_WE2E_summary()`` will print a summary of the jobs to screen, including the job's success or failure, timing information, and (if on an appropriately configured platform) the number of core hours used. An example run would look like this: + +.. code-block:: console + + $ ./run_WE2E_tests.py -t custom_ESGgrid -m hera -a gsd-fv3 --expt_basedir "test_set_01" -q + Checking that all tests are valid + Will run 1 tests: + /user/home/ufs-srweather-app/tests/WE2E/test_configs/wflow_features/config.custom_ESGgrid.yaml + Calling workflow generation function for test custom_ESGgrid + + Workflow for test custom_ESGgrid successfully generated in + /user/home/expt_dirs/test_set_01/custom_ESGgrid + + calling function that monitors jobs, prints summary + Writing information for all experiments to WE2E_tests_20230302214843.yaml + Checking tests available for monitoring... + Starting experiment custom_ESGgrid running + Updating database for experiment custom_ESGgrid + Setup complete; monitoring 1 experiments + Use ctrl-c to pause job submission/monitoring + Experiment custom_ESGgrid is COMPLETE; will no longer monitor. + All 1 experiments finished in 0:13:50.851855 + Calculating core-hour usage and printing final summary + ---------------------------------------------------------------------------------------------------- + Experiment name | Status | Core hours used + ---------------------------------------------------------------------------------------------------- + custom_ESGgrid COMPLETE 35.92 + ---------------------------------------------------------------------------------------------------- + Total COMPLETE 35.92 + + Detailed summary written to WE2E_summary_20230302220233.txt + + All experiments are complete + Summary of results available in WE2E_tests_20230302214843.yaml + + +As the script runs, detailed debug output is written to the file ``log.run_WE2E_tests``. This can be useful for debugging if something goes wrong. You can also use the ``-d`` flag to print all this output to screen during the run, but this can get quite cluttered. + +The final job summary is written by the ``print_WE2E_summary()``; this prints a short summary of experiments to screen, and prints a more detailed summary of all jobs for all experiments in the indicated ``.txt`` file. + +.. code-block:: console + + $ cat WE2E_summary_20230302220233.txt + ---------------------------------------------------------------------------------------------------- + Experiment name | Status | Core hours used + ---------------------------------------------------------------------------------------------------- + custom_ESGgrid COMPLETE 35.92 + ---------------------------------------------------------------------------------------------------- + Total COMPLETE 35.92 + + Detailed summary of each experiment: + + ---------------------------------------------------------------------------------------------------- + Detailed summary of experiment custom_ESGgrid + | Status | Walltime | Core hours used + ---------------------------------------------------------------------------------------------------- + make_grid_201907010000 SUCCEEDED 12.0 0.13 + get_extrn_ics_201907010000 SUCCEEDED 7.0 0.08 + get_extrn_lbcs_201907010000 SUCCEEDED 6.0 0.07 + make_orog_201907010000 SUCCEEDED 62.0 0.69 + make_sfc_climo_201907010000 SUCCEEDED 41.0 0.91 + make_ics_201907010000 SUCCEEDED 180.0 8.00 + make_lbcs_201907010000 SUCCEEDED 228.0 10.13 + run_fcst_201907010000 SUCCEEDED 208.0 13.87 + run_post_f000_201907010000 SUCCEEDED 15.0 0.33 + run_post_f001_201907010000 SUCCEEDED 15.0 0.33 + run_post_f002_201907010000 SUCCEEDED 15.0 0.33 + run_post_f003_201907010000 SUCCEEDED 12.0 0.27 + run_post_f004_201907010000 SUCCEEDED 12.0 0.27 + run_post_f005_201907010000 SUCCEEDED 11.0 0.24 + run_post_f006_201907010000 SUCCEEDED 12.0 0.27 + ---------------------------------------------------------------------------------------------------- + Total COMPLETE 35.92 + + +One might have noticed the line during the experiment run that reads "Use ctrl-c to pause job submission/monitoring". The ``monitor_jobs()`` function (called automatically after all experiments are generated) is designed to be easily paused and re-started if necessary. If you wish to stop actively submitting jobs, simply quitting the script using "ctrl-c" will stop the function, and give a short message on how to continue the experiment. .. code-block:: console - cat > my_tests.txt - custom_ESGgrid - grid_RRFS_CONUScompact_25km_ics_FV3GFS_lbcs_FV3GFS_suite_GFS_v16 + Setup complete; monitoring 1 experiments + Use ctrl-c to pause job submission/monitoring + ^C -(and ``Ctrl + D`` to exit). For each test in ``my_tests.txt``, ``run_WE2E_tests.sh`` will generate a new experiment directory and, by default, create a new :term:`cron` job in the user's cron table that will (re)launch the workflow every 2 minutes. This cron job calls the workflow launch script (``launch_FV3LAM_wflow.sh``) until the workflow either completes successfully (i.e., all tasks SUCCEEDED) or fails (i.e., at least one task fails). The cron job is then removed from the user's cron table. -The examples below demonstrate several common ways that ``run_WE2E_tests.sh`` can be called with the ``my_tests.txt`` file above. These examples assume that the user has already built the SRW App and loaded the regional workflow as described in :numref:`Section %s `. + User interrupted monitor script; to resume monitoring jobs run: + + ./monitor_jobs.py -y=WE2E_tests_20230302214324.yaml -p=1 + +The full list of options for any of these scripts can be found by using the ``-h`` flag. The examples below demonstrate several of the more common options for ``run_WE2E_tests.py``. These examples (as well as those above) assume that the user has already built the SRW App and loaded the appropriate python environment as described in :numref:`Section %s `. #. To run the tests listed in ``my_tests.txt`` on Hera and charge the computational - resources used to the "rtrr" account, use: + resources used to the "rtrr" account: .. code-block:: - ./run_WE2E_tests.sh tests_file="my_tests.txt" machine="hera" account="rtrr" + ./run_WE2E_tests.py --tests=my_tests.txt --machine=hera --account=rtrr This will create the experiment subdirectories for the two sample WE2E tests in the directory ``${HOMEdir}/../expt_dirs``, where ``HOMEdir`` is the top-level directory for the ufs-srweather-app repository (usually set to something like ``/path/to/ufs-srweather-app``). Thus, the following two experiment directories will be created: @@ -76,73 +163,56 @@ The examples below demonstrate several common ways that ``run_WE2E_tests.sh`` ca ${HOMEdir}/../expt_dirs/custom_ESGgrid ${HOMEdir}/../expt_dirs/grid_RRFS_CONUScompact_25km_ics_FV3GFS_lbcs_FV3GFS_suite_GFS_v16 - In addition, by default, cron jobs will be added to the user's cron table to relaunch the workflows of these experiments every 2 minutes. + Once these experiment directories are created, the script will call the ``monitor_jobs()`` function. This function runs ``rocotorun`` in the background to monitor the status of jobs in each experiment directory, tracking the status of jobs as they run and complete, and submitting new jobs when they are ready. The progress of ``monitor_jobs()`` is tracked in a file ``WE2E_tests_{datetime}.yaml``, where {datetime} is the date and time (in ``yyyymmddhhmmss`` format) that the file was created. -#. To change the frequency with which the cron relaunch jobs are submitted - from the default of 2 minutes to 1 minute, use: +#. Our second example will run the fundamental suite of tests on Orion, charging computational resources to the "gsd-fv3" account, and placing the experiment subdirectories in a subdirectory named ``test_set_01``: .. code-block:: + ./run_WE2E_tests.py -t fundamental -m hera -a gsd-fv3 --expt_basedir "test_set_01" -q - ./run_WE2E_tests.sh tests_file="my_tests.txt" machine="hera" account="rtrr" cron_relaunch_intvl_mnts="01" - -#. To disable use of cron (which implies that the worfkow for each test will have to be relaunched manually from within each experiment directory), use: + In this case, the full paths to the experiment directories will be: .. code-block:: - ./run_WE2E_tests.sh tests_file="my_tests.txt" machine="hera" account="rtrr" use_cron_to_relaunch="FALSE" - - In this case, the user will have to go into each test's experiment directory and either manually run the ``launch_FV3LAM_wflow.sh`` script or use the Rocoto commands described in :numref:`Chapter %s ` to (re)launch the workflow. Note that if using the Rocoto commands directly, the log file ``log.launch_FV3LAM_wflow`` will not be created; in this case, the status of the workflow can be checked using the ``rocotostat`` command (see :numref:`Section %s ` or :numref:`Section %s `). + ${HOMEdir}/../expt_dirs/test_set_01/grid_RRFS_CONUS_25km_ics_FV3GFS_lbcs_FV3GFS_suite_GFS_v15p2 + ${HOMEdir}/../expt_dirs/test_set_01/grid_RRFS_CONUS_25km_ics_FV3GFS_lbcs_FV3GFS_suite_GFS_v16 + ${HOMEdir}/../expt_dirs/test_set_01/grid_RRFS_CONUS_25km_ics_FV3GFS_lbcs_RAP_suite_HRRR + ${HOMEdir}/../expt_dirs/test_set_01/grid_RRFS_CONUS_25km_ics_GSMGFS_lbcs_GSMGFS_suite_GFS_v15p2 + ${HOMEdir}/../expt_dirs/test_set_01/grid_RRFS_CONUScompact_25km_ics_HRRR_lbcs_HRRR_suite_HRRR + ${HOMEdir}/../expt_dirs/test_set_01/grid_RRFS_CONUScompact_25km_ics_HRRR_lbcs_HRRR_suite_RRFS_v1beta + ${HOMEdir}/../expt_dirs/test_set_01/grid_RRFS_CONUScompact_25km_ics_HRRR_lbcs_RAP_suite_HRRR + ${HOMEdir}/../expt_dirs/test_set_01/grid_RRFS_CONUScompact_25km_ics_HRRR_lbcs_RAP_suite_RRFS_v1beta + ${HOMEdir}/../expt_dirs/test_set_01/nco_grid_RRFS_CONUScompact_25km_ics_HRRR_lbcs_RAP_suite_HRRR -#. To place the experiment subdirectories in a subdirectory named ``test_set_01`` under - ``${HOMEdir}/../expt_dirs`` (instead of immediately under ``expt_dirs``), use: - - .. code-block:: + The ``--expt_basedir`` option is useful for grouping various sets of tests. It can also be given a full path as an argument, which will place experiments in the given location. - ./run_WE2E_tests.sh tests_file="my_tests.txt" machine="hera" account="rtrr" expt_basedir="test_set_01" + The ``-q`` flag (as used in the first example shown above), is helpful for keeping the screen less cluttered; this will suppress the output from ``generate_FV3LAM_wflow()``, only printing important messages (warnings and errors) to screen. As always, this output will still be available in the ``log.run_WE2E_tests`` file. - In this case, the full paths to the experiment directories will be: +#. By default, the job monitoring and submission process is serial, using a single task. For test suites that contain many experiments, this means that the script may take a long time to return to a given experiment and submit the next job, due to the amount of time it takes for the ``rocotorun`` command to complete. In order to speed this process up, provided you have access to a node with the appropriate availability (e.g., submitting from a compute node), you can run the job monitoring processes in parallel using the ``-p`` option: .. code-block:: - ${HOMEdir}/../expt_dirs/test_set_01/custom_ESGgrid - ${HOMEdir}/../expt_dirs/test_set_01/grid_RRFS_CONUScompact_25km_ics_FV3GFS_lbcs_FV3GFS_suite_GFS_v16 + ./run_WE2E_tests.py -m=jet -a=gsd-fv3-dev -t=all -q -p 6 - This is useful for grouping various sets of tests. +Depending on your machine settings, this can reduce the time it takes to run all experiments substantially. -#. To use a test list file (again named ``my_tests.txt``) located in a custom location instead of in the same directory as ``run_WE2E_tests.sh`` and to have the experiment directories be placed in a specific, non-default location (e.g., ``/path/to/custom/expt_dirs``), use: +#. Our final example will run the single experiment "custom_ESGgrid" on Hera, charging computational resources to the "fv3lam" account. For this example, we submit the suite of tests using the legacy :term:`cron`-based system: - .. code-block:: +.. note:: - ./run_WE2E_tests.sh tests_file="/path/to/custom/location/my_tests.txt" machine="hera" account="rtrr" expt_basedir="/path/to/custom/expt_dirs" + This option is not recommended, as it does not work on some machines. -The full usage statement for ``run_WE2E_tests.sh`` is as follows: + .. code-block:: -.. code-block:: + ./run_WE2E_tests.py -t=custom_ESGgrid -m=hera -a=fv3lam --use_cron_to_relaunch --cron_relaunch_intvl_mnts=1 - ./run_WE2E_tests.sh \ - tests_file="..." \ - machine="..." \ - account="..." \ - [expt_basedir="..."] \ - [exec_subdir="..."] \ - [use_cron_to_relaunch="..."] \ - [cron_relaunch_intvl_mnts="..."] \ - [verbose="..."] \ - [generate_csv_file="..."] \ - [machine_file="..."] \ - [stmp="..."] \ - [ptmp="..."] \ - [compiler="..."] \ - [build_env_fn="..."] - -The arguments in brackets are optional. A complete description of these arguments can be -obtained by issuing: +The option ``--use_cron_to_relaunch`` means that, rather than calling the ``monitor_jobs()`` function, the ``generate_FV3LAM_wflow()`` function will create a new :term:`cron` job in the user's cron table that will launch the experiment with the workflow launch script (``launch_FV3LAM_wflow.sh``). By default this script is run every 2 minutes, but we have changed that to 1 minute with the ``--cron_relaunch_intvl_mnts=1`` argument. This script will run until the workflow either completes successfully (i.e., all tasks SUCCEEDED) or fails (i.e., at least one task fails). The cron job is then removed from the user's cron table. -.. code-block:: + .. code-block:: - ./run_WE2E_tests.sh --help + ./run_WE2E_tests.sh tests_file="my_tests.txt" machine="hera" account="rtrr" cron_relaunch_intvl_mnts="01" -from within the ``ufs-srweather-app/tests/WE2E`` directory. + In addition, by default, cron jobs will be added to the user's cron table to relaunch the workflows of these experiments every 2 minutes. .. _WE2ETestInfoFile: From ceff2d057669db5d4c01891e76f3679c4c8d59e7 Mon Sep 17 00:00:00 2001 From: "Michael Kavulich, Jr" Date: Mon, 6 Mar 2023 04:31:13 +0000 Subject: [PATCH 23/52] Finish implementation of "print_test_details()" - Add stand-alone script for calling function without running full test suite - Fix "calculate_cost" function and use it to calculate relative test cost as in previous implementation - Add entry for number of forecasts in output --- tests/WE2E/print_test_details.py | 25 +++++++++ tests/WE2E/utils.py | 23 +++++++- ush/calculate_cost.py | 93 +++++++++++++------------------- 3 files changed, 82 insertions(+), 59 deletions(-) create mode 100755 tests/WE2E/print_test_details.py diff --git a/tests/WE2E/print_test_details.py b/tests/WE2E/print_test_details.py new file mode 100755 index 0000000000..75b386ba0a --- /dev/null +++ b/tests/WE2E/print_test_details.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python3 + +import argparse +import sys + +from utils import print_test_details + +sys.path.append("../../ush") + +from check_python_version import check_python_version + + +if __name__ == "__main__": + + #Parse arguments + parser = argparse.ArgumentParser(description="Script for parsing all test files in the test_configs/ directory, and printing a pipe-delimited summary file of the details of each test.\n") + + parser.add_argument('-o', '--output_file', type=str, help='File name for test details file', default='') + + args = parser.parse_args() + + if args.output_file: + print_test_details(args.output_file) + else: + print_test_details() diff --git a/tests/WE2E/utils.py b/tests/WE2E/utils.py index 8f612c787e..1972c146db 100755 --- a/tests/WE2E/utils.py +++ b/tests/WE2E/utils.py @@ -12,12 +12,13 @@ import time import glob from textwrap import dedent -from datetime import datetime +from datetime import datetime, timedelta from contextlib import closing from multiprocessing import Pool sys.path.append("../../ush") +from calculate_cost import calculate_cost from python_utils import ( cfg_to_yaml_str, flatten_dict, @@ -413,12 +414,18 @@ def print_test_details(txtfile: str = "test_details.txt") -> None: """Prints a pipe ( | ) delimited text file containing summaries of each test defined by a config file in test_configs/* + Args: + txtfile (str): File name for test details file """ testfiles = glob.glob('test_configs/**/config*.yaml', recursive=True) testdict = dict() links = dict() for testfile in testfiles: + # Calculate relative cost of test based on config settings using legacy script + cost_array = calculate_cost(testfile) + cost = cost_array[1] / cost_array[3] + #Decompose full file path into relevant bits pathname, filename = os.path.split(testfile) testname = filename[7:-5] dirname = os.path.basename(os.path.normpath(pathname)) @@ -430,6 +437,17 @@ def print_test_details(txtfile: str = "test_details.txt") -> None: else: testdict[testname] = load_config_file(testfile) testdict[testname]["directory"] = dirname + testdict[testname]["cost"] = cost + #Calculate number of forecasts for a cycling run + if testdict[testname]['workflow']["DATE_FIRST_CYCL"] != testdict[testname]['workflow']["DATE_LAST_CYCL"]: + begin = datetime.strptime(testdict[testname]['workflow']["DATE_FIRST_CYCL"], '%Y%m%d%H') + end = datetime.strptime(testdict[testname]['workflow']["DATE_LAST_CYCL"], '%Y%m%d%H') + diff = end - begin + diffh = diff.total_seconds() // 3600 + nf = diffh // testdict[testname]['workflow']["INCR_CYCL_FREQ"] + testdict[testname]["num_fcsts"] = nf + else: + testdict[testname]["num_fcsts"] = 1 # For each found link, add its info to the appropriate test dictionary entry for link in links: @@ -458,7 +476,8 @@ def print_test_details(txtfile: str = "test_details.txt") -> None: for line in desc[:-1]: f.write(f" {line}\n") f.write(f" {desc[-1]}") - f.write(f"{d}'0{d}'0") + #Write test relative cost and number of test forecasts (for cycling runs) + f.write(f"{d}'{round(testdict[expt]['cost'],2)}{d}'{round(testdict[expt]['num_fcsts'])}") f.write(f"{d}" + get_or_print_blank(testdict[expt],'workflow','PREDEF_GRID_NAME')) f.write(f"{d}" + get_or_print_blank(testdict[expt],'workflow','CCPP_PHYS_SUITE')) f.write(f"{d}" + get_or_print_blank(testdict[expt],'task_get_extrn_ics','EXTRN_MDL_NAME_ICS')) diff --git a/ush/calculate_cost.py b/ush/calculate_cost.py index 1abe729545..56e0ed9cf8 100755 --- a/ush/calculate_cost.py +++ b/ush/calculate_cost.py @@ -6,7 +6,6 @@ from python_utils import ( set_env_var, - import_vars, load_config_file, flatten_dict, ) @@ -17,88 +16,68 @@ def calculate_cost(config_fn): - global PREDEF_GRID_NAME, QUILTING, GRID_GEN_METHOD - - # import all environment variables - IMPORTS = [ - "PREDEF_GRID_NAME", - "QUILTING", - "GRID_GEN_METHOD", - "DT_ATMOS", - "LAYOUT_X", - "LAYOUT_Y", - "BLOCKSIZE", - ] - import_vars(env_vars=IMPORTS) - ushdir = os.path.dirname(os.path.abspath(__file__)) - # get grid config parameters (predefined or custom) - if PREDEF_GRID_NAME: - QUILTING = False + cfg_u = load_config_file(config_fn) + cfg_u = flatten_dict(cfg_u) + + if 'PREDEF_GRID_NAME' in cfg_u: params_dict = set_predef_grid_params( USHdir=ushdir, - grid_name=PREDEF_GRID_NAME, - quilting=QUILTING, + grid_name=cfg_u['PREDEF_GRID_NAME'], + quilting=True ) - for param, value in params_dict.items(): - if param in IMPORTS and globals()[param] is not None: - params_dict[param] = globals()[param] - import_vars(dictionary=params_dict) + + # merge cfg_u with defaults, duplicate keys in cfg_u will overwrite defaults + cfg = {**params_dict, **cfg_u} else: - cfg_u = load_config_file(config_fn) - cfg_u = flatten_dict(cfg_u) - import_vars(dictionary=cfg_u) + cfg = cfg_u # number of gridpoints (nx*ny) depends on grid generation method - if GRID_GEN_METHOD == "GFDLgrid": + if cfg['GRID_GEN_METHOD'] == "GFDLgrid": grid_params = set_gridparams_GFDLgrid( - lon_of_t6_ctr=GFDLgrid_LON_T6_CTR, - lat_of_t6_ctr=GFDLgrid_LAT_T6_CTR, - res_of_t6g=GFDLgrid_NUM_CELLS, - stretch_factor=GFDLgrid_STRETCH_FAC, - refine_ratio_t6g_to_t7g=GFDLgrid_REFINE_RATIO, - istart_of_t7_on_t6g=GFDLgrid_ISTART_OF_RGNL_DOM_ON_T6G, - iend_of_t7_on_t6g=GFDLgrid_IEND_OF_RGNL_DOM_ON_T6G, - jstart_of_t7_on_t6g=GFDLgrid_JSTART_OF_RGNL_DOM_ON_T6G, - jend_of_t7_on_t6g=GFDLgrid_JEND_OF_RGNL_DOM_ON_T6G, + lon_of_t6_ctr=cfg['GFDLgrid_LON_T6_CTR'], + lat_of_t6_ctr=cfg['GFDLgrid_LAT_T6_CTR'], + res_of_t6g=cfg['GFDLgrid_NUM_CELLS'], + stretch_factor=cfg['GFDLgrid_STRETCH_FAC'], + refine_ratio_t6g_to_t7g=cfg['GFDLgrid_REFINE_RATIO'], + istart_of_t7_on_t6g=cfg['GFDLgrid_ISTART_OF_RGNL_DOM_ON_T6G'], + iend_of_t7_on_t6g=cfg['GFDLgrid_IEND_OF_RGNL_DOM_ON_T6G'], + jstart_of_t7_on_t6g=cfg['GFDLgrid_JSTART_OF_RGNL_DOM_ON_T6G'], + jend_of_t7_on_t6g=cfg['GFDLgrid_JEND_OF_RGNL_DOM_ON_T6G'], run_envir="community", verbose=False, nh4=4, ) - elif GRID_GEN_METHOD == "ESGgrid": + elif cfg['GRID_GEN_METHOD'] == "ESGgrid": constants = load_config_file(os.path.join(ushdir, "constants.yaml")) grid_params = set_gridparams_ESGgrid( - lon_ctr=ESGgrid_LON_CTR, - lat_ctr=ESGgrid_LAT_CTR, - nx=ESGgrid_NX, - ny=ESGgrid_NY, - pazi=ESGgrid_PAZI, - halo_width=ESGgrid_WIDE_HALO_WIDTH, - delx=ESGgrid_DELX, - dely=ESGgrid_DELY, + lon_ctr=cfg['ESGgrid_LON_CTR'], + lat_ctr=cfg['ESGgrid_LAT_CTR'], + nx=cfg['ESGgrid_NX'], + ny=cfg['ESGgrid_NY'], + pazi=cfg['ESGgrid_PAZI'], + halo_width=cfg['ESGgrid_WIDE_HALO_WIDTH'], + delx=cfg['ESGgrid_DELX'], + dely=cfg['ESGgrid_DELY'], constants=constants["constants"], ) + else: + raise ValueError("GRID_GEN_METHOD is set to an invalid value") - NX = grid_params["NX"] - NY = grid_params["NY"] - cost = [DT_ATMOS, NX * NY] + cost = [cfg['DT_ATMOS'], grid_params["NX"] * grid_params["NY"] ] # reference grid (6-hour forecast on RRFS_CONUS_25km) PREDEF_GRID_NAME = "RRFS_CONUS_25km" - params_dict = set_predef_grid_params( - USHdir=os.path.dirname(os.path.abspath(__file__)), + refgrid = set_predef_grid_params( + USHdir=ushdir, grid_name=PREDEF_GRID_NAME, - quilting=QUILTING, + quilting=True, ) - for param, value in params_dict.items(): - if param in IMPORTS and globals()[param] is not None: - params_dict[param] = globals()[param] - import_vars(dictionary=params_dict) - cost.extend([DT_ATMOS, ESGgrid_NX * ESGgrid_NY]) + cost.extend([refgrid['DT_ATMOS'], refgrid['ESGgrid_NX'] * refgrid['ESGgrid_NY']]) return cost From 735f2023e3a1f56dbcd8aea1ac0a0f611844804d Mon Sep 17 00:00:00 2001 From: "Michael Kavulich, Jr" Date: Mon, 6 Mar 2023 05:00:23 +0000 Subject: [PATCH 24/52] For test details, rename everything to be more consistent with old filenames. Also, fix link detection problem, and use default filename when called from run script --- .../{print_test_details.py => print_test_info.py} | 6 +++--- tests/WE2E/run_WE2E_tests.py | 8 ++++---- tests/WE2E/utils.py | 11 ++++++----- 3 files changed, 13 insertions(+), 12 deletions(-) rename tests/WE2E/{print_test_details.py => print_test_info.py} (83%) diff --git a/tests/WE2E/print_test_details.py b/tests/WE2E/print_test_info.py similarity index 83% rename from tests/WE2E/print_test_details.py rename to tests/WE2E/print_test_info.py index 75b386ba0a..f3c13f52e2 100755 --- a/tests/WE2E/print_test_details.py +++ b/tests/WE2E/print_test_info.py @@ -3,7 +3,7 @@ import argparse import sys -from utils import print_test_details +from utils import print_test_info sys.path.append("../../ush") @@ -20,6 +20,6 @@ args = parser.parse_args() if args.output_file: - print_test_details(args.output_file) + print_test_info(args.output_file) else: - print_test_details() + print_test_info() diff --git a/tests/WE2E/run_WE2E_tests.py b/tests/WE2E/run_WE2E_tests.py index 3cb94856f9..72e748d212 100755 --- a/tests/WE2E/run_WE2E_tests.py +++ b/tests/WE2E/run_WE2E_tests.py @@ -19,7 +19,7 @@ from check_python_version import check_python_version from monitor_jobs import monitor_jobs, write_monitor_file -from utils import print_test_details +from utils import print_test_info def run_we2e_tests(homedir, args) -> None: """Function to run the WE2E tests selected by the user @@ -466,7 +466,7 @@ def setup_logging(logfile: str = "log.run_WE2E_tests", debug: bool = False) -> N parser.add_argument('--use_cron_to_relaunch', action='store_true', help='Explicitly set USE_CRON_TO_RELAUNCH for all experiments; this option disables the "monitor" script functionality') parser.add_argument('--cron_relaunch_intvl_mnts', type=int, help='Overrides CRON_RELAUNCH_INTVL_MNTS for all experiments') parser.add_argument('--opsroot', type=str, help='If test is for NCO mode, sets OPSROOT (see config_defaults.yaml for details)') - parser.add_argument('--print_test_details', action='store_true', help='Create a "test_details.txt" file summarizing each test prior to starting experiment') + parser.add_argument('--print_test_info', action='store_true', help='Create a "WE2E_test_info.txt" file summarizing each test prior to starting experiment') parser.add_argument('--debug_tests', action='store_true', help='Explicitly set DEBUG=TRUE for all experiments') parser.add_argument('--verbose_tests', action='store_true', help='Explicitly set VERBOSE=TRUE for all experiments') @@ -481,8 +481,8 @@ def setup_logging(logfile: str = "log.run_WE2E_tests", debug: bool = False) -> N raise ValueError('You can not have less than one parallel process; select a valid value for --procs') # Print test details (if requested) - if args.print_test_details: - print_test_details("test_details.txt") + if args.print_test_info: + print_test_info() #Call main function try: diff --git a/tests/WE2E/utils.py b/tests/WE2E/utils.py index 1972c146db..91aeb9a46d 100755 --- a/tests/WE2E/utils.py +++ b/tests/WE2E/utils.py @@ -410,7 +410,7 @@ def update_expt_status_parallel(expt_dict: dict, procs: int, refresh: bool = Fal -def print_test_details(txtfile: str = "test_details.txt") -> None: +def print_test_info(txtfile: str = "WE2E_test_info.txt") -> None: """Prints a pipe ( | ) delimited text file containing summaries of each test defined by a config file in test_configs/* @@ -429,7 +429,7 @@ def print_test_details(txtfile: str = "test_details.txt") -> None: pathname, filename = os.path.split(testfile) testname = filename[7:-5] dirname = os.path.basename(os.path.normpath(pathname)) - if os.path.islink(filename): + if os.path.islink(testfile): targettestfile = os.readlink(testfile) targetfilename = os.path.basename(targettestfile) targettestname = targetfilename[7:-5] @@ -450,7 +450,8 @@ def print_test_details(txtfile: str = "test_details.txt") -> None: testdict[testname]["num_fcsts"] = 1 # For each found link, add its info to the appropriate test dictionary entry - for link in links: + for key in links.keys(): + link = links[key] testdict[link[2]]["alternate_name"] = link[0] testdict[link[2]]["alternate_directory_name"] = link[1] @@ -474,8 +475,8 @@ def print_test_details(txtfile: str = "test_details.txt") -> None: f.write(f"{d}\n") desc = testdict[expt]['metadata']['description'].splitlines() for line in desc[:-1]: - f.write(f" {line}\n") - f.write(f" {desc[-1]}") + f.write(f" {line}\n") + f.write(f" {desc[-1]}") #Write test relative cost and number of test forecasts (for cycling runs) f.write(f"{d}'{round(testdict[expt]['cost'],2)}{d}'{round(testdict[expt]['num_fcsts'])}") f.write(f"{d}" + get_or_print_blank(testdict[expt],'workflow','PREDEF_GRID_NAME')) From 970e74ae3b0afad0b9f1f281d46546d7b0330841 Mon Sep 17 00:00:00 2001 From: "Michael Kavulich, Jr" Date: Sun, 5 Mar 2023 22:03:21 -0700 Subject: [PATCH 25/52] Continue updating documentation through first few sections of WE2E chapter --- docs/UsersGuide/source/WE2Etests.rst | 37 ++++------------------------ 1 file changed, 5 insertions(+), 32 deletions(-) diff --git a/docs/UsersGuide/source/WE2Etests.rst b/docs/UsersGuide/source/WE2Etests.rst index 183ae956b3..1acce16089 100644 --- a/docs/UsersGuide/source/WE2Etests.rst +++ b/docs/UsersGuide/source/WE2Etests.rst @@ -196,11 +196,11 @@ The full list of options for any of these scripts can be found by using the ``-h Depending on your machine settings, this can reduce the time it takes to run all experiments substantially. -#. Our final example will run the single experiment "custom_ESGgrid" on Hera, charging computational resources to the "fv3lam" account. For this example, we submit the suite of tests using the legacy :term:`cron`-based system: +#. This example will run the single experiment "custom_ESGgrid" on Hera, charging computational resources to the "fv3lam" account. For this example, we submit the suite of tests using the legacy :term:`cron`-based system: .. note:: - This option is not recommended, as it does not work on some machines. + This option is not recommended, as it does not work on some machines and can cause system bottlenecks on others. .. code-block:: @@ -208,23 +208,11 @@ Depending on your machine settings, this can reduce the time it takes to run all The option ``--use_cron_to_relaunch`` means that, rather than calling the ``monitor_jobs()`` function, the ``generate_FV3LAM_wflow()`` function will create a new :term:`cron` job in the user's cron table that will launch the experiment with the workflow launch script (``launch_FV3LAM_wflow.sh``). By default this script is run every 2 minutes, but we have changed that to 1 minute with the ``--cron_relaunch_intvl_mnts=1`` argument. This script will run until the workflow either completes successfully (i.e., all tasks SUCCEEDED) or fails (i.e., at least one task fails). The cron job is then removed from the user's cron table. - .. code-block:: - - ./run_WE2E_tests.sh tests_file="my_tests.txt" machine="hera" account="rtrr" cron_relaunch_intvl_mnts="01" - - In addition, by default, cron jobs will be added to the user's cron table to relaunch the workflows of these experiments every 2 minutes. +WE2E Test Information File +================================== -.. _WE2ETestInfoFile: - -The WE2E Test Information File -================================ -In addition to creating the WE2E tests' experiment directories and optionally creating -cron jobs to launch their workflows, the ``run_WE2E_tests.sh`` script generates a CSV (Comma-Separated Value) file named ``WE2E_test_info.csv`` that contains information -on the full set of WE2E tests. This file serves as a single location where relevant -information about the WE2E tests can be found. It can be imported into Google Sheets -using the "|" (pipe symbol) character as the custom field separator. If the user does *not* want ``run_WE2E_tests.sh`` to generate this CSV file the first time it runs, -this functionality can be explicitly disabled by including the ``generate_csv_file="FALSE"`` flag as an argument when running this script. +If the user wants to see consolidated test information, they can generate a file that can be imported into a spreadsheet program (Google Sheets, Microsoft Excel, etc.) that summarizes each test. This file, named ``WE2E_test_info.txt`` by default, is delimited by the ``|`` character, and can be created either by running the ``./print_test_info.py`` script, or by generating an experiment using ``./run_WE2E_tests.py`` with the ``--print_test_info`` flag. The rows of the file/sheet represent the full set of available tests (not just the ones to be run). The columns contain the following information (column titles are included in the CSV file): @@ -295,21 +283,6 @@ The rows of the file/sheet represent the full set of available tests (not just t | ``LBC_SPEC_INTVL_HRS`` | ``NUM_ENS_MEMBERS`` -Additional fields (columns) may be added to the CSV file in the future. - -Note that the CSV file is not part of the ``ufs-srweather-app`` repository and therefore is not tracked by the repository. The ``run_WE2E_tests.sh`` script will generate a CSV file if the ``generate_csv_file`` flag to this script has *not* explicitly been set to false and if either one of the following is true: - -#. The CSV file doesn't already exist. -#. The CSV file does exist, but changes have been made to one or more of the - category subdirectories (e.g., test configuration files modified, added, - or deleted) since the creation of the CSV file. - -Thus, unless the ``generate_csv_file`` flag is set to ``"FALSE"``, the -``run_WE2E_tests.sh`` will create a CSV file the first time it is run in a -fresh git clone of the SRW App. The ``generate_csv_file`` flag is provided -because the CSV file generation can be slow, so users may wish to skip this -step since it is not a necessary part of running the tests. - Checking Test Status ====================== From e75a78b3475a177adc3afe7ed2d59783dfd72b2b Mon Sep 17 00:00:00 2001 From: "Michael Kavulich, Jr" Date: Sun, 5 Mar 2023 22:21:31 -0700 Subject: [PATCH 26/52] Some more documentation updates --- docs/UsersGuide/source/ConfigWorkflow.rst | 2 +- docs/UsersGuide/source/WE2Etests.rst | 92 ++--------------------- 2 files changed, 7 insertions(+), 87 deletions(-) diff --git a/docs/UsersGuide/source/ConfigWorkflow.rst b/docs/UsersGuide/source/ConfigWorkflow.rst index 6d03dc0fdc..05de68b98b 100644 --- a/docs/UsersGuide/source/ConfigWorkflow.rst +++ b/docs/UsersGuide/source/ConfigWorkflow.rst @@ -174,7 +174,7 @@ METplus Parameters Test Directories ---------------------- -These directories are used only by the ``run_WE2E_tests.sh`` script, so they are not used unless the user runs a Workflow End-to-End (WE2E) test. Their function corresponds to the same variables without the ``TEST_`` prefix. Users typically should not modify these variables. For any alterations, the logic in the ``run_WE2E_tests.sh`` script would need to be adjusted accordingly. +These directories are used only by the ``run_WE2E_tests.py`` script, so they are not used unless the user runs a Workflow End-to-End (WE2E) test (see :numref:`Chapter %s `). Their function corresponds to the same variables without the ``TEST_`` prefix. Users typically should not modify these variables. For any alterations, the logic in the ``run_WE2E_tests.py`` script would need to be adjusted accordingly. ``TEST_EXTRN_MDL_SOURCE_BASEDIR``: (Default: "") This parameter allows testing of user-staged files in a known location on a given platform. This path contains a limited dataset and likely will not be useful for most user experiments. diff --git a/docs/UsersGuide/source/WE2Etests.rst b/docs/UsersGuide/source/WE2Etests.rst index 1acce16089..174224d162 100644 --- a/docs/UsersGuide/source/WE2Etests.rst +++ b/docs/UsersGuide/source/WE2Etests.rst @@ -168,6 +168,7 @@ The full list of options for any of these scripts can be found by using the ``-h #. Our second example will run the fundamental suite of tests on Orion, charging computational resources to the "gsd-fv3" account, and placing the experiment subdirectories in a subdirectory named ``test_set_01``: .. code-block:: + ./run_WE2E_tests.py -t fundamental -m hera -a gsd-fv3 --expt_basedir "test_set_01" -q In this case, the full paths to the experiment directories will be: @@ -194,7 +195,7 @@ The full list of options for any of these scripts can be found by using the ``-h ./run_WE2E_tests.py -m=jet -a=gsd-fv3-dev -t=all -q -p 6 -Depending on your machine settings, this can reduce the time it takes to run all experiments substantially. + Depending on your machine settings, this can reduce the time it takes to run all experiments substantially. #. This example will run the single experiment "custom_ESGgrid" on Hera, charging computational resources to the "fv3lam" account. For this example, we submit the suite of tests using the legacy :term:`cron`-based system: @@ -209,6 +210,8 @@ Depending on your machine settings, this can reduce the time it takes to run all The option ``--use_cron_to_relaunch`` means that, rather than calling the ``monitor_jobs()`` function, the ``generate_FV3LAM_wflow()`` function will create a new :term:`cron` job in the user's cron table that will launch the experiment with the workflow launch script (``launch_FV3LAM_wflow.sh``). By default this script is run every 2 minutes, but we have changed that to 1 minute with the ``--cron_relaunch_intvl_mnts=1`` argument. This script will run until the workflow either completes successfully (i.e., all tasks SUCCEEDED) or fails (i.e., at least one task fails). The cron job is then removed from the user's cron table. +.. _WE2ETestInfoFile: + WE2E Test Information File ================================== @@ -284,65 +287,6 @@ The rows of the file/sheet represent the full set of available tests (not just t | ``NUM_ENS_MEMBERS`` -Checking Test Status -====================== -If :term:`cron` jobs are used to periodically relaunch the tests, the status of each test can be checked by viewing the end of the log file (``log.launch_FV3LAM_wflow``). Otherwise (or alternatively), the ``rocotorun``/``rocotostat`` combination of commands can be used. (See :numref:`Section %s ` for details.) - -The SRW App also provides the script ``get_expts_status.sh`` in the directory -``ufs-srweather-app/tests/WE2E``, which can be used to generate -a status summary for all tests in a given base directory. This script updates -the workflow status of each test by internally calling ``launch_FV3LAM_wflow.sh``. Then, it prints out the status of the various tests in the command prompt. It also creates -a status report file named ``expts_status_${create_date}.txt`` (where ``create_date`` -is a time stamp in ``YYYYMMDDHHmm`` format corresponding to the creation date/time -of the report) and places it in the experiment base directory. By default, this status file -contains the last 40 lines from the end of the ``log.launch_FV3LAM_wflow`` file. This number can be adjusted via the ``num_log_lines`` argument. These lines include the experiment status as well as the task status table generated by ``rocotostat`` so that, in case of failure, it is convenient to pinpoint the task that failed. -For details on the usage of ``get_expts_stats.sh``, issue the following command from the ``WE2E`` directory: - -.. code-block:: - - ./get_expts_status.sh --help - -Here is an example of how to call ``get_expts_status.sh`` from the ``WE2E`` directory: - -.. code-block:: console - - ./get_expts_status.sh expts_basedir=/path/to/expt_dirs/set01 - -The path for ``expts_basedir`` should be an absolute path. - -Here is an example of output from the ``get_expts_status.sh`` script: - -.. code-block:: console - - Checking for active experiment directories in the specified experiments - base directory (expts_basedir): - expts_basedir = "/path/to/expt_dirs/set01" - ... - - The number of active experiments found is: - num_expts = 2 - The list of experiments whose workflow status will be checked is: - 'custom_ESGgrid' - 'grid_RRFS_CONUScompact_25km_ics_FV3GFS_lbcs_FV3GFS_suite_GFS_v16' - - ====================================== - Checking workflow status of experiment "custom_ESGgrid" ... - Workflow status: SUCCESS - ====================================== - - ====================================== - Checking workflow status of experiment "grid_RRFS_CONUScompact_25km_ics_FV3GFS_lbcs_FV3GFS_suite_GFS_v16" ... - Workflow status: IN PROGRESS - ====================================== - - A status report has been created in: - expts_status_fp = "/path/to/expt_dirs/set01/expts_status_202204211440.txt" - - DONE. - -The "Workflow status" field of each test indicates the status of its workflow. -The values that this can take on are "SUCCESS", "FAILURE", and "IN PROGRESS". - Modifying the WE2E System ============================ This section describes various ways in which the WE2E testing system can be modified @@ -372,30 +316,6 @@ To add a new test named, e.g., ``new_test01``, to one of the existing test categ #. Edit the contents of ``config.new_test01.yaml`` by modifying existing experiment variable values and/or adding new variables such that the test runs with the intended configuration. -.. _AddNewCategory: - -Adding a New WE2E Test Category ------------------------------------ - -To create a new test category called, e.g., ``new_category``: - -#. In the directory ``ufs-srweather-app/tests/WE2E/test_configs``, create a new directory named ``new_category``. - -#. In the file ``get_WE2Etest_names_subdirs_descs.sh``, add the element ``"new_category"`` to the array ``category_subdirs``, which contains the list of categories/subdirectories in which to search for test configuration files. Thus, ``category_subdirs`` becomes: - - .. code-block:: console - - category_subdirs=( \ - "." \ - "grids_extrn_mdls_suites_community" \ - "grids_extrn_mdls_suites_nco" \ - "wflow_features" \ - "new_category" \ - ) - -New tests can now be added to ``new_category`` using the procedure described in :numref:`Section %s `. - - .. _CreateAltTestNames: Creating Alternate Names for a Test @@ -421,7 +341,7 @@ In this situation, the primary name for the test is ``grid_RRFS_CONUScompact_25k * A primary test can have more than one alternate test name (by having more than one symlink pointing to the test's configuration file). * The symlinks representing the alternate test names can be in the same or a different category directory. * The ``--relative`` flag makes the symlink relative (i.e., within/below the ``tests`` directory) so that it stays valid when copied to other locations. (Note, however, that this flag is platform-dependent and may not exist on some platforms.) - * To determine whether a test has one or more alternate names, a user can view the CSV file ``WE2E_test_info.csv`` generated by the ``run_WE2E_tests.sh`` script. Recall from :numref:`Section %s ` that column 1 of this CSV file contains the test's primary name (and its category) while column 2 contains any alternate names (and their categories). - * With this primary/alternate test naming convention, a user can list either the primary test name or one of the alternate test names in the experiments list file (e.g., ``my_tests.txt``) read in by ``run_WE2E_tests.sh``. If more than one name is listed for the same test (e.g., the primary name and and an alternate name, two alternate names, etc.), ``run_WE2E_tests.sh`` will exit with a warning message and will **not** run any tests. + * To determine whether a test has one or more alternate names, a user can view the file ``WE2E_test_info.txt`` as described in :numref:`Section %s ` + * With this primary/alternate test naming convention via symbolic links, if more than one name is listed for the same test (e.g., the primary name and and an alternate name, two alternate names, etc.), ``run_WE2E_tests.py`` will only run the test once From 7ccafde262faedfbff4592c957c887ce118947cc Mon Sep 17 00:00:00 2001 From: "Michael Kavulich, Jr" Date: Mon, 6 Mar 2023 05:25:45 +0000 Subject: [PATCH 27/52] Update gitignore for new/updated filenames --- .gitignore | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 6d4734c975..bc3eee8545 100644 --- a/.gitignore +++ b/.gitignore @@ -7,12 +7,14 @@ lib/ share/ modulefiles/extrn_comp_build/ sorc/*/ -tests/WE2E/WE2E_test_info.csv +tests/WE2E/WE2E_tests_*.yaml tests/WE2E/*.txt tests/WE2E/*.log +tests/WE2E/log.* ush/__pycache__/ ush/config.yaml ush/python_utils/__pycache__/ ush/*.swp *.swp +__pycache__ From c98f4fddfe79074d820a9d5c48cbddcbac63a5c9 Mon Sep 17 00:00:00 2001 From: "Michael Kavulich, Jr" Date: Mon, 6 Mar 2023 10:42:51 -0700 Subject: [PATCH 28/52] Documentation for WE2E_summary.py script --- docs/UsersGuide/source/WE2Etests.rst | 47 ++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/docs/UsersGuide/source/WE2Etests.rst b/docs/UsersGuide/source/WE2Etests.rst index 174224d162..ef5a7ead34 100644 --- a/docs/UsersGuide/source/WE2Etests.rst +++ b/docs/UsersGuide/source/WE2Etests.rst @@ -210,6 +210,53 @@ The full list of options for any of these scripts can be found by using the ``-h The option ``--use_cron_to_relaunch`` means that, rather than calling the ``monitor_jobs()`` function, the ``generate_FV3LAM_wflow()`` function will create a new :term:`cron` job in the user's cron table that will launch the experiment with the workflow launch script (``launch_FV3LAM_wflow.sh``). By default this script is run every 2 minutes, but we have changed that to 1 minute with the ``--cron_relaunch_intvl_mnts=1`` argument. This script will run until the workflow either completes successfully (i.e., all tasks SUCCEEDED) or fails (i.e., at least one task fails). The cron job is then removed from the user's cron table. +Checking test status and summary +================================= +By default, ``./run_WE2E_tests.py`` will actively monitor jobs, printing to screen when jobs are complete (either successfully or with a failure), and print a summary file ``WE2E_summary_{datetime.now().strftime("%Y%m%d%H%M%S")}.txt``. +However, if the user is using the legacy crontab option, or would like to summarize one or more experiments that are either not complete or were not handled by the WE2E test scripts, this status/summary file can be generated manually using ``WE2E_summary.py``. +In this example, an experiment was generated using the crontab option, and has not yet finished running. +We use the ``-e`` option to point to the experiment directory and get the current status of the experiment: + + .. code-block:: + + ./WE2E_summary.py -e /user/home/PR_466/expt_dirs/ + Updating database for experiment grid_RRFS_CONUScompact_25km_ics_HRRR_lbcs_HRRR_suite_RRFS_v1beta + Updating database for experiment grid_RRFS_CONUS_25km_ics_GSMGFS_lbcs_GSMGFS_suite_GFS_v16 + Updating database for experiment grid_RRFS_CONUS_3km_ics_FV3GFS_lbcs_FV3GFS_suite_HRRR + Updating database for experiment specify_template_filenames + Updating database for experiment grid_RRFS_CONUScompact_25km_ics_HRRR_lbcs_RAP_suite_HRRR + Updating database for experiment grid_RRFS_CONUScompact_3km_ics_HRRR_lbcs_RAP_suite_RRFS_v1beta + Updating database for experiment grid_RRFS_CONUS_25km_ics_FV3GFS_lbcs_FV3GFS_suite_GFS_2017_gfdlmp_regional + Updating database for experiment grid_SUBCONUS_Ind_3km_ics_HRRR_lbcs_RAP_suite_HRRR + Updating database for experiment grid_RRFS_CONUS_3km_ics_FV3GFS_lbcs_FV3GFS_suite_GFS_v16 + Updating database for experiment grid_RRFS_SUBCONUS_3km_ics_FV3GFS_lbcs_FV3GFS_suite_GFS_v16 + Updating database for experiment specify_DOT_OR_USCORE + Updating database for experiment custom_GFDLgrid__GFDLgrid_USE_NUM_CELLS_IN_FILENAMES_eq_FALSE + Updating database for experiment grid_RRFS_CONUScompact_25km_ics_FV3GFS_lbcs_FV3GFS_suite_GFS_v16 + ---------------------------------------------------------------------------------------------------- + Experiment name | Status | Core hours used + ---------------------------------------------------------------------------------------------------- + grid_RRFS_CONUScompact_25km_ics_HRRR_lbcs_HRRR_suite_RRFS_v1 COMPLETE 49.72 + grid_RRFS_CONUS_25km_ics_GSMGFS_lbcs_GSMGFS_suite_GFS_v16 DYING 6.51 + grid_RRFS_CONUS_3km_ics_FV3GFS_lbcs_FV3GFS_suite_HRRR COMPLETE 411.84 + specify_template_filenames COMPLETE 17.36 + grid_RRFS_CONUScompact_25km_ics_HRRR_lbcs_RAP_suite_HRRR COMPLETE 16.03 + grid_RRFS_CONUScompact_3km_ics_HRRR_lbcs_RAP_suite_RRFS_v1be COMPLETE 318.55 + grid_RRFS_CONUS_25km_ics_FV3GFS_lbcs_FV3GFS_suite_GFS_2017_g COMPLETE 17.79 + grid_SUBCONUS_Ind_3km_ics_HRRR_lbcs_RAP_suite_HRRR COMPLETE 17.76 + grid_RRFS_CONUS_3km_ics_FV3GFS_lbcs_FV3GFS_suite_GFS_v16 RUNNING 0.00 + grid_RRFS_SUBCONUS_3km_ics_FV3GFS_lbcs_FV3GFS_suite_GFS_v16 RUNNING 0.00 + specify_DOT_OR_USCORE QUEUED 0.00 + custom_GFDLgrid__GFDLgrid_USE_NUM_CELLS_IN_FILENAMES_eq_FALS QUEUED 0.00 + grid_RRFS_CONUScompact_25km_ics_FV3GFS_lbcs_FV3GFS_suite_GFS QUEUED 0.00 + ---------------------------------------------------------------------------------------------------- + Total RUNNING 855.56 + + Detailed summary written to WE2E_summary_20230306173013.txt + +As with all python scripts in the App, additional options for this script can be viewed by calling with the ``-h`` argument. + + .. _WE2ETestInfoFile: WE2E Test Information File From b208a4beb78d3d8a92dbf5f04719fe90adf7ed5b Mon Sep 17 00:00:00 2001 From: "Michael Kavulich, Jr" Date: Mon, 6 Mar 2023 17:47:05 +0000 Subject: [PATCH 29/52] Revert behavior of rocotorun to only capture output if debug=True. This will greatly increase the speed of runs on most platforms, but will have the downside of not capturing all rocotorun output in log files, including job cards and other messages. --- tests/WE2E/monitor_jobs.py | 4 ++-- tests/WE2E/run_WE2E_tests.py | 6 +++--- tests/WE2E/utils.py | 33 +++++++++++++++++++++++---------- 3 files changed, 28 insertions(+), 15 deletions(-) diff --git a/tests/WE2E/monitor_jobs.py b/tests/WE2E/monitor_jobs.py index 8c607e45e7..cdd353681e 100755 --- a/tests/WE2E/monitor_jobs.py +++ b/tests/WE2E/monitor_jobs.py @@ -53,11 +53,11 @@ def monitor_jobs(expt_dict: dict, monitor_file: str = '', procs: int = 1, debug: if procs > 1: print(f'Starting experiments in parallel with {procs} processes') - expt_dict = update_expt_status_parallel(expt_dict, procs, True) + expt_dict = update_expt_status_parallel(expt_dict, procs, True, debug) else: for expt in expt_dict: logging.info(f"Starting experiment {expt} running") - expt_dict[expt] = update_expt_status(expt_dict[expt], expt, True) + expt_dict[expt] = update_expt_status(expt_dict[expt], expt, True, debug) write_monitor_file(monitor_file,expt_dict) diff --git a/tests/WE2E/run_WE2E_tests.py b/tests/WE2E/run_WE2E_tests.py index 72e748d212..3aa2e89c45 100755 --- a/tests/WE2E/run_WE2E_tests.py +++ b/tests/WE2E/run_WE2E_tests.py @@ -212,10 +212,11 @@ def run_we2e_tests(homedir, args) -> None: monitor_file = f'WE2E_tests_{datetime.now().strftime("%Y%m%d%H%M%S")}.yaml' write_monitor_file(monitor_file,monitor_yaml) try: - monitor_file = monitor_jobs(monitor_yaml, monitor_file=monitor_file, procs=args.procs, debug=args.debug) + monitor_file = monitor_jobs(monitor_yaml, monitor_file=monitor_file, procs=args.procs, + debug=args.debug) except KeyboardInterrupt: logging.info("\n\nUser interrupted monitor script; to resume monitoring jobs run:\n") - logging.info(f"./monitor_jobs.py -y={monitor_file} -p={args.procs}\n") + logging.info(f"./monitor_jobs.py -y={monitor_file} -p={args.procs} -d={args.debug}\n") else: logging.info("All experiments are complete") logging.info(f"Summary of results available in {monitor_file}") @@ -458,7 +459,6 @@ def setup_logging(logfile: str = "log.run_WE2E_tests", debug: bool = False) -> N parser.add_argument('-q', '--quiet', action='store_true', help='Suppress console output from workflow generation; this will help keep the screen uncluttered') parser.add_argument('-p', '--procs', type=int, help='Run resource-heavy tasks (such as calls to rocotorun) in parallel, with provided number of parallel tasks', default=1) - parser.add_argument('--modulefile', type=str, help='Modulefile used for building the app') parser.add_argument('--run_envir', type=str, help='Overrides RUN_ENVIR variable to a new value ( "nco" or "community" ) for all experiments', default='') parser.add_argument('--expt_basedir', type=str, help='Explicitly set EXPT_BASEDIR for all experiments') diff --git a/tests/WE2E/utils.py b/tests/WE2E/utils.py index 91aeb9a46d..fdf712b1ca 100755 --- a/tests/WE2E/utils.py +++ b/tests/WE2E/utils.py @@ -77,6 +77,10 @@ def print_WE2E_summary(expt_dict: dict, debug: bool = False): total_core_hours += ch if "ERROR" in statuses: total_status = "ERROR" + elif "RUNNING" in statuses: + total_status = "RUNNING" + elif "QUEUED" in statuses: + total_status = "QUEUED" elif "DEAD" in statuses: total_status = "DEAD" elif "COMPLETE" in statuses: @@ -166,8 +170,8 @@ def create_expt_dict(expt_dir: str) -> dict: logging.debug(f'Skipping directory {item}, experiment XML file not found') continue #Update the experiment dictionary - logging.info(f"Reading status of experiment {item}") - update_expt_status(expt_dict[item],item,True) + logging.debug(f"Reading status of experiment {item}") + update_expt_status(expt_dict[item],item,True,False) summary_file = f'WE2E_tests_{datetime.now().strftime("%Y%m%d%H%M%S")}.yaml' return summary_file, expt_dict @@ -226,7 +230,7 @@ def write_monitor_file(monitor_file: str, expt_dict: dict): raise -def update_expt_status(expt: dict, name: str, refresh: bool = False) -> dict: +def update_expt_status(expt: dict, name: str, refresh: bool = False, debug: bool = False) -> dict: """ This function reads the dictionary showing the location of a given experiment, runs a `rocotorun` command to update the experiment (running new jobs and updating the status of @@ -267,6 +271,9 @@ def update_expt_status(expt: dict, name: str, refresh: bool = False) -> dict: refresh (bool): If true, this flag will check an experiment status even if it is listed as DEAD, ERROR, or COMPLETE. Used for initial checks for experiments that may have been restarted. + debug (bool): Will capture all output from rocotorun. This will allow information such + as job cards and job submit messages to appear in the log files, but can + slow down the process drastically. Returns: dict: The updated experiment dictionary. """ @@ -279,13 +286,19 @@ def update_expt_status(expt: dict, name: str, refresh: bool = False) -> dict: logging.info(f"Updating database for experiment {name}") # Update experiment, read rocoto database rocoto_db = f"{expt['expt_dir']}/FV3LAM_wflow.db" - rocotorun_cmd = ["rocotorun", f"-w {expt['expt_dir']}/FV3LAM_wflow.xml", f"-d {rocoto_db}", "-v 10"] - p = subprocess.run(rocotorun_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) - logging.debug(p.stdout) - - #Run rocotorun again to get around rocotobqserver proliferation issue - p = subprocess.run(rocotorun_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) - logging.debug(p.stdout) + if debug: + rocotorun_cmd = ["rocotorun", f"-w {expt['expt_dir']}/FV3LAM_wflow.xml", f"-d {rocoto_db}", "-v 10"] + p = subprocess.run(rocotorun_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) + logging.debug(p.stdout) + + #Run rocotorun again to get around rocotobqserver proliferation issue + p = subprocess.run(rocotorun_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) + logging.debug(p.stdout) + else: + rocotorun_cmd = ["rocotorun", f"-w {expt['expt_dir']}/FV3LAM_wflow.xml", f"-d {rocoto_db}"] + subprocess.run(rocotorun_cmd) + #Run rocotorun again to get around rocotobqserver proliferation issue + subprocess.run(rocotorun_cmd) logging.debug(f"Reading database for experiment {name}, updating experiment dictionary") try: From baa8fcb7e234d8e8317bbee796b036c3de673fe8 Mon Sep 17 00:00:00 2001 From: "Michael Kavulich, Jr" Date: Mon, 6 Mar 2023 18:24:37 +0000 Subject: [PATCH 30/52] Fixes suggested by pylint --- tests/WE2E/WE2E_summary.py | 32 ++++++++---------- tests/WE2E/monitor_jobs.py | 35 ++++++++++--------- tests/WE2E/print_test_info.py | 11 +++--- tests/WE2E/run_WE2E_tests.py | 63 ++++++++++++++++++++++------------- tests/WE2E/utils.py | 26 +++++++-------- 5 files changed, 89 insertions(+), 78 deletions(-) diff --git a/tests/WE2E/WE2E_summary.py b/tests/WE2E/WE2E_summary.py index 8e1a6113e2..26d1ccd731 100755 --- a/tests/WE2E/WE2E_summary.py +++ b/tests/WE2E/WE2E_summary.py @@ -1,27 +1,16 @@ #!/usr/bin/env python3 -import os import sys import argparse import logging -import subprocess -import sqlite3 -import time -from textwrap import dedent -from datetime import datetime -from contextlib import closing sys.path.append("../../ush") -from python_utils import ( - cfg_to_yaml_str, - load_config_file -) +from python_utils import load_config_file from check_python_version import check_python_version -from utils import calculate_core_hours, create_expt_dict, update_expt_status, \ - print_WE2E_summary, write_monitor_file +from utils import calculate_core_hours, create_expt_dict, print_WE2E_summary, write_monitor_file REPORT_WIDTH = 100 @@ -47,12 +36,20 @@ def setup_logging(debug: bool = False) -> None: check_python_version() #Parse arguments - parser = argparse.ArgumentParser(description="Script for creating a job summary printed to screen and a file, either from a yaml experiment file created by monitor_jobs() or from a provided directory of experiments\n") + parser = argparse.ArgumentParser( + description="Script for creating a job summary printed to screen and a file, "\ + "either from a yaml experiment file created by monitor_jobs() or from a "\ + "provided directory of experiments\n") req = parser.add_mutually_exclusive_group(required=True) - req.add_argument('-y', '--yaml_file', type=str, help='YAML-format file specifying the information of jobs to be summarized; for an example file, see monitor_jobs.yaml') - req.add_argument('-e', '--expt_dir', type=str, help='The full path of an experiment directory, containing one or more subdirectories with UFS SRW App experiments in them') - parser.add_argument('-d', '--debug', action='store_true', help='Script will be run in debug mode with more verbose output') + req.add_argument('-y', '--yaml_file', type=str, + help='YAML-format file specifying the information of jobs to be summarized; '\ + 'for an example file, see monitor_jobs.yaml') + req.add_argument('-e', '--expt_dir', type=str, + help='The full path of an experiment directory, containing one or more '\ + 'subdirectories with UFS SRW App experiments in them') + parser.add_argument('-d', '--debug', action='store_true', + help='Script will be run in debug mode with more verbose output') args = parser.parse_args() @@ -74,4 +71,3 @@ def setup_logging(debug: bool = False) -> None: #Call function to print summary print_WE2E_summary(expt_dict, args.debug) - diff --git a/tests/WE2E/monitor_jobs.py b/tests/WE2E/monitor_jobs.py index cdd353681e..a5634b52d4 100755 --- a/tests/WE2E/monitor_jobs.py +++ b/tests/WE2E/monitor_jobs.py @@ -1,30 +1,21 @@ #!/usr/bin/env python3 -import os -import re import sys import argparse import logging -import subprocess -import sqlite3 import time from textwrap import dedent from datetime import datetime -from contextlib import closing sys.path.append("../../ush") -from python_utils import ( - cfg_to_yaml_str, - flatten_dict, - load_config_file, - load_shell_config -) +from python_utils import load_config_file from check_python_version import check_python_version from WE2E_summary import print_WE2E_summary -from utils import calculate_core_hours, write_monitor_file, update_expt_status, update_expt_status_parallel +from utils import calculate_core_hours, write_monitor_file, update_expt_status,\ + update_expt_status_parallel def monitor_jobs(expt_dict: dict, monitor_file: str = '', procs: int = 1, debug: bool = False) -> str: """Function to monitor and run jobs for the specified experiment using Rocoto @@ -79,7 +70,8 @@ def monitor_jobs(expt_dict: dict, monitor_file: str = '', procs: int = 1, debug: for expt in running_expts.copy(): running_expts[expt] = expt_dict[expt] if running_expts[expt]["status"] in ['DEAD','ERROR','COMPLETE']: - logging.info(f'Experiment {expt} is {running_expts[expt]["status"]}; will no longer monitor.') + logging.info(f'Experiment {expt} is {running_expts[expt]["status"]};'\ + 'will no longer monitor.') running_expts.pop(expt) continue logging.debug(f'Experiment {expt} status is {expt_dict[expt]["status"]}') @@ -142,11 +134,18 @@ def setup_logging(logfile: str = "log.run_WE2E_tests", debug: bool = False) -> N logfile='log.monitor_jobs' #Parse arguments - parser = argparse.ArgumentParser(description="Script for monitoring and running jobs in a specified experiment, as specified in a yaml configuration file\n") - - parser.add_argument('-y', '--yaml_file', type=str, help='YAML-format file specifying the information of jobs to be run; for an example file, see monitor_jobs.yaml', required=True) - parser.add_argument('-p', '--procs', type=int, help='Run resource-heavy tasks (such as calls to rocotorun) in parallel, with provided number of parallel tasks', default=1) - parser.add_argument('-d', '--debug', action='store_true', help='Script will be run in debug mode with more verbose output') + parser = argparse.ArgumentParser(description="Script for monitoring and running jobs in a "\ + "specified experiment, as specified in a yaml "\ + "configuration file\n") + + parser.add_argument('-y', '--yaml_file', type=str, + help='YAML-format file specifying the information of jobs to be run; '\ + 'for an example file, see monitor_jobs.yaml', required=True) + parser.add_argument('-p', '--procs', type=int, + help='Run resource-heavy tasks (such as calls to rocotorun) in parallel, '\ + 'with provided number of parallel tasks', default=1) + parser.add_argument('-d', '--debug', action='store_true', + help='Script will be run in debug mode with more verbose output') args = parser.parse_args() diff --git a/tests/WE2E/print_test_info.py b/tests/WE2E/print_test_info.py index f3c13f52e2..f2301bb690 100755 --- a/tests/WE2E/print_test_info.py +++ b/tests/WE2E/print_test_info.py @@ -7,15 +7,16 @@ sys.path.append("../../ush") -from check_python_version import check_python_version - - if __name__ == "__main__": #Parse arguments - parser = argparse.ArgumentParser(description="Script for parsing all test files in the test_configs/ directory, and printing a pipe-delimited summary file of the details of each test.\n") + parser = argparse.ArgumentParser( + description="Script for parsing all test files in the test_configs/ "\ + "directory, and printing a pipe-delimited summary file of the details of "\ + "each test.\n") - parser.add_argument('-o', '--output_file', type=str, help='File name for test details file', default='') + parser.add_argument('-o', '--output_file', type=str, + help='File name for test details file', default='') args = parser.parse_args() diff --git a/tests/WE2E/run_WE2E_tests.py b/tests/WE2E/run_WE2E_tests.py index 3aa2e89c45..8d51507689 100755 --- a/tests/WE2E/run_WE2E_tests.py +++ b/tests/WE2E/run_WE2E_tests.py @@ -85,12 +85,14 @@ def run_we2e_tests(homedir, args) -> None: else: if not run_envir: run_envir = 'community' - logging.debug(f'{testfilename} exists for this platform and run_envir has not been specified'\ + logging.debug(f'{testfilename} exists for this platform and run_envir'\ + 'has not been specified\n'\ 'Setting run_envir = {run_envir} for all tests') else: if not run_envir: run_envir = 'nco' - logging.debug(f'{testfilename} exists for this platform and run_envir has not been specified'\ + logging.debug(f'{testfilename} exists for this platform and run_envir has'\ + 'not been specified\n'\ 'Setting run_envir = {run_envir} for all tests') logging.debug(f"Reading test file: {testfilename}") with open(testfilename, encoding="utf-8") as f: @@ -175,14 +177,18 @@ def run_we2e_tests(homedir, args) -> None: logging.debug(f"Overwriting WE2E-test-specific settings for test \n{test_name}\n") if 'task_get_extrn_ics' in test_cfg: - test_cfg['task_get_extrn_ics'] = check_task_get_extrn_bcs(test_cfg,machine_defaults,config_defaults,"ics") + test_cfg['task_get_extrn_ics'] = check_task_get_extrn_bcs(test_cfg,machine_defaults, + config_defaults,"ics") if 'task_get_extrn_lbcs' in test_cfg: - test_cfg['task_get_extrn_lbcs'] = check_task_get_extrn_bcs(test_cfg,machine_defaults,config_defaults,"lbcs") + test_cfg['task_get_extrn_lbcs'] = check_task_get_extrn_bcs(test_cfg,machine_defaults, + config_defaults,"lbcs") if 'verification' in test_cfg: - test_cfg['verification'] = check_task_verification(test_cfg,machine_defaults,config_defaults) + test_cfg['verification'] = check_task_verification(test_cfg,machine_defaults, + config_defaults) - logging.debug(f"Writing updated config.yaml for test {test_name}\nbased on specified command-line arguments:\n") + logging.debug(f"Writing updated config.yaml for test {test_name}\n"\ + "based on specified command-line arguments:\n") logging.debug(cfg_to_yaml_str(test_cfg)) with open(ushdir + "/config.yaml","w", encoding="utf-8") as f: f.writelines(cfg_to_yaml_str(test_cfg)) @@ -191,7 +197,8 @@ def run_we2e_tests(homedir, args) -> None: if args.quiet: console_handler = logging.getLogger().handlers[1] console_handler.setLevel(logging.WARNING) - expt_dir = generate_FV3LAM_wflow(ushdir,logfile=f"{ushdir}/log.generate_FV3LAM_wflow",debug=args.debug) + expt_dir = generate_FV3LAM_wflow(ushdir,logfile=f"{ushdir}/log.generate_FV3LAM_wflow", + debug=args.debug) if args.quiet: if args.debug: console_handler.setLevel(logging.DEBUG) @@ -264,11 +271,12 @@ def check_tests(tests: list) -> list: if os.path.islink(testfile): if os.path.realpath(testfile) in tests_to_run: logging.warning(dedent(f"""WARNING: test file {testfile} is a symbolic link to a - test file ({os.path.realpath(testfile)}) that is also included in the - test list. Only the latter test will be run.""")) + test file ({os.path.realpath(testfile)}) that is also included in + the test list. Only the latter test will be run.""")) tests_to_run.remove(testfile) if len(tests_to_run) != len(set(tests_to_run)): - logging.warning("\nWARNING: Duplicate test names were found in list. Removing duplicates and continuing.\n") + logging.warning("\nWARNING: Duplicate test names were found in list. "\ + "Removing duplicates and continuing.\n") tests_to_run = list(set(tests_to_run)) return tests_to_run @@ -311,7 +319,7 @@ def check_task_get_extrn_bcs(cfg: dict, mach: dict, dflt: dict, ics_or_lbcs: str """ if ics_or_lbcs not in ["lbcs", "ics"]: - raise ValueError(f"ics_or_lbcs must be set to 'lbcs' or 'ics'") + raise ValueError("ics_or_lbcs must be set to 'lbcs' or 'ics'") I_OR_L = ics_or_lbcs.upper() @@ -327,35 +335,43 @@ def check_task_get_extrn_bcs(cfg: dict, mach: dict, dflt: dict, ics_or_lbcs: str logging.debug(f'USE_USER_STAGED_EXTRN_FILES not specified or False in task_get_extrn_{ics_or_lbcs} section of config') return cfg_bcs - # If EXTRN_MDL_SYSBASEDIR_* is "set_to_non_default_location_in_testing_script", replace with test value from machine file + # If EXTRN_MDL_SYSBASEDIR_* is "set_to_non_default_location_in_testing_script", replace with + # test value from machine file if cfg_bcs.get(f'EXTRN_MDL_SYSBASEDIR_{I_OR_L}') == "set_to_non_default_location_in_testing_script": if f'TEST_ALT_EXTRN_MDL_SYSBASEDIR_{I_OR_L}' in mach['platform']: if os.path.isdir(mach['platform'][f'TEST_ALT_EXTRN_MDL_SYSBASEDIR_{I_OR_L}']): raise FileNotFoundError(f"Non-default input file location TEST_ALT_EXTRN_MDL_SYSBASEDIR_{I_OR_L} from machine file does not exist or is not a directory") - cfg_bcs[f'EXTRN_MDL_SYSBASEDIR_{I_OR_L}'] = mach['platform'][f'TEST_ALT_EXTRN_MDL_SYSBASEDIR_{I_OR_L}'] + cfg_bcs[f'EXTRN_MDL_SYSBASEDIR_{I_OR_L}'] = \ + mach['platform'][f'TEST_ALT_EXTRN_MDL_SYSBASEDIR_{I_OR_L}'] else: - raise KeyError(f"Non-default input file location TEST_ALT_EXTRN_MDL_SYSBASEDIR_{I_OR_L} not set in machine file") + raise KeyError(f"Non-default input file location "\ + "TEST_ALT_EXTRN_MDL_SYSBASEDIR_{I_OR_L} not set in machine file") return cfg_bcs - # Because USE_USER_STAGED_EXTRN_FILES is true, only look on disk, and ensure the staged data directory exists + # Because USE_USER_STAGED_EXTRN_FILES is true, only look on disk, and ensure the staged data + # directory exists cfg['platform']['EXTRN_MDL_DATA_STORES'] = "disk" if 'TEST_EXTRN_MDL_SOURCE_BASEDIR' not in mach['platform']: raise KeyError("TEST_EXTRN_MDL_SOURCE_BASEDIR, the directory for staged test data,"\ "has not been specified in the machine file for this platform") if not os.path.isdir(mach['platform']['TEST_EXTRN_MDL_SOURCE_BASEDIR']): - raise FileNotFoundError(dedent(f"""The directory for staged test data specified in this platform's machine file + raise FileNotFoundError(dedent( + f"""The directory for staged test data specified in this platform's machine file TEST_EXTRN_MDL_SOURCE_BASEDIR = {mach['platform']['TEST_EXTRN_MDL_SOURCE_BASEDIR']} does not exist.""")) - # Different input data types have different directory structures, so set the data directory accordingly + # Different input data types have different directory structures; set data dir accordingly if cfg_bcs[f'EXTRN_MDL_NAME_{I_OR_L}'] == 'FV3GFS': if f'FV3GFS_FILE_FMT_{I_OR_L}' not in cfg_bcs: - cfg_bcs[f'FV3GFS_FILE_FMT_{I_OR_L}'] = dflt[f'task_get_extrn_{ics_or_lbcs}'][f'FV3GFS_FILE_FMT_{I_OR_L}'] - cfg_bcs[f'EXTRN_MDL_SOURCE_BASEDIR_{I_OR_L}'] = f"{mach['platform']['TEST_EXTRN_MDL_SOURCE_BASEDIR']}/"\ - f"{cfg_bcs[f'EXTRN_MDL_NAME_{I_OR_L}']}/{cfg_bcs[f'FV3GFS_FILE_FMT_{I_OR_L}']}/${{yyyymmddhh}}" + cfg_bcs[f'FV3GFS_FILE_FMT_{I_OR_L}'] = \ + dflt[f'task_get_extrn_{ics_or_lbcs}'][f'FV3GFS_FILE_FMT_{I_OR_L}'] + cfg_bcs[f'EXTRN_MDL_SOURCE_BASEDIR_{I_OR_L}'] = \ + f"{mach['platform']['TEST_EXTRN_MDL_SOURCE_BASEDIR']}/"\ + f"{cfg_bcs[f'EXTRN_MDL_NAME_{I_OR_L}']}/{cfg_bcs[f'FV3GFS_FILE_FMT_{I_OR_L}']}/${{yyyymmddhh}}" else: - cfg_bcs[f'EXTRN_MDL_SOURCE_BASEDIR_{I_OR_L}'] = f"{mach['platform']['TEST_EXTRN_MDL_SOURCE_BASEDIR']}/"\ - f"{cfg_bcs[f'EXTRN_MDL_NAME_{I_OR_L}']}/${{yyyymmddhh}}" + cfg_bcs[f'EXTRN_MDL_SOURCE_BASEDIR_{I_OR_L}'] = \ + f"{mach['platform']['TEST_EXTRN_MDL_SOURCE_BASEDIR']}/"\ + f"{cfg_bcs[f'EXTRN_MDL_NAME_{I_OR_L}']}/${{yyyymmddhh}}" return cfg_bcs @@ -443,7 +459,8 @@ def setup_logging(logfile: str = "log.run_WE2E_tests", debug: bool = False) -> N #Parse arguments parser = argparse.ArgumentParser(epilog="For more information about config arguments (denoted in CAPS), see ush/config_defaults.yaml\n") - optional = parser._action_groups.pop() # Create a group for optional arguments so they can be listed after required args + # Create a group for optional arguments so they can be listed after required args + optional = parser._action_groups.pop() required = parser.add_argument_group('required arguments') required.add_argument('-m', '--machine', type=str, help='Machine name; see ush/machine/ for valid values', required=True) diff --git a/tests/WE2E/utils.py b/tests/WE2E/utils.py index fdf712b1ca..01e12eded7 100755 --- a/tests/WE2E/utils.py +++ b/tests/WE2E/utils.py @@ -5,14 +5,12 @@ import os import re import sys -import argparse import logging import subprocess import sqlite3 -import time import glob from textwrap import dedent -from datetime import datetime, timedelta +from datetime import datetime from contextlib import closing from multiprocessing import Pool @@ -26,8 +24,6 @@ load_shell_config ) -from check_python_version import check_python_version - REPORT_WIDTH = 100 def print_WE2E_summary(expt_dict: dict, debug: bool = False): @@ -98,7 +94,7 @@ def print_WE2E_summary(expt_dict: dict, debug: bool = False): summary_file = f'WE2E_summary_{datetime.now().strftime("%Y%m%d%H%M%S")}.txt' print(f"\nDetailed summary written to {summary_file}\n") - with open(summary_file, 'w') as f: + with open(summary_file, 'w', encoding="utf-8") as f: for line in summary: f.write(f"{line}\n") f.write("\nDetailed summary of each experiment:\n") @@ -217,7 +213,7 @@ def calculate_core_hours(expt_dict: dict) -> dict: def write_monitor_file(monitor_file: str, expt_dict: dict): try: - with open(monitor_file,"w") as f: + with open(monitor_file,"w", encoding="utf-8") as f: f.write("### WARNING ###\n") f.write("### THIS FILE IS AUTO_GENERATED AND REGULARLY OVER-WRITTEN BY WORKFKLOW SCRIPTS\n") f.write("### EDITS MAY RESULT IN MISBEHAVIOR OF EXPERIMENTS RUNNING\n") @@ -369,7 +365,6 @@ def update_expt_status(expt: dict, name: str, refresh: bool = False, debug: bool You can use ctrl-c to pause this script and inspect log files. """)) - else: logging.fatal("Some kind of horrible thing has happened") raise ValueError(dedent( @@ -386,7 +381,7 @@ def update_expt_status(expt: dict, name: str, refresh: bool = False, debug: bool return expt -def update_expt_status_parallel(expt_dict: dict, procs: int, refresh: bool = False) -> dict: +def update_expt_status_parallel(expt_dict: dict, procs: int, refresh: bool = False, debug: bool = False) -> dict: """ This function updates an entire set of experiments in parallel, drastically speeding up the process if given enough parallel processes. Given an experiment dictionary, it will @@ -399,6 +394,9 @@ def update_expt_status_parallel(expt_dict: dict, procs: int, refresh: bool = Fal expt_dict (dict): A dictionary containing information for all experiments procs (int): The number of parallel processes refresh (bool): "Refresh" flag to pass to update_expt_status() + debug (bool): Will capture all output from rocotorun. This will allow information such + as job cards and job submit messages to appear in the log files, but can + slow down the process drastically. Returns: dict: The updated dictionary of experiment dictionaries @@ -407,7 +405,7 @@ def update_expt_status_parallel(expt_dict: dict, procs: int, refresh: bool = Fal args = [] # Define a tuple of arguments to pass to starmap for expt in expt_dict: - args.append( (expt_dict[expt],expt,refresh) ) + args.append( (expt_dict[expt],expt,refresh,debug) ) # call update_expt_status() in parallel with Pool(processes=procs) as pool: @@ -416,8 +414,8 @@ def update_expt_status_parallel(expt_dict: dict, procs: int, refresh: bool = Fal # Update dictionary with output from all calls to update_expt_status() i = 0 for expt in expt_dict: - expt_dict[expt] = output[i] - i += 1 + expt_dict[expt] = output[i] + i += 1 return expt_dict @@ -469,7 +467,7 @@ def print_test_info(txtfile: str = "WE2E_test_info.txt") -> None: testdict[link[2]]["alternate_directory_name"] = link[1] # Print the file - with open(txtfile, 'w') as f: + with open(txtfile, 'w', encoding="utf-8") as f: # Field delimiter character d = "\" | \"" txt_output = ['"Test Name'] @@ -539,7 +537,7 @@ def compare_rocotostat(expt_dict,name): p = subprocess.run(rocotorun_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) rsout = p.stdout - # Parse each line of rocotostat output, extracting relevant information + # Parse each line of rocotostat output, extracting relevant information untracked_tasks = [] for line in rsout.split('\n'): # Skip blank lines and dividing lines of '=====...' From b26623fdbad0bd65834c327060da76450f1d989e Mon Sep 17 00:00:00 2001 From: "Michael Kavulich, Jr" Date: Mon, 6 Mar 2023 18:28:58 +0000 Subject: [PATCH 31/52] The big moment: ditching the old shell version. --- tests/WE2E/create_WE2E_resource_summary.py | 187 -- .../WE2E/get_WE2Etest_names_subdirs_descs.sh | 1633 ----------------- tests/WE2E/get_expts_status.sh | 475 ----- tests/WE2E/run_WE2E_tests.sh | 1379 -------------- tests/WE2E/setup_WE2E_tests.sh | 18 +- 5 files changed, 9 insertions(+), 3683 deletions(-) delete mode 100644 tests/WE2E/create_WE2E_resource_summary.py delete mode 100755 tests/WE2E/get_WE2Etest_names_subdirs_descs.sh delete mode 100755 tests/WE2E/get_expts_status.sh delete mode 100755 tests/WE2E/run_WE2E_tests.sh diff --git a/tests/WE2E/create_WE2E_resource_summary.py b/tests/WE2E/create_WE2E_resource_summary.py deleted file mode 100644 index 5095a9fe69..0000000000 --- a/tests/WE2E/create_WE2E_resource_summary.py +++ /dev/null @@ -1,187 +0,0 @@ -''' -Generate a summary of resources used for the WE2E test suite. - -Examples: - - To print usage - - python create_WE2E_resource_summary.py - python create_WE2E_resource_summary.py -h - - To print a report for all the experiments in an experiment directory - - python create_WE2E_resource_summary.py -e /path/to/expt_dir - - To print a report for all the grid_* and nco_* experiments. - - python create_WE2E_resource_summary.py -e /path/to/expt_dir \ - -n 'grid*' 'nco*' - - To compute a total estimated cost for all experiments on instances that are - $0.15 per core hour. - - python create_WE2E_resource_summary.py -e /path/to/expt_dir -c $0.15 - -Information about the output summary. - - - The core hours are an underestimate in many cases. - - Multiple tries are not captured. - - The use of a portion of a node or instance is not known. If the whole node - is used, but isn't reflected in the core count, the cores are not counted. - Partition information is not stored in the database, so mapping to a given - node type becomes ambiguous. - - For example, jobs that request 4 nodes with 2 processors per node with an - --exclusive flag will underestimate the total core hour usage by a factor - of 20 when using a 40 processor node. - - - When computing cost per job, it will also provide an underestimate for the - reasons listed above. - - Only one cost will be applied across all jobs. Rocoto jobs do not store - partition information in the job table, so was not included as an option here. - -''' - -import argparse -import glob -import os -import sys -import sqlite3 - -REPORT_WIDTH = 110 - -def parse_args(argv): - - - ''' - Function maintains the arguments accepted by this script. Please see - Python's argparse documenation for more information about settings of each - argument. - ''' - - parser = argparse.ArgumentParser( - description="Generate a usage report for a set of SRW experiments." - ) - - parser.add_argument( - '-e', '--expt_path', - help='The path to the directory containing the experiment \ - directories', - ) - parser.add_argument( - '-n', '--expt_names', - default=['*'], - help='A list of experiments to generate the report for. Wildcards \ - accepted by glob.glob may be used. If not provided, a report will be \ - generated for all experiments in the expt_path that have a Rocoto \ - database', - nargs='*', - ) - - # Optional - parser.add_argument( - '-c', '--cost_per_core_hour', - help='Provide the cost per core hour for the instance type used. \ - Only supports homogenous clusters.', - type=float, - ) - - return parser.parse_args(argv) - -def get_workflow_info(db_path): - - ''' Given the path to a Rocoto database, return the total number of tasks, - core hours and wall time for the workflow. ''' - - con = sqlite3.connect(db_path) - cur = con.cursor() - - # jobs schema is: - # (id INTEGER PRIMARY KEY, jobid VARCHAR(64), taskname VARCHAR(64), cycle - # DATETIME, cores INTEGER, state VARCHAR(64), native_state VARCHAR[64], - # exit_status INTEGER, tries INTEGER, nunknowns INTEGER, duration REAL) - # - # an example: - # 5|66993580|make_sfc_climo|1597017600|48|SUCCEEDED|COMPLETED|0|1|0|83.0 - try: - cur.execute('SELECT cores, duration from jobs') - except sqlite3.OperationalError: - return 0, 0, 0 - - workflow_info = cur.fetchall() - - core_hours = 0 - wall_time = 0 - ntasks = 0 - for cores, duration in workflow_info: - core_hours += cores * duration / 3600 - wall_time += duration / 60 - ntasks += 1 - - return ntasks, core_hours, wall_time - - -def fetch_expt_summaries(expts): - - ''' Get the important information from the database of each experiment, and - return a list, sorted by experiment name. ''' - - summaries = [] - for expt in expts: - test_name = expt.split('/')[-1] - db_path = os.path.join(expt, 'FV3LAM_wflow.db') - if not os.path.exists(db_path): - print(f'No FV3LAM_wflow.db exists for expt: {test_name}') - continue - ntasks, core_hours, wall_time = get_workflow_info(db_path) - summaries.append((test_name, ntasks, core_hours, wall_time)) - - return sorted(summaries) - -def generate_report(argv): - - ''' Given user arguments, print a summary of the requested experiments' - usage information, including cost (if requested). ''' - - cla = parse_args(argv) - - experiments = [] - for expt in cla.expt_names: - experiments.extend(glob.glob( - os.path.join(cla.expt_path, expt) - )) - - header = f'{" "*60} Core Hours | Run Time (mins)' - if cla.cost_per_core_hour: - header = f'{header} | Est. Cost ($) ' - - print('-'*REPORT_WIDTH) - print('-'*REPORT_WIDTH) - print(header) - print('-'*REPORT_WIDTH) - - total_ch = 0 - total_cost = 0 - for name, ntasks, ch, wt in fetch_expt_summaries(experiments): - line = f'{name[:60]:<60s} {ch:^12.2f} {wt:^20.1f}' - if cla.cost_per_core_hour: - cost = ch * cla.cost_per_core_hour - line = f'{line} ${cost:<.2f}' - total_cost += cost - total_ch += ch - print(line) - - print('-'*REPORT_WIDTH) - print(f'TOTAL CORE HOURS: {total_ch:6.2f}') - if cla.cost_per_core_hour: - print(f'TOTAL COST: ${cla.cost_per_core_hour * total_ch:6.2f}') - - print('*'*REPORT_WIDTH) - print('WARNING: This data reflects only the job information from the last', - 'logged try. It does not account for the use \n of an entire node, only', - 'the actual cores requested. It may provide an underestimate of true compute usage.') - print('*'*REPORT_WIDTH) - - -if __name__ == "__main__": - generate_report(sys.argv[1:]) diff --git a/tests/WE2E/get_WE2Etest_names_subdirs_descs.sh b/tests/WE2E/get_WE2Etest_names_subdirs_descs.sh deleted file mode 100755 index 2e7c312701..0000000000 --- a/tests/WE2E/get_WE2Etest_names_subdirs_descs.sh +++ /dev/null @@ -1,1633 +0,0 @@ -#!/bin/bash - -# -#----------------------------------------------------------------------- -# -# This file defines a function that gathers and returns information about -# the WE2E tests available in the WE2E testing system. This information -# consists of the test names, the category subdirectories in which the -# test configuration files are located (relative to a base directory), -# the test IDs, and the test descriptions. This function optionally -# also creates a CSV (Comma-Separated Value) file containing various -# pieces of information about each of the workflow end-to-end (WE2E) -# tests. These are described in more detail below. -# -# The function takes as inputs the following arguments: -# -# WE2Edir: -# Directory in which the WE2E testing system is located. This system -# consists of the main script for running WE2E tests, various auxiliary -# scripts, and the test configuration files. -# -# generate_csv_file: -# Flag that specifies whether or not a CSV (Comma-Separated Value) file -# containing information about the WE2E tests should be generated. -# -# verbose: -# Optional verbosity flag. Should be set to "TRUE" or "FALSE". Default -# is "FALSE". -# -# outvarname_test_configs_basedir: -# Name of output variable in which to return the base directory of the -# WE2E test configuration files. -# -# outvarname_test_names: -# Name of output array variable in which to return the names of the WE2E -# tests. -# -# outvarname_test_subdirs: -# Name of output array variable in which to return the category subdirectories -# in which the WE2E tests are located. -# -# outvarname_test_ids: -# Name of output array variable in which to return the IDs of the WE2E -# tests. -# -# outvarname_test_descs: -# Name of output array variable in which to return the descriptions of -# the WE2E tests. -# -# Note that any input argument that is not specified in the call to this -# function gets set to a null string in the body of the function. In -# particular, if any of the arguments that start with "outvarname_" -# (indicating that they specify the name of an output variable) are not -# set in the call, the values corresponding to those variables are not -# returned to the calling script or function. -# -# In order to gather information about the available WE2E tests, this -# function sets the local variable test_configs_basedir to the full path -# of the base directory in which the test configuration files (which may -# be ordinary files or symlinks) are located. It sets this as follows: -# -# test_configs_basedir="${WE2Edir}/test_configs" -# -# If the argument outvarname_test_configs_basedir is specified in the -# call to this function, then the value of test_configs_basedir will be -# returned to the calling script or function (in the variable specified -# by outvarname_test_configs_basedir). -# -# The WE2E test configuration files are located in subdirectories under -# the base directory. This function sets the names of these subdirectories -# in the local array category_subdirs. We refer to these as "category" -# subdirectories because they are used for clarity to group the tests -# into categories (instead of putting them all directly under the base -# directory). For example, one category of tests might be those that -# test workflow capabilities such as running multiple cycles and ensemble -# forecasts, another might be those that run various combinations of -# grids, physics suites, and external models for ICs/LBCs, etc. Note -# that if a new category subdirectory is added under test_configs_basedir, -# its name must be added below as a new element in category_subdirs; -# otherwise, this new subdirectory will not be searched for test -# configuration files. Note also that if one of the elements of -# category_subdirs is ".", then this function will also search directly -# under the base directory itself for test configuration files. -# -# Once test_configs_basedir and category_subdirs are set, this function -# searches the category subdirectories for WE2E test configuration files. -# In doing so, it assumes that any ordinary file or symlink in the category -# subdirectories having a name of the form -# -# config.${test_name}.sh -# -# is a test configuration file, and it takes the name of the corresponding -# test to be given by whatever test_name in the above file name happens -# to be. Here, by "ordinary" file we mean an item in the file system -# that is not a symlink (or a directory or other more exotic entity). -# Also, for simplicity, we require that any configuration file that is a -# symlink have a target that is an ordinary configuration file, i.e. not -# a symlink. -# -# We allow test configuration files to be symlinks in order to avoid the -# presence of identical configuration files with different names in the -# WE2E testing system. For example, assume there is a test named -# "test_grid1" that is used to test whether the forecast model can run -# on a grid named "grid1", and assume that the configuration file for -# this test is an ordinary file located in a category subdirectory named -# "grids" that contains tests for various grids. Then the full path to -# this configuration file will be -# -# ${test_configs_basedir}/grids/config.test_grid1.sh -# -# Now assume that there is another category subdirectory named "suites" -# that contains configuration files for tests that check whether the -# forecast model can run with various physics suites. Thus, in order to -# have a test that checks whether the forecast model can run successfully -# with a physics suite named "suite1", we might create an ordinary -# configuration file named "config.test_suite1.sh" in "suites" (so that -# the corresponding test name is "test_suite1"). Thus, the full path to -# this configuration file would be -# -# ${test_configs_basedir}/suites/config.test_suite1.sh -# -# Now if test "test_grid1" happens to use physics suite "suite1", then -# we may be able to use that test for testing both "grid1" and "suite1". -# However, we'd still want to have a configuration file in the "suites" -# subdirectory with a test name that makes it clear that the purpose of -# the test is to run using "suite1". Then, since the WE2E testing system -# allows configuration files to by symlinks, instead of copying -# "config.test_grid1.sh" from the "grids" to the "suites" subdirectory -# and renaming it to "config.test_suite1.sh" (which would create two -# identical ordinary configuration files), we could simply make -# "config.test_suite1.sh" in "suites" a symlink to "config.test_grid1.sh" -# in "grids", i.e. -# -# ${test_configs_basedir}/suites/config.test_suite1.sh -# --> ${test_configs_basedir}/grids/config.test_grid1.sh -# -# With this approach, there will be only one ordinary configuration file -# to maintain. Note that there may be more than one symlink pointing to -# the same ordinary configuration file. For example, there may be another -# category subdirectory named "wflow_features" containing tests for -# various workflow features. Then if the test "test_grid1" runs a test -# that, in addition to running the forecast model on "grid1" using the -# "suite1" physics suite also performs subhourly output, then a symlink -# named "config.test_subhourly.sh" can be created under "wflow_features" -# that points to the configuration file "config.test_grid1.sh", i.e. -# -# ${test_configs_basedir}/wflow_features/config.test_subhourly.sh -# --> ${test_configs_basedir}/grids/config.test_grid1.sh -# -# Since the WE2E testing system allows configuration files to be symlinks, -# the same WE2E test may be referred to via multiple test names -- the -# test name corresponding to the ordinary configuration file ("test_grid1" -# in the example above) and any one of the test names corresponding to -# any symlinks that have this ordinary file as their target ("test_suite1" -# and "test_subhourly" in the example above). Here, for clarity we will -# refer to the test name derived from the name of the ordinary configuration -# file as the "primary" test name, and we will refer to the test names -# dervied from the symlinks as the alternate test names. Since these -# test names all represent the same actual test, we also assign to each -# group of primary and alternate test names a single test ID. This is -# simply an integer that uniquely identifies each group of primary and -# alternate test names. -# -# For each configuration file (which may be an ordinary file or a symlink) -# found in the category subdirectories, this function saves in local -# arrays the following information about the WE2E files: -# -# 1) The list of all available WE2E test names, both primary and alternate. -# 2) The category subdirectories under the base directory test_configs_basedir -# in which the test configuration files corresponding to each test -# name are located. -# 3) The IDs corresponding to each of the test names. -# 4) The test descriptions (if outvarname_test_descs is specified in the -# call to this function or if generate_csv_file is or gets set to -# "TRUE"; see below). -# -# These local arrays are sorted in order of increasing test ID. Within -# each group of tests that have the same ID, the primary test name is -# listed first followed by zero or more alternate test names. Note also -# that to reduce confusion, we do not allow two or more configuration -# files of the same name anywere under test_configs_basedir (either -# representing the same actual test or different ones). In other words, -# the list of all test names that this function generates cannot contain -# any duplicate names (either primary or alternate). After assembling -# the full list of test names, this function checks for such duplicates -# and exits with an error message if any are found. -# -# The following input arguments to this function specify the names of -# the arrays in which each of the quantities listed above should be -# returned (to the calling script or function): -# -# outvarname_test_names -# outvarname_test_subdirs -# outvarname_test_ids -# outvarname_test_descs -# -# If any of these is not specified in the call to this function, then -# the corresponding quantity will not be returned to the calling script -# or function. -# -# The test descriptions are headers consisting of one or more bash-style -# comment lines at the top of each ordinary test configuraiton file. -# They are extracted from each such file and placed in a local array only -# if one or both of the following conditions are met: -# -# 1) The user explicitly asks for the descriptions to be returned by -# specifying in the call to this function the name of the array in -# which to return them (by setting a value for the argument -# outvarname_test_descs). -# 2) A CSV file summarizing the WE2E tests will be generated (see below) -# -# For convenience, this function can generate a CSV (comma-separated -# value) file containing information about the WE2E tests. If it does, -# the file will be placed in the main WE2E testing system directory -# specified by the input argument WE2Edir. The CSV file can be read -# into a spreadsheet in Google Sheets (or another similar tool) to get -# an overview of all the available WE2E tests. The rows of the CSV file -# correspond to the primary WE2E tests, and the columns correspond to -# the (primary) test name, alternate test names (if any), test description, -# number of times the test calls the forecast model, and values of various -# SRW App experiment variables for that test. -# -# A CSV file will be generated in the directory specified by WE2Edir if -# one or more of the following conditions hold: -# -# 1) The input argument generate_csv_file is set to "TRUE" in the call -# to this function. -# 2) The input argument generate_csv_file is not set in the call to this -# function, and a CSV file does not already exist. -# 3) The input argument generate_csv_file is not set in the call to this -# function, a CSV file already exists, and the modification time of -# at least one category subdirectory in category_subdirs is later -# than that of the CSV file, i.e. the existing CSV file needs to be -# updated because the test configuration files may have changed in -# some way. -# -# A CSV file is not generated if generate_csv_file is explicitly set to -# "FALSE" in the call to this function (regardless of whether or not a -# CSV file already exists). If a CSV file is generated, it is placed in -# the directory specified by the input argment WE2Edir, and it overwrites -# any existing copies of the file in that directory. The contents of -# each column of the CSV file are described below. -# -#----------------------------------------------------------------------- -# -function get_WE2Etest_names_subdirs_descs() { -# -#----------------------------------------------------------------------- -# -# Save current shell options (in a global array). Then set new options -# for this script or function. -# -#----------------------------------------------------------------------- -# - { save_shell_opts; . $USHdir/preamble.sh; } > /dev/null 2>&1 -# -#----------------------------------------------------------------------- -# -# Source constant files. -# -#----------------------------------------------------------------------- -# - source_config $USHdir/constants.yaml -# -#----------------------------------------------------------------------- -# -# Specify the set of valid argument names for this script or function. -# Then process the arguments provided to it on the command line (which -# should consist of a set of name-value pairs of the form arg1="value1", -# arg2="value2", etc). -# -#----------------------------------------------------------------------- -# - local valid_args=( \ - "WE2Edir" \ - "generate_csv_file" \ - "verbose" \ - "outvarname_test_configs_basedir" \ - "outvarname_test_names" \ - "outvarname_test_subdirs" \ - "outvarname_test_ids" \ - "outvarname_test_descs" \ - ) - process_args "valid_args" "$@" -# -#----------------------------------------------------------------------- -# -# For debugging purposes, print out values of arguments passed to this -# script. Note that these will be printed out only if VERBOSE is set to -# TRUE. -# -#----------------------------------------------------------------------- -# - print_input_args "valid_args" -# -#----------------------------------------------------------------------- -# -# Make the default value of "verbose" "FALSE". Then make sure "verbose" -# is set to a valid value. -# -#----------------------------------------------------------------------- -# - verbose=${verbose:-"FALSE"} - check_var_valid_value "verbose" "valid_vals_BOOLEAN" - verbose=$(boolify "$verbose") -# -#----------------------------------------------------------------------- -# -# Declare local variables. -# -#----------------------------------------------------------------------- -# - local abs_cost_ref \ - ac \ - all_items \ - alt_test_name \ - alt_test_names \ - alt_test_names_subdirs \ - alt_test_prim_test_names \ - alt_test_subdir \ - alt_test_subdirs \ - array_names_vars_to_extract \ - array_names_vars_to_extract_orig \ - category_subdirs \ - cmd \ - column_titles \ - config_fn \ - crnt_item \ - crnt_title \ - csv_delimiter \ - csv_fn \ - csv_fp \ - cwd \ - default_val \ - dt_atmos \ - fcst_len_hrs \ - get_test_descs \ - hash_or_null \ - i \ - ii \ - j \ - jp1 \ - k \ - line \ - mod_time_csv \ - mod_time_subdir \ - msg \ - nf \ - num_alt_tests \ - num_category_subdirs \ - num_cdates \ - num_cycles_per_day \ - num_days \ - num_fcsts \ - num_fcsts_orig \ - num_grid_pts \ - num_items \ - num_occurrences \ - num_prim_tests \ - num_tests \ - num_time_steps \ - num_vars_to_extract \ - prim_array_names_vars_to_extract \ - prim_test_descs \ - prim_test_dt_atmos \ - prim_test_ids \ - prim_test_name_subdir \ - prim_test_names \ - prim_test_num_fcsts \ - prim_test_rel_cost \ - prim_test_subdirs \ - rc \ - regex_search \ - rel_cost \ - row_content \ - sort_inds \ - stripped_line \ - subdir \ - subdir_fp \ - subdirs \ - target_dir \ - target_fn \ - target_fp \ - target_prim_test_name \ - target_rp \ - target_test_name_or_null \ - test_configs_basedir \ - test_desc \ - test_descs \ - test_descs_esc_sq \ - test_descs_orig \ - test_descs_str \ - test_id \ - test_id_next \ - test_ids \ - test_ids_and_inds \ - test_ids_and_inds_sorted \ - test_ids_orig \ - test_ids_str \ - test_name \ - test_name_or_null \ - test_names \ - test_names_orig \ - test_names_str \ - test_subdirs \ - test_subdirs_orig \ - test_subdirs_str \ - test_type \ - units \ - val \ - var_name \ - var_name_at \ - vars_to_extract - - local dta \ - nxny \ - dta_r \ - nxny_r -# -#----------------------------------------------------------------------- -# -# Set variables associated with the CSV (comma-separated value) file that -# this function may generate. The conditions under which such a file is -# generated are described above in the description of this function. -# -#----------------------------------------------------------------------- -# -# Set the name and full path to the CSV file. -# - csv_fn="WE2E_test_info.csv" - csv_fp="${WE2Edir}/${csv_fn}" -# -# If generate_csv_file is specified as an input argument in the call to -# this function, make sure that it is set to a valid value. -# - if [ ! -z "${generate_csv_file}" ]; then - - check_var_valid_value "generate_csv_file" "valid_vals_BOOLEAN" - generate_csv_file=$(boolify "${generate_csv_file}") -# -# If generate_csv_file was not specified as an input argument in the -# call to this function, then it will have been set above to a null -# string. In this case, if a CSV file doesn't already exsit, reset -# generate_csv_file to "TRUE" so that one will be generated. If a CSV -# file does exist, get its modification time so that later below, we can -# compare it to the modification times of the category subdirectories -# and determine whether a new CSV file needs to be generated. -# -# Note that the modification "times" obtained here and later below using -# the "stat" utility are the seconds elapsed between Epoch (which is a -# fixed point in time) and the last modification time of the specified -# file, not the dates/times at which the file was last modified. This -# is due to the use of the "--format=%Y" flag in the call to "stat". We -# choose these "seconds since Epoch" units because they make it easier -# to determine which of two files is younger/older (the one with the -# larger seconds-since-Epoch will be the more recently modified file.) -# - else - - if [ ! -f "${csv_fp}" ]; then - mod_time_csv="0" - generate_csv_file="TRUE" - else - mod_time_csv=$( stat --format=%Y "${csv_fp}" ) - fi - - fi - - if [ "${generate_csv_file}" = "TRUE" ]; then - print_info_msg " -Will generate a CSV (Comma Separated Value) file (csv_fp) containing -information on all WE2E tests: - csv_fp = \"${csv_fp}\"" - fi -# -#----------------------------------------------------------------------- -# -# Set the base directory containing the WE2E test configuration files -# (or, more precisely, containing the category subdirectories in which -# the configuration files are located). -# -#----------------------------------------------------------------------- -# - test_configs_basedir="${WE2Edir}/test_configs" -# -#----------------------------------------------------------------------- -# -# Set the array category_subdirs that specifies the subdirectories under -# test_configs_basedir in which to search for WE2E test configuration -# files. Note that if "." is included as one of the elements of this -# array, then the base directory itself will also be searched. -# -#----------------------------------------------------------------------- -# - category_subdirs=( \ - "." \ - "grids_extrn_mdls_suites_community" \ - "grids_extrn_mdls_suites_nco" \ - "release_SRW_v1" \ - "verification" \ - "wflow_features" \ - ) - num_category_subdirs="${#category_subdirs[@]}" - - orig_dir=$(pwd) -# -#----------------------------------------------------------------------- -# -# Loop over the category subdirectories under test_configs_basedir -# (possibly including the base directory itself). In each subdirectory, -# consider all items that have names of the form -# -# config.${test_name}.sh -# -# and that are either ordinary files (i.e. not symlinks) or are symlinks -# whose targets are ordinary files having names of the form above. For -# each item that is an ordinary file, save the corresponding primary test -# name, the category subdirectory in which the item is located, and the -# test ID in the arrays -# -# prim_test_names -# prim_test_subdirs -# prim_test_ids -# -# respectively. For each item that is a symlink to an ordinary file, -# save the alternate test name corresponding to the symlink name, the -# category subdirectory in which the symlink is located, and the test -# name derived from the name of the symlink's target (i.e. the primary -# test name that this alternate test name corresponds to) in the arrays -# -# alt_test_names -# alt_test_subdirs -# alt_test_prim_test_names -# -# respectively. -# -#----------------------------------------------------------------------- -# - prim_test_names=() - prim_test_ids=() - prim_test_subdirs=() - prim_test_num_fcsts=() - prim_test_dt_atmos=() - prim_test_rel_cost=() - - alt_test_names=() - alt_test_subdirs=() - alt_test_prim_test_names=() -# -# Initialize the counter that will be used to assign test IDs to the -# primary test names. This will be incremented below every time a new -# primary test name is found. Note that we do not yet assign IDs to the -# alternate test names. These will be assigned IDs later below that -# will be identical to the IDs of the primary thest names they correspond -# to. -# - test_id="0" - - for (( i=0; i<=$((num_category_subdirs-1)); i++ )); do - - subdir="${category_subdirs[$i]}" - subdir_fp="${test_configs_basedir}/$subdir" -# -# If at this point in the code generate_csv_file is still set to a null -# string, it means that a CSV file containing information about the WE2E -# tests already exists. In this case, a new version of this file needs -# to be generated only if one or more of the category subdirectories -# have modification times that are later than that of the existing CSV -# file. Check for this condition and set generate_csv_file accordingly. -# Note that this if-statement will be executed at most once since it sets -# generate_csv_file to "TRUE", after which the test for entering the if- -# statement will be false. -# - if [ -z "${generate_csv_file}" ]; then - if [ -f "${subdir_fp}/*.yaml" ]; then - mod_time_subdir=$( stat --format=%Y "${subdir_fp}"/*.yaml | sort -n | tail -1 ) - else - mod_time_subdir="0" - fi - if [ "${mod_time_subdir}" -gt "${mod_time_csv}" ]; then - generate_csv_file="TRUE" - print_info_msg " -The current category subdirectory (subdir) has a modification time -(mod_time_subdir) that is later than the modification time (mod_time_csv) -of the existing CSV file (csv_fp) containing WE2E test information: - subdir = \"${subdir}\" - mod_time_subdir = \"${mod_time_subdir}\" (in units of seconds since Epoch) - mod_time_csv = \"${mod_time_csv}\" (in units of seconds since Epoch) - csv_fp = \"${csv_fp}\" -Thus, the CSV file must be updated. Setting generate_csv_file to \"TRUE\" -to generate a new CSV file: - generate_csv_file = \"${generate_csv_file}\"" - fi - fi -# -# Change location to the current category subdirectory. -# - cd_vrfy "${subdir_fp}" -# -# Get the contents of the current subdirectory. We consider each item -# that has a name of the form -# -# config.${test_name}.sh -# -# to be a WE2E test configuration file, and we take the name of the test -# to be whatever ${test_name} in the above expression corresponds to. -# We ignore all other items in the subdirectory. -# - all_items=( $(ls -1) ) - num_items="${#all_items[@]}" - for (( j=0; j<=$((num_items-1)); j++ )); do - - crnt_item="${all_items[$j]}" -# -# Try to extract the name of the test from the name of the current item -# and place the result in test_name_or_null. test_name_or_null will -# contain the name of the test only if the item has a name of the form -# "config.${test_name}.sh", in which case it will be equal to ${test_name}. -# Otherwise, it will be a null string. -# - regex_search="^config\.(.*)\.yaml$" - test_name_or_null=$( printf "%s\n" "${crnt_item}" | \ - sed -n -r -e "s/${regex_search}/\1/p" ) -# -#----------------------------------------------------------------------- -# -# Take further action for this item only if it has a name of the form -# above expected for a WE2E test configuration file, which will be the -# case only if test_name_or_null is not a null string. -# -#----------------------------------------------------------------------- -# - if [ ! -z "${test_name_or_null}" ]; then -# -#----------------------------------------------------------------------- -# -# Use bash's -h conditional operator to check whether the current item -# (which at this point is taken to be a test configuration file) is a -# symlink. If it is a symlink, the only type of entity we allow the -# target to be is an existing ordinary file. In particular, to keep the -# WE2E testing system simple, we do not allow the target to be a symlink. -# Of course, it also cannot be a directory or other exotic entity. Below, -# we check for these various possibilities and only allow the case of the -# target being an existing ordinary file. -# -#----------------------------------------------------------------------- -# - if [ -h "${crnt_item}" ]; then -# -# Extract the name of the test from the name of the symlink and append -# it to the array alt_test_names. Also, append the category subdirectory -# under test_configs_basedir in which the symlink is located to the array -# alt_test_subdirs. -# - alt_test_names+=("${test_name_or_null}") - alt_test_subdirs+=("$subdir") -# -# Get the full path to the target of the symlink without following targets -# that are themselves symlinks. The "readlink" utility without any flags -# (such as -f) can do this, but when -f is omitted, it returns a relative -# path. To convert that relative path to an absolute path without resolving -# symlinks, use the "realpath" utility with the -s flag. -# - target_rp=$( readlink "${crnt_item}" ) - target_fp=$( realpath -s "${target_rp}" ) -# -# Use bash's -h conditional operator to check whether the target itself -# is a symlink. For simplicity, this is not allowed. Thus, in this -# case, print out an error message and exit. -# - if [ -h "${target_fp}" ]; then - cwd="$(pwd)" - print_err_msg_exit "\ -The symlink (crnt_item) in the current directory (cwd) has a target -(target_fp) that is itself a symlink: - cwd = \"${cwd}\" - crnt_item = \"${crnt_item}\" - target_fp = \"${target_fp}\" -This is not allowed. Please ensure that the current item points to an -ordinary file (i.e. not a symlink) and rerun." - fi -# -# Now use bash's -f conditional operator to check whether the target is -# a "regular" file (as defined by bash). Note that this test will return -# false if the target is a directory or does not exist and true otherwise. -# Thus, the negation of this test applied to the target (i.e. ! -f) that -# we use below will be true if the target is not an existing file. In -# this case, we print out an error message and exit. -# -# Note also that the -f operator recursively follows a symlink passed to -# it as an argument. For this reason, we need to first perform the -h -# test above to check that the target (without resolving symlinks) is -# itself not a symlink. The -f test below does not help in this regard. -# - if [ ! -f "${target_fp}" ]; then - cwd="$(pwd)" - print_err_msg_exit "\ -The symlink (crnt_item) in the current directory (cwd) has a target -(target_fp) that is not an existing ordinary file: - cwd = \"${cwd}\" - crnt_item = \"${crnt_item}\" - target_fp = \"${target_fp}\" -This is probably because either the target doesn't exist or is a directory, -neither of which is allowed because the symlink must point to an ordinary -(i.e. non-symlink) WE2E test configuration file. Please either point the -symlink to such a file or remove it, then rerun." - fi -# -# Get the name of the directory in which the target is located. -# - target_dir=$( dirname "${target_fp}" ) -# -# Next, check whether the directory in which the target is located is -# under the base directory of the WE2E test configuration files (i.e. -# test_configs_basedir). We require that the target be located in one -# of the subdirectories under test_configs_basedir (or directly under -# test_configs_basedir itself) because we don't want to deal with tests -# that have configuration files that may be located anywhere in the file -# system; for simplicity, we want all configuration files to be placed -# somewhere under test_configs_basedir. -# -# Note that the bash parameter expansion ${var/search/replace} returns -# $var but with the first instance of "search" replaced by "replace" if -# the former is found in $var. Otherwise, it returns the original $var. -# If "replace" is omitted, then "search" is simply deleted. Thus, in -# the if-statement below, if ${target_dir/${test_configs_basedir}/} -# returns ${target_dir} without changes (in which case the test in the -# if-statment will evaluate to true), it means ${test_configs_basedir} -# was not found within ${target_dir}. That in turn means ${target_dir} -# is not a location under ${test_configs_basedir}. In this case, print -# out a warning and exit. -# - if [ "${target_dir}" = "${target_dir/${test_configs_basedir}/}" ]; then - cwd="$(pwd)" - print_err_msg_exit "\ -The symlink (crnt_item) in the current directory (cwd) has a target -(target_fp) located in a directory (target_dir) that is not somewhere -under the WE2E tests base directory (test_configs_basedir): - cwd = \"${cwd}\" - crnt_item = \"${crnt_item}\" - target_fp = \"${target_fp}\" - target_dir = \"${target_dir}\" - test_configs_basedir = \"${test_configs_basedir}\" -For clarity, we require all WE2E test configuration files to be located -somewhere under test_configs_basedir (either directly in this base -directory on in a subdirectory). Please correct and rerun." - fi -# -# Finally, check whether the name of the target file is in the expected -# format "config.${test_name}.sh" for a WE2E test configuration file. -# If not, print out a warning and exit. -# - target_fn=$( basename "${target_fp}" ) - target_test_name_or_null=$( printf "%s\n" "${target_fn}" | \ - sed -n -r -e "s/${regex_search}/\1/p" ) - if [ -z "${target_test_name_or_null}" ]; then - cwd="$(pwd)" - print_err_msg_exit "\ -The symlink (crnt_item) in the current directory (cwd) has a target -(target_fn; located in the directory target_dir) with a name that is -not in the form \"config.[test_name].sh\" expected for a WE2E test -configuration file: - cwd = \"${cwd}\" - crnt_item = \"${crnt_item}\" - target_dir = \"${target_dir}\" - target_fn = \"${target_fn}\" -Please either rename the target to have the form specified above or -remove the symlink, then rerun." - fi -# -# Now that all the checks above have succeeded, for later use save the -# name of the WE2E test that the target represents in the array -# alt_test_prim_test_names. -# - alt_test_prim_test_names+=("${target_test_name_or_null}") -# -#----------------------------------------------------------------------- -# -# If the current item is not a symlink... -# -#----------------------------------------------------------------------- -# - else -# -# Check if the current item is a "regular" file (as defined by bash) and -# thus not a directory or some other exotic entity. If it is a regular -# file, save the corresponding WE2E test name and category subdirectory -# in the arrays prim_test_names and prim_test_subdirs, respectively. -# Also, set its test ID and save it in the array prim_test_ids. If the -# current item is not a regular file, print out a warning and exit. -# - if [ -f "${crnt_item}" ]; then - prim_test_names+=("${test_name_or_null}") - prim_test_subdirs+=("${subdir}") - test_id=$((test_id+1)) - prim_test_ids+=("${test_id}") - else - cwd="$(pwd)" - print_err_msg_exit "\ -The item (crnt_item) in the current directory (cwd) is not a symlink, -but it is also not a \"regular\" file (i.e. it fails bash's -f conditional -operator): - cwd = \"${cwd}\" - crnt_item = \"${crnt_item}\" - [ -f "${crnt_item}" ] = $([ -f "${crnt_item}" ]) -This is probably because it is a directory. Please correct and rerun." - fi - - fi - - fi - - done - - done -# -# For later use, save the number of primary and alternate test names in -# variables. -# - num_prim_tests="${#prim_test_names[@]}" - num_alt_tests="${#alt_test_names[@]}" -# -# Change location back to original directory. -# - cd_vrfy "${orig_dir}" -# -#----------------------------------------------------------------------- -# -# Create the array test_names that contains both the primary and alternate -# test names found above (with the list of primary names first followed -# by the list of alternate names). Also, create the array test_subdirs -# that contains the category subdirectories corresponding to these test -# names. -# -#----------------------------------------------------------------------- -# - test_names=("${prim_test_names[@]}") - test_subdirs=("${prim_test_subdirs[@]}") - if [ "${num_alt_tests}" -gt "0" ]; then - test_names+=("${alt_test_names[@]:-}") - test_subdirs+=("${alt_test_subdirs[@]:-}") - fi -# -#----------------------------------------------------------------------- -# -# For simplicity, make sure that each test name (either primary or -# alternate) appears exactly once in the array test_names. This is -# equivalent to requiring that a test configuration file (ordinary file -# or symlink) corresponding to each name appear exactly once anywhere -# under the base directory test_configs_basedir. -# -#----------------------------------------------------------------------- -# - num_tests="${#test_names[@]}" - for (( i=0; i<=$((num_tests-1)); i++ )); do - - test_name="${test_names[$i]}" - - subdirs=() - num_occurrences=0 - for (( j=0; j<=$((num_tests-1)); j++ )); do - if [ "${test_names[$j]}" = "${test_name}" ]; then - num_occurrences=$((num_occurrences+1)) - subdirs+=("${test_subdirs[$j]}") - fi - done - - if [ "${num_occurrences}" -ne "1" ]; then - print_err_msg_exit "\ -There must be exactly one WE2E test configuration file (which may be a -ordinary file or a symlink) corresponding to each test name anywhere -under the base directory test_configs_basedir. However, the number of -configuration files (num_occurences) corresponding to the current test -name (test_name) is not 1: - test_configs_basedir = \"${test_configs_basedir}\" - test_name = \"${test_name}\" - num_occurrences = ${num_occurrences} -These configuration files all have the name - \"config.${test_name}.yaml\" -and are located in the following category subdirectories under -test_configs_basedir: - subdirs = ( $( printf "\"%s\" " "${subdirs[@]}" )) -Please rename or remove all but one of these configuration files so that -they correspond to unique test names and rerun." - fi - - done -# -#----------------------------------------------------------------------- -# -# If the input argument outvarname_test_descs is not set to a null string -# (meaning that the name of the array in which to return the WE2E test -# descriptions is specified in the call to this function), or if the flag -# generate_csv_file is set to "TRUE", we need to obtain the WE2E test -# descriptions from the test configuration files. In these cases, set -# the local variable get_test_descs to "TRUE". Otherwise, set it to -# "FALSE". -# -#----------------------------------------------------------------------- -# - get_test_descs="FALSE" - if [ ! -z "${outvarname_test_descs}" ] || \ - [ "${generate_csv_file}" = "TRUE" ]; then - get_test_descs="TRUE" - fi -# -#----------------------------------------------------------------------- -# -# If get_test_descs is set to "TRUE", loop through all the primary test -# names and extract from the configuration file of each the description -# of the test. This is assumed to be a section of (bash) comment lines -# at the top of the configuration file. Then append the test description -# to the array prim_test_descs. Note that we assume the first non-comment -# line at the top of the configuration file indicates the end of the test -# description header. -# -#----------------------------------------------------------------------- -# - if [ "${get_test_descs}" = "TRUE" ]; then -# -# Specify in "vars_to_extract" the list of experiment variables to extract -# from each test configuration file (and later to place in the CSV file). -# Recall that the rows of the CSV file correspond to the various WE2E -# tests, and the columns correspond to the test name, description, and -# experiment variable values. The elements of "vars_to_extract" should -# be the names of SRW App experiment variables that are (or can be) -# specified in the App's configuration file. Note that if a variable is -# not specified in the test configuration file, in most cases its value -# is set to an empty string (and recorded as such in the CSV file). In -# some cases, it is set to some other value (e.g. for the number of -# ensemble members NUM_ENS_MEMBERS, it is set to 1). -# - vars_to_extract=( "PREDEF_GRID_NAME" \ - "CCPP_PHYS_SUITE" \ - "EXTRN_MDL_NAME_ICS" \ - "EXTRN_MDL_NAME_LBCS" \ - "DATE_FIRST_CYCL" \ - "DATE_LAST_CYCL" \ - "INCR_CYCL_FREQ" \ - "FCST_LEN_HRS" \ - "LBC_SPEC_INTVL_HRS" \ - "NUM_ENS_MEMBERS" \ - ) - num_vars_to_extract="${#vars_to_extract[@]}" -# -# Create names of local arrays that will hold the value of the corresponding -# variable for each test. Then use these names to define them as empty -# arrays. [The arrays named "prim_..." are to hold values for only the -# primary tests, while other arrays are to hold values for all (primary -# plus alternate) tests.] -# - prim_array_names_vars_to_extract=( $( printf "prim_test_%s_vals " "${vars_to_extract[@]}" ) ) - array_names_vars_to_extract=( $( printf "%s_vals " "${vars_to_extract[@]}" ) ) - for (( k=0; k<=$((num_vars_to_extract-1)); k++ )); do - cmd="local ${prim_array_names_vars_to_extract[$k]}=()" - eval $cmd - cmd="local ${array_names_vars_to_extract[$k]}=()" - eval $cmd - done - - print_info_msg " -Gathering test descriptions and experiment variable values from the -configuration files of the primary WE2E tests... -" - - prim_test_descs=() - for (( i=0; i<=$((num_prim_tests-1)); i++ )); do - - test_name="${prim_test_names[$i]}" - print_info_msg "\ - Reading in the test description for primary WE2E test: \"${test_name}\" - In category (subdirectory): \"${subdir}\" -" - subdir=("${prim_test_subdirs[$i]}") - cd_vrfy "${test_configs_basedir}/$subdir" -# -# Keep reading lines from the current test's configuration line until -# a line is encountered that does not start with zero or more spaces, -# followed by the hash symbol (which is the bash comment character) -# possibly followed by a single space character. -# -# In the while-loop below, we read in every such line, strip it of any -# leading spaces, the hash symbol, and possibly another space and append -# what remains to the local variable test_desc. -# - config_fn="config.${test_name}.yaml" - config_fp="${test_configs_basedir}/$subdir/$config_fn" - test_desc="$(config_to_yaml_str $config_fp -k "metadata")" - test_desc="${test_desc:27}" - test_desc="${test_desc::${#test_desc}-1}" -# -# Finally, save the description of the current test as the next element -# of the array prim_test_descs. -# - prim_test_descs+=("${test_desc}") -# -# Get from the current test's configuration file the values of the -# variables specified in "vars_to_extract". Then save the value in the -# arrays specified by "prim_array_names_vars_to_extract". -# - config_content=$(config_to_shell_str $config_fp) - for (( k=0; k<=$((num_vars_to_extract-1)); k++ )); do - - var_name="${vars_to_extract[$k]}" - set +e - cmd=$( grep "^[ ]*${var_name}=" <<< "${config_content}" ) - set -e - eval $cmd - - if [ -z "${!var_name+x}" ]; then - - msg=" - The variable \"${var_name}\" is not defined in the current test's - configuration file (config_fn): - config_fn = \"${config_fn}\" - Setting the element in the array \"${prim_array_names_vars_to_extract[$k]}\" - corresponding to this test to" - - case "${var_name}" in - - "NUM_ENS_MEMBERS") - default_val="1" - msg=$msg": - ${var_name} = \"${default_val}\"" - ;; - - "INCR_CYCL_FREQ") - default_val="24" - msg=$msg": - ${var_name} = \"${default_val}\"" - ;; - - *) - default_val="" - msg=$msg" an empty string." - ;; - - esac - cmd="${var_name}=\"${default_val}\"" - eval $cmd - - print_info_msg "$verbose" "$msg" - cmd="${prim_array_names_vars_to_extract[$k]}+=(\"'${default_val}\")" - - else -# -# The following are important notes regarding how the variable "cmd" -# containing the command that will append an element to the array -# specified by ${prim_array_names_vars_to_extract[$k]} is formulated: -# -# 1) If all the experiment variables were scalars, then the more complex -# command below could be replaced with the following: -# -# cmd="${prim_array_names_vars_to_extract[$k]}+=(\"${!var_name}\")" -# -# But some variables are arrays, so we need the more complex approach -# to cover those cases. -# -# 2) The double quotes (which need to be escaped here, i.e. \") are needed -# so that for any experiment variables that are arrays, all the elements of -# the array are combined together and treated as a single element. For -# example, if a variable CYCL_HRS is set to the array ("00" "12"), we want -# the value saved in the local array here to be a single element consisting -# of "00 12". Otherwise, "00" and "12" will be treated as separate -# elements, and more than one element would be added to the array (which -# would be incorrect here). -# -# 3) The single quote (which needs to be escaped here, i.e. \') is needed -# so that any numbers (e.g. a set of cycle hours such as "00 12") are -# treated as strings when the CSV file is opened in Google Sheets. -# If this is not done, Google Sheets will remove leading zeros. -# - var_name_at="${var_name}[@]" - cmd="${prim_array_names_vars_to_extract[$k]}+=(\'\"${!var_name_at}\")" - fi - eval $cmd - - done -# -# Calculate the number of forecasts that will be launched by the current -# test. The "10#" forces bash to treat the following number as a decimal -# (not hexadecimal, etc). Note that INCR_CYCL_FREQ is in units of hours, -# so the factor of 3600 is needed to convert the number of seconds to hours. -# - # Convert cycles to seconds - if [[ $DATE_FIRST_CYCL != [0-9]* ]]; then - DATE_FIRST_CYCL=$(eval ${DATE_FIRST_CYCL}) - fi - if [[ $DATE_LAST_CYCL != [0-9]* ]]; then - DATE_LAST_CYCL=$(eval ${DATE_LAST_CYCL}) - fi - first=$(date --utc --date "${DATE_FIRST_CYCL:0:8} ${DATE_FIRST_CYCL:8:2}" +"%s") - last=$(date --utc --date "${DATE_LAST_CYCL:0:8} ${DATE_LAST_CYCL:8:2}" +"%s") - # Diff and convert seconds to number of cycles where freq is in - # hours - nf=$(( ($last - $first) / 3600 / 10#${INCR_CYCL_FREQ} )) -# -# Save the number of forecasts launched by the current test in an -# appropriately named array. In the following, the single quote at the -# beginning forces Google Sheets to interpret this quantity as a string. -# This prevents any automatic number fomatting from being applied when -# the CSV file is imported into Google Sheets. -# - prim_test_num_fcsts+=( "'$nf" ) -# -#----------------------------------------------------------------------- -# -# Calculate the relative dynamics cost of the test, i.e. the relative -# cost of running only the dynamics portion of the forecast model. Here, -# we define the absolute cost of running the dynamics as -# -# abs_cost = nx*ny*num_time_steps*num_fcsts -# -# where nx and ny are the horizontal dimensions of the grid, num_time_steps -# is the number of time steps that need to be taken to complete one -# forecast within the test, and num_fcsts are the number of forecasts -# the test makes (e.g. if the test performs an ensemble forecast, the -# value of this parameter will be greater than 1). -# -# The relative cost is obtained by dividing the absolute cost of a test -# by the absolute cost of a reference 6-hour forecast on the RRFS_CONUS_25km -# predefined grid using the default time step for that grid. This is -# calculated later below and saved in the variable abs_cost_ref. Thus, -# the relative cost is given by -# -# rel_cost = abs_cost/abs_cost_ref -# -# defined as abs_cost_ref. -# -# Note that the (absolute or relative) cost defined here does not take -# into account the costs of running different physics suites, nor does -# it take into account the costs of workflow tasks other than the forecast -# task (e.g. generation of initial and boundary conditions, post processing, -# verification, etc; that is why it is referred to as the relative DYNAMICS -# cost). Note also that if in the future the number of levels in the -# vertical becomes a user-specified parameter, that will also have to be -# added to the definition of the cost. -# -#----------------------------------------------------------------------- -# - -# -# To calculate the absolute cost as defined above, we need the number of -# points in the two horizontal directions, nx and ny. Also, to calculate -# the number of time steps, we need the size of the time step (dt_atmos). -# These depend on the grid being used and must be extracted from the grid -# parameters. The way the latter are obtained depends on whether or not -# a predefined grid is being used. -# -params=$(\ - PREDEF_GRID_NAME="${PREDEF_GRID_NAME}" \ - QUILTING="FALSE" \ - $USHdir/calculate_cost.py -c "${test_configs_basedir}/$subdir/${config_fn}") - -read dta nxny dta_r nxny_r <<< "${params}" - -# -# Save the value of dta (which is just dt_atmos) in an array. The single -# quote at the beginning forces Google Sheets to interpret this quantity -# as a string. This prevents any automatic number fomatting from being -# applied when the CSV file is imported into Google Sheets. -# - prim_test_dt_atmos+=( "'${dta}" ) -# -# Calculate the total number of horizontal grid points. -# - num_grid_pts=$nxny -# -# Calculate the number of time steps for the test. Note that FCST_LEN_HRS -# is in units of hours while dta is in units of seconds. Also, the factor -# dta - 1 in the numerator is to cause the division to round up to the -# nearest integer (adding the denominator minus one to the numerator will -# make this happen). -# - num_time_steps=$(( (FCST_LEN_HRS*3600 + dta - 1)/dta )) -# -# Calculate the absolute cost of the test. -# - ac=$(( num_grid_pts*num_time_steps*nf )) -# -# Save the absolute cost for this test in the array that will eventually -# contain the relative cost. The values in this array will be divided -# by abs_cost_ref later below to obtain relative costs. -# - prim_test_rel_cost+=( "$ac" ) -# -# Unset the experiment variables defined for the current test so that -# they are not accidentally used for the next one. -# - for (( k=0; k<=$((num_vars_to_extract-1)); k++ )); do - var_name="${vars_to_extract[$k]}" - cmd="unset ${var_name}" - eval $cmd - done - - done # End loop over primary tests -# -#----------------------------------------------------------------------- -# -# Normalize the absolute costs calculated above for each test by the -# absolute cost of a reference 6-hour forecast on the RRFS_CONUS_25km -# predefined grid (using the default time step for that grid). -# -#----------------------------------------------------------------------- -# - num_grid_pts=$nxny_r - fcst_len_hrs="6" - num_time_steps=$(( (fcst_len_hrs*3600 + dta_r - 1)/dta_r )) - abs_cost_ref=$(( num_grid_pts*num_time_steps )) - - for (( i=0; i<=$((num_prim_tests-1)); i++ )); do -# -# In the following, the single quote at the beginning forces Google Sheets -# to interpret this quantity as a string. This prevents any automatic -# number fomatting from being applied when the CSV file is imported into -# Google Sheets. -# - prim_test_rel_cost[$i]="'"$( printf "%g" \ - $( bc -l <<< " ${prim_test_rel_cost[$i]}/${abs_cost_ref}" ) ) - done - - fi -# -#----------------------------------------------------------------------- -# -# Create the arrays test_ids and test_descs that initially contain the -# test IDs and descriptions corresponding to the primary test names -# (those of the alternate test names will be appended below). Then, in -# the for-loop, do same for the arrays containing the experiment variable -# values for each test. -# -#----------------------------------------------------------------------- -# - test_ids=("${prim_test_ids[@]}") - if [ "${get_test_descs}" = "TRUE" ]; then - test_descs=("${prim_test_descs[@]}") - num_fcsts=("${prim_test_num_fcsts[@]}") - dt_atmos=("${prim_test_dt_atmos[@]}") - rel_cost=("${prim_test_rel_cost[@]}") - for (( k=0; k<=$((num_vars_to_extract-1)); k++ )); do - cmd="${array_names_vars_to_extract[$k]}=(\"\${${prim_array_names_vars_to_extract[$k]}[@]}\")" - eval $cmd - done - fi -# -#----------------------------------------------------------------------- -# -# Append to the arrays test_ids and test_descs the test IDs and descriptions -# of the alternate test names. We set the test ID and description of -# each alternate test name to those of the corresponding primary test -# name. Then, in the inner for-loop, do the same for the arrays containing -# the experiment variable values. -# -#----------------------------------------------------------------------- -# - for (( i=0; i<=$((num_alt_tests-1)); i++ )); do - - alt_test_name="${alt_test_names[$i]}" - alt_test_subdir=("${alt_test_subdirs[$i]}") - target_prim_test_name="${alt_test_prim_test_names[$i]}" - - num_occurrences=0 - for (( j=0; j<=$((num_prim_tests-1)); j++ )); do - if [ "${prim_test_names[$j]}" = "${target_prim_test_name}" ]; then - test_ids+=("${prim_test_ids[$j]}") - if [ "${get_test_descs}" = "TRUE" ]; then - test_descs+=("${prim_test_descs[$j]}") - num_fcsts+=("${prim_test_num_fcsts[$j]}") - dt_atmos+=("${prim_test_dt_atmos[$j]}") - rel_cost+=("${prim_test_rel_cost[$j]}") - for (( k=0; k<=$((num_vars_to_extract-1)); k++ )); do - cmd="${array_names_vars_to_extract[$k]}+=(\"\${${prim_array_names_vars_to_extract[$k]}[$j]}\")" - eval $cmd - done - fi - num_occurrences=$((num_occurrences+1)) - fi - done - - if [ "${num_occurrences}" -ne 1 ]; then - print_err_msg_exit "\ -Each alternate test name must have a corresponding primary test name that -occurs exactly once in the full list of primary test names. For the -current alternate test name (alt_test_name), the number of occurrences -(num_occurrences) of the corresponding primary test name (target_prim_test_name) -is not 1: - alt_test_name = \"${alt_test_name}\" - target_prim_test_name = \"${target_prim_test_name}\" - num_occurrences = \"${num_occurrences}\" -Please correct and rerun." - fi - - done -# -#----------------------------------------------------------------------- -# -# Sort in order of increasing test ID the arrays containing the names, -# IDs, category subdirectories, and descriptions of the WE2E tests as -# well as the arrays containing the experiment variable values for each -# test. -# -# For this purpose, we first create an array (test_ids_and_inds) each -# of whose elements consist of the test ID, the test type, and the index -# of the array element (with a space used as delimiter). The test type -# is simply an identifier to distinguish between primary test names and -# alternate (symlink-derived) ones. For the former, we set the test -# type to "A", and for the latter, we set it to "B". We do this in order -# to obtain a sorted result in which the elements are not only sorted by -# test ID but also sorted by test type such that within each group of -# elements/tests that has the same test ID, the primary test name is -# listed first followed by zero or more alternte test names. -# -# Next, we sort the array test_ids_and_inds using the "sort" utility -# and save the result in the new array test_ids_and_inds_sorted. The -# latter will be sorted according to test ID because that is the first -# quantity on each line (element) of the original array test_ids_and_inds. -# Also, as described above, for each group of test names that have the -# same ID, the names will be sorted such that the primary test name is -# listed first. -# -# Finally, we extract from test_ids_and_inds_sorted the second number -# in each element (the one after the first number, which is the test ID, -# and the test type, which we no longer need), which is the original -# array index before sorting, and save the results in the array sort_inds. -# This array will contain the original indices in sorted order that we -# then use to sort the arrays containing the WE2E test names, IDs, -# subdirectories, descriptions, and experiment variable values. -# -#----------------------------------------------------------------------- -# - test_ids_and_inds=() - for (( i=0; i<=$((num_tests-1)); i++ )); do - test_type="A" - if [ "$i" -ge "${num_prim_tests}" ]; then - test_type="B" - fi - test_ids_and_inds[$i]="${test_ids[$i]} ${test_type} $i" - done - - readarray -t "test_ids_and_inds_sorted" < \ - <( printf "%s\n" "${test_ids_and_inds[@]}" | sort --numeric-sort ) - - sort_inds=() - regex_search="^[ ]*([0-9]*)[ ]*[AB][ ]*([0-9]*)$" - for (( i=0; i<=$((num_tests-1)); i++ )); do - sort_inds[$i]=$( printf "%s" "${test_ids_and_inds_sorted[$i]}" | \ - sed -n -r -e "s/${regex_search}/\2/p" ) - done - - local test_names_orig=( "${test_names[@]}" ) - local test_subdirs_orig=( "${test_subdirs[@]}" ) - local test_ids_orig=( "${test_ids[@]}" ) - for (( i=0; i<=$((num_tests-1)); i++ )); do - ii="${sort_inds[$i]}" - test_names[$i]="${test_names_orig[$ii]}" - test_subdirs[$i]="${test_subdirs_orig[$ii]}" - test_ids[$i]="${test_ids_orig[$ii]}" - done - - if [ "${get_test_descs}" = "TRUE" ]; then - - local test_descs_orig=( "${test_descs[@]}" ) - local num_fcsts_orig=( "${num_fcsts[@]}" ) - local dt_atmos_orig=( "${dt_atmos[@]}" ) - local rel_cost_orig=( "${rel_cost[@]}" ) - for (( k=0; k<=$((num_vars_to_extract-1)); k++ )); do - cmd="local ${array_names_vars_to_extract[$k]}_orig=(\"\${${array_names_vars_to_extract[$k]}[@]}\")" - eval $cmd - done - - for (( i=0; i<=$((num_tests-1)); i++ )); do - ii="${sort_inds[$i]}" - test_descs[$i]="${test_descs_orig[$ii]}" - num_fcsts[$i]="${num_fcsts_orig[$ii]}" - dt_atmos[$i]="${dt_atmos_orig[$ii]}" - rel_cost[$i]="${rel_cost_orig[$ii]}" - for (( k=0; k<=$((num_vars_to_extract-1)); k++ )); do - cmd="${array_names_vars_to_extract[$k]}[$i]=\"\${${array_names_vars_to_extract[$k]}_orig[$ii]}\"" - eval $cmd - done - done - - fi -# -#----------------------------------------------------------------------- -# -# If generate_csv_file is set to "TRUE", generate a CSV (comma-separated -# value) file containing information about the WE2E tests. This file -# can be opened in a spreadsheet in Google Sheets (and possibly Microsoft -# Excel as well) to view information about all the WE2E tests. Note that -# in doing so, the user must specify the field delimiter to be the same -# character that csv_delimiter is set to below. -# -#----------------------------------------------------------------------- -# - if [ "${generate_csv_file}" = "TRUE" ]; then -# -# If a CSV file already exists, delete it. -# - rm_vrfy -f "${csv_fp}" -# -# Set the character used to delimit columns in the CSV file. This has -# to be something that would normally not appear in the fields being -# written to the CSV file. -# - csv_delimiter="|" -# -# Set the titles of the columns that will be in the file. Then write -# them to the file. The contents of the columns are described in more -# detail further below. -# - column_titles="\ -\"Test Name -(Subdirectory)\" ${csv_delimiter} \ -\"Alternate Test Names -(Subdirectories)\" ${csv_delimiter} \ -\"Test Purpose/Description\" ${csv_delimiter} \ -\"Relative Cost of Running Dynamics -(1 corresponds to running a 6-hour forecast on the RRFS_CONUS_25km predefined grid using the default time step)\" ${csv_delimiter} \ -\"Number of Forecast Model Runs\"" - for (( k=0; k<=$((num_vars_to_extract-1)); k++ )); do - - crnt_title="${vars_to_extract[$k]}" - # - # Add units for select fields. - # - units="" - case "${vars_to_extract[$k]}" in - "INCR_CYCL_FREQ") - units="[hr]" - ;; - "FCST_LEN_HRS") - units="[hr]" - ;; - "LBC_SPEC_INTVL_HRS") - units="[hr]" - ;; - esac - crnt_title="${crnt_title}${units:+ $units}" - - column_titles="${column_titles} ${csv_delimiter} \"${crnt_title}\"" - # - # Insert a column for DT_ATMOS right after the one for FCST_LEN_HRS. - # - if [ "${vars_to_extract[$k]}" = "FCST_LEN_HRS" ]; then - units="[sec]" - crnt_title="DT_ATMOS${units:+ $units}" - column_titles="${column_titles} ${csv_delimiter} \"${crnt_title}\"" - fi - - done - printf "%s\n" "${column_titles}" >> "${csv_fp}" -# -# Loop through the arrays containing the WE2E test information. Extract -# the necessary information and record it to the CSV file row-by-row. -# Note that each row corresponds to a primary test. When an alternate -# test is encountered, its information is stored in the row of the -# corresponding primary test (i.e. a new row is not created). -# - j=0 - jp1=$((j+1)) - while [ "$j" -lt "${num_tests}" ]; do -# -# Get the primary name of the test and the category subdirectory in which -# it is located. -# - prim_test_name_subdir="${test_names[$j]}"$'\n'"(${test_subdirs[$j]})" -# -# Get the test ID. -# - test_id="${test_ids[$j]}" -# -# Get the test description. -# - test_desc="${test_descs[$j]}" -# -# Replace any double-quotes in the test description with two double-quotes -# since this is the way a double-quote is escaped in a CSV file, at least -# a CSV file that is read in by Google Sheets. -# - test_desc=$( printf "%s" "${test_desc}" | sed -r -e "s/\"/\"\"/g" ) -# -# Get the time step. -# - dta="${dt_atmos[$j]}" -# -# Get the relative cost. -# - rc="${rel_cost[$j]}" -# -# Get the number of forecasts (number of times the forcast model is run). -# - nf="${num_fcsts[$j]}" -# -# In the following inner while-loop, we step through all alternate test -# names (if any) that follow the current primary name and construct a -# string (alt_test_names_subdirs) consisting of all the alternate test -# names for this primary name, with each followed by the subdirectory -# the corresponding symlink is in. Note that when the CSV file is opened -# as a spreadsheet (e.g. in Google Sheets), this alternate test name -# information all appears in one cell of the spreadsheet. -# - alt_test_names_subdirs="" - while [ "$jp1" -lt "${num_tests}" ]; do - test_id_next="${test_ids[$jp1]}" - if [ "${test_id_next}" -eq "${test_id}" ]; then - alt_test_names_subdirs="${alt_test_names_subdirs}${test_names[$jp1]}"$'\n'"(${test_subdirs[$jp1]})"$'\n' - j="$jp1" - jp1=$((j+1)) - else - break - fi - done -# Remove trailing newline. - alt_test_names_subdirs="${alt_test_names_subdirs%$'\n'}" -# -# Write a line to the CSV file representing a single row of the spreadsheet. -# This row contains the following columns: -# -# Column 1: -# The primary test name followed by the category subdirectory it is -# located in (the latter in parentheses). -# -# Column 2: -# Any alternate test names followed by their category subdirectories (in -# parentheses). Each alternate test name and subdirectory pair is followed -# by a newline, but all lines will appear in a single cell of the spreadsheet. -# -# Column 3: -# The test description. -# -# Column 4: -# The relative cost of running the dynamics in the test. See above for -# details. -# -# Column 5: -# The number of times the forecast model will be run by the test. This -# is calculated using quantities such as the number of cycle dates (i.e. -# forecast model start dates) and the number of of ensemble members (which -# is greater than 1 if running ensemble forecasts and 1 otherwise). The -# latter are in turn obtained directly or indirectly from the quantities -# in Columns 6, 7, .... -# -# Columns 6, 7, ...: -# The values of the experiment variables specified in vars_to_extract, -# plus DT_ATMOS (included right after FCST_LEN_HRS). Note that DT_ATMOS -# cannot be included in vars_to_extract because it is usually not in the -# WE2E test configuration file where this script looks for these variables -# (because most of the tests use predefined grids, and for those cases, -# DT_ATMOS is defined in the same file/script where the other grid -# parameters are defined). -# - row_content="\ -\"${prim_test_name_subdir}\" ${csv_delimiter} \ -\"${alt_test_names_subdirs}\" ${csv_delimiter} \ -\"${test_desc}\" ${csv_delimiter} \ -\"${rc}\" ${csv_delimiter} \ -\"${nf}\"" - - for (( k=0; k<=$((num_vars_to_extract-1)); k++ )); do - - unset "val" - cmd="val=\"\${${array_names_vars_to_extract[$k]}[$j]}\"" - eval $cmd - row_content="${row_content} ${csv_delimiter} \"${val}\"" -# -# Insert value of DT_ATMOS right after value of FCST_LEN_HRS. -# - if [ "${vars_to_extract[$k]}" = "FCST_LEN_HRS" ]; then - row_content="${row_content} ${csv_delimiter} \"${dta}\"" - fi - - done - - printf "%s\n" "${row_content}" >> "${csv_fp}" -# -# Update loop indices. -# - j="$jp1" - jp1=$((j+1)) - - done - - print_info_msg "\ -Successfully generated a CSV (Comma Separated Value) file (csv_fp) -containing information on all WE2E tests: - csv_fp = \"${csv_fp}\"" - - fi -# -#----------------------------------------------------------------------- -# -# Use the eval function to set this function's output variables. Note -# that each of these is set only if the corresponding input variable -# specifying the name to use for the output variable is not empty. -# -#----------------------------------------------------------------------- -# - if [ ! -z "${outvarname_test_configs_basedir}" ]; then - eval ${outvarname_test_configs_basedir}="${test_configs_basedir}" - fi - - if [ ! -z "${outvarname_test_names}" ]; then - test_names_str="( "$( printf "\"%s\" " "${test_names[@]}" )")" - eval ${outvarname_test_names}="${test_names_str}" - fi - - if [ ! -z "${outvarname_test_subdirs}" ]; then - test_subdirs_str="( "$( printf "\"%s\" " "${test_subdirs[@]}" )")" - eval ${outvarname_test_subdirs}="${test_subdirs_str}" - fi - - if [ ! -z "${outvarname_test_ids}" ]; then - test_ids_str="( "$( printf "\"%s\" " "${test_ids[@]}" )")" - eval ${outvarname_test_ids}="${test_ids_str}" - fi - - if [ ! -z "${outvarname_test_descs}" ]; then - test_descs_str="( "$( printf "'%s' " "${test_descs[@]}" )")" - eval ${output_varname_test_descs}="${test_descs_str}" - fi -# -#----------------------------------------------------------------------- -# -# Restore the shell options saved at the beginning of this script or -# function. -# -#----------------------------------------------------------------------- -# - { restore_shell_opts; } > /dev/null 2>&1 - -} - diff --git a/tests/WE2E/get_expts_status.sh b/tests/WE2E/get_expts_status.sh deleted file mode 100755 index de326589ca..0000000000 --- a/tests/WE2E/get_expts_status.sh +++ /dev/null @@ -1,475 +0,0 @@ -#!/bin/bash - -# -#----------------------------------------------------------------------- -# -# This script updates and reports back the workflow status of all active -# forecast experiments under a specified base directory (expts_basedir). -# It must be supplied exactly one argument, which is the full path to the -# experiments base directory. -# -# The script first determines which of the subdirectories under the base -# directory represent active experiments (see below for how this is done). -# For all such experiments, it calls the workflow (re)launch script to -# update the status of the workflow and prints the status out to screen. -# It also generates a status report file in the base directory that -# contains the last num_log_lines lines (defined below) of each experiment's -# workflow log file [which is generated by the (re)launch script] and thus -# has information on which tasks may have succeeded/failed]. -# -#----------------------------------------------------------------------- -# - -# -#----------------------------------------------------------------------- -# -# Do not allow uninitialized variables. -# -#----------------------------------------------------------------------- -# -set -u -# -#----------------------------------------------------------------------- -# -# Get the full path to the file in which this script/function is located -# (scrfunc_fp), the name of that file (scrfunc_fn), and the directory in -# which the file is located (scrfunc_dir). -# -#----------------------------------------------------------------------- -# -scrfunc_fp=$( readlink -f "${BASH_SOURCE[0]}" ) -scrfunc_fn=$( basename "${scrfunc_fp}" ) -scrfunc_dir=$( dirname "${scrfunc_fp}" ) -# -#----------------------------------------------------------------------- -# -# The current script should be located in the "tests" subdirectory of the -# workflow's top-level directory, which we denote by HOMEdir. Thus, -# HOMEdir is the directory one level above the directory in which the -# current script is located. Set HOMEdir accordingly. -# -#----------------------------------------------------------------------- -# -HOMEdir=${scrfunc_dir%/*/*} -# -#----------------------------------------------------------------------- -# -# Set directories. -# -#----------------------------------------------------------------------- -# -USHdir="$HOMEdir/ush" -# -#----------------------------------------------------------------------- -# -# Source bash utility functions. -# -#----------------------------------------------------------------------- -# -. $USHdir/source_util_funcs.sh -# -#----------------------------------------------------------------------- -# -# Set the usage message. -# -#----------------------------------------------------------------------- -# -usage_str="\ -Usage: - - ${scrfunc_fn} \\ - expts_basedir=\"...\" \\ - [launch_wflows=\"...\"] \\ - [num_log_lines=\"...\"] \\ - [verbose=\"...\"] - -The arguments in brackets are optional. The arguments are defined as -follows: - -expts_basedir: -Full path to the experiments base directory, i.e. the directory containing -the experiment subdirectories. - -launch_wflows: -Optional flag that determines whether each experiment's workflow should -be launched if hasn't already. Should be set to \"TRUE\" or \"FALSE\". -Default is \"FALSE\". - -num_log_lines: -Optional integer specifying the number of lines from the end of the -workflow launch log file (log.launch_FV3LAM_wflow) of each test to -include in the status report file that this script generates. - -verbose: -Optional verbosity flag. Should be set to \"TRUE\" or \"FALSE\". Default -is \"FALSE\". -" -# -#----------------------------------------------------------------------- -# -# Check to see if usage help for this script is being requested. If so, -# print it out and exit with a 0 exit code (success). -# -#----------------------------------------------------------------------- -# -help_flag="--help" -if [ "$#" -eq 1 ] && [ "$1" = "${help_flag}" ]; then - print_info_msg "${usage_str}" - exit 0 -fi -# -#----------------------------------------------------------------------- -# -# Specify the set of valid argument names for this script or function. -# Then process the arguments provided to it on the command line (which -# should consist of a set of name-value pairs of the form arg1="value1", -# arg2="value2", etc). -# -#----------------------------------------------------------------------- -# -valid_args=( \ - "expts_basedir" \ - "launch_wflows" \ - "num_log_lines" \ - "verbose" \ - ) -process_args valid_args "$@" -# -#----------------------------------------------------------------------- -# -# Default values for various input arguments. -# -#----------------------------------------------------------------------- -# -launch_wflows=${launch_wflows:-"FALSE"} -num_log_lines=${num_log_lines:-"40"} -verbose=${verbose:-"FALSE"} -# -#----------------------------------------------------------------------- -# -# Make sure "launch_wflows" and "verbose" have valid values. -# -#----------------------------------------------------------------------- -# -launch_wflows=$(boolify "${launch_wflows}") -verbose=$(boolify "$verbose") -# -#----------------------------------------------------------------------- -# -# Verify that the required arguments to this script have been specified. -# If not, print out an error message and exit. -# -#----------------------------------------------------------------------- -# -help_msg="\ -Use - ${scrfunc_fn} ${help_flag} -to get help on how to use this script." - -if [ -z "${expts_basedir}" ]; then - print_err_msg_exit "\ -The argument \"expts_basedir\" specifying the base directory containing -the experiment directories was not specified in the call to this script. \ -${help_msg}" -fi -# -#----------------------------------------------------------------------- -# -# Check that the specified experiments base directory exists and is -# actually a directory. If not, print out an error message and exit. -# -#----------------------------------------------------------------------- -# -if [ ! -d "${expts_basedir}" ]; then - print_err_msg_exit " -The specified experiments base directory (expts_basedir) does not exit -or is not actually a directory: - expts_basedir = \"${expts_basedir}\"" -fi -# -#----------------------------------------------------------------------- -# -# Create an array containing the names of the subdirectories in the -# experiment base directory. -# -#----------------------------------------------------------------------- -# -cd_vrfy "${expts_basedir}" -# -# Get a list of all subdirectories (but not files) in the experiment base -# directory. Note that the ls command below will return a string containing -# the subdirectory names, with each name followed by a backslash and a -# newline. -# -subdirs_list=$( \ls -1 -d */ ) -# -# Remove all backslashes from the ends of the subdirectory names. -# -subdirs_list=$( printf "${subdirs_list}" "%s" | sed -r 's|/||g' ) -# -# Create an array out of the string containing the newline-separated list -# of subdirectories. -# -subdirs_list=( ${subdirs_list} ) -# -#----------------------------------------------------------------------- -# -# Loop through the elements of the array subdirs_list and create an array -# containing a list of all active experiment subdirectories under the -# experiment base directory. These active subdirectories will be further -# processed later below. Here, by "active" experiment subdirectory, we -# mean a subdirectory that (1) contains a forecast experiment (i.e. was -# was created by the experiment generation scripts) and (2) does not -# represent an old experiment whose workflow status is no longer relevant. -# For this purpose, for each element in subdirs_list, we: -# -# 1) Change location to the subdirectory. -# -# 2) Check whether an experiment variable definitions file (var_defns.sh) -# exists. If so, we assume the subdirectory is an experiment directory. -# If not, we assume it is not, in which case the subdirectory will -# not be added to the list of active experiment subdirectories. -# -# 3) If the subdirectory is an experiment directory, ensure that it is -# an active experiment, i.e. that it is not an old experiment that -# has been renamed and whose experiment status is thus irrelevant. -# For this purpose, we source the variable definitions file in order -# to have available the workflow variable EXPT_SUBDIR that contains -# the name of the experiment when it was first created. If this -# matches the name of the current subdirectory, then add the latter -# to the list of active experiment subdirectories; otherwise, do not. -# In the latter case, we are assuming that the original experiment -# subdirectory was renamed (e.g. to something like the orginal name -# with the string "_old001" appended) and thus does not contain an -# active experiment whose workflow status is of interest. -# -# 4) Change location back to the experiments base directory. -# -#----------------------------------------------------------------------- -# -separator="======================================" - -var_defns_fn="var_defns.sh" -j="0" -expt_subdirs=() - -print_info_msg "\ -Checking for active experiment directories in the specified experiments -base directory (expts_basedir): - expts_basedir = \"${expts_basedir}\" -..." - -num_subdirs="${#subdirs_list[@]}" -for (( i=0; i<=$((num_subdirs-1)); i++ )); do - - subdir="${subdirs_list[$i]}" - msg=" -$separator -Checking whether the subdirectory - \"${subdir}\" -contains an active experiment..." - print_info_msg "$verbose" "$msg" - - cd_vrfy "${subdir}" -# -# If a variable definitions file does not exist, print out a message -# and move on to the next subdirectory. -# - if [ ! -f "${var_defns_fn}" ]; then - - print_info_msg "$verbose" " -The current subdirectory (subdir) under the experiments base directory -(expts_basedir) does not contain an experiment variable defintions file -(var_defns_fn): - expts_basedir = \"${expts_basedir}\" - subdir = \"${subdir}\" - var_defns_fn = \"${var_defns_fn}\" -Thus, we will assume it is not an experiment directory and will not add -it to the list of active experiments subdirectories whose workflow status -must be checked." -# -# If a variable definitions file does exist, then... -# - else -# -# Source the variable definitions file. -# - . "./${var_defns_fn}" -# We want a clean output from this script so disable debugging mode - export DEBUG="FALSE" -# -# If the workflow variable EXPT_SUBDIR is the same as the name of the -# current subdirectory, then assume this subdirectory contains an active -# experiment. In this case, print out a message and add its name to the -# list of such experiments. -# - if [ "${EXPT_SUBDIR}" = "$subdir" ]; then - - print_info_msg "$verbose" " -The current subdirectory (subdir) under the experiments base directory -(expts_basedir) contains an active experiment: - expts_basedir = \"${expts_basedir}\" - subdir = \"${subdir}\" -Adding the current subdirectory to the list of active experiment -subdirectories whose workflow status must be checked." - - expt_subdirs[$j]="$subdir" - j=$((j+1)) -# -# If the workflow variable EXPT_SUBDIR is not the same as the name of -# the current subdirectory, then assume this subdirectory contains an -# "inactive" that has been renamed. In this case, print out a message -# and move on to the next subdirectory (whithout adding the the name of -# the currend subdirectory to the list of active experiments). -# - else - - print_info_msg "$verbose" " -The current subdirectory (subdir) under the experiments base directory -(expts_basedir) contains an experiment whose original name (EXPT_SUBDIR) -does not match the name of the current subdirectory: - expts_basedir = \"${expts_basedir}\" - subdir = \"${subdir}\" - EXPT_SUBDIR = \"${EXPT_SUBDIR}\" -Thus, we will assume that the current subdirectory contains an inactive -(i.e. old) experiment whose workflow status is not relevant and will not -add it to the list of active experiment subdirectories whose workflow -status must be checked." - - fi - - fi - - print_info_msg "$verbose" "\ -$separator -" -# -# Change location back to the experiments base directory. -# - cd_vrfy "${expts_basedir}" - -done -# -#----------------------------------------------------------------------- -# -# Get the number of active experiments for which to check the workflow -# status and print out an informational message. -# -#----------------------------------------------------------------------- -# -num_expts="${#expt_subdirs[@]}" -expt_subdirs_str=$( printf " \'%s\'\n" "${expt_subdirs[@]}" ) -print_info_msg " -The number of active experiments found is: - num_expts = ${num_expts} -The list of experiments whose workflow status will be checked is: -${expt_subdirs_str} -" -# -#----------------------------------------------------------------------- -# -# Set the name and full path of the file in which the status report will -# be saved. If such a file already exists, rename it. -# -#----------------------------------------------------------------------- -# -yyyymmddhhmn=$( date +%Y%m%d%H%M ) -expts_status_fn="expts_status_${yyyymmddhhmn}.txt" -expts_status_fp="${expts_basedir}/${expts_status_fn}" - -# Note that the check_for_preexist_dir_file function assumes that there -# is a variable named "VERBOSE" in the environment. Set that before -# calling the function. -VERBOSE="TRUE" -check_for_preexist_dir_file "${expts_status_fp}" "rename" -# -#----------------------------------------------------------------------- -# -# Loop through the elements of the array expt_subdirs. For each element -# (i.e. for each active experiment), change location to the experiment -# directory and call the script launch_FV3LAM_wflow.sh to update the log -# file log.launch_FV3LAM_wflow. Then take the last num_log_lines of -# this log file (along with an appropriate message) and add it to the -# status report file. -# -#----------------------------------------------------------------------- -# -launch_wflow_fn="launch_FV3LAM_wflow.sh" -launch_wflow_log_fn="log.launch_FV3LAM_wflow" - -for (( i=0; i<=$((num_expts-1)); i++ )); do - - expt_subdir="${expt_subdirs[$i]}" - msg="\ -$separator -Checking workflow status of experiment \"${expt_subdir}\" ..." - print_info_msg "$msg" - print_info_msg "$msg" >> "${expts_status_fp}" -# -# Change location to the experiment subdirectory, and check the launch -# log file for status -# - cd_vrfy "${expt_subdir}" - if [ -f "${launch_wflow_log_fn}" ]; then - # - # Print the workflow status to the screen. - # - # The "tail -1" is to get only the last occurrence of "Workflow status" - wflow_status=$( grep "Workflow status:" "${launch_wflow_log_fn}" | tail -1 ) - # Not sure why this doesn't work to strip leading spaces. - # wflow_status="${wflow_status## }" - # Remove leading spaces. - wflow_status=$( printf "${wflow_status}" "%s" | sed -r 's|^[ ]*||g' ) - print_info_msg "${wflow_status}" - print_info_msg "\ -$separator -" - # - # Combine message above with the last num_log_lines lines from the workflow - # launch log file and place the result in the status report file. - # - msg=$msg" -${wflow_status} -The last ${num_log_lines} lines of the workflow launch log file -(\"${launch_wflow_log_fn}\") are: -" - tail -n ${num_log_lines} ${launch_wflow_log_fn} >> "${expts_status_fp}" -# -# If a log file from the launch script is not present in the experiment -# directory, it means the workflow has not been launched. In this case, -# print out an appropriate message. Then, if launch_wflows is set to -# TRUE, launch the workflow and print out further info. -# - else - - wflow_status="Workflow status: NOT LAUNCHED YET" - if [ "${launch_wflows}" = "TRUE" ]; then - wflow_status=${wflow_status}" -Launching workflow using script \"${launch_wflow_fn}\"..." - fi - - print_info_msg "${wflow_status}" - print_info_msg "\ -$separator -" - - msg="${wflow_status} -" - print_info_msg "$msg" >> "${expts_status_fp}" - if [ "${launch_wflows}" = "TRUE" ]; then - ./${launch_wflow_fn} >> "${expts_status_fp}" 2>&1 - fi - - fi -# -# Change location back to the experiments base directory. -# - cd_vrfy "${expts_basedir}" - -done - -print_info_msg "\ -A status report has been created in: - expts_status_fp = \"${expts_status_fp}\" - -DONE." diff --git a/tests/WE2E/run_WE2E_tests.sh b/tests/WE2E/run_WE2E_tests.sh deleted file mode 100755 index d2319b6d87..0000000000 --- a/tests/WE2E/run_WE2E_tests.sh +++ /dev/null @@ -1,1379 +0,0 @@ -#!/bin/bash - -# -#----------------------------------------------------------------------- -# -# This script runs the specified WE2E tests. Type -# -# run_WE2E_tests.sh --help -# -# for a full description of how to use this script. -# -#----------------------------------------------------------------------- -# - -# -#----------------------------------------------------------------------- -# -# Get the full path to the file in which this script or function is -# located (scrfunc_fp), the name of that file (scrfunc_fn), and the -# directory in which the file is located (scrfunc_dir). -# -#----------------------------------------------------------------------- -# -scrfunc_fp=$( readlink -f "${BASH_SOURCE[0]}" ) -scrfunc_fn=$( basename "${scrfunc_fp}" ) -scrfunc_dir=$( dirname "${scrfunc_fp}" ) -# -#----------------------------------------------------------------------- -# -# Set the full path to the top-level directory of the UFS SRW App -# repository. We denote this path by HOMEdir. The current script -# should be located in the "tests/WE2E" subdirectory under this directory. -# Thus, HOMEdir is the directory two levels above the directory in which -# the current script is located. -# -#----------------------------------------------------------------------- -# -HOMEdir=${scrfunc_dir%/*/*} -# -#----------------------------------------------------------------------- -# -# Set other directories that depend on HOMEdir. -# -#----------------------------------------------------------------------- -# -export USHdir="$HOMEdir/ush" -TESTSdir="$HOMEdir/tests" -WE2Edir="$TESTSdir/WE2E" -# -#----------------------------------------------------------------------- -# -# Source bash utility functions. -# -#----------------------------------------------------------------------- -# -. $USHdir/source_util_funcs.sh -# -#----------------------------------------------------------------------- -# -# Source other needed files. -# -#----------------------------------------------------------------------- -# -. ${WE2Edir}/get_WE2Etest_names_subdirs_descs.sh -# -#----------------------------------------------------------------------- -# -# Run python checks -# -#----------------------------------------------------------------------- -# -python3 $USHdir/check_python_version.py -if [[ $? -ne 0 ]]; then - exit 1 -fi - -# -#----------------------------------------------------------------------- -# -# Save current shell options (in a global array). Then set new options -# for this script or function. -# -#----------------------------------------------------------------------- -# -{ save_shell_opts; . $USHdir/preamble.sh; } > /dev/null 2>&1 -# -#----------------------------------------------------------------------- -# -# Set the usage message. -# -#----------------------------------------------------------------------- -# -usage_str="\ -Usage: - - ${scrfunc_fn} \\ - tests_file=\"...\" \\ - machine=\"...\" \\ - account=\"...\" \\ - [expt_basedir=\"...\"] \\ - [exec_subdir=\"...\"] \\ - [use_cron_to_relaunch=\"...\"] \\ - [cron_relaunch_intvl_mnts=\"...\"] \\ - [debug=\"...\"] \\ - [verbose=\"...\"] \\ - [generate_csv_file=\"...\"] \\ - [machine_file=\"...\"] \\ - [opsroot=\"...\"] \\ - [run_envir=\"...\"] \\ - [compiler=\"...\"] \\ - [build_mod_fn=\"...\"] - -The arguments in brackets are optional. The arguments are defined as -follows: - -Exactly one of the following flags for defining which tests to run is -required - - tests_file: - Name of file or relative or absolute path to file containing the list - of WE2E tests to run. This file must contain one test name per line, - with no repeated names. - - test_type: - Name of a supported set of tests. Options are fundamental, - comprehensive, or all. - - test_name: - The name of a single test to run - -machine: -Argument used to explicitly set the experiment variable MACHINE in the -experiment configuration files of all the WE2E tests the user wants to -run. (A description of MACHINE can be found in the default experiment -configuration file.) This is a required argument. - -account: -Argument used to explicitly set the experiment variable ACCOUNT in the -experiment configuration files of all the WE2E tests the user wants to -run. (A description of ACCOUNT can be found in the default experiment -configuration file.) This is a required argument. - -expt_basedir: -Optional argument used to explicitly set the experiment variable -EXPT_BASEDIR in the experiment configuration files of all the WE2E tests -the user wants to run. (A description of EXPT_BASEDIR can be found in -the default experiment configuration file.) If expt_basedir is specified -in the call to this script, its value is used to set EXPT_BASEDIR in the -configuration files. If it is not specified, EXPT_BASEDIR is not set in -the configuration files, in which case the workflow generation script -sets it to a default value. Note that if expt_basedir is set to a -relative path (e.g. expt_basedir=\"testset1\" in the call to this script), -then the experiment generation script will set EXPT_BASEDIR for the -experiment to a default absolute path followed by \${expt_basedir}. -This feature can be used to group the WE2E tests into subdirectories for -convenience, e.g. a set of tests under subdirectory testset1, another -set of tests under testset2, etc. - -exec_subdir: -Optional argument used to explicitly set the experiment variable -EXEC_SUBDIR in the experiment configuration files of all the WE2E tests -the user wants to run. See the default experiment configuration file -\"config_defaults.sh\" for a full description of EXEC_SUBDIR. - -use_cron_to_relaunch: -Optional argument used to explicitly set the experiment variable -USE_CRON_TO_RELAUNCH in the experiment configuration files of all the -WE2E tests the user wants to run. (A description of USE_CRON_TO_RELAUNCH -can be found in the default experiment configuration file.) If -use_cron_to_relaunch is specified in the call to this script, its value -is used to set USE_CRON_TO_RELAUNCH in the configuration files. If it -is not specified, USE_CRON_TO_RELAUNCH is set to \"TRUE\" in the -configuration files, in which case cron jobs are used to (re)launch the -workflows for all tests (one cron job per test). Thus, use_cron_to_relaunch -needs to be specified only if the user wants to turn off use of cron jobs -for all tests (by specifying use_cron_to_relaunch=\"FALSE\" on the command -line). Note that it is not possible to specify a different value for -USE_CRON_TO_RELAUNCH for each test via this argument; either all tests -use cron jobs or none do. - -cron_relaunch_intvl_mnts: -Optional argument used to explicitly set the experiment variable -CRON_RELAUNCH_INTVL_MNTS in the experiment configuration files of -all the WE2E tests the user wants to run. (A description of -CRON_RELAUNCH_INTVL_MNTS can be found in the default experiment -configuration file.) If cron_relaunch_intvl_mnts is specified in the -call to this script, its value is used to set CRON_RELAUNCH_INTVL_MNTS -in the configuration files. If it is not specified, CRON_RELAUNCH_INTVL_MNTS -is set to \"02\" (i.e. two minutes) in the configuration files. Note -that it is not possible to specify a different value for -CRON_RELAUNCH_INTVL_MNTS for each test via this argument; all tests will -use the same value for USE_CRON_TO_RELAUNCH (either the value specified -in the call to this script or the default value of \"02\"). Note also -that the value of this argument matters only if the argument -use_cron_to_relaunch is not explicitly set to \"FALSE\" in the call to -this script. - -debug: -If true, run test case in debugging mode. - -verbose: -Optional argument used to explicitly set the experiment variable VERBOSE -in the experiment configuration files of all the WE2E tests the user -wants to run. (A description of VERBOSE can be found in the default -experiment configuration file.) If verbose is specified in the call to -this script, its value is used to set VERBOSE in the configuration files. -If it is not specified, VERBOSE is set to \"TRUE\" in the configuration -files. Note that it is not possible to specify a different value for -VERBOSE for each test via this argument; either all tests will have -VERBOSE set to \"TRUE\" or all will have it set to \"FALSE\". - -generate_csv_file: -Optional argument that specifies whether or not to generate a CSV file -containing summary information about all the tests available in the WE2E -testing system. Default value is \"TRUE\". - -machine_file: -Optional argument specifying the full path to a machine configuration -file. If not set, a supported platform machine file may be used. - -opsroot: -Operations root directory in NCO mode - -run_envir: -Overrides RUN_ENVIR variable to a new value ( nco or community ) - -compiler: -Optional argument used to explicitly set the experiment variable COMPILER -in the experiment configuration files of all the WE2E tests the user -wants to run. (A description of COMPILER can be found in the default -experiment configuration file.) If compiler is specified in the call to -this script, its value is used to set COMPILER in the configuration files. -If it is not specified, COMPILER is set to \"intel\" in the configuration -files. Note that it is not possible to specify a different value for -COMPILER for each test via this argument; all tests will use the same -value for COMPILER (either the value specified in the call to this script -or the default value of \"intel\"). - -build_mod_fn: -Optional argument used to explicitly set the experiment variable -BUILD_MOD_FN in the experiment configuration files of all the WE2E tests -the user wants to run (e.g. \"build_cheyenne_gnu\"). If the string -\"gnu\" appears in this file name, the \"compiler\" option to this -function must also be specified with the value \"gnu\". - - -Usage Examples: --------------- -Here, we give several common usage examples. In the following, assume -my_tests.txt is a text file in the same directory as this script containing -a list of test names that we want to run, e.g. - -> more my_tests.txt -new_ESGgrid -specify_DT_ATMOS_LAYOUT_XY_BLOCKSIZE - -Then: - -1) To run the tests listed in my_tests.txt on Hera and charge the core- - hours used to the \"rtrr\" account, use: - - > run_WE2E_tests.sh tests_file=\"my_tests.txt\" machine=\"hera\" account=\"rtrr\" - - This will create the experiment subdirectories for the two tests in - the directory - - \${HOMEdir}/../expt_dirs - - where HOMEdir is the directory in which the ufs-srweather-app - repository is cloned. Thus, the following two experiment directories - will be created: - - \${HOMEdir}/../expt_dirs/new_ESGgrid - \${HOMEdir}/../expt_dirs/specify_DT_ATMOS_LAYOUT_XY_BLOCKSIZE - - In addition, by default, cron jobs will be created in the user's cron - table to relaunch the workflows of these experiments every 2 minutes. - -2) To change the frequency with which the cron relaunch jobs are submitted - from the default of 2 minutes to 1 minute, use: - - > run_WE2E_tests.sh tests_file=\"my_tests.txt\" machine=\"hera\" account=\"rtrr\" cron_relaunch_intvl_mnts=\"01\" - -3) To disable use of cron (which means the worfkow for each test will - have to be relaunched manually from within each experiment directory), - use: - - > run_WE2E_tests.sh tests_file=\"my_tests.txt\" machine=\"hera\" account=\"rtrr\" use_cron_to_relaunch=\"FALSE\" - -4) To place the experiment subdirectories in a subdirectory named \"test_set_01\" - under - - \${HOMEdir}/../expt_dirs - - (instead of immediately under the latter), use: - - > run_WE2E_tests.sh tests_file=\"my_tests.txt\" machine=\"hera\" account=\"rtrr\" expt_basedir=\"test_set_01\" - - In this case, the full paths to the experiment directories will be: - - \${HOMEdir}/../expt_dirs/test_set_01/new_ESGgrid - \${HOMEdir}/../expt_dirs/test_set_01/specify_DT_ATMOS_LAYOUT_XY_BLOCKSIZE - -5) To use a list of tests that is located in - - /path/to/custom/my_tests.txt - - instead of in the same directory as this script, and to have the - experiment directories be placed in an arbitrary location, say - - /path/to/custom/expt_dirs - - use: - - > run_WE2E_tests.sh tests_file=\"/path/to/custom/my_tests.txt\" machine=\"hera\" account=\"rtrr\" expt_basedir=\"/path/to/custom/expt_dirs\" -" -# -#----------------------------------------------------------------------- -# -# Check to see if usage help for this script is being requested. If so, -# print it out and exit with a 0 exit code (success). -# -#----------------------------------------------------------------------- -# -help_flag="--help" -if [ "$#" -eq 1 ] && [ "$1" = "${help_flag}" ]; then - print_info_msg "${usage_str}" - exit 0 -fi -# -#----------------------------------------------------------------------- -# -# Specify the set of valid argument names for this script or function. -# Then process the arguments provided to it on the command line (which -# should consist of a set of name-value pairs of the form arg1="value1", -# arg2="value2", etc). -# -#----------------------------------------------------------------------- -# -valid_args=( \ - "tests_file" \ - "test_type" \ - "test_name" \ - "machine" \ - "account" \ - "expt_basedir" \ - "exec_subdir" \ - "use_cron_to_relaunch" \ - "cron_relaunch_intvl_mnts" \ - "debug" \ - "verbose" \ - "generate_csv_file" \ - "machine_file" \ - "opsroot" \ - "run_envir" \ - "compiler" \ - "build_mod_fn" \ - ) -process_args valid_args "$@" -# -#----------------------------------------------------------------------- -# -# For debugging purposes, print out values of arguments passed to this -# script. Note that these will be printed out only if VERBOSE is set to -# "TRUE". -# -#----------------------------------------------------------------------- -# -print_input_args "valid_args" -# -#----------------------------------------------------------------------- -# -# Verify that the required arguments to this script have been specified. -# If not, print out an error message and exit. -# -#----------------------------------------------------------------------- -# -help_msg="\ -Use - ${scrfunc_fn} ${help_flag} -to get help on how to use this script." - -if [ -z "${tests_file}" ] && [ -z "${test_name}" ] && [ -z "${test_type}" ] ; then - print_err_msg_exit "\ -At least on of the following arguments must be specified to run this -script: - tests_file - test_name - test_type -${help_msg}" -fi - -if [ -z "${machine}" ]; then - print_err_msg_exit "\ -The argument \"machine\" specifying the machine or platform on which to -run the WE2E tests was not specified in the call to this script. \ -${help_msg}" -fi -machine=${machine,,} - - # Cheyenne-specific test limitation - -if [ "${machine}" = "cheyenne" ]; then - use_cron_to_relaunch=FALSE - echo " -Due to system limitations, the 'use_cron_to_relaunch' command can not be used on -the '${machine}' machine. Setting this variable to false. - -" -fi - -if [ -z "${account}" ]; then - print_err_msg_exit "\ -The argument \"account\" specifying the account under which to submit -jobs to the queue when running the WE2E tests was not specified in the -call to this script. \ -${help_msg}" -fi -# -#----------------------------------------------------------------------- -# -# Set the list of tests to run. -# -#----------------------------------------------------------------------- -# -if [ -n "${test_name}" ] ; then - - # User specified a single test - user_spec_tests=( "${test_name}" ) - -elif [ "${test_type}" = "all" ] ; then - - # User would like to run all the tests available - user_spec_tests=() - for fp in $(find ${scrfunc_dir}/test_configs -name "config.*" -type f ) ; do - user_spec_tests+=("$(basename $fp | cut -f 2 -d .)") - done - -elif [ -n "${tests_file}" ] || [ -n "${test_type}" ] ; then - - # User wants to run a set of tests from a file, either their own or - # one managed in the repo - - if [ -n "${test_type}" ] ; then - # Check for a pre-defined set. It could be machine dependent or has the mode - # (community or nco), or default - user_spec_tests_fp=${scrfunc_dir}/machine_suites/${test_type}.${machine}.${compiler}.nco - if [ ! -f ${user_spec_tests_fp} ]; then - user_spec_tests_fp=${scrfunc_dir}/machine_suites/${test_type}.${machine}.${compiler}.com - if [ ! -f ${user_spec_tests_fp} ]; then - user_spec_tests_fp=${scrfunc_dir}/machine_suites/${test_type}.${machine}.${compiler} - if [ ! -f ${user_spec_tests_fp} ]; then - user_spec_tests_fp=${scrfunc_dir}/machine_suites/${test_type}.${machine} - if [ ! -f ${user_spec_tests_fp} ]; then - user_spec_tests_fp=${scrfunc_dir}/machine_suites/${test_type} - fi - fi - else - run_envir=${run_envir:-"community"} - fi - else - run_envir=${run_envir:-"nco"} - fi - elif [ -n "${tests_file}" ] ; then - user_spec_tests_fp=$( readlink -f "${tests_file}" ) - fi - - if [ ! -f "${user_spec_tests_fp}" ]; then - print_err_msg_exit "\ - The file containing the user-specified list of WE2E tests to run - (tests_file) that is passed in as an argument to this script does not - exit: - tests_file = \"${tests_file}\" - The full path to this script is: - user_spec_tests_fp = \"${user_spec_tests_fp}\" - Please ensure that this file exists and rerun." - fi - # - #----------------------------------------------------------------------- - # - # Read in each line of the file specified by user_spec_tests_fp and add - # each non-empty line to the array user_spec_tests. Note that the read - # command will remove any leading and trailing whitespace from each line - # in user_spec_tests_fp [because it treats whatever character(s) the bash - # variable IFS (Internal Field Separator) is set to as word separators - # on each line, and IFS is by default set to a space, a tab, and a - # newline]. - # - #----------------------------------------------------------------------- - # - user_spec_tests=() - while read -r line; do - if [ ! -z "$line" ]; then - user_spec_tests+=("$line") - fi - done < "${user_spec_tests_fp}" - -fi -# -#----------------------------------------------------------------------- -# -# Call a function to obtain the names of all available WE2E tests (i.e. -# not just the ones the user wants to run but all that are part of the -# WE2E testing system), the test IDs, and the category subdirectory in -# which each corresponding test configuration file is located. -# -# The array of test names (avail_WE2E_test_names) that the function -# called below returns contains both primary and alternate test names. -# A primary test name is a test name obtained from the name of a WE2E -# test configuration file that is an ordinary file, i.e. not a symlink, -# whereas an alternate name is one that is derived from the name of a -# symlink whose target is an ordinary test configuration file (but not -# another symlink). To be able to determine the set of test names that -# correspond to the same primary test, the function called also returns -# an array of test IDs (avail_WE2E_test_IDs) such that the IDs for a -# primary test name and all the alternate names that map to it (if any) -# are the same. These IDs will be used later below to ensure that the -# user does not list in the set of test names to run a given test more -# than once, e.g. by accidentally including in the list its primary name -# as well as one of its alternate names. -# -# The category subdirectories in the array avail_WE2E_test_subdirs -# returned by the function called below are relative to the base -# directory under which the WE2E test configuration files are located. -# This base directory is set by the function call below and is returned -# in the output variable avail_WE2E_test_configs_basedir. The i-th -# element of avail_WE2E_test_subdirs specifies the subdirectory under -# this base directory that contains the ordinary test configuration file -# (for a primary test name) or the symlink (for an alternate test name) -# corresponding to the i-th element (which may be a primary or alternate -# test name) in avail_WE2E_test_names. We refer to these subdirectories -# as "category" subdirectories because they are used for clarity to group -# the WE2E tests into types or categories. -# -# Finally, note that the returned arrays -# -# avail_WE2E_test_names -# avail_WE2E_test_ids -# avail_WE2E_test_subdirs -# -# are sorted in order of increasing test ID and such that for a given -# set of test names that share the same ID, the primary test name is -# listed first followed by zero or more alternate names. As an example, -# assume that there are three category subdirectories under the base -# directory specified by avail_WE2E_test_configs_basedir: dir1, dir2, -# and dir3. Also, assume that dir1 contains a test configuration file -# named config.primary_name.sh that is an ordinary file, and dir2 and dir3 -# contain the following symlinks that point config.primary_name.sh: -# -# ${avail_WE2E_test_configs_basedir}/dir2/config.alt_name_1.sh -# --> ${avail_WE2E_test_configs_basedir}/dir1/config.primary_name.sh -# -# ${avail_WE2E_test_configs_basedir}/dir3/config.alt_name_2.sh -# --> ${avail_WE2E_test_configs_basedir}/dir1/config.primary_name.sh -# -# Finally, assume that the ID of the test primary_name is 21 and that -# this ID is at indices 7, 8, and 9 in avail_WE2E_test_ids. Then indices -# 7, 8, and 9 of the three arrays returned by the function call below -# may be as follows: -# -# avail_WE2E_test_names[7]="primary_name" -# avail_WE2E_test_names[8]="alt_name_1" -# avail_WE2E_test_names[9]="alt_name_2" -# -# avail_WE2E_test_ids[7]="21" -# avail_WE2E_test_ids[8]="21" -# avail_WE2E_test_ids[9]="21" -# -# avail_WE2E_test_subdirs[7]="dir1" -# avail_WE2E_test_subdirs[8]="dir2" -# avail_WE2E_test_subdirs[9]="dir3" -# -#----------------------------------------------------------------------- -# -print_info_msg " -Getting information about all available WE2E tests..." - -get_WE2Etest_names_subdirs_descs \ - WE2Edir="${WE2Edir}" \ - generate_csv_file="${generate_csv_file}" \ - outvarname_test_configs_basedir="avail_WE2E_test_configs_basedir" \ - outvarname_test_names="avail_WE2E_test_names" \ - outvarname_test_subdirs="avail_WE2E_test_subdirs" \ - outvarname_test_ids="avail_WE2E_test_ids" -# -# Get the total number of available WE2E test names (including alternate -# names). -# -num_avail_WE2E_tests="${#avail_WE2E_test_names[@]}" -# -#----------------------------------------------------------------------- -# -# Loop through the elements of the array user_spec_tests and perform -# sanity checks. For each such element (i.e. for each WE2E test to run -# specified by the user), make sure that: -# -# 1) The name of the test exists in the complete list of available WE2E -# tests in avail_WE2E_test_names. -# 2) The test does not have an ID that is identical to a previously -# considered test in the user-specified list of tests to run (because -# if so, it would be identical to that previously considered test, -# and it would be a waste of computational resources to run). -# -# If these requirements are met, add the test name to the list of tests -# to run in the array names_tests_to_run, and add the test's category -# subdirectory to subdirs_tests_to_run. -# -#----------------------------------------------------------------------- -# -print_info_msg " -Performing sanity checks on user-specified list of WE2E tests to run..." - -names_tests_to_run=() -ids_tests_to_run=() -subdirs_tests_to_run=() -# -# Initialize the array that will contain the remaining available WE2E -# test names (including alternate names, if any) after finding a match -# for the i-th user-specified test name to run in user_spec_tests. -# -remaining_avail_WE2E_test_names=( "${avail_WE2E_test_names[@]}" ) - -num_user_spec_tests="${#user_spec_tests[@]}" -for (( i=0; i<=$((num_user_spec_tests-1)); i++ )); do - - user_spec_test="${user_spec_tests[$i]}" - - print_info_msg "\ - Checking user-specified WE2E test: \"${user_spec_test}\"" -# -# For the current user-specified WE2E test (user_spec_test), loop through -# the list of all remaining available WE2E test names (i.e. the ones that -# haven't yet been matched to any of the user-specified test names to -# run) and make sure that: -# -# 1) The name of the test exists (either as a primary test name or an -# alternate test name) in the list of all available WE2E test names. -# 2) The test is not repeated in the user-specified list of tests to run, -# either under the same name or an alternate name (i.e. make sure that -# it does not have the same test ID as a previously considered test). -# -# Note that in the loop below, the index j gets set to only those elements -# of remaining_avail_WE2E_test_names that are defined [the syntax -# "${!some_array[@]}" expands to the indices of some_array that have -# defined elements]. We do this for efficiency; we unset elements of -# remaining_avail_WE2E_test_names that have already been matched with -# one of the user-specified test names to run because we know that any -# remaining user-specified test names will not match those elements. -# - match_found="FALSE" - for j in "${!remaining_avail_WE2E_test_names[@]}"; do - - test_name="${avail_WE2E_test_names[$j]}" - test_id="${avail_WE2E_test_ids[$j]}" -# -# Check whether the name of the current user-specified test (user_spec_test) -# matches any of the names in the full list of WE2E tests. If so: -# -# 1) Set match_found to "TRUE". -# 2) Make sure that the test to run doesn't have a test ID that is -# identical to a previously considered test in the user-specified -# list of tests to run (which would mean the two tests are identical). -# If so, print out an error message and exit. -# - if [ "${test_name}" = "${user_spec_test}" ]; then - - match_found="TRUE" - - is_element_of "ids_tests_to_run" "${test_id}" && { - - user_spec_tests_str=$(printf " \"%s\"\n" "${user_spec_tests[@]}") - user_spec_tests_str=$(printf "(\n%s\n )" "${user_spec_tests_str}") - - all_names_for_test=() - for (( k=0; k<=$((num_avail_WE2E_tests-1)); k++ )); do - if [ "${avail_WE2E_test_ids[$k]}" = "${test_id}" ]; then - all_names_for_test+=("${avail_WE2E_test_names[$k]}") - fi - done - all_names_for_test_str=$(printf " \"%s\"\n" "${all_names_for_test[@]}") - - print_err_msg_exit "\ -The current user-specified test to run (user_spec_test) is already included -in the list of tests to run (user_spec_tests), either under the same name -or an alternate name: - user_spec_test = \"${user_spec_test}\" - user_spec_tests = ${user_spec_tests_str} -This test has the following primary and possible alternate names: -${all_names_for_test_str} -In order to avoid repeating the same WE2E test (and thus waste computational -resources), only one of these test names can be specified in the list of -tests to run. Please modify this list in the file - user_spec_tests_fp = \"${user_spec_tests_fp}\" -accordingly and rerun." - - } -# -# Append the name of the current user-specified test, its ID, and its -# category subdirectory to the arrays that contain the sanity-checked -# versions of of these quantities. -# - names_tests_to_run+=("${user_spec_test}") - ids_tests_to_run+=("${test_id}") - subdirs_tests_to_run+=("${avail_WE2E_test_subdirs[$j]}") -# -# Remove the j-th element of remaining_avail_WE2E_test_names so that for -# the next user-specified test to run, we do not need to check whether -# the j-th test is a match. Then break out of the loop over all remaining -# available WE2E tests. -# - unset remaining_avail_WE2E_test_names[$j] - break - - fi - - done -# -# If match_found is still "FALSE" after exiting the loop above, then a -# match for the current user-specifed test to run was not found in the -# list of all WE2E tests -- neither as a primary test name nor as an -# alternate name. In this case, print out an error message and exit. -# - if [ "${match_found}" = "FALSE" ]; then - avail_WE2E_test_names_str=$( printf " \"%s\"\n" "${avail_WE2E_test_names[@]}" ) - print_err_msg_exit "\ -The name of the current user-specified test to run (user_spec_test) does -not match any of the names (either primary or alternate) of the available -WE2E tests: - user_spec_test = \"${user_spec_test}\" -Valid values for user_spec_test consist of the names (primary or alternate) -of the available WE2E tests, which are: -${avail_WE2E_test_names_str} -Each name in the user-specified list of tests to run: - 1) Must match one of the (primary or alternate) test names of the - availabe WE2E tests. - 2) Must not be the primary or alternate name of a test that has its - primary or one of its alternate names already included in the user- - specified list of test to run, i.e. tests must not be repeated (in - order not to waste computational resources). -Please modify the user-specified list of tests to run such that it adheres -to the rules above and rerun. This list is in the file specified by the -input variable tests_file: - tests_file = \"${tests_file}\" -The full path to this file is: - user_spec_tests_fp = \"${user_spec_tests_fp}\"" - fi - -done -# -#----------------------------------------------------------------------- -# -# Get the number of WE2E tests to run and print out an informational -# message. -# -#----------------------------------------------------------------------- -# -num_tests_to_run="${#names_tests_to_run[@]}" -tests_to_run_str=$( printf " \'%s\'\n" "${names_tests_to_run[@]}" ) -print_info_msg " -After processing the user-specified list of WE2E tests to run, the number -of tests to run (num_tests_to_run) is - num_tests_to_run = ${num_tests_to_run} -and the list of WE2E tests to run (one test per line) is -${tests_to_run_str}" -# -#----------------------------------------------------------------------- -# -# Loop through the WE2E tests to run. For each test, use the corresponding -# test configuration file to generate a temporary experiment file and -# launch the experiment generation script using that file. -# -#----------------------------------------------------------------------- -# -for (( i=0; i<=$((num_tests_to_run-1)); i++ )); do - - test_name="${names_tests_to_run[$i]}" - test_subdir="${subdirs_tests_to_run[$i]}" -# -# Generate the full path to the current WE2E test's configuration file. -# Then ensure that this file exists. -# - test_config_fp="${avail_WE2E_test_configs_basedir}/${test_subdir}/config.${test_name}.yaml" - - if [ ! -f "${test_config_fp}" ]; then - print_err_msg_exit "\ -The experiment configuration file (test_config_fp) for the current WE2E -test (test_name) does not exist: - test_name = \"${test_name}\" - test_config_fp = \"${test_config_fp}\" -Please correct and rerun." - fi -# -#----------------------------------------------------------------------- -# -# Source the default experiment configuration file to set values of -# various experiment variables to their defaults. Then source the -# current WE2E test's configuration file to overwrite certain variables' -# default values with test-specific ones. -# -#----------------------------------------------------------------------- -# - - # Save the environment variable since a default will override when - # sourced. - save_USHdir=${USHdir} - source_config ${USHdir}/config_defaults.yaml - USHdir=${save_USHdir} - MACHINE_FILE=${machine_file:-"${USHdir}/machine/${machine}.yaml"} - source_config ${MACHINE_FILE} - source_config ${test_config_fp} -# -#----------------------------------------------------------------------- -# -# We will now construct a multiline variable consisting of the contents -# that we want the experiment configuration file for this WE2E test to -# have. Once this variable is constructed, we will write its contents -# to the generic configuration file that the experiment generation script -# reads in (specified by the variable EXPT_CONFIG_FN in the default -# configuration file config_defaults.yaml sourced above) and then run that -# script to generate an experiment for the current WE2E test. -# -# We name the multiline variable that will contain the contents of the -# experiment configuration file "expt_config_str" (short for "experiment -# configuration string"). Here, we initialize this to a null string, -# and we append to it later below. -# -#----------------------------------------------------------------------- -# - expt_config_str="" -# -#----------------------------------------------------------------------- -# -# Set (and then write to expt_config_str) various experiment variables -# that depend on the input arguments to this script (as opposed to -# variable settings in the test configuration file specified by -# test_config_fp). Note that any values of these parameters specified -# in the default experiment configuration file (config_defaults.yaml) -# or in the test configuraiton file (test_config_fp) that were sourced -# above will be overwritten by the settings below. -# -# Note also that if EXPT_BASEDIR ends up getting set to a null string, -# the experiment generation script that gets called further below will -# set it to a default path; if it gets set to a relative path, then the -# experiment generation script will set it to a path consisting of a -# default path with the relative path appended to it; and if it gets set -# to an absolute path, then the workflow will leave it set to that path. -# -#----------------------------------------------------------------------- -# - MACHINE="${machine^^}" - ACCOUNT="${account}" - COMPILER=${compiler:-"intel"} - BUILD_MOD_FN=${build_mod_fn:-"build_${machine}_${COMPILER}"} - EXPT_BASEDIR="${expt_basedir}" - EXPT_SUBDIR="${test_name}" - EXEC_SUBDIR="${exec_subdir}" - USE_CRON_TO_RELAUNCH=${use_cron_to_relaunch:-"TRUE"} - CRON_RELAUNCH_INTVL_MNTS=${cron_relaunch_intvl_mnts:-"02"} - DEBUG=${debug:-"FALSE"} - VERBOSE=${verbose:-"TRUE"} - - expt_config_str=${expt_config_str}"\ -# -# The machine on which to run, the account to which to charge computational -# resources, the base directory in which to create the experiment directory -# (if different from the default location), and the name of the experiment -# subdirectory. -# -MACHINE=\"${MACHINE}\" -ACCOUNT=\"${ACCOUNT}\" - -COMPILER=\"${COMPILER}\" -BUILD_MOD_FN=\"${BUILD_MOD_FN}\"" - - if [ -n "${EXEC_SUBDIR}" ]; then - expt_config_str=${expt_config_str}" -EXEC_SUBDIR=\"${EXEC_SUBDIR}\"" - fi - - if [ -n "${EXPT_BASEDIR}" ]; then - expt_config_str=${expt_config_str}" -EXPT_BASEDIR=\"${EXPT_BASEDIR}\"" - fi - - expt_config_str=${expt_config_str}" -EXPT_SUBDIR=\"${EXPT_SUBDIR}\" -# -# Flag specifying whether or not to automatically resubmit the worfklow -# to the batch system via cron and, if so, the frequency (in minutes) of -# resubmission. -# -USE_CRON_TO_RELAUNCH=\"${USE_CRON_TO_RELAUNCH}\" -CRON_RELAUNCH_INTVL_MNTS=\"${CRON_RELAUNCH_INTVL_MNTS}\" -# -# Flags specifying whether to run in debug and verbose mode. -# -DEBUG=\"${DEBUG}\" -VERBOSE=\"${VERBOSE}\"" -# -#----------------------------------------------------------------------- -# -# Append the contents of the current WE2E test's configuration file to -# the experiment configuration string. -# -#----------------------------------------------------------------------- -# - expt_config_str=${expt_config_str}" -# -#----------------------------------------------------------------------- -#----------------------------------------------------------------------- -# The following section is a copy of this WE2E test's configuration file. -# -" - expt_config_str=${expt_config_str}$( config_to_shell_str "${test_config_fp}" ) - expt_config_str=${expt_config_str}" -# -# End of section from this test's configuration file. -#----------------------------------------------------------------------- -#-----------------------------------------------------------------------" - -# -# Set RUN_ENVIR from the $run_envir argument passed to this script -# -if [ ! -z ${run_envir} ]; then - expt_config_str=${expt_config_str}" -# -# Set RUN_ENVIR -# -RUN_ENVIR=${run_envir}" - - RUN_ENVIR=${run_envir} -fi - -# -# Eval DATE_FIRST/LAST_CYCL commands -# -if [[ $DATE_FIRST_CYCL != [0-9]* ]]; then - DATE_FIRST_CYCL=$(eval ${DATE_FIRST_CYCL}) - expt_config_str=${expt_config_str}" -DATE_FIRST_CYCL=${DATE_FIRST_CYCL}" -fi -if [[ $DATE_LAST_CYCL != [0-9]* ]]; then - DATE_LAST_CYCL=$(eval ${DATE_LAST_CYCL}) - expt_config_str=${expt_config_str}" -DATE_LAST_CYCL=${DATE_LAST_CYCL}" -fi - -# -#----------------------------------------------------------------------- -# -# Modifications to the experiment configuration file if the WE2E test -# uses pre-generated grid, orography, or surface climatology files. -# -# If not running one or more of the grid, orography, and surface -# climatology file generation tasks, specify directories in which -# pregenerated versions of these files can be found. -# -#----------------------------------------------------------------------- -# - if [ "${RUN_TASK_MAKE_GRID}" = "FALSE" ] || \ - [ "${RUN_TASK_MAKE_OROG}" = "FALSE" ] || \ - [ "${RUN_TASK_MAKE_SFC_CLIMO}" = "FALSE" ]; then - - pregen_basedir=${TEST_PREGEN_BASEDIR:-} - - if [ ! -d "${pregen_basedir:-}" ] ; then - print_err_msg_exit "\ -The base directory (pregen_basedir) in which the pregenerated grid, -orography, and/or surface climatology files are located has not been -specified for this machine (MACHINE): - MACHINE= \"${MACHINE}\"" - fi - - pregen_dir="${pregen_basedir}/${PREDEF_GRID_NAME}" - expt_config_str=${expt_config_str}" -# -# Directory containing the pregenerated grid files. -# -DOMAIN_PREGEN_BASEDIR=\"${pregen_basedir}\"" - - fi -# -# Directory for pregenerated grid files. -# - if [ "${RUN_TASK_MAKE_GRID}" = "FALSE" ]; then - GRID_DIR="${pregen_dir}" - expt_config_str=${expt_config_str}" -# -# Directory containing the pregenerated grid files. -# -GRID_DIR=\"${GRID_DIR}\"" - fi -# -# Directory for pregenerated orography files. -# - if [ "${RUN_TASK_MAKE_OROG}" = "FALSE" ]; then - OROG_DIR="${pregen_dir}" - expt_config_str=${expt_config_str}" -# -# Directory containing the pregenerated orography files. -# -OROG_DIR=\"${OROG_DIR}\"" - fi -# -# Directory for pregenerated surface climatology files. -# - if [ "${RUN_TASK_MAKE_SFC_CLIMO}" = "FALSE" ]; then - SFC_CLIMO_DIR="${pregen_dir}" - expt_config_str=${expt_config_str}" -# -# Directory containing the pregenerated surface climatology files. -# -SFC_CLIMO_DIR=\"${SFC_CLIMO_DIR}\"" - fi -# -#----------------------------------------------------------------------- -# -# Modifications to the experiment configuration file if running the WE2E -# test in NCO mode. -# -#----------------------------------------------------------------------- -# - if [ "${RUN_ENVIR}" = "nco" ]; then -# -# Set RUN and envir. -# - expt_config_str=${expt_config_str}" -# -# Set NCO mode RUN and model_ver -# -RUN=\"\${EXPT_SUBDIR}\" -model_ver="we2e"" - -# -# Set OPSROOT. -# - expt_config_str=${expt_config_str}" -# -# Set NCO mode OPSROOT -# -OPSROOT=\"${opsroot:-$OPSROOT}\"" - - fi -# -#----------------------------------------------------------------------- -# -# Modifications to the experiment configuration file if the WE2E test -# uses user-staged external model files. -# -#----------------------------------------------------------------------- -# - if [ "${USE_USER_STAGED_EXTRN_FILES}" = "TRUE" ]; then - - # Ensure we only check on disk for these files - data_stores="disk" - - extrn_mdl_source_basedir=${TEST_EXTRN_MDL_SOURCE_BASEDIR:-} - if [ ! -d "${extrn_mdl_source_basedir:-}" ] ; then - print_err_msg_exit "\ -The base directory (extrn_mdl_source_basedir) in which the user-staged -external model files should be located has not been specified for this -machine (MACHINE): - MACHINE= \"${MACHINE}\"" - fi - EXTRN_MDL_SOURCE_BASEDIR_ICS="${extrn_mdl_source_basedir}/${EXTRN_MDL_NAME_ICS}" - if [ "${EXTRN_MDL_NAME_ICS}" = "FV3GFS" ] ; then - EXTRN_MDL_SOURCE_BASEDIR_ICS="${EXTRN_MDL_SOURCE_BASEDIR_ICS}/${FV3GFS_FILE_FMT_ICS}/\${yyyymmddhh}" - else - EXTRN_MDL_SOURCE_BASEDIR_ICS="${EXTRN_MDL_SOURCE_BASEDIR_ICS}/\${yyyymmddhh}" - fi - - EXTRN_MDL_SOURCE_BASEDIR_LBCS="${extrn_mdl_source_basedir}/${EXTRN_MDL_NAME_LBCS}" - if [ "${EXTRN_MDL_NAME_LBCS}" = "FV3GFS" ] ; then - EXTRN_MDL_SOURCE_BASEDIR_LBCS="${EXTRN_MDL_SOURCE_BASEDIR_LBCS}/${FV3GFS_FILE_FMT_LBCS}/\${yyyymmddhh}" - else - EXTRN_MDL_SOURCE_BASEDIR_LBCS="${EXTRN_MDL_SOURCE_BASEDIR_LBCS}/\${yyyymmddhh}" - fi -# -# Make sure that the forecast length is evenly divisible by the interval -# between the times at which the lateral boundary conditions will be -# specified. -# - rem=$(( 10#${FCST_LEN_HRS} % 10#${LBC_SPEC_INTVL_HRS} )) - if [ "$rem" -ne "0" ]; then - print_err_msg_exit "\ -The forecast length (FCST_LEN_HRS) must be evenly divisible by the lateral -boundary conditions specification interval (LBC_SPEC_INTVL_HRS): - FCST_LEN_HRS = ${FCST_LEN_HRS} - LBC_SPEC_INTVL_HRS = ${LBC_SPEC_INTVL_HRS} - rem = FCST_LEN_HRS%%LBC_SPEC_INTVL_HRS = $rem" - fi - expt_config_str="${expt_config_str} -# -# Locations and names of user-staged external model files for generating -# ICs and LBCs. -# -EXTRN_MDL_SOURCE_BASEDIR_ICS='${EXTRN_MDL_SOURCE_BASEDIR_ICS}' -EXTRN_MDL_FILES_ICS=( ${EXTRN_MDL_FILES_ICS[@]} ) -EXTRN_MDL_SOURCE_BASEDIR_LBCS='${EXTRN_MDL_SOURCE_BASEDIR_LBCS}' -EXTRN_MDL_FILES_LBCS=( ${EXTRN_MDL_FILES_LBCS[@]} ) -EXTRN_MDL_DATA_STORES=\"$data_stores\"" - - fi -# -#----------------------------------------------------------------------- -# -# Check that MET directories have been set appropriately, if needed. -# -#----------------------------------------------------------------------- -# - if [ "${RUN_TASK_VX_GRIDSTAT}" = "TRUE" ] || \ - [ "${RUN_TASK_VX_POINTSTAT}" = "TRUE" ] || \ - [ "${RUN_TASK_VX_ENSGRID}" = "TRUE" ] || \ - [ "${RUN_TASK_VX_ENSPOINT}" = "TRUE" ]; then - - check=0 - if [ ! -d ${MET_INSTALL_DIR} ] ; then - print_info_msg "\ - The MET installation location must be set for this machine! - MET_INSTALL_DIR = \"${MET_INSTALL_DIR}\"" - check=1 - fi - - if [ ! -d ${METPLUS_PATH} ] ; then - print_info_msg "\ - The MET+ installation location must be set for this machine! - METPLUS_PATH = \"${METPLUS_PATH}\"" - check=1 - fi - - if [ -z ${MET_BIN_EXEC} ] ; then - print_info_msg "\ - The MET execution command must be set for this machine! - MET_BIN_EXEC = \"${MET_BIN_EXEC}\"" - check=1 - fi - - if [ ! -d ${CCPA_OBS_DIR} ] ; then - print_info_msg "\ - The CCPA observation location must be set for this machine! - CCPA_OBS_DIR = \"${CCPA_OBS_DIR}\"" - check=1 - fi - - if [ ! -d ${MRMS_OBS_DIR} ] ; then - print_info_msg "\ - The MRMS observation location must be set for this machine! - MRMS_OBS_DIR = \"${MRMS_OBS_DIR}\"" - check=1 - fi - - if [ ! -d ${NDAS_OBS_DIR} ] ; then - print_info_msg "\ - The NDAS observation location must be set for this machine! - NDAS_OBS_DIR = \"${NDAS_OBS_DIR}\"" - check=1 - fi - - if [ ${check} = 1 ] ; then - print_err_msg_exit "\ - Please set MET variables in the machine file for \ - MACHINE = \"${MACHINE}\"" - fi - - fi -# -#----------------------------------------------------------------------- -# -# On some machines (e.g. cheyenne), some tasks often require multiple -# tries before they succeed. To make it more convenient to run the WE2E -# tests on these machines without manual intervention, change the number -# of attempts for such tasks on those machines to be more than one. -# -#----------------------------------------------------------------------- -# - add_maxtries="FALSE" - - if [ "$MACHINE" = "HERA" ]; then - add_maxtries="TRUE" - MAXTRIES_MAKE_ICS="2" - MAXTRIES_MAKE_LBCS="2" - MAXTRIES_RUN_POST="2" - elif [ "$MACHINE" = "CHEYENNE" ]; then - add_maxtries="TRUE" - MAXTRIES_MAKE_SFC_CLIMO="3" - MAXTRIES_MAKE_ICS="5" - MAXTRIES_MAKE_LBCS="10" - MAXTRIES_RUN_POST="10" - fi - - if [ "${add_maxtries}" = "TRUE" ]; then - - expt_config_str=${expt_config_str}" -# -# Maximum number of attempts at running each task. -# -MAXTRIES_MAKE_GRID=\"${MAXTRIES_MAKE_GRID}\" -MAXTRIES_MAKE_OROG=\"${MAXTRIES_MAKE_OROG}\" -MAXTRIES_MAKE_SFC_CLIMO=\"${MAXTRIES_MAKE_SFC_CLIMO}\" -MAXTRIES_GET_EXTRN_ICS=\"${MAXTRIES_GET_EXTRN_ICS}\" -MAXTRIES_GET_EXTRN_LBCS=\"${MAXTRIES_GET_EXTRN_LBCS}\" -MAXTRIES_MAKE_ICS=\"${MAXTRIES_MAKE_ICS}\" -MAXTRIES_MAKE_LBCS=\"${MAXTRIES_MAKE_LBCS}\" -MAXTRIES_RUN_FCST=\"${MAXTRIES_RUN_FCST}\" -MAXTRIES_RUN_POST=\"${MAXTRIES_RUN_POST}\"" - - fi -# -#----------------------------------------------------------------------- -# Write content to a temporary config file -#----------------------------------------------------------------------- -# - temp_file="$PWD/_config_temp_.sh" - expt_config_fp="${temp_file}" - printf "%s" "${expt_config_str}" > "${expt_config_fp}" -# -#----------------------------------------------------------------------- -# -# The following are changes that need to be made directly to the -# experiment configuration file created above (as opposed to the -# experiment configuration string expt_config_str) because they involve -# resetting of values that have already been set in the experiment -# configuration file. -# -# If EXTRN_MDL_SYSBASEDIR_ICS has been specified in the current WE2E -# test's base configuration file, it must be set to one of the following: -# -# 1) The string "set_to_non_default_location_in_testing_script" in order -# to allow this script to set it to a valid location depending on the -# machine and external model (for ICs). -# -# 2) To an existing directory. If it is set to a directory, then this -# script ensures that the directory exists (via the check below). -# -#----------------------------------------------------------------------- -# - if [ -n "${EXTRN_MDL_SYSBASEDIR_ICS}" ]; then - - if [ "${EXTRN_MDL_SYSBASEDIR_ICS}" = "set_to_non_default_location_in_testing_script" ]; then - - EXTRN_MDL_SYSBASEDIR_ICS="${TEST_ALT_EXTRN_MDL_SYSBASEDIR_ICS:-}" - - if [ -z "${EXTRN_MDL_SYSBASEDIR_ICS}" ]; then - print_err_msg_exit "\ -A non-default location for EXTRN_MDL_SYSBASEDIR_ICS for testing purposes -has not been specified for this machine (MACHINE) and external model for -initial conditions (EXTRN_MDL_NAME_ICS) combination: - MACHINE= \"${MACHINE}\" - EXTRN_MDL_NAME_ICS = \"${EXTRN_MDL_NAME_ICS}\"" - fi - - # Maintain any templates in EXTRN_MDL_SYSBASEDIR_ICS -- don't use - # quotes. - set_bash_param "${expt_config_fp}" \ - "EXTRN_MDL_SYSBASEDIR_ICS" ${EXTRN_MDL_SYSBASEDIR_ICS} - - fi - - # Check the base directory for the specified location. - if [ ! -d "$(dirname ${EXTRN_MDL_SYSBASEDIR_ICS%%\$*})" ]; then - print_err_msg_exit "\ -The non-default location specified by EXTRN_MDL_SYSBASEDIR_ICS does not -exist or is not a directory: - EXTRN_MDL_NAME_ICS = \"${EXTRN_MDL_NAME_ICS}\"" - fi - - - fi -# -#----------------------------------------------------------------------- -# -# Same as above but for EXTRN_MDL_SYSBASEDIR_LBCS. -# -#----------------------------------------------------------------------- -# - if [ -n "${EXTRN_MDL_SYSBASEDIR_LBCS}" ]; then - - if [ "${EXTRN_MDL_SYSBASEDIR_LBCS}" = "set_to_non_default_location_in_testing_script" ]; then - - EXTRN_MDL_SYSBASEDIR_LBCS="${TEST_ALT_EXTRN_MDL_SYSBASEDIR_LBCS:-}" - - if [ -z "${EXTRN_MDL_SYSBASEDIR_LBCS}" ]; then - print_err_msg_exit "\ -A non-default location for EXTRN_MDL_SYSBASEDIR_LBCS for testing purposes -has not been specified for this machine (MACHINE) and external model for -initial conditions (EXTRN_MDL_NAME_LBCS) combination: - MACHINE= \"${MACHINE}\" - EXTRN_MDL_NAME_LBCS = \"${EXTRN_MDL_NAME_LBCS}\"" - fi - - # Maintain any templates in EXTRN_MDL_SYSBASEDIR_ICS -- don't use - # quotes. - set_bash_param "${expt_config_fp}" \ - "EXTRN_MDL_SYSBASEDIR_LBCS" ${EXTRN_MDL_SYSBASEDIR_LBCS} - - fi - - # Check the base directory for the specified location. - if [ ! -d "$(dirname ${EXTRN_MDL_SYSBASEDIR_LBCS%%\$*})" ]; then - print_err_msg_exit "\ -The non-default location specified by EXTRN_MDL_SYSBASEDIR_LBCS does not -exist or is not a directory: - EXTRN_MDL_NAME_LBCS = \"${EXTRN_MDL_NAME_LBCS}\"" - fi - - - fi -# -#----------------------------------------------------------------------- -# -# Set the full path to the configuration file that the experiment -# generation script reads in. Then write the contents of expt_config_str -# to that file. -# -#----------------------------------------------------------------------- -# - expt_config_fp="$USHdir/${EXPT_CONFIG_FN}" - ext="${EXPT_CONFIG_FN##*.}" - config_to_str "${ext}" "${temp_file}" -t "$USHdir/config_defaults.yaml" >"${expt_config_fp}" - rm -rf "${temp_file}" -# -#----------------------------------------------------------------------- -# -# Call the experiment generation script to generate an experiment -# directory and a rocoto workflow XML for the current WE2E test to run. -# -#----------------------------------------------------------------------- -# - $USHdir/generate_FV3LAM_wflow.py - - if [ $? != 0 ] ; then - print_err_msg_exit "\ -Could not generate an experiment for the test specified by test_name: - test_name = \"${test_name}\"" - fi - -done - -# Print notes about monitoring/running jobs if use_cron_to_relaunch = FALSE -topdir=${scrfunc_dir%/*/*/*} -expt_dirs_fullpath="${topdir}/expt_dirs" - -echo " - ======================================================================== - ======================================================================== - - All experiments have been generated in the directory - ${expt_dirs_fullpath} - - ======================================================================== - ======================================================================== -" - -if [ "${use_cron_to_relaunch,,}" = "false" ]; then - echo " - -The variable 'use_cron_to_relaunch' has been set to FALSE. Jobs will not be automatically run via crontab. - -You can run each task manually in the experiment directory: -(${expt_dirs_fullpath}) - -Or you can use the 'run_srw_tests.py' script in the ush/ directory: - - cd $USHdir - ./run_srw_tests.py -e=${expt_dirs_fullpath} - -" -fi - -# -#----------------------------------------------------------------------- -# -# Restore the shell options saved at the beginning of this script or -# function. -# -#----------------------------------------------------------------------- -# -{ restore_shell_opts; } > /dev/null 2>&1 - diff --git a/tests/WE2E/setup_WE2E_tests.sh b/tests/WE2E/setup_WE2E_tests.sh index c031ad89b2..a0ddb94ec9 100755 --- a/tests/WE2E/setup_WE2E_tests.sh +++ b/tests/WE2E/setup_WE2E_tests.sh @@ -54,16 +54,16 @@ test_type=${4:-fundamental} #---------------------------------------------------------------------- opts= if [[ "$*" != *"debug"* ]]; then - opts="${opts} debug=TRUE" + opts="${opts} --debug=TRUE" fi if [[ "$*" != *"verbose"* ]]; then - opts="${opts} verbose=TRUE" + opts="${opts} --verbose=TRUE" fi if [[ "$*" != *"cron_relaunch_intvl_mnts"* ]]; then - opts="${opts} cron_relaunch_intvl_mnts=4" + opts="${opts} --cron_relaunch_intvl_mnts=4" fi if [[ "$*" != *"exec_subdir"* ]]; then - opts="${opts} exec_subdir=install_${compiler}/exec" + opts="${opts} --exec_subdir=install_${compiler}/exec" fi #----------------------------------------------------------------------- @@ -74,11 +74,11 @@ fi source ../../ush/load_modules_wflow.sh ${machine} # Run the E2E Workflow tests -./run_WE2E_tests.sh \ - machine=${machine} \ - account=${account} \ - compiler=${compiler} \ - test_type=${test_type} \ +./run_WE2E_tests.py \ + --machine=${machine} \ + --account=${account} \ + --compiler=${compiler} \ + --tests=${test_type} \ ${opts} \ "${@:5}" From 5621385e5cc419c7d227c879b817d71a9a188c70 Mon Sep 17 00:00:00 2001 From: "Michael Kavulich, Jr" Date: Mon, 6 Mar 2023 18:52:24 +0000 Subject: [PATCH 32/52] Updates to Jenkins test script for new python workflow --- .cicd/scripts/srw_test.sh | 48 ++++----------------------------------- 1 file changed, 4 insertions(+), 44 deletions(-) diff --git a/.cicd/scripts/srw_test.sh b/.cicd/scripts/srw_test.sh index 8df2ff5c2f..7a4796dc8e 100755 --- a/.cicd/scripts/srw_test.sh +++ b/.cicd/scripts/srw_test.sh @@ -38,58 +38,18 @@ else fi cd ${we2e_test_dir} +# Progress file +progress_file="${workspace}/we2e_test_results-${platform}-${SRW_COMPILER}.txt" ./setup_WE2E_tests.sh ${platform} ${SRW_PROJECT} ${SRW_COMPILER} ${test_type} \ expt_basedir=${we2e_experiment_base_dir} \ - opsroot=${nco_dir} - -# Run the new run_srw_tests script if the machine is Cheyenne. -if [[ "${platform}" = "cheyenne" ]]; then - cd ${workspace}/ush - ./run_srw_tests.py -e=${we2e_experiment_base_dir} - cd ${we2e_test_dir} -fi + opsroot=${nco_dir} | tee ${progress_file} # Progress file progress_file="${workspace}/we2e_test_results-${platform}-${SRW_COMPILER}.txt" -# Allow the tests to start before checking for status. -# TODO: Create a parameter that sets the initial start delay. -if [[ "${platform}" != "cheyenne" ]]; then - sleep 300 -fi - -# Wait for all tests to complete. -while true; do - - # Check status of all experiments - ./get_expts_status.sh expts_basedir="${we2e_experiment_base_dir}" \ - verbose="FALSE" | tee ${progress_file} - - # Exit loop only if there are not tests in progress - set +e - grep -q "Workflow status: IN PROGRESS" ${progress_file} - exit_code=$? - set -e - - if [[ $exit_code -ne 0 ]]; then - break - fi - - # TODO: Create a paremeter that sets the poll frequency. - sleep 60 -done - -# Allow we2e cron jobs time to complete and clean up themselves -# TODO: Create parameter that sets the interval for the we2e cron jobs; this -# value should be some factor of that interval to ensure the cron jobs execute -# before the workspace is cleaned up. -if [[ "${platform}" != "cheyenne" ]]; then - sleep 600 -fi - # Set exit code to number of failures set +e -failures=$(grep "Workflow status: FAILURE" ${progress_file} | wc -l) +failures=$(grep " DEAD " ${progress_file} | wc -l) if [[ $failures -ne 0 ]]; then failures=1 fi From 7e5436d9eff53d0fa50b765d3ea48c39f7c934e2 Mon Sep 17 00:00:00 2001 From: "Michael Kavulich, Jr" Date: Mon, 6 Mar 2023 20:01:17 +0000 Subject: [PATCH 33/52] Fix unit test for new behavior of calculate_cost.py --- ush/calculate_cost.py | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/ush/calculate_cost.py b/ush/calculate_cost.py index 56e0ed9cf8..87f789aebc 100755 --- a/ush/calculate_cost.py +++ b/ush/calculate_cost.py @@ -103,16 +103,6 @@ def calculate_cost(config_fn): class Testing(unittest.TestCase): def test_calculate_cost(self): USHdir = os.path.dirname(os.path.abspath(__file__)) - params = calculate_cost(None) - self.assertCountEqual(params, [36, 1987440, 36, 28689]) - - def setUp(self): - set_env_var("DEBUG", False) - set_env_var("VERBOSE", False) - set_env_var("PREDEF_GRID_NAME", "RRFS_CONUS_3km") - set_env_var("DT_ATMOS", 36) - set_env_var("LAYOUT_X", 18) - set_env_var("LAYOUT_Y", 36) - set_env_var("BLOCKSIZE", 28) - set_env_var("QUILTING", False) - set_env_var("RUN_ENVIR", "community") + params = calculate_cost('config.community.yaml') + self.assertCountEqual(params, [180, 28689, 180, 28689]) + From 1dc12293e03f045f966896d0f8db9f67829cdceb Mon Sep 17 00:00:00 2001 From: "Michael Kavulich, Jr" Date: Mon, 6 Mar 2023 20:07:10 +0000 Subject: [PATCH 34/52] Fix unit test for real this time? --- ush/calculate_cost.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ush/calculate_cost.py b/ush/calculate_cost.py index 87f789aebc..731cce76f7 100755 --- a/ush/calculate_cost.py +++ b/ush/calculate_cost.py @@ -103,6 +103,6 @@ def calculate_cost(config_fn): class Testing(unittest.TestCase): def test_calculate_cost(self): USHdir = os.path.dirname(os.path.abspath(__file__)) - params = calculate_cost('config.community.yaml') + params = calculate_cost(os.path.join(USHdir, 'config.community.yaml')) self.assertCountEqual(params, [180, 28689, 180, 28689]) From 399b10d5e10901b91895c6b2b1e2876447feb3c1 Mon Sep 17 00:00:00 2001 From: "Michael Kavulich, Jr" Date: Mon, 6 Mar 2023 16:04:04 -0700 Subject: [PATCH 35/52] Don't call rocotorun for WE2E_summary --- tests/WE2E/utils.py | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/tests/WE2E/utils.py b/tests/WE2E/utils.py index 01e12eded7..5c2eebf0ac 100755 --- a/tests/WE2E/utils.py +++ b/tests/WE2E/utils.py @@ -167,7 +167,7 @@ def create_expt_dict(expt_dir: str) -> dict: continue #Update the experiment dictionary logging.debug(f"Reading status of experiment {item}") - update_expt_status(expt_dict[item],item,True,False) + update_expt_status(expt_dict[item],item,True,False,False) summary_file = f'WE2E_tests_{datetime.now().strftime("%Y%m%d%H%M%S")}.yaml' return summary_file, expt_dict @@ -226,7 +226,8 @@ def write_monitor_file(monitor_file: str, expt_dict: dict): raise -def update_expt_status(expt: dict, name: str, refresh: bool = False, debug: bool = False) -> dict: +def update_expt_status(expt: dict, name: str, refresh: bool = False, debug: bool = False, + submit: bool = True) -> dict: """ This function reads the dictionary showing the location of a given experiment, runs a `rocotorun` command to update the experiment (running new jobs and updating the status of @@ -270,6 +271,9 @@ def update_expt_status(expt: dict, name: str, refresh: bool = False, debug: bool debug (bool): Will capture all output from rocotorun. This will allow information such as job cards and job submit messages to appear in the log files, but can slow down the process drastically. + submit (bool): In addition to reading the rocoto database, script will advance the + workflow by calling rocotorun. If simply generating a report, set this + to False Returns: dict: The updated experiment dictionary. """ @@ -277,24 +281,24 @@ def update_expt_status(expt: dict, name: str, refresh: bool = False, debug: bool #If we are no longer tracking this experiment, return unchanged if (expt["status"] in ['DEAD','ERROR','COMPLETE']) and not refresh: return expt - - if refresh: - logging.info(f"Updating database for experiment {name}") # Update experiment, read rocoto database rocoto_db = f"{expt['expt_dir']}/FV3LAM_wflow.db" - if debug: - rocotorun_cmd = ["rocotorun", f"-w {expt['expt_dir']}/FV3LAM_wflow.xml", f"-d {rocoto_db}", "-v 10"] - p = subprocess.run(rocotorun_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) - logging.debug(p.stdout) - - #Run rocotorun again to get around rocotobqserver proliferation issue - p = subprocess.run(rocotorun_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) - logging.debug(p.stdout) - else: - rocotorun_cmd = ["rocotorun", f"-w {expt['expt_dir']}/FV3LAM_wflow.xml", f"-d {rocoto_db}"] - subprocess.run(rocotorun_cmd) - #Run rocotorun again to get around rocotobqserver proliferation issue - subprocess.run(rocotorun_cmd) + if submit: + if refresh: + logging.info(f"Updating database for experiment {name}") + if debug: + rocotorun_cmd = ["rocotorun", f"-w {expt['expt_dir']}/FV3LAM_wflow.xml", f"-d {rocoto_db}", "-v 10"] + p = subprocess.run(rocotorun_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) + logging.debug(p.stdout) + + #Run rocotorun again to get around rocotobqserver proliferation issue + p = subprocess.run(rocotorun_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) + logging.debug(p.stdout) + else: + rocotorun_cmd = ["rocotorun", f"-w {expt['expt_dir']}/FV3LAM_wflow.xml", f"-d {rocoto_db}"] + subprocess.run(rocotorun_cmd) + #Run rocotorun again to get around rocotobqserver proliferation issue + subprocess.run(rocotorun_cmd) logging.debug(f"Reading database for experiment {name}, updating experiment dictionary") try: From 0bd9c5025f815ccbb307e259f7faea2747c9a5f0 Mon Sep 17 00:00:00 2001 From: "Michael Kavulich, Jr" Date: Mon, 6 Mar 2023 18:08:53 -0700 Subject: [PATCH 36/52] Add directory name to test summary --- tests/WE2E/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/WE2E/utils.py b/tests/WE2E/utils.py index 5c2eebf0ac..687dea04f8 100755 --- a/tests/WE2E/utils.py +++ b/tests/WE2E/utils.py @@ -51,6 +51,7 @@ def print_WE2E_summary(expt_dict: dict, debug: bool = False): expt_details.append('') expt_details.append('-'*REPORT_WIDTH) expt_details.append(f'Detailed summary of experiment {expt}') + expt_details.append(f"in directory {expt_dict[expt]['expt_dir']}") expt_details.append(f'{" "*40} | Status | Walltime | Core hours used') expt_details.append('-'*REPORT_WIDTH) From 903d881d96d5d653f28bd793672e6ea4ec5d93c3 Mon Sep 17 00:00:00 2001 From: "Michael Kavulich, Jr" Date: Tue, 7 Mar 2023 02:13:44 +0000 Subject: [PATCH 37/52] - More general cleanup, including suggestions from pylint - Rename monitor_jobs.yaml to WE2E_tests.yaml to conform with naming convention of auto-generated yaml files - Rename instances of "expt_dir" to "expts_dir" if it is a dictionary of multiple experiment dictionaries; retain "expt_dir" for single-experiment dictionaries - Remove duplicate copy of calculate_core_hours() that somehow pylint doesnt catch?? --- tests/WE2E/WE2E_summary.py | 16 +- .../{monitor_jobs.yaml => WE2E_tests.yaml} | 0 tests/WE2E/monitor_jobs.py | 50 +++-- tests/WE2E/run_WE2E_tests.py | 82 +++++--- tests/WE2E/utils.py | 177 ++++++++---------- 5 files changed, 164 insertions(+), 161 deletions(-) rename tests/WE2E/{monitor_jobs.yaml => WE2E_tests.yaml} (100%) diff --git a/tests/WE2E/WE2E_summary.py b/tests/WE2E/WE2E_summary.py index 26d1ccd731..de478a0f38 100755 --- a/tests/WE2E/WE2E_summary.py +++ b/tests/WE2E/WE2E_summary.py @@ -10,9 +10,7 @@ from check_python_version import check_python_version -from utils import calculate_core_hours, create_expt_dict, print_WE2E_summary, write_monitor_file - -REPORT_WIDTH = 100 +from utils import calculate_core_hours, create_expts_dict, print_WE2E_summary, write_monitor_file def setup_logging(debug: bool = False) -> None: """ @@ -44,7 +42,7 @@ def setup_logging(debug: bool = False) -> None: req = parser.add_mutually_exclusive_group(required=True) req.add_argument('-y', '--yaml_file', type=str, help='YAML-format file specifying the information of jobs to be summarized; '\ - 'for an example file, see monitor_jobs.yaml') + 'for an example file, see WE2E_tests.yaml') req.add_argument('-e', '--expt_dir', type=str, help='The full path of an experiment directory, containing one or more '\ 'subdirectories with UFS SRW App experiments in them') @@ -59,15 +57,15 @@ def setup_logging(debug: bool = False) -> None: # Set up dictionary of experiments if args.expt_dir: - yaml_file, expt_dict = create_expt_dict(args.expt_dir) + yaml_file, expts_dict = create_expts_dict(args.expt_dir) elif args.yaml_file: - expt_dict = load_config_file(args.yaml_file) + expts_dict = load_config_file(args.yaml_file) else: raise ValueError(f'Bad arguments; run {__file__} -h for more information') # Calculate core hours and update yaml - expt_dict = calculate_core_hours(expt_dict) - write_monitor_file(yaml_file,expt_dict) + expts_dict = calculate_core_hours(expts_dict) + write_monitor_file(yaml_file,expts_dict) #Call function to print summary - print_WE2E_summary(expt_dict, args.debug) + print_WE2E_summary(expts_dict, args.debug) diff --git a/tests/WE2E/monitor_jobs.yaml b/tests/WE2E/WE2E_tests.yaml similarity index 100% rename from tests/WE2E/monitor_jobs.yaml rename to tests/WE2E/WE2E_tests.yaml diff --git a/tests/WE2E/monitor_jobs.py b/tests/WE2E/monitor_jobs.py index a5634b52d4..626b0d8f81 100755 --- a/tests/WE2E/monitor_jobs.py +++ b/tests/WE2E/monitor_jobs.py @@ -17,18 +17,17 @@ from utils import calculate_core_hours, write_monitor_file, update_expt_status,\ update_expt_status_parallel -def monitor_jobs(expt_dict: dict, monitor_file: str = '', procs: int = 1, debug: bool = False) -> str: +def monitor_jobs(expts_dict: dict, monitor_file: str = '', procs: int = 1, debug: bool = False) -> str: """Function to monitor and run jobs for the specified experiment using Rocoto Args: - expt_dict (dict): A dictionary containing the information needed to run + expts_dict (dict): A dictionary containing the information needed to run one or more experiments. See example file monitor_jobs.yaml monitor_file (str): [optional] debug (bool): [optional] Enable extra output for debugging Returns: str: The name of the file used for job monitoring (when script is finished, this contains results/summary) - """ starttime = datetime.now() @@ -37,68 +36,65 @@ def monitor_jobs(expt_dict: dict, monitor_file: str = '', procs: int = 1, debug: monitor_file = f'WE2E_tests_{starttime.strftime("%Y%m%d%H%M%S")}.yaml' logging.info(f"Writing information for all experiments to {monitor_file}") - write_monitor_file(monitor_file,expt_dict) + write_monitor_file(monitor_file,expts_dict) # Perform initial setup for each experiment logging.info("Checking tests available for monitoring...") if procs > 1: print(f'Starting experiments in parallel with {procs} processes') - expt_dict = update_expt_status_parallel(expt_dict, procs, True, debug) + expts_dict = update_expt_status_parallel(expts_dict, procs, True, debug) else: - for expt in expt_dict: + for expt in expts_dict: logging.info(f"Starting experiment {expt} running") - expt_dict[expt] = update_expt_status(expt_dict[expt], expt, True, debug) + expts_dict[expt] = update_expt_status(expts_dict[expt], expt, True, debug) - write_monitor_file(monitor_file,expt_dict) + write_monitor_file(monitor_file,expts_dict) - logging.info(f'Setup complete; monitoring {len(expt_dict)} experiments') + logging.info(f'Setup complete; monitoring {len(expts_dict)} experiments') logging.info('Use ctrl-c to pause job submission/monitoring') #Make a copy of experiment dictionary; will use this copy to monitor active experiments - running_expts = expt_dict.copy() + running_expts = expts_dict.copy() i = 0 while running_expts: i += 1 if procs > 1: - expt_dict = update_expt_status_parallel(expt_dict, procs) + expts_dict = update_expt_status_parallel(expts_dict, procs) else: for expt in running_expts.copy(): - expt_dict[expt] = update_expt_status(expt_dict[expt], expt) + expts_dict[expt] = update_expt_status(expts_dict[expt], expt) for expt in running_expts.copy(): - running_expts[expt] = expt_dict[expt] - if running_expts[expt]["status"] in ['DEAD','ERROR','COMPLETE']: + running_expts[expt] = expts_dict[expt] + if running_expts[expt]["status"] in ['DEAD','ERROR','COMPLETE']: logging.info(f'Experiment {expt} is {running_expts[expt]["status"]};'\ 'will no longer monitor.') running_expts.pop(expt) continue - logging.debug(f'Experiment {expt} status is {expt_dict[expt]["status"]}') - + logging.debug(f'Experiment {expt} status is {expts_dict[expt]["status"]}') - write_monitor_file(monitor_file,expt_dict) + write_monitor_file(monitor_file,expts_dict) endtime = datetime.now() total_walltime = endtime - starttime logging.debug(f"Finished loop {i}\nWalltime so far is {str(total_walltime)}") - #Slow things down just a tad between loops so experiments behave better time.sleep(5) - endtime = datetime.now() total_walltime = endtime - starttime - logging.info(f'All {len(expt_dict)} experiments finished in {str(total_walltime)}') - logging.info(f'Calculating core-hour usage and printing final summary') + logging.info(f'All {len(expts_dict)} experiments finished in {str(total_walltime)}') + logging.info('Calculating core-hour usage and printing final summary') # Calculate core hours and update yaml - expt_dict = calculate_core_hours(expt_dict) - write_monitor_file(monitor_file,expt_dict) + expts_dict = calculate_core_hours(expts_dict) + write_monitor_file(monitor_file,expts_dict) #Call function to print summary - print_WE2E_summary(expt_dict, debug) + print_WE2E_summary(expts_dict, debug) return monitor_file @@ -141,7 +137,7 @@ def setup_logging(logfile: str = "log.run_WE2E_tests", debug: bool = False) -> N parser.add_argument('-y', '--yaml_file', type=str, help='YAML-format file specifying the information of jobs to be run; '\ 'for an example file, see monitor_jobs.yaml', required=True) - parser.add_argument('-p', '--procs', type=int, + parser.add_argument('-p', '--procs', type=int, help='Run resource-heavy tasks (such as calls to rocotorun) in parallel, '\ 'with provided number of parallel tasks', default=1) parser.add_argument('-d', '--debug', action='store_true', @@ -151,7 +147,7 @@ def setup_logging(logfile: str = "log.run_WE2E_tests", debug: bool = False) -> N setup_logging(logfile,args.debug) - expt_dict = load_config_file(args.yaml_file) + expts_dict = load_config_file(args.yaml_file) if args.procs < 1: raise ValueError('You can not have less than one parallel process; select a valid value for --procs') @@ -159,7 +155,7 @@ def setup_logging(logfile: str = "log.run_WE2E_tests", debug: bool = False) -> N #Call main function try: - monitor_jobs(expt_dict,args.yaml_file,args.procs,args.debug) + monitor_jobs(expts_dict,args.yaml_file,args.procs,args.debug) except KeyboardInterrupt: logging.info("\n\nUser interrupted monitor script; to resume monitoring jobs run:\n") logging.info(f"{__file__} -y={args.yaml_file} -p={args.procs}\n") diff --git a/tests/WE2E/run_WE2E_tests.py b/tests/WE2E/run_WE2E_tests.py index 8d51507689..9815221feb 100755 --- a/tests/WE2E/run_WE2E_tests.py +++ b/tests/WE2E/run_WE2E_tests.py @@ -65,7 +65,7 @@ def run_we2e_tests(homedir, args) -> None: tests_to_check = [] for f in alltests: filename = os.path.basename(f) - # We just want the test name in this list, so cut out the + # We just want the test name in this list, so cut out the # "config." prefix and ".yaml" extension tests_to_check.append(filename[7:-5]) logging.debug(f"Will check all tests:\n{tests_to_check}") @@ -332,20 +332,24 @@ def check_task_get_extrn_bcs(cfg: dict, mach: dict, dflt: dict, ics_or_lbcs: str # If USE_USER_STAGED_EXTRN_FILES not specified or false, do nothing and return if not cfg_bcs.get('USE_USER_STAGED_EXTRN_FILES'): - logging.debug(f'USE_USER_STAGED_EXTRN_FILES not specified or False in task_get_extrn_{ics_or_lbcs} section of config') + logging.debug('USE_USER_STAGED_EXTRN_FILES not specified or False in '\ + f'task_get_extrn_{ics_or_lbcs} section of config') return cfg_bcs # If EXTRN_MDL_SYSBASEDIR_* is "set_to_non_default_location_in_testing_script", replace with # test value from machine file - if cfg_bcs.get(f'EXTRN_MDL_SYSBASEDIR_{I_OR_L}') == "set_to_non_default_location_in_testing_script": + if cfg_bcs.get(f'EXTRN_MDL_SYSBASEDIR_{I_OR_L}') == \ + "set_to_non_default_location_in_testing_script": if f'TEST_ALT_EXTRN_MDL_SYSBASEDIR_{I_OR_L}' in mach['platform']: if os.path.isdir(mach['platform'][f'TEST_ALT_EXTRN_MDL_SYSBASEDIR_{I_OR_L}']): - raise FileNotFoundError(f"Non-default input file location TEST_ALT_EXTRN_MDL_SYSBASEDIR_{I_OR_L} from machine file does not exist or is not a directory") + raise FileNotFoundError("Non-default input file location "\ + f"TEST_ALT_EXTRN_MDL_SYSBASEDIR_{I_OR_L} from machine "\ + "file does not exist or is not a directory") cfg_bcs[f'EXTRN_MDL_SYSBASEDIR_{I_OR_L}'] = \ mach['platform'][f'TEST_ALT_EXTRN_MDL_SYSBASEDIR_{I_OR_L}'] else: - raise KeyError(f"Non-default input file location "\ - "TEST_ALT_EXTRN_MDL_SYSBASEDIR_{I_OR_L} not set in machine file") + raise KeyError("Non-default input file location "\ + f"TEST_ALT_EXTRN_MDL_SYSBASEDIR_{I_OR_L} not set in machine file") return cfg_bcs # Because USE_USER_STAGED_EXTRN_FILES is true, only look on disk, and ensure the staged data @@ -356,9 +360,9 @@ def check_task_get_extrn_bcs(cfg: dict, mach: dict, dflt: dict, ics_or_lbcs: str "has not been specified in the machine file for this platform") if not os.path.isdir(mach['platform']['TEST_EXTRN_MDL_SOURCE_BASEDIR']): raise FileNotFoundError(dedent( - f"""The directory for staged test data specified in this platform's machine file - TEST_EXTRN_MDL_SOURCE_BASEDIR = {mach['platform']['TEST_EXTRN_MDL_SOURCE_BASEDIR']} - does not exist.""")) + f"""The directory for staged test data specified in this platform's machine file + TEST_EXTRN_MDL_SOURCE_BASEDIR = {mach['platform']['TEST_EXTRN_MDL_SOURCE_BASEDIR']} + does not exist.""")) # Different input data types have different directory structures; set data dir accordingly if cfg_bcs[f'EXTRN_MDL_NAME_{I_OR_L}'] == 'FV3GFS': @@ -458,34 +462,57 @@ def setup_logging(logfile: str = "log.run_WE2E_tests", debug: bool = False) -> N logfile='log.run_WE2E_tests' #Parse arguments - parser = argparse.ArgumentParser(epilog="For more information about config arguments (denoted in CAPS), see ush/config_defaults.yaml\n") + parser = argparse.ArgumentParser(epilog="For more information about config arguments (denoted "\ + "in CAPS), see ush/config_defaults.yaml\n") # Create a group for optional arguments so they can be listed after required args optional = parser._action_groups.pop() required = parser.add_argument_group('required arguments') - required.add_argument('-m', '--machine', type=str, help='Machine name; see ush/machine/ for valid values', required=True) - required.add_argument('-a', '--account', type=str, help='Account name for running submitted jobs', required=True) - required.add_argument('-t', '--tests', type=str, nargs="*", help="""Can be one of three options (in order of priority): + required.add_argument('-m', '--machine', type=str, + help='Machine name; see ush/machine/ for valid values', required=True) + required.add_argument('-a', '--account', type=str, + help='Account name for running submitted jobs', required=True) + required.add_argument('-t', '--tests', type=str, nargs="*", + help="""Can be one of three options (in order of priority): 1. A test name or list of test names. 2. A test suite name ("fundamental", "comprehensive", or "all") 3. The name of a file (full or relative path) containing a list of test names. """, required=True) - parser.add_argument('-c', '--compiler', type=str, help='Compiler used for building the app', default='intel') - parser.add_argument('-d', '--debug', action='store_true', help='Script will be run in debug mode with more verbose output') - parser.add_argument('-q', '--quiet', action='store_true', help='Suppress console output from workflow generation; this will help keep the screen uncluttered') - parser.add_argument('-p', '--procs', type=int, help='Run resource-heavy tasks (such as calls to rocotorun) in parallel, with provided number of parallel tasks', default=1) + parser.add_argument('-c', '--compiler', type=str, + help='Compiler used for building the app', default='intel') + parser.add_argument('-d', '--debug', action='store_true', + help='Script will be run in debug mode with more verbose output') + parser.add_argument('-q', '--quiet', action='store_true', + help='Suppress console output from workflow generation; this will help '\ + 'keep the screen uncluttered') + parser.add_argument('-p', '--procs', type=int, + help='Run resource-heavy tasks (such as calls to rocotorun) in parallel, '\ + 'with provided number of parallel tasks', default=1) parser.add_argument('--modulefile', type=str, help='Modulefile used for building the app') - parser.add_argument('--run_envir', type=str, help='Overrides RUN_ENVIR variable to a new value ( "nco" or "community" ) for all experiments', default='') - parser.add_argument('--expt_basedir', type=str, help='Explicitly set EXPT_BASEDIR for all experiments') - parser.add_argument('--exec_subdir', type=str, help='Explicitly set EXEC_SUBDIR for all experiments') - parser.add_argument('--use_cron_to_relaunch', action='store_true', help='Explicitly set USE_CRON_TO_RELAUNCH for all experiments; this option disables the "monitor" script functionality') - parser.add_argument('--cron_relaunch_intvl_mnts', type=int, help='Overrides CRON_RELAUNCH_INTVL_MNTS for all experiments') - parser.add_argument('--opsroot', type=str, help='If test is for NCO mode, sets OPSROOT (see config_defaults.yaml for details)') - parser.add_argument('--print_test_info', action='store_true', help='Create a "WE2E_test_info.txt" file summarizing each test prior to starting experiment') - parser.add_argument('--debug_tests', action='store_true', help='Explicitly set DEBUG=TRUE for all experiments') - parser.add_argument('--verbose_tests', action='store_true', help='Explicitly set VERBOSE=TRUE for all experiments') + parser.add_argument('--run_envir', type=str, + help='Overrides RUN_ENVIR variable to a new value ("nco" or "community") '\ + 'for all experiments', default='') + parser.add_argument('--expt_basedir', type=str, + help='Explicitly set EXPT_BASEDIR for all experiments') + parser.add_argument('--exec_subdir', type=str, + help='Explicitly set EXEC_SUBDIR for all experiments') + parser.add_argument('--use_cron_to_relaunch', action='store_true', + help='Explicitly set USE_CRON_TO_RELAUNCH for all experiments; this '\ + 'option disables the "monitor" script functionality') + parser.add_argument('--cron_relaunch_intvl_mnts', type=int, + help='Overrides CRON_RELAUNCH_INTVL_MNTS for all experiments') + parser.add_argument('--opsroot', type=str, + help='If test is for NCO mode, sets OPSROOT (see config_defaults.yaml for '\ + 'more details on this variable)') + parser.add_argument('--print_test_info', action='store_true', + help='Create a "WE2E_test_info.txt" file summarizing each test prior to'\ + 'starting experiment') + parser.add_argument('--debug_tests', action='store_true', + help='Explicitly set DEBUG=TRUE for all experiments') + parser.add_argument('--verbose_tests', action='store_true', + help='Explicitly set VERBOSE=TRUE for all experiments') parser._action_groups.append(optional) @@ -495,7 +522,8 @@ def setup_logging(logfile: str = "log.run_WE2E_tests", debug: bool = False) -> N if args.modulefile is None: args.modulefile = f'build_{args.machine.lower()}_{args.compiler}' if args.procs < 1: - raise ValueError('You can not have less than one parallel process; select a valid value for --procs') + raise ValueError('You can not have less than one parallel process; select a valid value '\ + 'for --procs') # Print test details (if requested) if args.print_test_info: diff --git a/tests/WE2E/utils.py b/tests/WE2E/utils.py index 687dea04f8..93943f7df2 100755 --- a/tests/WE2E/utils.py +++ b/tests/WE2E/utils.py @@ -26,12 +26,12 @@ REPORT_WIDTH = 100 -def print_WE2E_summary(expt_dict: dict, debug: bool = False): +def print_WE2E_summary(expts_dict: dict, debug: bool = False): """Function that creates a summary for the specified experiment Args: - expt_dict (dict): A dictionary containing the information needed to run - one or more experiments. See example file monitor_jobs.yaml + expts_dict (dict): A dictionary containing the information needed to run + one or more experiments. See example file WE2E_tests.yaml debug (bool): [optional] Enable extra output for debugging Returns: None @@ -45,8 +45,8 @@ def print_WE2E_summary(expt_dict: dict, debug: bool = False): total_core_hours = 0 statuses = [] expt_details = [] - for expt in expt_dict: - statuses.append(expt_dict[expt]["status"]) + for expt in expts_dict: + statuses.append(expts_dict[expt]["status"]) ch = 0 expt_details.append('') expt_details.append('-'*REPORT_WIDTH) @@ -55,15 +55,15 @@ def print_WE2E_summary(expt_dict: dict, debug: bool = False): expt_details.append(f'{" "*40} | Status | Walltime | Core hours used') expt_details.append('-'*REPORT_WIDTH) - for task in expt_dict[expt]: + for task in expts_dict[expt]: # Skip non-task entries if task in ["expt_dir","status"]: continue - status = expt_dict[expt][task]["status"] - walltime = expt_dict[expt][task]["walltime"] + status = expts_dict[expt][task]["status"] + walltime = expts_dict[expt][task]["walltime"] expt_details.append(f'{task[:40]:<40s} {status:<12s} {walltime:>10.1f}') - if "core_hours" in expt_dict[expt][task]: - task_ch = expt_dict[expt][task]["core_hours"] + if "core_hours" in expts_dict[expt][task]: + task_ch = expts_dict[expt][task]["core_hours"] ch += task_ch expt_details[-1] = f'{expt_details[-1]} {task_ch:>13.2f}' else: @@ -102,45 +102,7 @@ def print_WE2E_summary(expt_dict: dict, debug: bool = False): for line in expt_details: f.write(f"{line}\n") -def calculate_core_hours(expt_dict: dict) -> dict: - """ - Function takes in an experiment dictionary, reads the var_defns file for necessary information, - and calculates the core hours used by each task, updating expt_dict with this info - - Args: - expt_dict (dict) : Experiment dictionary - Returns: - dict : Experiment dictionary updated with core hours - """ - - for expt in expt_dict: - # Read variable definitions file - vardefs = load_shell_config(os.path.join(expt_dict[expt]["expt_dir"],"var_defns.sh")) - vdf = flatten_dict(vardefs) - cores_per_node = vdf["NCORES_PER_NODE"] - for task in expt_dict[expt]: - # Skip non-task entries - if task in ["expt_dir","status"]: - continue - # Cycle is last 12 characters, task name is rest (minus separating underscore) - taskname = task[:-13] - # Handle task names that have ensemble and/or fhr info appended with regex - taskname = re.sub('_mem\d{3}', '', taskname) - taskname = re.sub('_f\d{3}', '', taskname) - nnodes_var = f'NNODES_{taskname.upper()}' - if nnodes_var in vdf: - nnodes = vdf[nnodes_var] - # Users are charged for full use of nodes, so core hours are CPN * nodes * time in hrs - core_hours = cores_per_node * nnodes * expt_dict[expt][task]['walltime'] / 3600 - expt_dict[expt][task]['exact_count'] = True - else: - # If we can't find the number of nodes, assume full usage (may undercount) - core_hours = expt_dict[expt][task]['cores'] * expt_dict[expt][task]['walltime'] / 3600 - expt_dict[expt][task]['exact_count'] = False - expt_dict[expt][task]['core_hours'] = round(core_hours,2) - return expt_dict - -def create_expt_dict(expt_dir: str) -> dict: +def create_expts_dict(expt_dir: str) -> dict: """ Function takes in a directory, searches that directory for subdirectories containing experiments, and creates a skeleton dictionary that can be filled out by update_expt_status() @@ -152,7 +114,7 @@ def create_expt_dict(expt_dir: str) -> dict: """ contents = os.listdir(expt_dir) - expt_dict=dict() + expts_dict=dict() for item in contents: # Look for FV3LAM_wflow.xml to indicate directories with experiments in them fullpath = os.path.join(expt_dir, item) @@ -160,36 +122,37 @@ def create_expt_dict(expt_dir: str) -> dict: continue xmlfile = os.path.join(expt_dir, item, 'FV3LAM_wflow.xml') if os.path.isfile(xmlfile): - expt_dict[item] = dict() - expt_dict[item].update({"expt_dir": os.path.join(expt_dir,item)}) - expt_dict[item].update({"status": "CREATED"}) + expts_dict[item] = dict() + expts_dict[item].update({"expt_dir": os.path.join(expt_dir,item)}) + expts_dict[item].update({"status": "CREATED"}) else: logging.debug(f'Skipping directory {item}, experiment XML file not found') continue #Update the experiment dictionary logging.debug(f"Reading status of experiment {item}") - update_expt_status(expt_dict[item],item,True,False,False) + update_expt_status(expts_dict[item],item,True,False,False) summary_file = f'WE2E_tests_{datetime.now().strftime("%Y%m%d%H%M%S")}.yaml' - return summary_file, expt_dict + return summary_file, expts_dict -def calculate_core_hours(expt_dict: dict) -> dict: +def calculate_core_hours(expts_dict: dict) -> dict: """ Function takes in an experiment dictionary, reads the var_defns file for necessary information, - and calculates the core hours used by each task, updating expt_dict with this info + and calculates the core hours used by each task, updating expts_dict with this info Args: - expt_dict (dict) : Experiment dictionary + expts_dict (dict): A dictionary containing the information needed to run + one or more experiments. See example file WE2E_tests.yaml Returns: - dict : Experiment dictionary updated with core hours + dict : Experiments dictionary updated with core hours """ - for expt in expt_dict: + for expt in expts_dict: # Read variable definitions file - vardefs = load_shell_config(os.path.join(expt_dict[expt]["expt_dir"],"var_defns.sh")) + vardefs = load_shell_config(os.path.join(expts_dict[expt]["expt_dir"],"var_defns.sh")) vdf = flatten_dict(vardefs) cores_per_node = vdf["NCORES_PER_NODE"] - for task in expt_dict[expt]: + for task in expts_dict[expt]: # Skip non-task entries if task in ["expt_dir","status"]: continue @@ -201,27 +164,29 @@ def calculate_core_hours(expt_dict: dict) -> dict: nnodes_var = f'NNODES_{taskname.upper()}' if nnodes_var in vdf: nnodes = vdf[nnodes_var] - # Users are charged for full use of nodes, so core hours are CPN * nodes * time in hrs - core_hours = cores_per_node * nnodes * expt_dict[expt][task]['walltime'] / 3600 - expt_dict[expt][task]['exact_count'] = True + # Users are charged for full use of nodes, so core hours = CPN * nodes * time in hrs + core_hours = cores_per_node * nnodes * expts_dict[expt][task]['walltime'] / 3600 + expts_dict[expt][task]['exact_count'] = True else: # If we can't find the number of nodes, assume full usage (may undercount) - core_hours = expt_dict[expt][task]['cores'] * expt_dict[expt][task]['walltime'] / 3600 - expt_dict[expt][task]['exact_count'] = False - expt_dict[expt][task]['core_hours'] = round(core_hours,2) - return expt_dict + core_hours = expts_dict[expt][task]['cores'] * \ + expts_dict[expt][task]['walltime'] / 3600 + expts_dict[expt][task]['exact_count'] = False + expts_dict[expt][task]['core_hours'] = round(core_hours,2) + return expts_dict -def write_monitor_file(monitor_file: str, expt_dict: dict): +def write_monitor_file(monitor_file: str, expts_dict: dict): try: with open(monitor_file,"w", encoding="utf-8") as f: f.write("### WARNING ###\n") - f.write("### THIS FILE IS AUTO_GENERATED AND REGULARLY OVER-WRITTEN BY WORKFKLOW SCRIPTS\n") + f.write("### THIS FILE IS AUTO_GENERATED AND REGULARLY OVER-WRITTEN BY WORKFLOW SCRIPTS\n") f.write("### EDITS MAY RESULT IN MISBEHAVIOR OF EXPERIMENTS RUNNING\n") - f.writelines(cfg_to_yaml_str(expt_dict)) + f.writelines(cfg_to_yaml_str(expts_dict)) except: logging.fatal("\n********************************\n") - logging.fatal(f"WARNING WARNING WARNING\nFailure occurred while writing monitor file {monitor_file}") + logging.fatal(f"WARNING WARNING WARNING\n") + logging.fatal("Failure occurred while writing monitor file {monitor_file}") logging.fatal("File may be corrupt or invalid for re-run!!") logging.fatal("\n********************************\n") raise @@ -252,8 +217,9 @@ def update_expt_status(expt: dict, name: str, refresh: bool = False, debug: bool ERROR: Could not read the rocoto database file. This will require manual intervention to solve, so we will no longer monitor this experiment. This status may also appear if we fail to read the rocoto database file. - RUNNING: One or more jobs are at status RUNNING, and the rest are either status QUEUED, SUBMITTED, - or SUCCEEDED. This is a normal state; we will continue to monitor this experiment. + RUNNING: One or more jobs are at status RUNNING, and the rest are either status QUEUED, + SUBMITTED, or SUCCEEDED. This is a normal state; we will continue to monitor this + experiment. QUEUED: One or more jobs are at status QUEUED, and some others may be at status SUBMITTED or SUCCEEDED. This is a normal state; we will continue to monitor this experiment. @@ -288,15 +254,19 @@ def update_expt_status(expt: dict, name: str, refresh: bool = False, debug: bool if refresh: logging.info(f"Updating database for experiment {name}") if debug: - rocotorun_cmd = ["rocotorun", f"-w {expt['expt_dir']}/FV3LAM_wflow.xml", f"-d {rocoto_db}", "-v 10"] - p = subprocess.run(rocotorun_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) + rocotorun_cmd = ["rocotorun", f"-w {expt['expt_dir']}/FV3LAM_wflow.xml", + f"-d {rocoto_db}", "-v 10"] + p = subprocess.run(rocotorun_cmd, stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, text=True) logging.debug(p.stdout) #Run rocotorun again to get around rocotobqserver proliferation issue - p = subprocess.run(rocotorun_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) + p = subprocess.run(rocotorun_cmd, stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, text=True) logging.debug(p.stdout) else: - rocotorun_cmd = ["rocotorun", f"-w {expt['expt_dir']}/FV3LAM_wflow.xml", f"-d {rocoto_db}"] + rocotorun_cmd = ["rocotorun", f"-w {expt['expt_dir']}/FV3LAM_wflow.xml", + f"-d {rocoto_db}"] subprocess.run(rocotorun_cmd) #Run rocotorun again to get around rocotobqserver proliferation issue subprocess.run(rocotorun_cmd) @@ -319,8 +289,9 @@ def update_expt_status(expt: dict, name: str, refresh: bool = False, debug: bool return expt for task in db: - # For each entry from rocoto database, store that task's info under a dictionary key named TASKNAME_CYCLE - # Cycle comes from the database in Unix Time (seconds), so convert to human-readable + # For each entry from rocoto database, store that task's info under a dictionary key named + # TASKNAME_CYCLE; Cycle comes from the database in Unix Time (seconds), so convert to + # human-readable cycle = datetime.utcfromtimestamp(task[1]).strftime('%Y%m%d%H%M') if f"{task[0]}_{cycle}" not in expt: expt[f"{task[0]}_{cycle}"] = dict() @@ -338,7 +309,8 @@ def update_expt_status(expt: dict, name: str, refresh: bool = False, debug: bool if "DEAD" in statuses: still_live = ["RUNNING", "SUBMITTING", "QUEUED", "FAILED"] if any(status in still_live for status in statuses): - logging.debug(f'DEAD job in experiment {name}; continuing to track until all jobs are complete') + logging.debug(f'DEAD job in experiment {name}; continuing to track until all jobs are '\ + 'complete') expt["status"] = "DYING" else: expt["status"] = "DEAD" @@ -386,7 +358,8 @@ def update_expt_status(expt: dict, name: str, refresh: bool = False, debug: bool return expt -def update_expt_status_parallel(expt_dict: dict, procs: int, refresh: bool = False, debug: bool = False) -> dict: +def update_expt_status_parallel(expts_dict: dict, procs: int, refresh: bool = False, + debug: bool = False) -> dict: """ This function updates an entire set of experiments in parallel, drastically speeding up the process if given enough parallel processes. Given an experiment dictionary, it will @@ -396,7 +369,7 @@ def update_expt_status_parallel(expt_dict: dict, procs: int, refresh: bool = Fal Making use of the python multiprocessing starmap functionality, takes Args: - expt_dict (dict): A dictionary containing information for all experiments + expts_dict (dict): A dictionary containing information for all experiments procs (int): The number of parallel processes refresh (bool): "Refresh" flag to pass to update_expt_status() debug (bool): Will capture all output from rocotorun. This will allow information such @@ -409,8 +382,8 @@ def update_expt_status_parallel(expt_dict: dict, procs: int, refresh: bool = Fal args = [] # Define a tuple of arguments to pass to starmap - for expt in expt_dict: - args.append( (expt_dict[expt],expt,refresh,debug) ) + for expt in expts_dict: + args.append( (expts_dict[expt],expt,refresh,debug) ) # call update_expt_status() in parallel with Pool(processes=procs) as pool: @@ -418,11 +391,11 @@ def update_expt_status_parallel(expt_dict: dict, procs: int, refresh: bool = Fal # Update dictionary with output from all calls to update_expt_status() i = 0 - for expt in expt_dict: - expt_dict[expt] = output[i] + for expt in expts_dict: + expts_dict[expt] = output[i] i += 1 - return expt_dict + return expts_dict @@ -455,9 +428,12 @@ def print_test_info(txtfile: str = "WE2E_test_info.txt") -> None: testdict[testname]["directory"] = dirname testdict[testname]["cost"] = cost #Calculate number of forecasts for a cycling run - if testdict[testname]['workflow']["DATE_FIRST_CYCL"] != testdict[testname]['workflow']["DATE_LAST_CYCL"]: - begin = datetime.strptime(testdict[testname]['workflow']["DATE_FIRST_CYCL"], '%Y%m%d%H') - end = datetime.strptime(testdict[testname]['workflow']["DATE_LAST_CYCL"], '%Y%m%d%H') + if testdict[testname]['workflow']["DATE_FIRST_CYCL"] != \ + testdict[testname]['workflow']["DATE_LAST_CYCL"]: + begin = datetime.strptime(testdict[testname]['workflow']["DATE_FIRST_CYCL"], + '%Y%m%d%H') + end = datetime.strptime(testdict[testname]['workflow']["DATE_LAST_CYCL"], + '%Y%m%d%H') diff = end - begin diffh = diff.total_seconds() // 3600 nf = diffh // testdict[testname]['workflow']["INCR_CYCL_FREQ"] @@ -486,7 +462,8 @@ def print_test_info(txtfile: str = "WE2E_test_info.txt") -> None: f.write(f"\"{expt}\n(") f.write(f"{testdict[expt]['directory']}){d}") if "alternate_name" in testdict[expt]: - f.write(f"{testdict[expt]['alternate_name']}\n({testdict[expt]['alternate_directory_name']}){d}") + f.write(f"{testdict[expt]['alternate_name']}\n"\ + f"({testdict[expt]['alternate_directory_name']}){d}") else: f.write(f"{d}\n") desc = testdict[expt]['metadata']['description'].splitlines() @@ -497,14 +474,17 @@ def print_test_info(txtfile: str = "WE2E_test_info.txt") -> None: f.write(f"{d}'{round(testdict[expt]['cost'],2)}{d}'{round(testdict[expt]['num_fcsts'])}") f.write(f"{d}" + get_or_print_blank(testdict[expt],'workflow','PREDEF_GRID_NAME')) f.write(f"{d}" + get_or_print_blank(testdict[expt],'workflow','CCPP_PHYS_SUITE')) - f.write(f"{d}" + get_or_print_blank(testdict[expt],'task_get_extrn_ics','EXTRN_MDL_NAME_ICS')) - f.write(f"{d}" + get_or_print_blank(testdict[expt],'task_get_extrn_lbcs','EXTRN_MDL_NAME_LBCS')) + f.write(f"{d}" + get_or_print_blank(testdict[expt],'task_get_extrn_ics', + 'EXTRN_MDL_NAME_ICS')) + f.write(f"{d}" + get_or_print_blank(testdict[expt],'task_get_extrn_lbcs', + 'EXTRN_MDL_NAME_LBCS')) f.write(f"{d}" + get_or_print_blank(testdict[expt],'workflow','DATE_FIRST_CYCL')) f.write(f"{d}" + get_or_print_blank(testdict[expt],'workflow','DATE_LAST_CYCL')) f.write(f"{d}" + get_or_print_blank(testdict[expt],'workflow','INCR_CYCL_FREQ')) f.write(f"{d}" + get_or_print_blank(testdict[expt],'workflow','FCST_LEN_HRS')) f.write(f"{d}" + get_or_print_blank(testdict[expt],'task_run_fcst','DT_ATMOS')) - f.write(f"{d}" + get_or_print_blank(testdict[expt],'task_get_extrn_lbcs','LBC_SPEC_INTVL_HRS')) + f.write(f"{d}" + get_or_print_blank(testdict[expt],'task_get_extrn_lbcs', + 'LBC_SPEC_INTVL_HRS')) f.write(f"{d}" + get_or_print_blank(testdict[expt],'global','NUM_ENS_MEMBERS') + "\n") def get_or_print_blank(d,key1,key2): @@ -538,7 +518,8 @@ def compare_rocotostat(expt_dict,name): # Call rocotostat and store output rocoto_db = f"{expt_dict['expt_dir']}/FV3LAM_wflow.db" - rocotorun_cmd = ["rocotostat", f"-w {expt_dict['expt_dir']}/FV3LAM_wflow.xml", f"-d {rocoto_db}", "-v 10"] + rocotorun_cmd = ["rocotostat", f"-w {expt_dict['expt_dir']}/FV3LAM_wflow.xml", + f"-d {rocoto_db}", "-v 10"] p = subprocess.run(rocotorun_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) rsout = p.stdout @@ -575,7 +556,7 @@ def compare_rocotostat(expt_dict,name): elif expt_dict['status'] == 'STALLED': expt_dict['status'] = 'STUCK' elif expt_dict['status'] == 'STUCK': - msg = f"WARNING: For experiment {name}, there are some jobs that are not being submitted:" + msg = f"WARNING: For experiment {name}, there are jobs that are not being submitted:" for ut in untracked_tasks: msg += ut msg = msg + f"""WARNING: For experiment {name}, From e7380cab9ad0f70636a54e29c95ff677a18f3aa3 Mon Sep 17 00:00:00 2001 From: "Michael Kavulich, Jr" Date: Tue, 7 Mar 2023 04:57:36 +0000 Subject: [PATCH 38/52] Fix missed "expt_dict" rename, widen test name column in summary file --- tests/WE2E/utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/WE2E/utils.py b/tests/WE2E/utils.py index 93943f7df2..453724761c 100755 --- a/tests/WE2E/utils.py +++ b/tests/WE2E/utils.py @@ -40,7 +40,7 @@ def print_WE2E_summary(expts_dict: dict, debug: bool = False): # Create summary table as list of strings summary = [] summary.append('-'*REPORT_WIDTH) - summary.append(f'Experiment name {" "*43} | Status | Core hours used ') + summary.append(f'Experiment name {" "*48} | Status | Core hours used ') summary.append('-'*REPORT_WIDTH) total_core_hours = 0 statuses = [] @@ -51,7 +51,7 @@ def print_WE2E_summary(expts_dict: dict, debug: bool = False): expt_details.append('') expt_details.append('-'*REPORT_WIDTH) expt_details.append(f'Detailed summary of experiment {expt}') - expt_details.append(f"in directory {expt_dict[expt]['expt_dir']}") + expt_details.append(f"in directory {expts_dict[expt]['expt_dir']}") expt_details.append(f'{" "*40} | Status | Walltime | Core hours used') expt_details.append('-'*REPORT_WIDTH) @@ -70,7 +70,7 @@ def print_WE2E_summary(expts_dict: dict, debug: bool = False): expt_details[-1] = f'{expt_details[-1]} -' expt_details.append('-'*REPORT_WIDTH) expt_details.append(f'Total {" "*34} {statuses[-1]:<12s} {" "*11} {ch:>13.2f}') - summary.append(f'{expt[:60]:<60s} {statuses[-1]:<12s} {ch:>13.2f}') + summary.append(f'{expt[:65]:<65s} {statuses[-1]:<12s} {ch:>13.2f}') total_core_hours += ch if "ERROR" in statuses: total_status = "ERROR" @@ -85,7 +85,7 @@ def print_WE2E_summary(expts_dict: dict, debug: bool = False): else: total_status = "UNKNOWN" summary.append('-'*REPORT_WIDTH) - summary.append(f'Total {" "*54} {total_status:<12s} {total_core_hours:>13.2f}') + summary.append(f'Total {" "*59} {total_status:<12s} {total_core_hours:>13.2f}') # Print summary to screen for line in summary: From 18389ca8446596da621291061c0c2ceb78b98a8d Mon Sep 17 00:00:00 2001 From: "Michael Kavulich, Jr" Date: Tue, 7 Mar 2023 05:44:37 +0000 Subject: [PATCH 39/52] Add missing column header for test info file --- tests/WE2E/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/WE2E/utils.py b/tests/WE2E/utils.py index 453724761c..92c190f602 100755 --- a/tests/WE2E/utils.py +++ b/tests/WE2E/utils.py @@ -454,7 +454,7 @@ def print_test_info(txtfile: str = "WE2E_test_info.txt") -> None: txt_output = ['"Test Name'] txt_output.append(f'(Subdirectory){d}Alternate Test Names') txt_output.append(f'(Subdirectories){d}Test Purpose/Description{d}Relative Cost of Running Dynamics') - txt_output.append(f'(1 corresponds to running a 6-hour forecast on the RRFS_CONUS_25km predefined grid using the default time step){d}PREDEF_GRID_NAME{d}CCPP_PHYS_SUITE{d}EXTRN_MDL_NAME_ICS{d}EXTRN_MDL_NAME_LBCS{d}DATE_FIRST_CYCL{d}DATE_LAST_CYCL{d}INCR_CYCL_FREQ{d}FCST_LEN_HRS{d}LBC_SPEC_INTVL_HRS{d}NUM_ENS_MEMBERS') + txt_output.append(f'(1 corresponds to running a 6-hour forecast on the RRFS_CONUS_25km predefined grid using the default time step){d}PREDEF_GRID_NAME{d}CCPP_PHYS_SUITE{d}EXTRN_MDL_NAME_ICS{d}EXTRN_MDL_NAME_LBCS{d}DATE_FIRST_CYCL{d}DATE_LAST_CYCL{d}INCR_CYCL_FREQ{d}FCST_LEN_HRS{d}DT_ATMOS{d}LBC_SPEC_INTVL_HRS{d}NUM_ENS_MEMBERS') for line in txt_output: f.write(f"{line}\n") From 5255a827523ae670d4fdd48dc6c519781f355500 Mon Sep 17 00:00:00 2001 From: "Michael Kavulich, Jr" Date: Tue, 7 Mar 2023 13:53:33 -0700 Subject: [PATCH 40/52] Fixes to Jenkins testing scripts from Mike Lueken --- .cicd/scripts/srw_test.sh | 7 ++----- tests/WE2E/setup_WE2E_tests.sh | 4 ++-- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/.cicd/scripts/srw_test.sh b/.cicd/scripts/srw_test.sh index 7a4796dc8e..8c6ef42528 100755 --- a/.cicd/scripts/srw_test.sh +++ b/.cicd/scripts/srw_test.sh @@ -41,11 +41,8 @@ cd ${we2e_test_dir} # Progress file progress_file="${workspace}/we2e_test_results-${platform}-${SRW_COMPILER}.txt" ./setup_WE2E_tests.sh ${platform} ${SRW_PROJECT} ${SRW_COMPILER} ${test_type} \ - expt_basedir=${we2e_experiment_base_dir} \ - opsroot=${nco_dir} | tee ${progress_file} - -# Progress file -progress_file="${workspace}/we2e_test_results-${platform}-${SRW_COMPILER}.txt" + --expt_basedir=${we2e_experiment_base_dir} \ + --opsroot=${nco_dir} | tee ${progress_file} # Set exit code to number of failures set +e diff --git a/tests/WE2E/setup_WE2E_tests.sh b/tests/WE2E/setup_WE2E_tests.sh index a0ddb94ec9..26d025a694 100755 --- a/tests/WE2E/setup_WE2E_tests.sh +++ b/tests/WE2E/setup_WE2E_tests.sh @@ -54,10 +54,10 @@ test_type=${4:-fundamental} #---------------------------------------------------------------------- opts= if [[ "$*" != *"debug"* ]]; then - opts="${opts} --debug=TRUE" + opts="${opts} --debug" fi if [[ "$*" != *"verbose"* ]]; then - opts="${opts} --verbose=TRUE" + opts="${opts} --verbose" fi if [[ "$*" != *"cron_relaunch_intvl_mnts"* ]]; then opts="${opts} --cron_relaunch_intvl_mnts=4" From 35da5beb3feb027286876d08295afebd568408db Mon Sep 17 00:00:00 2001 From: "Michael Kavulich, Jr" Date: Tue, 7 Mar 2023 13:57:00 -0700 Subject: [PATCH 41/52] If database is not loaded, need to return every time, even if a tolerable error. --- tests/WE2E/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/WE2E/utils.py b/tests/WE2E/utils.py index 92c190f602..56abcbd80a 100755 --- a/tests/WE2E/utils.py +++ b/tests/WE2E/utils.py @@ -286,7 +286,8 @@ def update_expt_status(expt: dict, name: str, refresh: bool = False, debug: bool if not refresh: logging.warning(f"Unable to read database {rocoto_db}\nCan not track experiment {name}") expt["status"] = "ERROR" - return expt + + return expt for task in db: # For each entry from rocoto database, store that task's info under a dictionary key named From d484d14cf849b555805ed8ab0a105ecfab221ed0 Mon Sep 17 00:00:00 2001 From: "Michael J. Kavulich, Jr" Date: Wed, 8 Mar 2023 04:40:36 +0000 Subject: [PATCH 42/52] Rocoto requires /home/Michael.Kavulich to be set to a writable path in the user's environment, so we have to pass it as an argument to setup_WE2E_tests.sh to be exported prior to running the WE2E test script --- tests/WE2E/setup_WE2E_tests.sh | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/tests/WE2E/setup_WE2E_tests.sh b/tests/WE2E/setup_WE2E_tests.sh index 26d025a694..75994156fd 100755 --- a/tests/WE2E/setup_WE2E_tests.sh +++ b/tests/WE2E/setup_WE2E_tests.sh @@ -1,5 +1,10 @@ #!/usr/bin/env bash -[ -n "$HOME" ] && exec -c "$0" "$@" + +# `exec -c` runs this script with clean environment; this avoids some problems +# with double-loading conda environments. Since we do need $HOME to be set for +# rocoto to run properly, pass it as an argument and export it later + +[ -n "$HOME" ] && exec -c "$0" "$HOME" "$@" #---------------------------------------------------------------------- # Wrapper for the automation of UFS Short Range Weather App Workflow @@ -26,8 +31,9 @@ function usage { echo - echo "Usage: $0 machine account [compiler] [test_type] [others] | -h" + echo "Usage: $0 homedir machine account [compiler] [test_type] [others] | -h" echo + echo " homedir [required] user's home directory; this space must be writable" echo " machine [required] is one of: ${machines[@]}" echo " account [required] case sensitive name of the user-specific slurm account" echo " compiler [optional] compiler used to build binaries (intel or gnu)" @@ -42,12 +48,21 @@ function usage { machines=( hera jet cheyenne orion wcoss2 gaea odin singularity macos noaacloud ) if [ "$1" = "-h" ] ; then usage ; fi -[[ $# -le 1 ]] && usage +[[ $# -le 2 ]] && usage + +homedir=$1 +machine=${2,,} +account=$3 +compiler=${4:-intel} +test_type=${5:-fundamental} + +echo $homedir +echo $machine +echo $account +echo $compiler +echo $test_type + -machine=${1,,} -account=$2 -compiler=${3:-intel} -test_type=${4:-fundamental} #---------------------------------------------------------------------- # Set some default options, if user did not pass them @@ -69,6 +84,8 @@ fi #----------------------------------------------------------------------- # Run E2E Tests #----------------------------------------------------------------------- +# Export HOME environment variable; needed for rocoto +export HOME=$homedir # Load Python Modules source ../../ush/load_modules_wflow.sh ${machine} @@ -80,5 +97,5 @@ source ../../ush/load_modules_wflow.sh ${machine} --compiler=${compiler} \ --tests=${test_type} \ ${opts} \ - "${@:5}" + "${@:6}" From b4f7319d3a8d08643584cae0a9efbf347310784a Mon Sep 17 00:00:00 2001 From: "Michael J. Kavulich, Jr" Date: Wed, 8 Mar 2023 18:22:21 +0000 Subject: [PATCH 43/52] Update archiving of relevant log files in Jenkinsfile --- .cicd/Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.cicd/Jenkinsfile b/.cicd/Jenkinsfile index 92d01de481..c2d054d719 100644 --- a/.cicd/Jenkinsfile +++ b/.cicd/Jenkinsfile @@ -177,7 +177,7 @@ pipeline { post { always { // Archive the test log files - sh 'cd "${SRW_WE2E_EXPERIMENT_BASE_DIR}" && tar --create --gzip --verbose --dereference --file "${WORKSPACE}/we2e_test_logs-${SRW_PLATFORM}-${SRW_COMPILER}.tgz" */log.generate_FV3LAM_wflow */log.launch_FV3LAM_wflow */log/*' + sh 'cd "${SRW_WE2E_EXPERIMENT_BASE_DIR}" && tar --create --gzip --verbose --dereference --file "${WORKSPACE}/we2e_test_logs-${SRW_PLATFORM}-${SRW_COMPILER}.tgz" */log.generate_FV3LAM_wflow */log/* ${WORKSPACE}/tests/WE2E/WE2E_tests_*yaml ${WORKSPACE}/tests/WE2E/WE2E_summary*txt ${WORKSPACE}/tests/WE2E/log.*' // Remove the data sets from the experiments directory to conserve disk space sh 'find "${SRW_WE2E_EXPERIMENT_BASE_DIR}" -regextype posix-extended -regex "^.*(orog|[0-9]{10})$" -type d | xargs rm -rf' s3Upload consoleLogLevel: 'INFO', dontSetBuildResultOnFailure: false, dontWaitForConcurrentBuildCompletion: false, entries: [[bucket: 'woc-epic-jenkins-artifacts', excludedFile: '', flatten: false, gzipFiles: false, keepForever: false, managedArtifacts: true, noUploadOnFailure: false, selectedRegion: 'us-east-1', showDirectlyInBrowser: false, sourceFile: 'we2e_test_results-*-*.txt', storageClass: 'STANDARD', uploadFromSlave: false, useServerSideEncryption: false], [bucket: 'woc-epic-jenkins-artifacts', excludedFile: '', flatten: false, gzipFiles: false, keepForever: false, managedArtifacts: true, noUploadOnFailure: false, selectedRegion: 'us-east-1', showDirectlyInBrowser: false, sourceFile: 'we2e_test_logs-*-*.tgz', storageClass: 'STANDARD', uploadFromSlave: false, useServerSideEncryption: false]], pluginFailureResultConstraint: 'FAILURE', profileName: 'main', userMetadata: [] From dca8c2591a2fa67288c6517d4f9791390a489513 Mon Sep 17 00:00:00 2001 From: "Michael J. Kavulich, Jr" Date: Fri, 10 Mar 2023 01:29:34 +0000 Subject: [PATCH 44/52] Fixes suggested by Daniel --- tests/WE2E/setup_WE2E_tests.sh | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/tests/WE2E/setup_WE2E_tests.sh b/tests/WE2E/setup_WE2E_tests.sh index 75994156fd..20ae60e856 100755 --- a/tests/WE2E/setup_WE2E_tests.sh +++ b/tests/WE2E/setup_WE2E_tests.sh @@ -31,9 +31,8 @@ function usage { echo - echo "Usage: $0 homedir machine account [compiler] [test_type] [others] | -h" + echo "Usage: $0 machine account [compiler] [test_type] [others] | -h" echo - echo " homedir [required] user's home directory; this space must be writable" echo " machine [required] is one of: ${machines[@]}" echo " account [required] case sensitive name of the user-specific slurm account" echo " compiler [optional] compiler used to build binaries (intel or gnu)" @@ -56,14 +55,6 @@ account=$3 compiler=${4:-intel} test_type=${5:-fundamental} -echo $homedir -echo $machine -echo $account -echo $compiler -echo $test_type - - - #---------------------------------------------------------------------- # Set some default options, if user did not pass them #---------------------------------------------------------------------- From e2e7a6cdbf30068b9be3061a4cf20ffe05aec5c4 Mon Sep 17 00:00:00 2001 From: "Michael J. Kavulich, Jr" Date: Fri, 10 Mar 2023 16:54:44 +0000 Subject: [PATCH 45/52] Fix usage instructions for more flexible "tests" argument --- tests/WE2E/setup_WE2E_tests.sh | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/tests/WE2E/setup_WE2E_tests.sh b/tests/WE2E/setup_WE2E_tests.sh index 20ae60e856..b41d076c44 100755 --- a/tests/WE2E/setup_WE2E_tests.sh +++ b/tests/WE2E/setup_WE2E_tests.sh @@ -31,14 +31,15 @@ function usage { echo - echo "Usage: $0 machine account [compiler] [test_type] [others] | -h" + echo "Usage: $0 machine account [compiler] [tests] [others] | -h" echo - echo " machine [required] is one of: ${machines[@]}" - echo " account [required] case sensitive name of the user-specific slurm account" - echo " compiler [optional] compiler used to build binaries (intel or gnu)" - echo " test_type [optional] test type: fundamental or comprehensive or all or any other name" - echo " others [optional] All other arguments are forwarded to run_WE2E_tests.sh" - echo " -h display this help" + echo " machine [required] is one of: ${machines[@]}" + echo " account [required] case sensitive name of the user-specific slurm account" + echo " compiler [optional] compiler used to build binaries (intel or gnu)" + echo " tests [optional] tests to run: can be a suite (all|comprehensive|fundamental) + a filename, or a test name" + echo " others [optional] All other arguments are forwarded to run_WE2E_tests.sh" + echo " -h display this help" echo exit 1 @@ -53,7 +54,7 @@ homedir=$1 machine=${2,,} account=$3 compiler=${4:-intel} -test_type=${5:-fundamental} +tests=${5:-fundamental} #---------------------------------------------------------------------- # Set some default options, if user did not pass them @@ -86,7 +87,7 @@ source ../../ush/load_modules_wflow.sh ${machine} --machine=${machine} \ --account=${account} \ --compiler=${compiler} \ - --tests=${test_type} \ + --tests=${tests} \ ${opts} \ "${@:6}" From effbc010e1047f1dc3652c558c2e65b93144ffb8 Mon Sep 17 00:00:00 2001 From: "Michael J. Kavulich, Jr" Date: Wed, 15 Mar 2023 17:27:09 +0000 Subject: [PATCH 46/52] Address PR comments - Correct import location for print_WE2E_summary - Use os.path.join() for path strings - Correct script name - Set global variables for column width in job summary Also a bug fix for cases where variable definitions file doesn't exist (can occur if experiment is moved or re-created after yaml generation) --- tests/WE2E/monitor_jobs.py | 3 +-- tests/WE2E/run_WE2E_tests.py | 11 ++++++----- tests/WE2E/setup_WE2E_tests.sh | 4 ++-- tests/WE2E/utils.py | 23 +++++++++++++++-------- 4 files changed, 24 insertions(+), 17 deletions(-) diff --git a/tests/WE2E/monitor_jobs.py b/tests/WE2E/monitor_jobs.py index 626b0d8f81..5d1d4a63af 100755 --- a/tests/WE2E/monitor_jobs.py +++ b/tests/WE2E/monitor_jobs.py @@ -13,9 +13,8 @@ from check_python_version import check_python_version -from WE2E_summary import print_WE2E_summary from utils import calculate_core_hours, write_monitor_file, update_expt_status,\ - update_expt_status_parallel + update_expt_status_parallel, print_WE2E_summary def monitor_jobs(expts_dict: dict, monitor_file: str = '', procs: int = 1, debug: bool = False) -> str: """Function to monitor and run jobs for the specified experiment using Rocoto diff --git a/tests/WE2E/run_WE2E_tests.py b/tests/WE2E/run_WE2E_tests.py index 9815221feb..8b4d4cc35c 100755 --- a/tests/WE2E/run_WE2E_tests.py +++ b/tests/WE2E/run_WE2E_tests.py @@ -190,7 +190,7 @@ def run_we2e_tests(homedir, args) -> None: logging.debug(f"Writing updated config.yaml for test {test_name}\n"\ "based on specified command-line arguments:\n") logging.debug(cfg_to_yaml_str(test_cfg)) - with open(ushdir + "/config.yaml","w", encoding="utf-8") as f: + with open(os.path.join(ushdir,"/config.yaml"),"w", encoding="utf-8") as f: f.writelines(cfg_to_yaml_str(test_cfg)) logging.info(f"Calling workflow generation function for test {test_name}\n") @@ -370,12 +370,13 @@ def check_task_get_extrn_bcs(cfg: dict, mach: dict, dflt: dict, ics_or_lbcs: str cfg_bcs[f'FV3GFS_FILE_FMT_{I_OR_L}'] = \ dflt[f'task_get_extrn_{ics_or_lbcs}'][f'FV3GFS_FILE_FMT_{I_OR_L}'] cfg_bcs[f'EXTRN_MDL_SOURCE_BASEDIR_{I_OR_L}'] = \ - f"{mach['platform']['TEST_EXTRN_MDL_SOURCE_BASEDIR']}/"\ - f"{cfg_bcs[f'EXTRN_MDL_NAME_{I_OR_L}']}/{cfg_bcs[f'FV3GFS_FILE_FMT_{I_OR_L}']}/${{yyyymmddhh}}" + os.path.join(f"{mach['platform']['TEST_EXTRN_MDL_SOURCE_BASEDIR']}", + f"{cfg_bcs[f'EXTRN_MDL_NAME_{I_OR_L}']}",f"{cfg_bcs[f'FV3GFS_FILE_FMT_{I_OR_L}']}", + f"${{yyyymmddhh}}") else: cfg_bcs[f'EXTRN_MDL_SOURCE_BASEDIR_{I_OR_L}'] = \ - f"{mach['platform']['TEST_EXTRN_MDL_SOURCE_BASEDIR']}/"\ - f"{cfg_bcs[f'EXTRN_MDL_NAME_{I_OR_L}']}/${{yyyymmddhh}}" + os.path.join(f"{mach['platform']['TEST_EXTRN_MDL_SOURCE_BASEDIR']}", + f"{cfg_bcs[f'EXTRN_MDL_NAME_{I_OR_L}']}/${{yyyymmddhh}}") return cfg_bcs diff --git a/tests/WE2E/setup_WE2E_tests.sh b/tests/WE2E/setup_WE2E_tests.sh index b41d076c44..b617709327 100755 --- a/tests/WE2E/setup_WE2E_tests.sh +++ b/tests/WE2E/setup_WE2E_tests.sh @@ -12,7 +12,7 @@ # # The wrapper loads the appropriate workflow environment for the # machine, and sets the machine test suite file before invoking the -# run_WE2E_tests.sh. +# run_WE2E_tests.py script. # # The script is dependent on a successful build of this repo using the # tests/build.sh script in the ufs-srweather-app repository. The UFS @@ -38,7 +38,7 @@ function usage { echo " compiler [optional] compiler used to build binaries (intel or gnu)" echo " tests [optional] tests to run: can be a suite (all|comprehensive|fundamental) a filename, or a test name" - echo " others [optional] All other arguments are forwarded to run_WE2E_tests.sh" + echo " others [optional] All other arguments are forwarded to run_WE2E_tests.py" echo " -h display this help" echo exit 1 diff --git a/tests/WE2E/utils.py b/tests/WE2E/utils.py index 56abcbd80a..827a3e0ba1 100755 --- a/tests/WE2E/utils.py +++ b/tests/WE2E/utils.py @@ -25,7 +25,8 @@ ) REPORT_WIDTH = 100 - +EXPT_COLUMN_WIDTH = 65 +TASK_COLUMN_WIDTH = 40 def print_WE2E_summary(expts_dict: dict, debug: bool = False): """Function that creates a summary for the specified experiment @@ -40,7 +41,7 @@ def print_WE2E_summary(expts_dict: dict, debug: bool = False): # Create summary table as list of strings summary = [] summary.append('-'*REPORT_WIDTH) - summary.append(f'Experiment name {" "*48} | Status | Core hours used ') + summary.append(f'Experiment name {" "*(EXPT_COLUMN_WIDTH-17)} | Status | Core hours used ') summary.append('-'*REPORT_WIDTH) total_core_hours = 0 statuses = [] @@ -52,7 +53,7 @@ def print_WE2E_summary(expts_dict: dict, debug: bool = False): expt_details.append('-'*REPORT_WIDTH) expt_details.append(f'Detailed summary of experiment {expt}') expt_details.append(f"in directory {expts_dict[expt]['expt_dir']}") - expt_details.append(f'{" "*40} | Status | Walltime | Core hours used') + expt_details.append(f'{" "*TASK_COLUMN_WIDTH}| Status | Walltime | Core hours used') expt_details.append('-'*REPORT_WIDTH) for task in expts_dict[expt]: @@ -61,7 +62,7 @@ def print_WE2E_summary(expts_dict: dict, debug: bool = False): continue status = expts_dict[expt][task]["status"] walltime = expts_dict[expt][task]["walltime"] - expt_details.append(f'{task[:40]:<40s} {status:<12s} {walltime:>10.1f}') + expt_details.append(f'{task[:TASK_COLUMN_WIDTH]:<{TASK_COLUMN_WIDTH}s} {status:<12s} {walltime:>10.1f}') if "core_hours" in expts_dict[expt][task]: task_ch = expts_dict[expt][task]["core_hours"] ch += task_ch @@ -69,8 +70,8 @@ def print_WE2E_summary(expts_dict: dict, debug: bool = False): else: expt_details[-1] = f'{expt_details[-1]} -' expt_details.append('-'*REPORT_WIDTH) - expt_details.append(f'Total {" "*34} {statuses[-1]:<12s} {" "*11} {ch:>13.2f}') - summary.append(f'{expt[:65]:<65s} {statuses[-1]:<12s} {ch:>13.2f}') + expt_details.append(f'Total {" "*(TASK_COLUMN_WIDTH - 6)} {statuses[-1]:<12s} {" "*11} {ch:>13.2f}') + summary.append(f'{expt[:EXPT_COLUMN_WIDTH]:<{EXPT_COLUMN_WIDTH}s} {statuses[-1]:<12s} {ch:>13.2f}') total_core_hours += ch if "ERROR" in statuses: total_status = "ERROR" @@ -85,7 +86,7 @@ def print_WE2E_summary(expts_dict: dict, debug: bool = False): else: total_status = "UNKNOWN" summary.append('-'*REPORT_WIDTH) - summary.append(f'Total {" "*59} {total_status:<12s} {total_core_hours:>13.2f}') + summary.append(f'Total {" "*(EXPT_COLUMN_WIDTH - 6)} {total_status:<12s} {total_core_hours:>13.2f}') # Print summary to screen for line in summary: @@ -149,7 +150,13 @@ def calculate_core_hours(expts_dict: dict) -> dict: for expt in expts_dict: # Read variable definitions file - vardefs = load_shell_config(os.path.join(expts_dict[expt]["expt_dir"],"var_defns.sh")) + vardefs_file = os.path.join(expts_dict[expt]["expt_dir"],"var_defns.sh") + if not os.path.isfile(vardefs_file): + logging.warning(f"\nWARNING: For experiment {expt}, variable definitions file") + logging.warning(f"{vardefs_file}\ndoes not exist!\n\nDropping experiment from summary") + continue + logging.debug(f'Reading variable definitions file {vardefs_file}') + vardefs = load_shell_config(vardefs_file) vdf = flatten_dict(vardefs) cores_per_node = vdf["NCORES_PER_NODE"] for task in expts_dict[expt]: From 099a2d2e1fe3fb6e878489594c219091d3c33000 Mon Sep 17 00:00:00 2001 From: "Michael J. Kavulich, Jr" Date: Wed, 15 Mar 2023 17:47:03 +0000 Subject: [PATCH 47/52] A couple bug fixes from latest changes --- tests/WE2E/run_WE2E_tests.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/WE2E/run_WE2E_tests.py b/tests/WE2E/run_WE2E_tests.py index 8b4d4cc35c..b1ef55c9ed 100755 --- a/tests/WE2E/run_WE2E_tests.py +++ b/tests/WE2E/run_WE2E_tests.py @@ -190,7 +190,7 @@ def run_we2e_tests(homedir, args) -> None: logging.debug(f"Writing updated config.yaml for test {test_name}\n"\ "based on specified command-line arguments:\n") logging.debug(cfg_to_yaml_str(test_cfg)) - with open(os.path.join(ushdir,"/config.yaml"),"w", encoding="utf-8") as f: + with open(os.path.join(ushdir,"config.yaml"),"w", encoding="utf-8") as f: f.writelines(cfg_to_yaml_str(test_cfg)) logging.info(f"Calling workflow generation function for test {test_name}\n") @@ -223,7 +223,7 @@ def run_we2e_tests(homedir, args) -> None: debug=args.debug) except KeyboardInterrupt: logging.info("\n\nUser interrupted monitor script; to resume monitoring jobs run:\n") - logging.info(f"./monitor_jobs.py -y={monitor_file} -p={args.procs} -d={args.debug}\n") + logging.info(f"./monitor_jobs.py -y={monitor_file} -p={args.procs}\n") else: logging.info("All experiments are complete") logging.info(f"Summary of results available in {monitor_file}") From b88c4a7dc569a4420a0fafc0918fe80be24a174c Mon Sep 17 00:00:00 2001 From: "Michael J. Kavulich, Jr" Date: Wed, 15 Mar 2023 17:49:14 +0000 Subject: [PATCH 48/52] Addressing more review comments: Update dicts in place --- tests/WE2E/WE2E_summary.py | 36 ++++++++++++++++++++---------------- tests/WE2E/monitor_jobs.py | 10 +++++----- tests/WE2E/utils.py | 33 +++++++++++---------------------- 3 files changed, 36 insertions(+), 43 deletions(-) diff --git a/tests/WE2E/WE2E_summary.py b/tests/WE2E/WE2E_summary.py index de478a0f38..828b4c411a 100755 --- a/tests/WE2E/WE2E_summary.py +++ b/tests/WE2E/WE2E_summary.py @@ -12,6 +12,25 @@ from utils import calculate_core_hours, create_expts_dict, print_WE2E_summary, write_monitor_file +def WE2E_summary(args): + yaml_file = args.yaml_file + + # Set up dictionary of experiments + if args.expt_dir: + yaml_file, expts_dict = create_expts_dict(args.expt_dir) + elif args.yaml_file: + expts_dict = load_config_file(args.yaml_file) + else: + raise ValueError(f'Bad arguments; run {__file__} -h for more information') + + # Calculate core hours and update yaml + calculate_core_hours(expts_dict) + write_monitor_file(yaml_file,expts_dict) + + #Call function to print summary + print_WE2E_summary(expts_dict, args.debug) + + def setup_logging(debug: bool = False) -> None: """ Sets up logging, printing high-priority (INFO and higher) messages to screen, and printing all @@ -53,19 +72,4 @@ def setup_logging(debug: bool = False) -> None: setup_logging(args.debug) - yaml_file = args.yaml_file - - # Set up dictionary of experiments - if args.expt_dir: - yaml_file, expts_dict = create_expts_dict(args.expt_dir) - elif args.yaml_file: - expts_dict = load_config_file(args.yaml_file) - else: - raise ValueError(f'Bad arguments; run {__file__} -h for more information') - - # Calculate core hours and update yaml - expts_dict = calculate_core_hours(expts_dict) - write_monitor_file(yaml_file,expts_dict) - - #Call function to print summary - print_WE2E_summary(expts_dict, args.debug) + WE2E_summary(args) diff --git a/tests/WE2E/monitor_jobs.py b/tests/WE2E/monitor_jobs.py index 5d1d4a63af..ff6d880b69 100755 --- a/tests/WE2E/monitor_jobs.py +++ b/tests/WE2E/monitor_jobs.py @@ -42,11 +42,11 @@ def monitor_jobs(expts_dict: dict, monitor_file: str = '', procs: int = 1, debug if procs > 1: print(f'Starting experiments in parallel with {procs} processes') - expts_dict = update_expt_status_parallel(expts_dict, procs, True, debug) + update_expt_status_parallel(expts_dict, procs, True, debug) else: for expt in expts_dict: logging.info(f"Starting experiment {expt} running") - expts_dict[expt] = update_expt_status(expts_dict[expt], expt, True, debug) + update_expt_status(expts_dict[expt], expt, True, debug) write_monitor_file(monitor_file,expts_dict) @@ -60,10 +60,10 @@ def monitor_jobs(expts_dict: dict, monitor_file: str = '', procs: int = 1, debug while running_expts: i += 1 if procs > 1: - expts_dict = update_expt_status_parallel(expts_dict, procs) + update_expt_status_parallel(expts_dict, procs) else: for expt in running_expts.copy(): - expts_dict[expt] = update_expt_status(expts_dict[expt], expt) + update_expt_status(expts_dict[expt], expt) for expt in running_expts.copy(): running_expts[expt] = expts_dict[expt] @@ -89,7 +89,7 @@ def monitor_jobs(expts_dict: dict, monitor_file: str = '', procs: int = 1, debug logging.info('Calculating core-hour usage and printing final summary') # Calculate core hours and update yaml - expts_dict = calculate_core_hours(expts_dict) + calculate_core_hours(expts_dict) write_monitor_file(monitor_file,expts_dict) #Call function to print summary diff --git a/tests/WE2E/utils.py b/tests/WE2E/utils.py index 827a3e0ba1..1932683dbb 100755 --- a/tests/WE2E/utils.py +++ b/tests/WE2E/utils.py @@ -27,6 +27,7 @@ REPORT_WIDTH = 100 EXPT_COLUMN_WIDTH = 65 TASK_COLUMN_WIDTH = 40 + def print_WE2E_summary(expts_dict: dict, debug: bool = False): """Function that creates a summary for the specified experiment @@ -136,7 +137,7 @@ def create_expts_dict(expt_dir: str) -> dict: return summary_file, expts_dict -def calculate_core_hours(expts_dict: dict) -> dict: +def calculate_core_hours(expts_dict: dict) -> None: """ Function takes in an experiment dictionary, reads the var_defns file for necessary information, and calculates the core hours used by each task, updating expts_dict with this info @@ -145,7 +146,7 @@ def calculate_core_hours(expts_dict: dict) -> dict: expts_dict (dict): A dictionary containing the information needed to run one or more experiments. See example file WE2E_tests.yaml Returns: - dict : Experiments dictionary updated with core hours + None """ for expt in expts_dict: @@ -180,7 +181,6 @@ def calculate_core_hours(expts_dict: dict) -> dict: expts_dict[expt][task]['walltime'] / 3600 expts_dict[expt][task]['exact_count'] = False expts_dict[expt][task]['core_hours'] = round(core_hours,2) - return expts_dict def write_monitor_file(monitor_file: str, expts_dict: dict): @@ -200,7 +200,7 @@ def write_monitor_file(monitor_file: str, expts_dict: dict): def update_expt_status(expt: dict, name: str, refresh: bool = False, debug: bool = False, - submit: bool = True) -> dict: + submit: bool = True) -> None: """ This function reads the dictionary showing the location of a given experiment, runs a `rocotorun` command to update the experiment (running new jobs and updating the status of @@ -249,12 +249,12 @@ def update_expt_status(expt: dict, name: str, refresh: bool = False, debug: bool workflow by calling rocotorun. If simply generating a report, set this to False Returns: - dict: The updated experiment dictionary. + None """ #If we are no longer tracking this experiment, return unchanged if (expt["status"] in ['DEAD','ERROR','COMPLETE']) and not refresh: - return expt + return # Update experiment, read rocoto database rocoto_db = f"{expt['expt_dir']}/FV3LAM_wflow.db" if submit: @@ -293,8 +293,7 @@ def update_expt_status(expt: dict, name: str, refresh: bool = False, debug: bool if not refresh: logging.warning(f"Unable to read database {rocoto_db}\nCan not track experiment {name}") expt["status"] = "ERROR" - - return expt + return for task in db: # For each entry from rocoto database, store that task's info under a dictionary key named @@ -322,7 +321,7 @@ def update_expt_status(expt: dict, name: str, refresh: bool = False, debug: bool expt["status"] = "DYING" else: expt["status"] = "DEAD" - return expt + return elif "RUNNING" in statuses: expt["status"] = "RUNNING" elif "QUEUED" in statuses: @@ -364,10 +363,9 @@ def update_expt_status(expt: dict, name: str, refresh: bool = False, debug: bool if expt["status"] in ["SUCCEEDED","STALLED","STUCK"]: expt = compare_rocotostat(expt,name) - return expt def update_expt_status_parallel(expts_dict: dict, procs: int, refresh: bool = False, - debug: bool = False) -> dict: + debug: bool = False) -> None: """ This function updates an entire set of experiments in parallel, drastically speeding up the process if given enough parallel processes. Given an experiment dictionary, it will @@ -385,7 +383,7 @@ def update_expt_status_parallel(expts_dict: dict, procs: int, refresh: bool = Fa slow down the process drastically. Returns: - dict: The updated dictionary of experiment dictionaries + None """ args = [] @@ -395,16 +393,7 @@ def update_expt_status_parallel(expts_dict: dict, procs: int, refresh: bool = Fa # call update_expt_status() in parallel with Pool(processes=procs) as pool: - output = pool.starmap(update_expt_status, args) - - # Update dictionary with output from all calls to update_expt_status() - i = 0 - for expt in expts_dict: - expts_dict[expt] = output[i] - i += 1 - - return expts_dict - + pool.starmap(update_expt_status, args) def print_test_info(txtfile: str = "WE2E_test_info.txt") -> None: From 40e5fb56ad6b2b1f7b4f3b90b7d99a6e7cf9c785 Mon Sep 17 00:00:00 2001 From: "Michael J. Kavulich, Jr" Date: Wed, 15 Mar 2023 18:28:03 +0000 Subject: [PATCH 49/52] Revert "Addressing more review comments: Update dicts in place" This reverts commit b88c4a7dc569a4420a0fafc0918fe80be24a174c. --- tests/WE2E/WE2E_summary.py | 36 ++++++++++++++++-------------------- tests/WE2E/monitor_jobs.py | 10 +++++----- tests/WE2E/utils.py | 33 ++++++++++++++++++++++----------- 3 files changed, 43 insertions(+), 36 deletions(-) diff --git a/tests/WE2E/WE2E_summary.py b/tests/WE2E/WE2E_summary.py index 828b4c411a..de478a0f38 100755 --- a/tests/WE2E/WE2E_summary.py +++ b/tests/WE2E/WE2E_summary.py @@ -12,25 +12,6 @@ from utils import calculate_core_hours, create_expts_dict, print_WE2E_summary, write_monitor_file -def WE2E_summary(args): - yaml_file = args.yaml_file - - # Set up dictionary of experiments - if args.expt_dir: - yaml_file, expts_dict = create_expts_dict(args.expt_dir) - elif args.yaml_file: - expts_dict = load_config_file(args.yaml_file) - else: - raise ValueError(f'Bad arguments; run {__file__} -h for more information') - - # Calculate core hours and update yaml - calculate_core_hours(expts_dict) - write_monitor_file(yaml_file,expts_dict) - - #Call function to print summary - print_WE2E_summary(expts_dict, args.debug) - - def setup_logging(debug: bool = False) -> None: """ Sets up logging, printing high-priority (INFO and higher) messages to screen, and printing all @@ -72,4 +53,19 @@ def setup_logging(debug: bool = False) -> None: setup_logging(args.debug) - WE2E_summary(args) + yaml_file = args.yaml_file + + # Set up dictionary of experiments + if args.expt_dir: + yaml_file, expts_dict = create_expts_dict(args.expt_dir) + elif args.yaml_file: + expts_dict = load_config_file(args.yaml_file) + else: + raise ValueError(f'Bad arguments; run {__file__} -h for more information') + + # Calculate core hours and update yaml + expts_dict = calculate_core_hours(expts_dict) + write_monitor_file(yaml_file,expts_dict) + + #Call function to print summary + print_WE2E_summary(expts_dict, args.debug) diff --git a/tests/WE2E/monitor_jobs.py b/tests/WE2E/monitor_jobs.py index ff6d880b69..5d1d4a63af 100755 --- a/tests/WE2E/monitor_jobs.py +++ b/tests/WE2E/monitor_jobs.py @@ -42,11 +42,11 @@ def monitor_jobs(expts_dict: dict, monitor_file: str = '', procs: int = 1, debug if procs > 1: print(f'Starting experiments in parallel with {procs} processes') - update_expt_status_parallel(expts_dict, procs, True, debug) + expts_dict = update_expt_status_parallel(expts_dict, procs, True, debug) else: for expt in expts_dict: logging.info(f"Starting experiment {expt} running") - update_expt_status(expts_dict[expt], expt, True, debug) + expts_dict[expt] = update_expt_status(expts_dict[expt], expt, True, debug) write_monitor_file(monitor_file,expts_dict) @@ -60,10 +60,10 @@ def monitor_jobs(expts_dict: dict, monitor_file: str = '', procs: int = 1, debug while running_expts: i += 1 if procs > 1: - update_expt_status_parallel(expts_dict, procs) + expts_dict = update_expt_status_parallel(expts_dict, procs) else: for expt in running_expts.copy(): - update_expt_status(expts_dict[expt], expt) + expts_dict[expt] = update_expt_status(expts_dict[expt], expt) for expt in running_expts.copy(): running_expts[expt] = expts_dict[expt] @@ -89,7 +89,7 @@ def monitor_jobs(expts_dict: dict, monitor_file: str = '', procs: int = 1, debug logging.info('Calculating core-hour usage and printing final summary') # Calculate core hours and update yaml - calculate_core_hours(expts_dict) + expts_dict = calculate_core_hours(expts_dict) write_monitor_file(monitor_file,expts_dict) #Call function to print summary diff --git a/tests/WE2E/utils.py b/tests/WE2E/utils.py index 1932683dbb..827a3e0ba1 100755 --- a/tests/WE2E/utils.py +++ b/tests/WE2E/utils.py @@ -27,7 +27,6 @@ REPORT_WIDTH = 100 EXPT_COLUMN_WIDTH = 65 TASK_COLUMN_WIDTH = 40 - def print_WE2E_summary(expts_dict: dict, debug: bool = False): """Function that creates a summary for the specified experiment @@ -137,7 +136,7 @@ def create_expts_dict(expt_dir: str) -> dict: return summary_file, expts_dict -def calculate_core_hours(expts_dict: dict) -> None: +def calculate_core_hours(expts_dict: dict) -> dict: """ Function takes in an experiment dictionary, reads the var_defns file for necessary information, and calculates the core hours used by each task, updating expts_dict with this info @@ -146,7 +145,7 @@ def calculate_core_hours(expts_dict: dict) -> None: expts_dict (dict): A dictionary containing the information needed to run one or more experiments. See example file WE2E_tests.yaml Returns: - None + dict : Experiments dictionary updated with core hours """ for expt in expts_dict: @@ -181,6 +180,7 @@ def calculate_core_hours(expts_dict: dict) -> None: expts_dict[expt][task]['walltime'] / 3600 expts_dict[expt][task]['exact_count'] = False expts_dict[expt][task]['core_hours'] = round(core_hours,2) + return expts_dict def write_monitor_file(monitor_file: str, expts_dict: dict): @@ -200,7 +200,7 @@ def write_monitor_file(monitor_file: str, expts_dict: dict): def update_expt_status(expt: dict, name: str, refresh: bool = False, debug: bool = False, - submit: bool = True) -> None: + submit: bool = True) -> dict: """ This function reads the dictionary showing the location of a given experiment, runs a `rocotorun` command to update the experiment (running new jobs and updating the status of @@ -249,12 +249,12 @@ def update_expt_status(expt: dict, name: str, refresh: bool = False, debug: bool workflow by calling rocotorun. If simply generating a report, set this to False Returns: - None + dict: The updated experiment dictionary. """ #If we are no longer tracking this experiment, return unchanged if (expt["status"] in ['DEAD','ERROR','COMPLETE']) and not refresh: - return + return expt # Update experiment, read rocoto database rocoto_db = f"{expt['expt_dir']}/FV3LAM_wflow.db" if submit: @@ -293,7 +293,8 @@ def update_expt_status(expt: dict, name: str, refresh: bool = False, debug: bool if not refresh: logging.warning(f"Unable to read database {rocoto_db}\nCan not track experiment {name}") expt["status"] = "ERROR" - return + + return expt for task in db: # For each entry from rocoto database, store that task's info under a dictionary key named @@ -321,7 +322,7 @@ def update_expt_status(expt: dict, name: str, refresh: bool = False, debug: bool expt["status"] = "DYING" else: expt["status"] = "DEAD" - return + return expt elif "RUNNING" in statuses: expt["status"] = "RUNNING" elif "QUEUED" in statuses: @@ -363,9 +364,10 @@ def update_expt_status(expt: dict, name: str, refresh: bool = False, debug: bool if expt["status"] in ["SUCCEEDED","STALLED","STUCK"]: expt = compare_rocotostat(expt,name) + return expt def update_expt_status_parallel(expts_dict: dict, procs: int, refresh: bool = False, - debug: bool = False) -> None: + debug: bool = False) -> dict: """ This function updates an entire set of experiments in parallel, drastically speeding up the process if given enough parallel processes. Given an experiment dictionary, it will @@ -383,7 +385,7 @@ def update_expt_status_parallel(expts_dict: dict, procs: int, refresh: bool = Fa slow down the process drastically. Returns: - None + dict: The updated dictionary of experiment dictionaries """ args = [] @@ -393,7 +395,16 @@ def update_expt_status_parallel(expts_dict: dict, procs: int, refresh: bool = Fa # call update_expt_status() in parallel with Pool(processes=procs) as pool: - pool.starmap(update_expt_status, args) + output = pool.starmap(update_expt_status, args) + + # Update dictionary with output from all calls to update_expt_status() + i = 0 + for expt in expts_dict: + expts_dict[expt] = output[i] + i += 1 + + return expts_dict + def print_test_info(txtfile: str = "WE2E_test_info.txt") -> None: From b3213715b4f5a2bb8e96b72e2943a596baa5f961 Mon Sep 17 00:00:00 2001 From: "Michael J. Kavulich, Jr" Date: Wed, 15 Mar 2023 18:31:17 +0000 Subject: [PATCH 50/52] Final set of review comments --- tests/WE2E/utils.py | 89 +++++++++++++++++---------------------------- 1 file changed, 33 insertions(+), 56 deletions(-) diff --git a/tests/WE2E/utils.py b/tests/WE2E/utils.py index 827a3e0ba1..1a6d4aae12 100755 --- a/tests/WE2E/utils.py +++ b/tests/WE2E/utils.py @@ -257,12 +257,12 @@ def update_expt_status(expt: dict, name: str, refresh: bool = False, debug: bool return expt # Update experiment, read rocoto database rocoto_db = f"{expt['expt_dir']}/FV3LAM_wflow.db" + rocoto_xml = f"{expt['expt_dir']}/FV3LAM_wflow.xml" if submit: if refresh: logging.info(f"Updating database for experiment {name}") if debug: - rocotorun_cmd = ["rocotorun", f"-w {expt['expt_dir']}/FV3LAM_wflow.xml", - f"-d {rocoto_db}", "-v 10"] + rocotorun_cmd = ["rocotorun", f"-w {rocoto_xml}", f"-d {rocoto_db}", "-v 10"] p = subprocess.run(rocotorun_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) logging.debug(p.stdout) @@ -272,8 +272,7 @@ def update_expt_status(expt: dict, name: str, refresh: bool = False, debug: bool stderr=subprocess.STDOUT, text=True) logging.debug(p.stdout) else: - rocotorun_cmd = ["rocotorun", f"-w {expt['expt_dir']}/FV3LAM_wflow.xml", - f"-d {rocoto_db}"] + rocotorun_cmd = ["rocotorun", f"-w {rocoto_xml}", f"-d {rocoto_db}"] subprocess.run(rocotorun_cmd) #Run rocotorun again to get around rocotobqserver proliferation issue subprocess.run(rocotorun_cmd) @@ -370,19 +369,17 @@ def update_expt_status_parallel(expts_dict: dict, procs: int, refresh: bool = Fa debug: bool = False) -> dict: """ This function updates an entire set of experiments in parallel, drastically speeding up - the process if given enough parallel processes. Given an experiment dictionary, it will - output the updated dictionary. - - parallelizes the call to update_expt_status across the given number of processes. - Making use of the python multiprocessing starmap functionality, takes + the process if given enough parallel processes. Given a dictionary of experiments, it will + pass each individual experiment dictionary to update_expt_status() to be updated, making use + of the python multiprocessing starmap functionality to achieve this in parallel Args: expts_dict (dict): A dictionary containing information for all experiments - procs (int): The number of parallel processes - refresh (bool): "Refresh" flag to pass to update_expt_status() - debug (bool): Will capture all output from rocotorun. This will allow information such - as job cards and job submit messages to appear in the log files, but can - slow down the process drastically. + procs (int): The number of parallel processes + refresh (bool): "Refresh" flag to pass to update_expt_status() + debug (bool): Will capture all output from rocotorun. This will allow information such + as job cards and job submit messages to appear in the log files, but can + slow down the process drastically. Returns: dict: The updated dictionary of experiment dictionaries @@ -450,10 +447,10 @@ def print_test_info(txtfile: str = "WE2E_test_info.txt") -> None: testdict[testname]["num_fcsts"] = 1 # For each found link, add its info to the appropriate test dictionary entry - for key in links.keys(): - link = links[key] - testdict[link[2]]["alternate_name"] = link[0] - testdict[link[2]]["alternate_directory_name"] = link[1] + for key, link in links.items(): + alt_testname, alt_dirname, link_name = link + testdict[link_name]["alternate_name"] = alt_testname + testdict[link_name]["alternate_directory_name"] = alt_dirname # Print the file with open(txtfile, 'w', encoding="utf-8") as f: @@ -480,43 +477,23 @@ def print_test_info(txtfile: str = "WE2E_test_info.txt") -> None: f.write(f" {desc[-1]}") #Write test relative cost and number of test forecasts (for cycling runs) f.write(f"{d}'{round(testdict[expt]['cost'],2)}{d}'{round(testdict[expt]['num_fcsts'])}") - f.write(f"{d}" + get_or_print_blank(testdict[expt],'workflow','PREDEF_GRID_NAME')) - f.write(f"{d}" + get_or_print_blank(testdict[expt],'workflow','CCPP_PHYS_SUITE')) - f.write(f"{d}" + get_or_print_blank(testdict[expt],'task_get_extrn_ics', - 'EXTRN_MDL_NAME_ICS')) - f.write(f"{d}" + get_or_print_blank(testdict[expt],'task_get_extrn_lbcs', - 'EXTRN_MDL_NAME_LBCS')) - f.write(f"{d}" + get_or_print_blank(testdict[expt],'workflow','DATE_FIRST_CYCL')) - f.write(f"{d}" + get_or_print_blank(testdict[expt],'workflow','DATE_LAST_CYCL')) - f.write(f"{d}" + get_or_print_blank(testdict[expt],'workflow','INCR_CYCL_FREQ')) - f.write(f"{d}" + get_or_print_blank(testdict[expt],'workflow','FCST_LEN_HRS')) - f.write(f"{d}" + get_or_print_blank(testdict[expt],'task_run_fcst','DT_ATMOS')) - f.write(f"{d}" + get_or_print_blank(testdict[expt],'task_get_extrn_lbcs', - 'LBC_SPEC_INTVL_HRS')) - f.write(f"{d}" + get_or_print_blank(testdict[expt],'global','NUM_ENS_MEMBERS') + "\n") - -def get_or_print_blank(d,key1,key2): - """Function that checks the existence of keys in a nested dictionary in the form: - - dictionary[key1][key2] - - If dictionary[key1][key2] exists, return its value as a string. - If either key1 or key2 do not exist, return an empty string - - Args: - d (dict) : Dictionary to check for keys - key1 (str) : The key for dictionary d - key2 (str) : The key for dictionary d[key1] - Returns: - write : A string containing the value of d[key1][key2] - """ - - if d.get(key1,{}).get(key2): - write = f"{d[key1][key2]}" - else: - write = "" + # Bundle various variables with their corresponding sections for more compact coding + key_pairs = [ ('workflow', 'PREDEF_GRID_NAME'), + ('workflow', 'CCPP_PHYS_SUITE'), + ('task_get_extrn_ics', 'EXTRN_MDL_NAME_ICS'), + ('task_get_extrn_lbcs', 'EXTRN_MDL_NAME_LBCS'), + ('workflow', 'DATE_FIRST_CYCL'), + ('workflow', 'DATE_LAST_CYCL'), + ('workflow', 'INCR_CYCL_FREQ'), + ('workflow', 'FCST_LEN_HRS'), + ('task_run_fcst', 'DT_ATMOS'), + ('task_get_extrn_lbcs', 'LBC_SPEC_INTVL_HRS'), + ('global', 'NUM_ENS_MEMBERS') ] + + for key1, key2 in key_pairs: + f.write(f"{d}{testdict[expt].get(key1, {}).get(key2, '')}") + f.write("\n") - return write def compare_rocotostat(expt_dict,name): """Reads the dictionary showing the location of a given experiment, runs a `rocotostat` command @@ -526,8 +503,8 @@ def compare_rocotostat(expt_dict,name): # Call rocotostat and store output rocoto_db = f"{expt_dict['expt_dir']}/FV3LAM_wflow.db" - rocotorun_cmd = ["rocotostat", f"-w {expt_dict['expt_dir']}/FV3LAM_wflow.xml", - f"-d {rocoto_db}", "-v 10"] + rocoto_xml = f"{expt_dict['expt_dir']}/FV3LAM_wflow.xml" + rocotorun_cmd = ["rocotostat", f"-w {rocoto_xml}", f"-d {rocoto_db}", "-v 10"] p = subprocess.run(rocotorun_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) rsout = p.stdout From 8f5525d152965511d65b17a575d973ac9d1d6626 Mon Sep 17 00:00:00 2001 From: "Michael J. Kavulich, Jr" Date: Wed, 15 Mar 2023 20:00:23 +0000 Subject: [PATCH 51/52] Un-revert intended change to flow of WE2E_summary.py --- tests/WE2E/WE2E_summary.py | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/tests/WE2E/WE2E_summary.py b/tests/WE2E/WE2E_summary.py index de478a0f38..828b4c411a 100755 --- a/tests/WE2E/WE2E_summary.py +++ b/tests/WE2E/WE2E_summary.py @@ -12,6 +12,25 @@ from utils import calculate_core_hours, create_expts_dict, print_WE2E_summary, write_monitor_file +def WE2E_summary(args): + yaml_file = args.yaml_file + + # Set up dictionary of experiments + if args.expt_dir: + yaml_file, expts_dict = create_expts_dict(args.expt_dir) + elif args.yaml_file: + expts_dict = load_config_file(args.yaml_file) + else: + raise ValueError(f'Bad arguments; run {__file__} -h for more information') + + # Calculate core hours and update yaml + calculate_core_hours(expts_dict) + write_monitor_file(yaml_file,expts_dict) + + #Call function to print summary + print_WE2E_summary(expts_dict, args.debug) + + def setup_logging(debug: bool = False) -> None: """ Sets up logging, printing high-priority (INFO and higher) messages to screen, and printing all @@ -53,19 +72,4 @@ def setup_logging(debug: bool = False) -> None: setup_logging(args.debug) - yaml_file = args.yaml_file - - # Set up dictionary of experiments - if args.expt_dir: - yaml_file, expts_dict = create_expts_dict(args.expt_dir) - elif args.yaml_file: - expts_dict = load_config_file(args.yaml_file) - else: - raise ValueError(f'Bad arguments; run {__file__} -h for more information') - - # Calculate core hours and update yaml - expts_dict = calculate_core_hours(expts_dict) - write_monitor_file(yaml_file,expts_dict) - - #Call function to print summary - print_WE2E_summary(expts_dict, args.debug) + WE2E_summary(args) From 4ec7dfc28a29cb7ea30cbf25e30766fe019e32b0 Mon Sep 17 00:00:00 2001 From: "Michael J. Kavulich, Jr" Date: Wed, 15 Mar 2023 23:32:01 +0000 Subject: [PATCH 52/52] Revert "Un-revert intended change to flow of WE2E_summary.py" This reverts commit 8f5525d152965511d65b17a575d973ac9d1d6626. --- tests/WE2E/WE2E_summary.py | 36 ++++++++++++++++-------------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/tests/WE2E/WE2E_summary.py b/tests/WE2E/WE2E_summary.py index 828b4c411a..de478a0f38 100755 --- a/tests/WE2E/WE2E_summary.py +++ b/tests/WE2E/WE2E_summary.py @@ -12,25 +12,6 @@ from utils import calculate_core_hours, create_expts_dict, print_WE2E_summary, write_monitor_file -def WE2E_summary(args): - yaml_file = args.yaml_file - - # Set up dictionary of experiments - if args.expt_dir: - yaml_file, expts_dict = create_expts_dict(args.expt_dir) - elif args.yaml_file: - expts_dict = load_config_file(args.yaml_file) - else: - raise ValueError(f'Bad arguments; run {__file__} -h for more information') - - # Calculate core hours and update yaml - calculate_core_hours(expts_dict) - write_monitor_file(yaml_file,expts_dict) - - #Call function to print summary - print_WE2E_summary(expts_dict, args.debug) - - def setup_logging(debug: bool = False) -> None: """ Sets up logging, printing high-priority (INFO and higher) messages to screen, and printing all @@ -72,4 +53,19 @@ def setup_logging(debug: bool = False) -> None: setup_logging(args.debug) - WE2E_summary(args) + yaml_file = args.yaml_file + + # Set up dictionary of experiments + if args.expt_dir: + yaml_file, expts_dict = create_expts_dict(args.expt_dir) + elif args.yaml_file: + expts_dict = load_config_file(args.yaml_file) + else: + raise ValueError(f'Bad arguments; run {__file__} -h for more information') + + # Calculate core hours and update yaml + expts_dict = calculate_core_hours(expts_dict) + write_monitor_file(yaml_file,expts_dict) + + #Call function to print summary + print_WE2E_summary(expts_dict, args.debug)