CliMA · charleskawczynski · Sep 27, 2024 · Sep 20, 2024 · Sep 20, 2024 · Sep 23, 2024
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -56,11 +56,11 @@ steps:
 
   - wait
 
-  - group: "Regression tests"
+  - group: "Reproducibility tests"
     steps:
 
       - label: ":computer: Ensure mse tables are reset when necessary"
-        command: "julia --color=yes --project=examples regression_tests/test_reset.jl"
+        command: "julia --color=yes --project=examples reproducibility_tests/test_reset.jl"
 
   - group: "Radiation"
     steps:
@@ -256,7 +256,7 @@ steps:
           --config_file $CONFIG_PATH/sphere_baroclinic_wave_rhoe_equilmoist.yml
           --job_id sphere_baroclinic_wave_rhoe_equilmoist
 
-          julia --color=yes --project=examples regression_tests/test_mse.jl
+          julia --color=yes --project=examples reproducibility_tests/test_mse.jl
           --job_id sphere_baroclinic_wave_rhoe_equilmoist
           --out_dir sphere_baroclinic_wave_rhoe_equilmoist/output_active
         artifact_paths: "sphere_baroclinic_wave_rhoe_equilmoist/output_active/*"
@@ -269,7 +269,7 @@ steps:
           --config_file $CONFIG_PATH/deep_sphere_baroclinic_wave_rhoe_equilmoist.yml
           --job_id deep_sphere_baroclinic_wave_rhoe_equilmoist
 
-          julia --color=yes --project=examples regression_tests/test_mse.jl
+          julia --color=yes --project=examples reproducibility_tests/test_mse.jl
           --job_id deep_sphere_baroclinic_wave_rhoe_equilmoist
           --out_dir deep_sphere_baroclinic_wave_rhoe_equilmoist/output_active
         artifact_paths: "deep_sphere_baroclinic_wave_rhoe_equilmoist/output_active/*"
@@ -302,7 +302,7 @@ steps:
           --config_file $CONFIG_PATH/sphere_held_suarez_rhoe_equilmoist_hightop_sponge.yml
           --job_id sphere_held_suarez_rhoe_equilmoist_hightop_sponge
 
-          julia --color=yes --project=examples regression_tests/test_mse.jl
+          julia --color=yes --project=examples reproducibility_tests/test_mse.jl
           --job_id sphere_held_suarez_rhoe_equilmoist_hightop_sponge
           --out_dir sphere_held_suarez_rhoe_equilmoist_hightop_sponge/output_active
         artifact_paths: "sphere_held_suarez_rhoe_equilmoist_hightop_sponge/output_active/*"
@@ -327,7 +327,7 @@ steps:
           --config_file $CONFIG_PATH/sphere_aquaplanet_rhoe_equilmoist_allsky_gw_raw_zonallyasymmetric.yml
           --job_id sphere_aquaplanet_rhoe_equilmoist_allsky_gw_raw_zonallyasymmetric
 
-          julia --color=yes --project=examples regression_tests/test_mse.jl
+          julia --color=yes --project=examples reproducibility_tests/test_mse.jl
           --job_id sphere_aquaplanet_rhoe_equilmoist_allsky_gw_raw_zonallyasymmetric
           --out_dir sphere_aquaplanet_rhoe_equilmoist_allsky_gw_raw_zonallyasymmetric/output_active
         artifact_paths: "sphere_aquaplanet_rhoe_equilmoist_allsky_gw_raw_zonallyasymmetric/output_active/*"
@@ -610,7 +610,7 @@ steps:
           --config_file $CONFIG_PATH/diagnostic_edmfx_aquaplanet.yml
           --job_id diagnostic_edmfx_aquaplanet
 
-          julia --color=yes --project=examples regression_tests/test_mse.jl
+          julia --color=yes --project=examples reproducibility_tests/test_mse.jl
           --job_id diagnostic_edmfx_aquaplanet
           --out_dir diagnostic_edmfx_aquaplanet/output_active
         artifact_paths: "diagnostic_edmfx_aquaplanet/output_active/*"
@@ -1111,10 +1111,10 @@ steps:
     continue_on_failure: true
 
   - label: ":robot_face: Print new mse tables"
-    command: "julia --color=yes --project=examples regression_tests/print_new_mse.jl"
+    command: "julia --color=yes --project=examples reproducibility_tests/print_new_mse.jl"
 
   - label: ":robot_face: Print new reference counter"
-    command: "julia --color=yes --project=examples regression_tests/print_new_ref_counter.jl"
+    command: "julia --color=yes --project=examples reproducibility_tests/print_new_ref_counter.jl"
 
   - label: ":bar_chart: Tabulate performance summary"
     command: "julia --color=yes --project=perf perf/tabulate_perf_summary.jl"
@@ -1128,4 +1128,4 @@ steps:
   - wait
 
   - label: ":robot_face: Move main results"
-    command: "julia --color=yes --project=examples regression_tests/move_output.jl"
+    command: "julia --color=yes --project=examples reproducibility_tests/move_output.jl"
diff --git a/config/default_configs/default_config.yml b/config/default_configs/default_config.yml
@@ -214,8 +214,8 @@ non_orographic_gravity_wave:
 nh_poly:
   help: "Horizontal polynomial degree. Note: The number of quadrature points in 1D within each horizontal element is then Nq = <--nh_poly> + 1"
   value: 3
-regression_test:
-  help: "(Bool) perform regression test"
+reproducibility_test:
+  help: "(Bool) perform reproducibility test"
   value: false
 check_conservation:
   help: "Check conservation of mass and energy [`false` (default), `true`]"

diff --git a/config/model_configs/deep_sphere_baroclinic_wave_rhoe_equilmoist.yml b/config/model_configs/deep_sphere_baroclinic_wave_rhoe_equilmoist.yml
@@ -1,6 +1,6 @@
 precip_model: "0M"
 dt_save_state_to_disk: "2days"
-regression_test: true
+reproducibility_test: true
 initial_condition: "MoistBaroclinicWave"
 dt: "450secs"
 t_end: "10days"

diff --git a/config/model_configs/diagnostic_edmfx_aquaplanet.yml b/config/model_configs/diagnostic_edmfx_aquaplanet.yml
@@ -16,6 +16,6 @@ precip_model: 1M
 dt: 100secs
 t_end: 12hours
 dt_save_state_to_disk: 12hours
-regression_test: true
+reproducibility_test: true
 toml: [toml/diagnostic_edmfx.toml]
 ode_algo: ARS343
diff --git a/config/model_configs/single_column_precipitation_test.yml b/config/model_configs/single_column_precipitation_test.yml
@@ -14,7 +14,7 @@ precip_model: "1M"
 vert_diff: "FriersonDiffusion"
 implicit_diffusion: true
 approximate_linear_solve_iters: 2
-regression_test: false
+reproducibility_test: false
 toml: [toml/single_column_precipitation_test.toml]
 diagnostics:
   - short_name: [hus, clw, cli, husra, hussn, ta, wa]

diff --git a/config/model_configs/sphere_aquaplanet_rhoe_equilmoist_allsky_gw_raw_zonallyasymmetric.yml b/config/model_configs/sphere_aquaplanet_rhoe_equilmoist_allsky_gw_raw_zonallyasymmetric.yml
@@ -16,7 +16,7 @@ cloud_model: "grid_scale"
 surface_temperature: "ZonallyAsymmetric"
 moist: "equil"
 albedo_model: "RegressionFunctionAlbedo"
-regression_test: true
+reproducibility_test: true
 aerosol_radiation: true
 prescribed_aerosols: ["CB1", "CB2", "DST01", "DST02", "DST03", "DST04", "OC1", "OC2", "SO4"]
 toml: [toml/sphere_aquaplanet_rhoe_equilmoist_allsky_gw_raw_zonallyasymmetric.toml]
diff --git a/config/model_configs/sphere_baroclinic_wave_rhoe_equilmoist.yml b/config/model_configs/sphere_baroclinic_wave_rhoe_equilmoist.yml
@@ -1,6 +1,6 @@
 precip_model: "0M"
 dt_save_state_to_disk: "2days"
-regression_test: true
+reproducibility_test: true
 initial_condition: "MoistBaroclinicWave"
 dt: "450secs"
 t_end: "10days"

diff --git a/config/model_configs/sphere_held_suarez_rhoe_equilmoist_hightop_sponge.yml b/config/model_configs/sphere_held_suarez_rhoe_equilmoist_hightop_sponge.yml
@@ -8,7 +8,7 @@ vert_diff: true
 forcing: "held_suarez"
 z_max: 55000.0
 precip_model: "0M"
-regression_test: true
+reproducibility_test: true
 viscous_sponge: true
 moist: "equil"
 toml: [toml/sphere_held_suarez_rhoe_equilmoist_hightop_sponge.toml]
diff --git a/examples/hybrid/driver.jl b/examples/hybrid/driver.jl
@@ -84,26 +84,28 @@ if CA.is_distributed(config.comms_ctx)
 end
 
 # Check if selected output has changed from the previous recorded output (bit-wise comparison)
-include(joinpath(@__DIR__, "..", "..", "regression_tests", "mse_tables.jl"))
-if config.parsed_args["regression_test"]
+include(
+    joinpath(@__DIR__, "..", "..", "reproducibility_tests", "mse_tables.jl"),
+)
+if config.parsed_args["reproducibility_test"]
     # Test results against main branch
     include(
         joinpath(
             @__DIR__,
             "..",
             "..",
-            "regression_tests",
-            "regression_tests.jl",
+            "reproducibility_tests",
+            "reproducibility_tests.jl",
         ),
     )
-    @testset "Test regression table entries" begin
+    @testset "Test reproducibility table entries" begin
         mse_keys = sort(collect(keys(all_best_mse[simulation.job_id])))
         pcs = collect(Fields.property_chains(sol.u[end]))
         for prop_chain in mse_keys
             @test prop_chain in pcs
         end
     end
-    perform_regression_tests(
+    perform_reproducibility_tests(
         simulation.job_id,
         sol.u[end],
         all_best_mse,
@@ -143,7 +145,11 @@ end
 # Visualize the solution
 if ClimaComms.iamroot(config.comms_ctx)
     include(
-        joinpath(pkgdir(CA), "regression_tests", "self_reference_or_path.jl"),
+        joinpath(
+            pkgdir(CA),
+            "reproducibility_tests",
+            "self_reference_or_path.jl",
+        ),
     )
     @info "Plotting"
     path = self_reference_or_path() # __build__ path (not job path)

diff --git a/regression_tests/README.md → reproducibility_tests/README.md b/regression_tests/README.md → reproducibility_tests/README.md
@@ -1,29 +1,29 @@
-# User guide to regression tests
+# User guide to reproducibility tests
 
-This document outlines how regression tests work and how to update PRs to pass regression tests.
+This document outlines how reproducibility tests work and how to update PRs to pass reproducibility tests.
 
-## The basic idea of how regression tests work
+## The basic idea of how reproducibility tests work
 
-When a particular job opts-in to testing regressions (using the `regression_test` command line option and the `julia --project=examples regression_tests/test_mse.jl` command), we compare the solution dataset (the prognostic state at the last timestep) of that job with a reference dataset.
+When a particular job opts-in to testing reproducibilitys (using the `reproducibility_test` command line option and the `julia --project=examples reproducibility_tests/test_mse.jl` command), we compare the solution dataset (the prognostic state at the last timestep) of that job with a reference dataset.
 
 We don't always have a reference to compare against, due to what we'll call **failure modes**. For a full list of failure modes, see [Failure modes](#Failure-modes), but here are a few examples:
 
  - There is no reference to compare against when we add a new experiment / buildkite job.
 
- - There is no reference to compare against when we add a new variable to opt-in for regression tests.
+ - There is no reference to compare against when we add a new variable to opt-in for reproducibility tests.
 
 Our solution to dealing with failure modes is by providing users with two workflows: one when a comparable reference dataset exists (non-failure mode case), and another when it does not (failure mode case).
 
  - A comparable reference dataset exists:
    - [Update mse tables](#How-to-update-mse-tables)
 
  - A comparable reference dataset does **not** exists:
-   - Increment the reference counter in `regression_tests/ref_counter.jl`. This triggers a "self-reference".
+   - Increment the reference counter in `reproducibility_tests/ref_counter.jl`. This triggers a "self-reference".
    - [Update mse tables](#How-to-update-mse-tables) _all to zero values_
 
 At this moment, it's crucial to mention several important points:
 
- - When a reference dataset does not exist, we still perform a regression test so that we continuously exercise the testing infrastructure. However, we compare the solution dataset with itself (which we call a "self-reference"). Therefore, _all regression tests for all jobs will pass_ (no matter what the results look like) when the reference counter is incremented. So, it is important to review the quality of the results when the reference counter is incremented.
+ - When a reference dataset does not exist, we still perform a reproducibility test so that we continuously exercise the testing infrastructure. However, we compare the solution dataset with itself (which we call a "self-reference"). Therefore, _all reproducibility tests for all jobs will pass_ (no matter what the results look like) when the reference counter is incremented. So, it is important to review the quality of the results when the reference counter is incremented.
 
  - Every time the reference counter is incremented, data from that PR is saved onto Caltech's central cluster. And that solution's dataset is the new reference dataset that all future PRs are compared against (until the reference counter is incremented again).
 
@@ -34,51 +34,51 @@ To update the mse tables:
  - Click the *Print new mse tables* buildkite job
  - Click the *Running commands* entry in the *Log* tab
  - Copy this output until `-- DO NOT COPY --`
- - Paste these contents into `regression_tests/mse_tables.jl`
+ - Paste these contents into `reproducibility_tests/mse_tables.jl`
  - Add, commit, and push these changes.
 
-## Adding a new regression test
+## Adding a new reproducibility test
 
-To add a new regression test:
+To add a new reproducibility test:
 
- - Set the command-line `regression_test` to true, and add `julia --color=yes --project=examples regression_tests/test_mse.jl --job_id [job_id] --out_dir [job_id]` as a separate command for the new (or existing) job
+ - Set the command-line `reproducibility_test` to true, and add `julia --color=yes --project=examples reproducibility_tests/test_mse.jl --job_id [job_id] --out_dir [job_id]` as a separate command for the new (or existing) job
  - Copy the `all_best_mse` dict template from the job's log
- - Paste the `all_best_mse` dict template into `regression_test/mse_tables.jl`
+ - Paste the `all_best_mse` dict template into `reproducibility_test/mse_tables.jl`
 
 <!-- TODO: improve names / mark off sections for all_best_mse dict -->
 
 ## Failure modes
 
 Here are some situations where we cannot (or cannot easily) compare against [existing] reference datasets. For example, when
 
- - a new regression test is added: no reference to compare against
+ - a new reproducibility test is added: no reference to compare against
 
- - a new variable is added to an existing regression test: no reference to compare against
+ - a new variable is added to an existing reproducibility test: no reference to compare against
 
  - a variable name has changed: cannot (easily) compare variables with two different names
 
  - the grid resolution has changed: see [A note on grid resolution failure mode]
 
 ### A note on grid resolution failure mode
 
-We cannot (easily) compare the output with a reference if we change the spatial resolution (without interpolation). Coupling the regression infrastructure with interpolation would introduce a great deal of complexity since this depends on the type of grid (e.g., column, box, sphere), and details of those grids (e.g., radius of earth). Using interpolation in the regression infrastructure is not impossible, but it tightly couples details of the model / configuration, and introduces a lot of software complexity that results in specialized and difficult to maintain code.
+We cannot (easily) compare the output with a reference if we change the spatial resolution (without interpolation). Coupling the reproducibility infrastructure with interpolation would introduce a great deal of complexity since this depends on the type of grid (e.g., column, box, sphere), and details of those grids (e.g., radius of earth). Using interpolation in the reproducibility infrastructure is not impossible, but it tightly couples details of the model / configuration, and introduces a lot of software complexity that results in specialized and difficult to maintain code.
 
-# Developer guide to regression tests
+# Developer guide to reproducibility tests
 
-## A detailed procedure of how regression tests are performed
+## A detailed procedure of how reproducibility tests are performed
 
-Regression tests are performed at the end of `examples/hybrid/driver.jl`, after a simulation completes, and relies on a unique job id (`job_id`). Here is an outline of the regression test procedure:
+Reprodicibility tests are performed at the end of `examples/hybrid/driver.jl`, after a simulation completes, and relies on a unique job id (`job_id`). Here is an outline of the reproducibility test procedure:
 
  0) Run a simulation, with a particular `job_id`, to the final time.
  1) Load a dictionary, `all_best_mse`, of previous "best" mean-squared errors from `mse_tables.jl` and extract the mean squared errors for the given `job_id` (store in job-specific dictionary, `best_mse`).
  2) Export the solution (a `FieldVector`) at the final simulation time to an `NCDataset` file.
  3) Compute the errors between the exported solution and the exported solution from the reference `NCDataset` file (which is saved in a dedicated folder on the Caltech Central cluster) and save into a dictionary, called `computed_mse`.
  4) Export this dictionary (`computed_mse`) to the output folder
- 5) Test that `computed_mse` is no worse than `best_mse` (determines if regression test passes or not).
+ 5) Test that `computed_mse` is no worse than `best_mse` (determines if reproducibility test passes or not).
 
 After these steps are performed at the end of the driver, additional jobs are run:
 
- 1) Print `computed_mse` for all jobs to make updating `regression_tests/mse_tables.jl` easy
+ 1) Print `computed_mse` for all jobs to make updating `reproducibility_tests/mse_tables.jl` easy
  2) If we're on the github queue merging branch (all tests have passed, and the PR is effectively merging), move the `NCDataset`s from the scratch directory onto the dedicated folder on the Caltech Central cluster.
 
 ## How we track which dataset to compare against
@@ -105,11 +105,11 @@ The way this works is:
  1) We start off with a self reference: print a new reference
     counter in the `print new reference counter` job.
 
- 2) (PR author) copy-paste counter into `regression_tests/ref_counter.jl`
+ 2) (PR author) copy-paste counter into `reproducibility_tests/ref_counter.jl`
 
  3) Upon next CI run, before performing CI test,
     we check if the counter indicates a self-reference by
-    checking if `regression_tests/ref_counter.jl` in the PR
+    checking if `reproducibility_tests/ref_counter.jl` in the PR
     matches (e.g.,) `aRsVoY/ref_counter.jl` in the last
     merged commit (on central). If yes, then it's a self
     reference, if not, then we look-up the dataset based

diff --git a/regression_tests/compute_mse.jl → reproducibility_tests/compute_mse.jl b/regression_tests/compute_mse.jl → reproducibility_tests/compute_mse.jl
@@ -6,7 +6,7 @@ import ClimaCoreTempestRemap as CCTR
 include("self_reference_or_path.jl")
 
 """
-    regression_test(;
+    reproducibility_test(;
         job_id,
         reference_mse,
         ds_filename_computed,
@@ -24,7 +24,12 @@ via `varname`.
 If running on buildkite, we get `ds_filename_reference`
 from the latest merged dataset on Caltech central.
 """
-function regression_test(; job_id, reference_mse, ds_filename_computed, varname)
+function reproducibility_test(;
+    job_id,
+    reference_mse,
+    ds_filename_computed,
+    varname,
+)
     local ds_filename_reference
 
     if haskey(ENV, "BUILDKITE_COMMIT")
@@ -64,21 +69,21 @@ function regression_test(; job_id, reference_mse, ds_filename_computed, varname)
             msg *= "\n"
             msg *= "    was created, or the name of the dataset\n"
             msg *= "    has changed. Please increment the reference\n"
-            msg *= "    counter in `regression_tests/ref_counter.jl`.\n"
+            msg *= "    counter in `reproducibility_tests/ref_counter.jl`.\n"
             msg *= "\n"
             msg *= "    If this is not the case, then please\n"
             msg *= "    open an issue with a link pointing to this\n"
             msg *= "    PR and build.\n"
             msg *= "\n"
             msg *= "For more information, please find\n"
-            msg *= "`regression_tests/README.md` and read the section\n\n"
+            msg *= "`reproducibility_tests/README.md` and read the section\n\n"
             msg *= "  `How to merge pull requests (PR) that get approved\n"
-            msg *= "   but *break* regression tests`\n\n"
+            msg *= "   but *break* reproducibility tests`\n\n"
             msg *= "for how to merge this PR."
             error(msg)
         end
     else
-        @warn "Buildkite not detected. Skipping regression tests."
+        @warn "Buildkite not detected. Skipping reproducibility tests."
         @info "Please review output results before merging."
         return reference_mse
     end
@@ -94,9 +99,9 @@ function regression_test(; job_id, reference_mse, ds_filename_computed, varname)
         )
     catch err
         msg = ""
-        msg *= "The regression test broke. Please find\n"
-        msg *= "`regression_tests/README.md` and read the section\n\n"
-        msg *= "  `How to merge pull requests (PR) that get approved but *break* regression tests`\n\n"
+        msg *= "The reproducibility test broke. Please find\n"
+        msg *= "`reproducibility_tests/README.md` and read the section\n\n"
+        msg *= "  `How to merge pull requests (PR) that get approved but *break* reproducibility tests`\n\n"
         msg *= "for how to merge this PR."
         @info msg
         rethrow(err)

diff --git a/regression_tests/move_output.jl → reproducibility_tests/move_output.jl b/regression_tests/move_output.jl → reproducibility_tests/move_output.jl
@@ -27,7 +27,7 @@ if buildkite_ci
         mkpath(cluster_data_prefix)
         path = joinpath(cluster_data_prefix, commit_sha)
         mkpath(path)
-        # Only move regression data if self reference:
+        # Only move reproducibility data if self reference:
         if self_reference
             for folder_name in job_ids
                 src = folder_name

diff --git a/regression_tests/mse_tables.jl → reproducibility_tests/mse_tables.jl b/regression_tests/mse_tables.jl → reproducibility_tests/mse_tables.jl
diff --git a/regression_tests/print_new_mse.jl → reproducibility_tests/print_new_mse.jl b/regression_tests/print_new_mse.jl → reproducibility_tests/print_new_mse.jl