diff --git a/src/subscript/params2csv/params2csv.py b/src/subscript/params2csv/params2csv.py index a7685c217..d41d0abf4 100755 --- a/src/subscript/params2csv/params2csv.py +++ b/src/subscript/params2csv/params2csv.py @@ -3,12 +3,13 @@ data, ensuring labels for each value matches). """ +from __future__ import annotations import argparse import logging -import re import shutil from glob import glob +from pathlib import Path import pandas as pd from ert.config import ErtScript @@ -27,7 +28,7 @@ In the CSV file, each individual parameter file will be represented by one data row. The order of parameters in each text file is not conserved. -The original filename for each file is written to the column ‘filename’. +The original filename for each file is written to the column 'filename'. Beware if you have that as a in the text files. """ @@ -153,26 +154,33 @@ def params2csv_main(args: argparse.Namespace) -> None: if args.verbose: logger.setLevel(logging.INFO) - # Expand wildcards if not being expanded - args.parameterfile = [ - path for pattern in args.parameterfile for path in sorted(glob(pattern)) + possible_metadata_columns = [ + "ENSEMBLESET", + "REAL", + "ENSEMBLE", + "ITER", + args.filenamecolumnname, ] - ens = pd.DataFrame() + # Expand wildcards if not being expanded + paramfile_paths = [ + Path(path) for pattern in args.parameterfile for path in sorted(glob(pattern)) + ] - parsedfiles = 0 - for _, parameterfilename in enumerate(args.parameterfile, start=0): - try: - paramtable = pd.read_csv(parameterfilename, header=None, sep=r"\s+") - parsedfiles = parsedfiles + 1 - except IOError: + dfs = [] + for parameterfilename in paramfile_paths: + if parameterfilename.exists(): + paramtable = pd.read_csv( + parameterfilename, + names=["key", "value"], + header=None, + usecols=[0, 1], + sep=r"\s+", + ) + else: logger.warning("%s not found, skipping..", parameterfilename) continue - # Chop to only two colums, set keys, and transpose, and then - # merge with the previous tables - paramtable = pd.DataFrame(paramtable.iloc[:, 0:2]) - paramtable.columns = ["key", "value"] paramtable.drop_duplicates( "key", keep="last", inplace=True ) # if key is repeated, keep the last one. @@ -185,28 +193,28 @@ def params2csv_main(args: argparse.Namespace) -> None: parameterfilename, ) else: - transposed.insert(0, args.filenamecolumnname, parameterfilename) - - # Look for meta-information in filename - realregex = r".*realization-(\d*)/" - iterregex = r".*iter-(\d*)/" - if ( - re.match(realregex, parameterfilename) - and "Realization" not in transposed.columns - ): - transposed.insert( - 0, - "Realization", - re.match(realregex, parameterfilename).group(1), # type: ignore - ) - if re.match(iterregex, parameterfilename) and "Iter" not in transposed.columns: - transposed.insert( - 0, - "Iter", - re.match(iterregex, parameterfilename).group(1), # type: ignore - ) + transposed[args.filenamecolumnname] = str(parameterfilename) + + path_metadata = get_metadata_from_path(parameterfilename.resolve()) + if path_metadata is not None: + case_folder, iter_folder, iteration, real = path_metadata + transposed["ENSEMBLESET"] = case_folder + transposed["ENSEMBLE"] = iter_folder + transposed["ITER"] = iteration + transposed["REAL"] = real + dfs.append(transposed) + + if not dfs: + raise ValueError("No parameterfiles was found, check the input path provided") + ens = pd.concat(dfs) + + metadata_columns = [col for col in possible_metadata_columns if col in ens] + parameter_columns = [col for col in ens.columns if col not in metadata_columns] - ens = pd.concat([ens, transposed], sort=True) + # reorder dataframe and sort by ensemble and realization if present + ens = ens[metadata_columns + parameter_columns] + if "REAL" in metadata_columns: + ens = ens.sort_values(["ENSEMBLE", "REAL"]) if args.clean: # Users wants the script to write back to parameters.txt a @@ -214,23 +222,21 @@ def params2csv_main(args: argparse.Namespace) -> None: # parameters is equal in an entire ensemble, and so that # duplicate keys are removed Parameters only existing in some # realizations will be NaN-padded in the others. - ensfilenames = ens.reset_index()["filename"] - ensidx = ens.reset_index().drop(["index", "filename"], axis=1) - for row in list(ensidx.index.values): - paramfile = ensfilenames.loc[row] + for paramfile, realdf in ens.groupby(args.filenamecolumnname): shutil.copyfile(paramfile, paramfile + ".backup") logger.info("Writing to %s", paramfile) - ensidx.loc[row].to_csv(paramfile, sep=" ", na_rep="NaN", header=False) + realdf = realdf[parameter_columns].transpose() + realdf.to_csv(paramfile, sep=" ", na_rep="NaN", header=False) # Drop constant columns: if not args.keepconstantcolumns: - for col in ens.columns: + for col in parameter_columns: if len(ens[col].unique()) == 1: del ens[col] logger.warning("Dropping constant column %s", col) ens.to_csv(args.output, index=False) - logger.info("%s parameterfiles written to %s", parsedfiles, args.output) + logger.info("%s parameterfiles written to %s", len(dfs), args.output) def main() -> None: @@ -251,5 +257,33 @@ def legacy_ertscript_workflow(config) -> None: workflow.category = CATEGORY +def get_metadata_from_path(paramfile: Path) -> tuple[str, str, int, int] | None: + """Get some metadata from the Path object""" + + real_path = get_realization_path(paramfile) + if not real_path: + return None + + real = get_number_from_folder(real_path.stem) + case_folder = real_path.parent.stem + + # if real folder is direct parent to runpath, there is no iter + iter_folder = paramfile.parent.stem if real_path != paramfile.parent else "iter-0" + iteration = ( + get_number_from_folder(iter_folder) if iter_folder.startswith("iter-") else 0 + ) + return case_folder, iter_folder, iteration, real + + +def get_realization_path(path: Path) -> Path | None: + """Retrive the realization path, return None if not found""" + return next((p for p in path.parents if p.stem.startswith("realization-")), None) + + +def get_number_from_folder(foldername: str) -> int: + """Retrive the integer after the '-' from the folder name""" + return int(foldername.split("-")[-1]) + + if __name__ == "__main__": main() diff --git a/tests/test_params2csv.py b/tests/test_params2csv.py index 2b22f6884..498624589 100644 --- a/tests/test_params2csv.py +++ b/tests/test_params2csv.py @@ -15,6 +15,15 @@ HAVE_ERT = False +ERT_CONFIG_WF = [ + "QUEUE_SYSTEM LOCAL", + "NUM_REALIZATIONS 1", + "RUNPATH ", + "", + "LOAD_WORKFLOW PARAMS2CSV", +] + + def test_main(tmp_path, mocker): """Test invocation from command line""" os.chdir(tmp_path) @@ -165,7 +174,7 @@ def test_ert_workflow(tmp_path): f"real\t{i}", encoding="utf8" ) - Path("PARAMS2CSV_ITER0").write_text( + Path("PARAMS2CSV").write_text( ( 'PARAMS2CSV "-o" /parameters.csv ' "/realization-*/iter-0/parameters.txt" @@ -173,20 +182,116 @@ def test_ert_workflow(tmp_path): ) ert_config_fname = "test_params2csv.ert" - ert_config = [ - "QUEUE_SYSTEM LOCAL", - "NUM_REALIZATIONS 1", - "RUNPATH ", - "", - "LOAD_WORKFLOW PARAMS2CSV_ITER0", - ] - Path(ert_config_fname).write_text("\n".join(ert_config), encoding="utf8") - subprocess.run( - ["ert", "workflow", "PARAMS2CSV_ITER0", ert_config_fname], check=True - ) + Path(ert_config_fname).write_text("\n".join(ERT_CONFIG_WF), encoding="utf8") + subprocess.run(["ert", "workflow", "PARAMS2CSV", ert_config_fname], check=True) dframe = pd.read_csv("parameters.csv") assert not dframe.empty - assert "Realization" in dframe + assert "REAL" in dframe assert "real" in dframe assert len(dframe.index) == realizations + + +@pytest.mark.integration +@pytest.mark.skipif(not HAVE_ERT, reason="Requires ERT to be installed") +def test_ert_workflow_multiple_iter(tmp_path): + """ + Test that PARAMS2CSV can be run as an ERT workflow/plugin on + multiple iterations. + """ + os.chdir(tmp_path) + + realizations = 3 + for i in range(realizations): + Path(f"realization-{i}/iter-0").mkdir(parents=True) + Path(f"realization-{i}/iter-0/parameters.txt").write_text( + f"myparam\t{i}", encoding="utf8" + ) + Path(f"realization-{i}/iter-1").mkdir(parents=True) + Path(f"realization-{i}/iter-1/parameters.txt").write_text( + f"myparam\t{i}", encoding="utf8" + ) + + Path("PARAMS2CSV").write_text( + ( + 'PARAMS2CSV "-o" /parameters.csv ' + "/realization-*/iter-*/parameters.txt" + ) + ) + + ert_config_fname = "test_params2csv.ert" + Path(ert_config_fname).write_text("\n".join(ERT_CONFIG_WF), encoding="utf8") + subprocess.run(["ert", "workflow", "PARAMS2CSV", ert_config_fname], check=True) + + dframe = pd.read_csv("parameters.csv") + assert not dframe.empty + assert "myparam" in dframe + assert set(dframe["REAL"].unique()) == {0, 1, 2} + assert set(dframe["ENSEMBLESET"].unique()) == {tmp_path.stem} + assert set(dframe["ENSEMBLE"].unique()) == {"iter-0", "iter-1"} + assert set(dframe["ITER"].unique()) == {0, 1} + assert len(dframe.index) == realizations * 2 + + +@pytest.mark.integration +@pytest.mark.skipif(not HAVE_ERT, reason="Requires ERT to be installed") +def test_ert_workflow_pred_params(tmp_path): + """Test that PARAMS2CSV can be run on folders not starting with iter""" + os.chdir(tmp_path) + + realizations = 3 + for i in range(realizations): + Path(f"realization-{i}/pred").mkdir(parents=True) + Path(f"realization-{i}/pred/parameters.txt").write_text( + f"myparam\t{i}", encoding="utf8" + ) + + Path("PARAMS2CSV").write_text( + ( + 'PARAMS2CSV "-o" /parameters.csv ' + "/realization-*/pred/parameters.txt" + ) + ) + + ert_config_fname = "test_params2csv.ert" + Path(ert_config_fname).write_text("\n".join(ERT_CONFIG_WF), encoding="utf8") + subprocess.run(["ert", "workflow", "PARAMS2CSV", ert_config_fname], check=True) + + dframe = pd.read_csv("parameters.csv") + assert not dframe.empty + assert "myparam" in dframe + assert set(dframe["REAL"].unique()) == {0, 1, 2} + assert set(dframe["ENSEMBLE"].unique()) == {"pred"} + assert set(dframe["ITER"].unique()) == {0} + + +@pytest.mark.integration +@pytest.mark.skipif(not HAVE_ERT, reason="Requires ERT to be installed") +def test_ert_workflow_no_iter_folder(tmp_path): + """Test that PARAMS2CSV can be run on cases without iteration folders""" + os.chdir(tmp_path) + + realizations = 3 + for i in range(realizations): + Path(f"realization-{i}").mkdir(parents=True) + Path(f"realization-{i}/parameters.txt").write_text( + f"myparam\t{i}", encoding="utf8" + ) + + Path("PARAMS2CSV").write_text( + ( + 'PARAMS2CSV "-o" /parameters.csv ' + "/realization-*/parameters.txt" + ) + ) + + ert_config_fname = "test_params2csv.ert" + Path(ert_config_fname).write_text("\n".join(ERT_CONFIG_WF), encoding="utf8") + subprocess.run(["ert", "workflow", "PARAMS2CSV", ert_config_fname], check=True) + + dframe = pd.read_csv("parameters.csv") + assert not dframe.empty + assert "myparam" in dframe + assert set(dframe["REAL"].unique()) == {0, 1, 2} + assert set(dframe["ENSEMBLE"].unique()) == {"iter-0"} + assert set(dframe["ITER"].unique()) == {0}