Skip to content

Commit

Permalink
Standardize column names in params2csv
Browse files Browse the repository at this point in the history
  • Loading branch information
tnatt committed Mar 4, 2024
1 parent 011afce commit 3fd2480
Show file tree
Hide file tree
Showing 2 changed files with 215 additions and 60 deletions.
144 changes: 97 additions & 47 deletions src/subscript/params2csv/params2csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,13 @@
data, ensuring labels for each value matches).
"""
from __future__ import annotations

import argparse
import logging
import re
import shutil
from glob import glob
from pathlib import Path

import pandas as pd
from ert.config import ErtScript
Expand All @@ -27,7 +28,7 @@
In the CSV file, each individual parameter file will be represented by one data row.
The order of parameters in each text file is not conserved.
The original filename for each file is written to the column filename.
The original filename for each file is written to the column 'filename'.
Beware if you have that as a <key> in the text files.
"""

Expand Down Expand Up @@ -55,15 +56,31 @@
# The following string is used for the ERT workflow documentation, note
# the very subtle difference in variable name.
WORKFLOW_EXAMPLE = """
Add a file named e.g. ``ert/bin/workflows/PARAMS2CSV_ITER0`` with the contents::
Add a file named e.g. ``ert/bin/workflows/wf_params2csv_iter0`` with the contents::
MAKE_DIRECTORY <SCRATCH>/<USER>/<CASE_DIR>/share/results/tables
PARAMS2CSV "--verbose" "-o" <SCRATCH>/<USER>/<CASE_DIR>/share/results/tables/parameters_iter-0.csv <SCRATCH>/<USER>/<CASE_DIR>/realization-*/iter-0/parameters.txt
Add to your ERT config to have the workflow loaded upon launching::
LOAD_WORKFLOW ../bin/workflows/PARAMS2CSV_ITER0
LOAD_WORKFLOW ../bin/workflows/wf_params2csv_iter0
It is then possible to run the workflow either through ERT CLI or GUI.
Wildcards can be used to extract parameters from multiple iterations,
this is done in the example below. Note also the use of ``HOOK_WORKFLOW`` to automatically
run the workflow when all realizations have finished.
Add a file named e.g. ``ert/bin/workflows/wf_params2csv_hist`` with the contents::
MAKE_DIRECTORY <SCRATCH>/<USER>/<CASE_DIR>/share/results/tables
PARAMS2CSV "--verbose" "-o" <SCRATCH>/<USER>/<CASE_DIR>/share/results/tables/parameters_hist.csv <SCRATCH>/<USER>/<CASE_DIR>/realization-*/iter-*/parameters.txt
Add to your ERT config to have the workflow automatically executed on successful runs::
LOAD_WORKFLOW ../bin/workflows/wf_params2csv_hist
HOOK_WORKFLOW wf_params2csv_hist POST_SIMULATION
It is then possible to run the workflow either through ERT CLI or GUI.
""" # noqa


Expand Down Expand Up @@ -153,26 +170,33 @@ def params2csv_main(args: argparse.Namespace) -> None:
if args.verbose:
logger.setLevel(logging.INFO)

# Expand wildcards if not being expanded
args.parameterfile = [
path for pattern in args.parameterfile for path in sorted(glob(pattern))
possible_metadata_columns = [
"ENSEMBLESET",
"REAL",
"ENSEMBLE",
"ITER",
args.filenamecolumnname,
]

ens = pd.DataFrame()
# Expand wildcards if not being expanded
paramfile_paths = [
Path(path) for pattern in args.parameterfile for path in sorted(glob(pattern))
]

parsedfiles = 0
for _, parameterfilename in enumerate(args.parameterfile, start=0):
try:
paramtable = pd.read_csv(parameterfilename, header=None, sep=r"\s+")
parsedfiles = parsedfiles + 1
except IOError:
dfs = []
for parameterfilename in paramfile_paths:
if not parameterfilename.exists():
logger.warning("%s not found, skipping..", parameterfilename)
continue

# Chop to only two colums, set keys, and transpose, and then
# merge with the previous tables
paramtable = pd.DataFrame(paramtable.iloc[:, 0:2])
paramtable.columns = ["key", "value"]
paramtable = pd.read_csv(
parameterfilename,
names=["key", "value"],
header=None,
usecols=[0, 1],
sep=r"\s+",
)

paramtable.drop_duplicates(
"key", keep="last", inplace=True
) # if key is repeated, keep the last one.
Expand All @@ -185,52 +209,50 @@ def params2csv_main(args: argparse.Namespace) -> None:
parameterfilename,
)
else:
transposed.insert(0, args.filenamecolumnname, parameterfilename)

# Look for meta-information in filename
realregex = r".*realization-(\d*)/"
iterregex = r".*iter-(\d*)/"
if (
re.match(realregex, parameterfilename)
and "Realization" not in transposed.columns
):
transposed.insert(
0,
"Realization",
re.match(realregex, parameterfilename).group(1), # type: ignore
)
if re.match(iterregex, parameterfilename) and "Iter" not in transposed.columns:
transposed.insert(
0,
"Iter",
re.match(iterregex, parameterfilename).group(1), # type: ignore
)
transposed[args.filenamecolumnname] = str(parameterfilename)

ens = pd.concat([ens, transposed], sort=True)
path_metadata = get_metadata_from_path(parameterfilename.resolve())
if path_metadata is not None:
case_folder, iter_folder, iteration, real = path_metadata
transposed["ENSEMBLESET"] = case_folder
transposed["ENSEMBLE"] = iter_folder
transposed["ITER"] = iteration
transposed["REAL"] = real
dfs.append(transposed)

if not dfs:
raise ValueError("No parameterfiles was found, check the input path provided")
ens = pd.concat(dfs)

metadata_columns = [col for col in possible_metadata_columns if col in ens]
parameter_columns = [col for col in ens.columns if col not in metadata_columns]

# reorder dataframe and sort by ensemble and realization if present
ens = ens[metadata_columns + parameter_columns]
if "REAL" in metadata_columns:
ens = ens.sort_values(["ENSEMBLE", "REAL"])

if args.clean:
# Users wants the script to write back to parameters.txt a
# possible subset of parametervalues so that the number of
# parameters is equal in an entire ensemble, and so that
# duplicate keys are removed Parameters only existing in some
# realizations will be NaN-padded in the others.
ensfilenames = ens.reset_index()["filename"]
ensidx = ens.reset_index().drop(["index", "filename"], axis=1)
for row in list(ensidx.index.values):
paramfile = ensfilenames.loc[row]
for paramfile, realdf in ens.groupby(args.filenamecolumnname):
shutil.copyfile(paramfile, paramfile + ".backup")
logger.info("Writing to %s", paramfile)
ensidx.loc[row].to_csv(paramfile, sep=" ", na_rep="NaN", header=False)
realdf = realdf[parameter_columns].transpose()
realdf.to_csv(paramfile, sep=" ", na_rep="NaN", header=False)

# Drop constant columns:
if not args.keepconstantcolumns:
for col in ens.columns:
for col in parameter_columns:
if len(ens[col].unique()) == 1:
del ens[col]
logger.warning("Dropping constant column %s", col)

ens.to_csv(args.output, index=False)
logger.info("%s parameterfiles written to %s", parsedfiles, args.output)
logger.info("%s parameterfiles written to %s", len(dfs), args.output)


def main() -> None:
Expand All @@ -251,5 +273,33 @@ def legacy_ertscript_workflow(config) -> None:
workflow.category = CATEGORY


def get_metadata_from_path(paramfile: Path) -> tuple[str, str, int, int] | None:
"""Get some metadata from the Path object"""

real_path = get_realization_path(paramfile)
if not real_path:
return None

real = get_number_from_folder(real_path.stem)
case_folder = real_path.parent.stem

# if real folder is direct parent to runpath, there is no iter
iter_folder = paramfile.parent.stem if real_path != paramfile.parent else "iter-0"
iteration = (
get_number_from_folder(iter_folder) if iter_folder.startswith("iter-") else 0
)
return case_folder, iter_folder, iteration, real


def get_realization_path(path: Path) -> Path | None:
"""Retrive the realization path, return None if not found"""
return next((p for p in path.parents if p.stem.startswith("realization-")), None)


def get_number_from_folder(foldername: str) -> int:
"""Retrive the integer after the '-' from the folder name"""
return int(foldername.split("-")[-1])


if __name__ == "__main__":
main()
131 changes: 118 additions & 13 deletions tests/test_params2csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,15 @@
HAVE_ERT = False


ERT_CONFIG_WF = [
"QUEUE_SYSTEM LOCAL",
"NUM_REALIZATIONS 1",
"RUNPATH <CONFIG_PATH>",
"",
"LOAD_WORKFLOW wf_params2csv",
]


def test_main(tmp_path, mocker):
"""Test invocation from command line"""
os.chdir(tmp_path)
Expand Down Expand Up @@ -165,28 +174,124 @@ def test_ert_workflow(tmp_path):
f"real\t{i}", encoding="utf8"
)

Path("PARAMS2CSV_ITER0").write_text(
Path("wf_params2csv").write_text(
(
'PARAMS2CSV "-o" <CONFIG_PATH>/parameters.csv '
"<CONFIG_PATH>/realization-*/iter-0/parameters.txt"
)
)

ert_config_fname = "test_params2csv.ert"
ert_config = [
"QUEUE_SYSTEM LOCAL",
"NUM_REALIZATIONS 1",
"RUNPATH <CONFIG_PATH>",
"",
"LOAD_WORKFLOW PARAMS2CSV_ITER0",
]
Path(ert_config_fname).write_text("\n".join(ert_config), encoding="utf8")
subprocess.run(
["ert", "workflow", "PARAMS2CSV_ITER0", ert_config_fname], check=True
)
Path(ert_config_fname).write_text("\n".join(ERT_CONFIG_WF), encoding="utf8")
subprocess.run(["ert", "workflow", "wf_params2csv", ert_config_fname], check=True)

dframe = pd.read_csv("parameters.csv")
assert not dframe.empty
assert "Realization" in dframe
assert "REAL" in dframe
assert "real" in dframe
assert len(dframe.index) == realizations


@pytest.mark.integration
@pytest.mark.skipif(not HAVE_ERT, reason="Requires ERT to be installed")
def test_ert_workflow_multiple_iter(tmp_path):
"""
Test that PARAMS2CSV can be run as an ERT workflow/plugin on
multiple iterations.
"""
os.chdir(tmp_path)

realizations = 3
for i in range(realizations):
Path(f"realization-{i}/iter-0").mkdir(parents=True)
Path(f"realization-{i}/iter-0/parameters.txt").write_text(
f"myparam\t{i}", encoding="utf8"
)
Path(f"realization-{i}/iter-1").mkdir(parents=True)
Path(f"realization-{i}/iter-1/parameters.txt").write_text(
f"myparam\t{i}", encoding="utf8"
)

Path("wf_params2csv").write_text(
(
'PARAMS2CSV "-o" <CONFIG_PATH>/parameters.csv '
"<CONFIG_PATH>/realization-*/iter-*/parameters.txt"
)
)

ert_config_fname = "test_params2csv.ert"
Path(ert_config_fname).write_text("\n".join(ERT_CONFIG_WF), encoding="utf8")
subprocess.run(["ert", "workflow", "wf_params2csv", ert_config_fname], check=True)

dframe = pd.read_csv("parameters.csv")
assert not dframe.empty
assert "myparam" in dframe
assert set(dframe["REAL"].unique()) == {0, 1, 2}
assert set(dframe["ENSEMBLESET"].unique()) == {tmp_path.stem}
assert set(dframe["ENSEMBLE"].unique()) == {"iter-0", "iter-1"}
assert set(dframe["ITER"].unique()) == {0, 1}
assert len(dframe.index) == realizations * 2


@pytest.mark.integration
@pytest.mark.skipif(not HAVE_ERT, reason="Requires ERT to be installed")
def test_ert_workflow_pred_params(tmp_path):
"""Test that PARAMS2CSV can be run on folders not starting with iter"""
os.chdir(tmp_path)

realizations = 3
for i in range(realizations):
Path(f"realization-{i}/pred").mkdir(parents=True)
Path(f"realization-{i}/pred/parameters.txt").write_text(
f"myparam\t{i}", encoding="utf8"
)

Path("wf_params2csv").write_text(
(
'PARAMS2CSV "-o" <CONFIG_PATH>/parameters.csv '
"<CONFIG_PATH>/realization-*/pred/parameters.txt"
)
)

ert_config_fname = "test_params2csv.ert"
Path(ert_config_fname).write_text("\n".join(ERT_CONFIG_WF), encoding="utf8")
subprocess.run(["ert", "workflow", "wf_params2csv", ert_config_fname], check=True)

dframe = pd.read_csv("parameters.csv")
assert not dframe.empty
assert "myparam" in dframe
assert set(dframe["REAL"].unique()) == {0, 1, 2}
assert set(dframe["ENSEMBLE"].unique()) == {"pred"}
assert set(dframe["ITER"].unique()) == {0}


@pytest.mark.integration
@pytest.mark.skipif(not HAVE_ERT, reason="Requires ERT to be installed")
def test_ert_workflow_no_iter_folder(tmp_path):
"""Test that PARAMS2CSV can be run on cases without iteration folders"""
os.chdir(tmp_path)

realizations = 3
for i in range(realizations):
Path(f"realization-{i}").mkdir(parents=True)
Path(f"realization-{i}/parameters.txt").write_text(
f"myparam\t{i}", encoding="utf8"
)

Path("wf_params2csv").write_text(
(
'PARAMS2CSV "-o" <CONFIG_PATH>/parameters.csv '
"<CONFIG_PATH>/realization-*/parameters.txt"
)
)

ert_config_fname = "test_params2csv.ert"
Path(ert_config_fname).write_text("\n".join(ERT_CONFIG_WF), encoding="utf8")
subprocess.run(["ert", "workflow", "wf_params2csv", ert_config_fname], check=True)

dframe = pd.read_csv("parameters.csv")
assert not dframe.empty
assert "myparam" in dframe
assert set(dframe["REAL"].unique()) == {0, 1, 2}
assert set(dframe["ENSEMBLE"].unique()) == {"iter-0"}
assert set(dframe["ITER"].unique()) == {0}

0 comments on commit 3fd2480

Please sign in to comment.