Skip to content

Commit

Permalink
Extend metadata_column_to_perfdata to multiple columns (#216)
Browse files Browse the repository at this point in the history
* Improve metadata_column_to_perfdata to add multiple columns at the same time

* Rename argument for clarity

* black
  • Loading branch information
michaelmckinsey1 authored Oct 25, 2024
1 parent 0b77659 commit 91d62d1
Show file tree
Hide file tree
Showing 5 changed files with 95 additions and 44 deletions.
4 changes: 2 additions & 2 deletions thicket/ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ def _handle_perfdata():
new_profiles = [i for i in range(len(thickets_cp[0].profile))]
for i in range(len(thickets_cp)):
thickets_cp[i].metadata["new_profiles"] = new_profiles
thickets_cp[i].metadata_column_to_perfdata(
thickets_cp[i].metadata_columns_to_perfdata(
"new_profiles", drop=True
)
thickets_cp[i].dataframe.reset_index(level=inner_idx, inplace=True)
Expand All @@ -233,7 +233,7 @@ def _handle_perfdata():
else: # Change second-level index to be from metadata's "metadata_key" column
for i in range(len(thickets_cp)):
if metadata_key not in thickets_cp[i].dataframe.index.names:
thickets_cp[i].metadata_column_to_perfdata(metadata_key)
thickets_cp[i].metadata_columns_to_perfdata(metadata_key)
thickets_cp[i].dataframe.reset_index(level=inner_idx, inplace=True)
new_mappings.update(
pd.Series(
Expand Down
2 changes: 1 addition & 1 deletion thicket/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def _agg_rows(col_series):
if col not in index_names:
if col in tk_c.metadata.columns or col in df_columns:
if col not in df_columns:
tk_c.metadata_column_to_perfdata(col)
tk_c.metadata_columns_to_perfdata(col)
tk_c.dataframe = tk_c.dataframe.set_index(col, append=True)
else:
raise KeyError(f'"{col}" is not in the PerfData or MetaData.')
Expand Down
79 changes: 57 additions & 22 deletions thicket/tests/test_thicket.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,28 +80,63 @@ def _test_multiindex():
assert bool(re.search("1.000.*Basic_COPY8", tree_output))


def test_metadata_column_to_perfdata(mpi_scaling_cali):
t_ens = Thicket.from_caliperreader(mpi_scaling_cali, disable_tqdm=True)

example_column = "jobsize"
example_column_metrics = [27, 64, 125, 216, 343]

# Column should be in metadata table
assert example_column in t_ens.metadata
# Column should not be in performance data table
assert example_column not in t_ens.dataframe
# Assume second level index is profile
assert t_ens.dataframe.index.names[1] == "profile"

t_ens.metadata_column_to_perfdata(example_column)

# Column should be in performance data table
assert example_column in t_ens.dataframe

# Check that the metrics exist in the performance data table
values = t_ens.dataframe[example_column].values.astype("int")
for metric in example_column_metrics:
assert metric in values
def test_metadata_columns_to_perfdata(
rajaperf_cuda_block128_1M_cali, rajaperf_seq_O3_1M_cali
):
tk = Thicket.from_caliperreader(
[rajaperf_cuda_block128_1M_cali[0], rajaperf_seq_O3_1M_cali[0]],
disable_tqdm=True,
)
tkc1 = tk.deepcopy()

tk.metadata_columns_to_perfdata(["variant", "tuning"])

# Check columns added
assert "variant" in tk.dataframe.columns and "tuning" in tk.dataframe.columns

# Check overwrite warning raised
with pytest.warns(UserWarning, match=r"Column .* already exists"):
tk.metadata_columns_to_perfdata(["variant", "tuning"])

# Check drop works
tkc2 = tk.deepcopy()
tkc2.metadata_columns_to_perfdata("variant", overwrite=True, drop=True)
assert "variant" not in tkc2.metadata

# Check error raise for join_key
tkc2.dataframe = tkc2.dataframe.reset_index(level="profile", drop=True)
with pytest.raises(KeyError, match="'profile' must be present"):
tkc2.metadata_columns_to_perfdata("tuning", overwrite=True)

# Check alternate join key
tk.metadata_columns_to_perfdata("ProblemSizeRunParam")
tk.metadata_columns_to_perfdata("user", join_key="ProblemSizeRunParam")
assert "user" in tk.dataframe

# Check column axis Thicket
# 1. without metadata_key
gb = tkc1.groupby(["variant", "tuning"])
ctk = Thicket.concat_thickets(
thickets=list(gb.values()),
axis="columns",
headers=list(gb.keys()),
)
ctk.metadata_columns_to_perfdata(
metadata_columns=[(("Base_CUDA", "block_128"), "ProblemSizeRunParam")]
)
assert (("Base_CUDA", "block_128"), "ProblemSizeRunParam") in ctk.dataframe.columns
# 2. with metadata_key
ctk2 = Thicket.concat_thickets(
thickets=list(gb.values()),
axis="columns",
headers=list(gb.keys()),
metadata_key="ProblemSizeRunParam",
)
ctk2.metadata_columns_to_perfdata(
metadata_columns=[(("Base_CUDA", "block_128"), "user")],
join_key="ProblemSizeRunParam",
)
assert (("Base_CUDA", "block_128"), "user") in ctk2.dataframe.columns


def test_perfdata_column_to_statsframe(literal_thickets, mpi_scaling_cali):
Expand Down
3 changes: 1 addition & 2 deletions thicket/tests/test_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,7 @@ def test_indices(rajaperf_unique_tunings):
# No error
tk.tree(metric_column="Avg time/rank", indices=tk.profile[0])

tk.metadata_column_to_perfdata("variant")
tk.metadata_column_to_perfdata("tuning")
tk.metadata_columns_to_perfdata(["variant", "tuning"])

# Error because there are duplicate variants. We need to add the tuning to the index as well.
tk.dataframe = (
Expand Down
51 changes: 34 additions & 17 deletions thicket/thicket.py
Original file line number Diff line number Diff line change
Expand Up @@ -629,35 +629,52 @@ def _rep_agg_func(col):
rsuffix="_right",
)

def metadata_column_to_perfdata(self, metadata_key, overwrite=False, drop=False):
"""Add a column from the metadata table to the performance data table.
def metadata_columns_to_perfdata(
self, metadata_columns, overwrite=False, drop=False, join_key="profile"
):
"""Add columns from the metadata table to the performance data table. Joins on join_key, an index or column that is present in both tables.
Arguments:
metadata_key (str): Name of the column from the metadata table
metadata_columns (list or str): List of the columns from the metadata table
overwrite (bool): Determines overriding behavior in performance data table
drop (bool): Whether to drop the column from the metadata table afterwards
drop (bool): Whether to drop the columns from the metadata table afterwards
join_key (str): Name of the index/column to join on if not 'profile'
"""
# Raise error if join_key is not present in both tables
if not (
join_key in self.dataframe.reset_index()
and join_key in self.metadata.reset_index()
):
raise KeyError(
f"'{join_key}' must be present (index or columns) for both the performance data table and metadata table."
)

# Convert metadata_columns to list if str
if isinstance(metadata_columns, str):
metadata_columns = [metadata_columns]

# Add warning if column already exists in performance data table
if metadata_key in self.dataframe.columns:
# Drop column to overwrite, otherwise warn and return
if overwrite:
self.dataframe.drop(metadata_key, axis=1, inplace=True)
else:
warnings.warn(
"Column "
+ metadata_key
+ " already exists. Set 'overwrite=True' to force update the column."
)
return
for mkey in metadata_columns:
if mkey in self.dataframe.columns:
# Drop column to overwrite, otherwise warn and return
if overwrite:
self.dataframe.drop(mkey, axis=1, inplace=True)
else:
warnings.warn(
"Column "
+ mkey
+ " already exists. Set 'overwrite=True' to force update the column."
)
return

# Add the column to the performance data table
self.dataframe = self.dataframe.join(
self.metadata[metadata_key], on=self.dataframe.index.names[1]
self.metadata[metadata_columns], on=join_key
)

# Drop column
if drop:
self.metadata.drop(metadata_key, axis=1, inplace=True)
self.metadata.drop(metadata_columns, axis=1, inplace=True)

def squash(self, update_inc_cols=True, new_statsframe=True):
"""Rewrite the Graph to include only nodes present in the performance
Expand Down

0 comments on commit 91d62d1

Please sign in to comment.