Skip to content

Commit

Permalink
passing tests and better logging for ignoring columns, fixes hackingm…
Browse files Browse the repository at this point in the history
  • Loading branch information
ardunn committed Oct 14, 2019
1 parent 1e77201 commit ffb2cb9
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 12 deletions.
38 changes: 29 additions & 9 deletions automatminer/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import os
import copy
import pickle
from typing import Dict

import pandas as pd

Expand Down Expand Up @@ -179,7 +180,7 @@ def transform(self, df, **transform_kwargs):
return self.predict(df, **transform_kwargs)

@check_fitted
def predict(self, df, ignore=None):
def predict(self, df, ignore="all"):
"""
Predict a target property of a set of materials.
Expand All @@ -191,25 +192,44 @@ def predict(self, df, ignore=None):
Args:
df (pandas.DataFrame): Pipe will be fit to this dataframe.
ignore ([str]): String names of columns in all dataframes to ignore.
This will not stop samples from being dropped.
ignore ([str], None): Select which columns to ignore.
These columns will not be used for learning/prediction, but will
simply be appended back to the predicted df at the end of
prediction REGARDLESS of the pipeline configuration.
This will not stop samples from being dropped. If
columns not present in the fitting are not ignored, they will
be automatically dropped. Similarly, if the AutoFeaturizer
is not configured to preserve inputs and they are not ignored,
they will be automatically dropped. Ignoring columns supercedes
all inner operations.
Select columns using:
- [str]: String names of columns to ignore.
- None: input columns will be automatically dropped if they are
inputs. User defined features will be preserved if usable
as ML input.
Returns:
(pandas.DataFrame): The dataframe with target property predictions.
"""
if ignore:
ignored_df = df[ignore]
df = df.drop(columns=ignored_df)
self.logger.warning(
f"MatPipe will ignore and append (after prediction) the "
f"following columns: \n{ignore}"
)
ignore_df = df[list(ignore)]
df = df.drop(columns=ignore_df)
else:
ignored_df = pd.DataFrame()
ignore_df = pd.DataFrame()

self.logger.info("Beginning MatPipe prediction using fitted pipeline.")
df = self.autofeaturizer.transform(df, self.target)
df = self.cleaner.transform(df, self.target)
df = self.reducer.transform(df, self.target)
predictions = self.learner.predict(df, self.target)
self.logger.info("MatPipe prediction completed.")
merged_df = predictions.join(ignored_df, how="left")
merged_df = predictions.join(ignore_df, how="left")
return merged_df

@set_fitted
Expand Down Expand Up @@ -301,7 +321,7 @@ def benchmark(self, df, target, kfold, fold_subset=None, cache=False):
return results

@check_fitted
def inspect(self, filename=None):
def inspect(self, filename=None) -> Dict[str, str]:
"""
Get all details of the pipeline in human-readable format.
Expand All @@ -321,7 +341,7 @@ def inspect(self, filename=None):
return attrs

@check_fitted
def summarize(self, filename=None):
def summarize(self, filename=None) -> Dict[str, str]:
"""
Get an executive summary of the most important parts of the pipeline.
Useful for understanding the pipeline at a high level.
Expand Down
47 changes: 44 additions & 3 deletions automatminer/tests/test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ def make_matpipe_test(config_preset, skip=None):
skippables = [
"transferability",
"user_features",
"ignore",
"benchmarking",
"persistence",
"digests",
Expand Down Expand Up @@ -131,6 +132,39 @@ def test_user_features(self):
test = df_test[self.target + " predicted"]
self.assertTrue(r2_score(true, test) > 0.75)


@unittest.skipIf("ignore" in skip, reason)
def test_ignore(self):
df = self.df
# pd.set_option('display.max_rows', 500)
# pd.set_option('display.max_columns', 500)
# pd.set_option('display.width', 1000)
# print(df)

df_train = df.iloc[:200]
df_test = df.iloc[201:250]
ef = "ExtraFeature"
df_test[ef] = [i + 100 for i in range(df_test.shape[0])]
self.pipe.fit(df_train, self.target)

self.assertTrue(ef in df_test.columns)
self.assertTrue("composition" in df_test.columns)

ignore = [ef, "composition"]
predicted_ignored = self.pipe.predict(df_test, ignore=ignore)
self.assertTrue(ef in predicted_ignored.columns)
self.assertTrue("composition" in predicted_ignored.columns)

predicted_none = self.pipe.predict(df_test, ignore=None)
self.assertFalse(ef in predicted_none.columns)
self.assertFalse("composition" in predicted_none.columns)

some = ["composition"]
predicted_some = self.pipe.predict(df_test, ignore=some)
self.assertFalse(ef in predicted_some.columns)
self.assertTrue("composition" in predicted_some.columns)


@unittest.skipIf("benchmarking" in skip, reason)
def test_benchmarking_no_cache(self):
pipe = self.pipe
Expand Down Expand Up @@ -212,11 +246,18 @@ def tearDown(self) -> None:

@unittest.skipIf(int(os.environ.get("SKIP_INTENSIVE", 0)),
"Test too intensive for CircleCI commit builds.")
class MatPipeDebugTest(make_matpipe_test("debug")):
pass
# class MatPipeDebugTest(make_matpipe_test("debug")):
# pass


class MatPipeDebugSingleTest(make_matpipe_test("debug_single")):
class MatPipeDebugSingleTest(make_matpipe_test("debug_single", skip=[
"transferability",
"user_features",
# "ignore",
"benchmarking",
"persistence",
"digests",
])):
pass


Expand Down
3 changes: 3 additions & 0 deletions docs/source/advanced.rst
Original file line number Diff line number Diff line change
Expand Up @@ -66,11 +66,14 @@ Time Savers and Practical Tools
-------------------------------
ignoring a column

using user defined features


Customizing pipelines
---------------------



Using DFTransformers individually
---------------------------------

Expand Down

0 comments on commit ffb2cb9

Please sign in to comment.