diff --git a/.circleci/config.yml b/.circleci/config.yml
index dd70bc3c..fb8f1b25 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -20,4 +20,4 @@ jobs:
             pip install codecov
             coverage run setup.py test
             codecov
-          no_output_timeout: 30m
\ No newline at end of file
+          no_output_timeout: 120m
\ No newline at end of file
diff --git a/.coveragerc b/.coveragerc
index d04264e0..18a3fac1 100644
--- a/.coveragerc
+++ b/.coveragerc
@@ -2,4 +2,8 @@
 source=mslearn
 
 [report]
-ignore_errors=True
\ No newline at end of file
+ignore_errors=True
+omit =
+    dev_scripts/*
+    examples/*
+    .cicleci/*
diff --git a/dev_scripts/evaluation/benchmark.py b/dev_scripts/evaluation/benchmark.py
new file mode 100644
index 00000000..854ce39f
--- /dev/null
+++ b/dev_scripts/evaluation/benchmark.py
@@ -0,0 +1,172 @@
+import os
+import time
+import hashlib
+import pprint
+import copy
+
+import git
+from pymongo import MongoClient
+from fireworks import FireTaskBase, Firework, Workflow, FWAction
+from matminer.datasets.dataset_retrieval import load_dataset
+from sklearn.metrics import f1_score, r2_score
+
+import mslearn
+from mslearn.featurization import AutoFeaturizer
+from mslearn.preprocessing import DataCleaner, FeatureReducer
+from mslearn.automl.adaptors import TPOTAdaptor
+from mslearn.pipeline import MatPipe
+
+# DB =  MongoClient("mongodb://%s:%s@ds111244.mlab.com:11244/automatminer" % (DB_USER, DB_PASSWORD)).automatminer
+DB = MongoClient('localhost', 27017).automatminer
+DB_USER = "miner"
+DB_PASSWORD = "Materials2019"
+
+DATASET_SET = ["elastic_tensor_2015"]
+TARGETS = {"elastic_tensor_2015": "K_VRH"}
+SCORING = {"elastic_tensor_2015": "r2"}
+REWRITE_COLS = {"elastic_tensor_2015": {"formula": "composition"}}
+RELEVANT_COLS = {"elastic_tensor_2015": ["K_VRH", "structure", "composition"]}
+
+
+# todo: eventually this should use a test_idx so ensure that for every dataset for every repetition the same test set is used!
+
+class RunPipe(FireTaskBase):
+    _fw_name = 'RunPipe'
+
+    def run_task(self, fw_spec):
+        if fw_spec["learner_name"] == "TPOTAdaptor":
+            learner = TPOTAdaptor
+        else:
+            raise ValueError("{} is an unknown learner name!"
+                             "".format(self["learner_name"]))
+
+        # Set up the pipeline and data
+        pipe_config_dict = fw_spec["pipe_config"]
+        pipe_config = {"learner": learner(**pipe_config_dict["learner_kwargs"]),
+                       "reducer": FeatureReducer(
+                           **pipe_config_dict["reducer_kwargs"]),
+                       "cleaner": DataCleaner(
+                           **pipe_config_dict["cleaner_kwargs"]),
+                       "autofeaturizer_kwargs":
+                           AutoFeaturizer(
+                               **pipe_config_dict["autofeaturizer_kwargs"])}
+        pipe = MatPipe(**pipe_config)
+        dataset = fw_spec["dataset"]
+        df = load_dataset(dataset)
+        df = df.rename(columns=REWRITE_COLS[dataset])[RELEVANT_COLS[dataset]]
+        target = TARGETS[dataset]
+
+        # Run the benchmark
+        t1 = time.time()
+        predicted_test_df = pipe.benchmark(df, target, test_spec=0.2)
+        elapsed_time = time.time() - t1
+
+        # Save everything
+        savedir = fw_spec["save_dir"]
+        pipe.save(os.path.join(savedir, "pipe.p"))
+        pipe.digest(os.path.join(savedir, "digest.txt"))
+        predicted_test_df.to_csv(os.path.join(savedir, "test_df.csv"))
+        pipe.post_fit_df.to_csv(os.path.join(savedir, "fitted_df.csv"))
+
+        # Evaluate model
+        true = predicted_test_df[target]
+        test = predicted_test_df[target + " predicted"]
+        if SCORING[dataset] == "r2":
+            scorer = r2_score
+        elif SCORING[dataset] == "f1":
+            scorer = f1_score
+        else:
+            raise KeyError("Scoring {} not among valid options: [r2, f1].")
+        score = scorer(true, test)
+
+        # Extract important details for storage
+        best_model = pipe.learner.best_models[0]
+        features = pipe.learner.features
+        n_features = len(features)
+        n_test_samples_original = len(df[fw_spec["test_idx"]])
+
+        pass_to_storage = {"score": score, "target": target,
+                           "best_model": best_model,
+                           "elapsed_time": elapsed_time,
+                           "features": features,
+                           "n_features": n_features,
+                           "n_test_samples_original": n_test_samples_original,
+                           "n_train_samples_original": len(
+                               df) - n_test_samples_original,
+                           "n_train_samples": len(pipe.post_fit_df),
+                           "n_test_samples": len(test)
+                           }
+        return FWAction(update_spec=pass_to_storage)
+
+
+class StorePipeResults(FireTaskBase):
+    _fw_name = "StorePipeResults"
+
+    def run_task(self, fw_spec):
+        DB.pipes.insert_one(fw_spec)
+
+
+class ConsolidateRuns(FireTaskBase):
+    _fw_name = "ConsolidateRuns"
+
+
+    def run_task(self, fw_spec):
+        builds = DB.builds
+        pipes = DB.pipes
+        build_hash = fw_spec["build_hash"]
+
+        tags_all = []
+        performance_dict = {k: [] for k in DATASET_SET}
+        features_dict = {k : [] for k in DATASET_SET}
+        time_dict = {k: [] for k in DATASET_SET}
+
+        for doc in pipes.find({'build_hash': build_hash}):
+            tags_all.extend(doc["tags"])
+            performance_dict[doc["dataset"]].append(doc["score"])
+        pass
+    #todo: finish this
+
+
+
+def submit_build(launchpad, name, pipe_config, tags=None):
+    top_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../..")
+    repo = git.Repo(top_dir)
+    last_commit = str(repo.head.commit)
+    # Build hash is the combination of pipe configuration and current commit
+    build_config_for_hash = copy.deepcopy(pipe_config)
+    build_config_for_hash["last_commit"] = last_commit
+    build_config_for_hash = str(build_config_for_hash).encode("UTF-8")
+    build_hash = hashlib.sha1(build_config_for_hash).hexdigest()[:10]
+
+    fws = []
+    for dataset in DATASET_SET:
+        spec = {"dataset": dataset,
+                "pipe_config": pipe_config,
+                "commit": last_commit,
+                "name": name,
+                "build_hash": build_hash,
+                "tags": tags if tags else []}
+        for trial in range(5):
+            spec["trial"] = trial
+            fws.append(Firework([RunPipe(), StorePipeResults()],
+                                spec=spec,
+                                name="{}: {} - trial {}".format(name, dataset,
+                                                                trial)))
+
+    # todo: link fws together and test
+    wf = Workflow(fws, name="{}: build {}".format(name, build_hash))
+
+
+if __name__ == "__main__":
+    # mc = MongoClient("mongodb://%s:%s@ds111244.mlab.com:11244/automatminer" % (DB_USER, DB_PASSWORD))
+    # print(mc.automatminer)
+
+    submit_build("Test", {})
+
+    # print(subprocess.check_output(["git", "describe"]).strip())
+    # build = {"learner_name": "TPOTAdaptor",
+    #          "learner_kwargs": {"max_time_mins": 2, "population_size": 10},
+    #          "reducer_kwargs": None,
+    #          "autofeaturizer_kwargs": None,
+    #          "cleaner_kwargs": None,
+    #          "build": None}
diff --git a/dev_scripts/evaluation/benchmarker.py b/dev_scripts/evaluation/benchmarker.py
deleted file mode 100644
index 1c8445fe..00000000
--- a/dev_scripts/evaluation/benchmarker.py
+++ /dev/null
@@ -1,17 +0,0 @@
-"""
-This file will eventually hold a function that tests a mslearn
-pipeline on a set of datasets for predictive power.
-"""
-
-from matminer.datasets.dataset_retrieval import load_dataset, get_available_datasets
-# from matminer.datasets.convenience_loaders import
-
-if __name__ == "__main__":
-    df_piezo = load_dataset("piezoelectric_tensor")
-    df_exgap = load_dataset("expt_gap")
-    df_elastic = load_dataset("elastic_tensor_2015")
-    df_glass = load_dataset("glass_binary")
-
-
-
-
diff --git a/dev_scripts/evaluation/datasets.py b/dev_scripts/evaluation/datasets.py
new file mode 100644
index 00000000..5a287835
--- /dev/null
+++ b/dev_scripts/evaluation/datasets.py
@@ -0,0 +1,31 @@
+"""
+This file will eventually hold a function that tests a mslearn
+pipeline on a set of datasets for predictive power.
+
+
+3 computational / 3 experimental
+2 classification / 4 regression
+3 small datasets / 3 large datasets
+
+"""
+
+from matminer.datasets.dataset_retrieval import load_dataset, get_available_datasets, get_dataset_column_description
+# from matminer.datasets.convenience_loaders import
+
+if __name__ == "__main__":
+    # print(get_available_datasets())
+    # df_piezo = load_dataset("piezoelectric_tensor")      # 941    regression      computational  hard      (predict max of piezoelectric tensor)
+    # df_exgap = load_dataset("expt_gap")                  # 6,354  classification  experimental   moderate  (predict metal vs nonmetal)
+    df_elastic = load_dataset("elastic_tensor_2015")     # 1,181  regression      computational  easy      (predict bulk modulus)
+    # df_glass = load_dataset("glass_binary")              # 5,959  classification  experimental   moderate  (predict if metallic glass forms)
+    # df_steel = load_dataset("steel_strength")            # 371    regression      experimental   moderate  (predict tensile strength of steels)
+    # df_boltz = load_dataset("boltztrap_mp")              # 8,924  regression      computational  hard      (predict effective masses)
+    # df_mp = load_dataset("mp_all")                       # 70,000 regression      computational  hard
+    # df_exform = load_dataset("expt_formation_enthalpy")  # 1,276  regression      experimental   moderate  (predict formation enthalpy)
+
+    print(df_elastic)
+    for column in df_steel.columns:
+        print(column, get_dataset_column_description('steel_strength', column))
+
+
+
diff --git a/mslearn/pipeline.py b/mslearn/pipeline.py
index b01bdb17..6d8d44a1 100644
--- a/mslearn/pipeline.py
+++ b/mslearn/pipeline.py
@@ -51,10 +51,17 @@ class MatPipe(DataframeTransformer, LoggableMixin):
     to predict the properties of another. Furthermore, the entire pipeline and
     all constituent objects can be summarized in text with "digest".
 
+    ----------------------------------------------------------------------------
+    Note: This pipeline should function the same regardless of which
+    "component" classes it is made out of. E.g., he steps for each method should
+    remain the same whether using the TPOTAdaptor class as the learner or
+    using an AutoKerasAdaptor class as the learner.
+    ----------------------------------------------------------------------------
+
     Examples:
         # A benchmarking experiment, where all property values are known
         pipe = MatPipe()
-        validation_predictions = pipe.benchmark(df, "target_property")
+        test_predictions = pipe.benchmark(df, "target_property")
 
         # Creating a pipe with data containing known properties, then predicting
         # on new materials