From 2f528dc758cbdfd807ddc51b58da2c5d89c25e18 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 5 May 2024 12:23:06 +0100 Subject: [PATCH 1/4] test: refactor into separate HelpMessages class --- pysr/test/test.py | 195 ++++++++++++++++++++++++---------------------- 1 file changed, 100 insertions(+), 95 deletions(-) diff --git a/pysr/test/test.py b/pysr/test/test.py index 404d5a139..d2a169e16 100644 --- a/pysr/test/test.py +++ b/pysr/test/test.py @@ -563,6 +563,105 @@ def test_csv_to_pkl_conversion(self): test_pkl_file = _csv_filename_to_pkl_filename(str(equation_file)) self.assertEqual(test_pkl_file, str(expected_pkl_file)) + def test_pickle_with_temp_equation_file(self): + """If we have a temporary equation file, unpickle the estimator.""" + model = PySRRegressor( + populations=int(1 + DEFAULT_POPULATIONS / 5), + temp_equation_file=True, + procs=0, + multithreading=False, + ) + nout = 3 + X = np.random.randn(100, 2) + y = np.random.randn(100, nout) + model.fit(X, y) + contents = model.equation_file_contents_.copy() + + y_predictions = model.predict(X) + + equation_file_base = model.equation_file_ + for i in range(1, nout + 1): + assert not os.path.exists(str(equation_file_base) + f".out{i}.bkup") + + with tempfile.NamedTemporaryFile() as pickle_file: + pkl.dump(model, pickle_file) + pickle_file.seek(0) + model2 = pkl.load(pickle_file) + + contents2 = model2.equation_file_contents_ + cols_to_check = ["equation", "loss", "complexity"] + for frame1, frame2 in zip(contents, contents2): + pd.testing.assert_frame_equal(frame1[cols_to_check], frame2[cols_to_check]) + + y_predictions2 = model2.predict(X) + np.testing.assert_array_equal(y_predictions, y_predictions2) + + def test_scikit_learn_compatibility(self): + """Test PySRRegressor compatibility with scikit-learn.""" + model = PySRRegressor( + niterations=int(1 + DEFAULT_NITERATIONS / 10), + populations=int(1 + DEFAULT_POPULATIONS / 3), + ncycles_per_iteration=int(2 + DEFAULT_NCYCLES / 10), + verbosity=0, + progress=False, + random_state=0, + deterministic=True, # Deterministic as tests require this. + procs=0, + multithreading=False, + warm_start=False, + temp_equation_file=True, + ) # Return early. + + check_generator = check_estimator(model, generate_only=True) + exception_messages = [] + for _, check in check_generator: + if check.func.__name__ == "check_complex_data": + # We can use complex data, so avoid this check. + continue + try: + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + check(model) + print("Passed", check.func.__name__) + except Exception: + error_message = str(traceback.format_exc()) + exception_messages.append( + f"{check.func.__name__}:\n" + error_message + "\n" + ) + print("Failed", check.func.__name__, "with:") + # Add a leading tab to error message, which + # might be multi-line: + print("\n".join([(" " * 4) + row for row in error_message.split("\n")])) + # If any checks failed don't let the test pass. + self.assertEqual(len(exception_messages), 0) + + def test_param_groupings(self): + """Test that param_groupings are complete""" + param_groupings_file = Path(__file__).parent.parent / "param_groupings.yml" + if not param_groupings_file.exists(): + return + + # Read the file, discarding lines ending in ":", + # and removing leading "\s*-\s*": + params = [] + with open(param_groupings_file, "r") as f: + for line in f.readlines(): + if line.strip().endswith(":"): + continue + if line.strip().startswith("-"): + params.append(line.strip()[1:].strip()) + + regressor_params = [ + p for p in DEFAULT_PARAMS.keys() if p not in ["self", "kwargs"] + ] + + # Check the sets are equal: + self.assertSetEqual(set(params), set(regressor_params)) + + +class TestHelpMessages(unittest.TestCase): + """Test user help messages.""" + def test_deprecation(self): """Ensure that deprecation works as expected. @@ -705,101 +804,6 @@ def test_bad_kwargs(self): model.get_best() print("Failed", opt["kwargs"]) - def test_pickle_with_temp_equation_file(self): - """If we have a temporary equation file, unpickle the estimator.""" - model = PySRRegressor( - populations=int(1 + DEFAULT_POPULATIONS / 5), - temp_equation_file=True, - procs=0, - multithreading=False, - ) - nout = 3 - X = np.random.randn(100, 2) - y = np.random.randn(100, nout) - model.fit(X, y) - contents = model.equation_file_contents_.copy() - - y_predictions = model.predict(X) - - equation_file_base = model.equation_file_ - for i in range(1, nout + 1): - assert not os.path.exists(str(equation_file_base) + f".out{i}.bkup") - - with tempfile.NamedTemporaryFile() as pickle_file: - pkl.dump(model, pickle_file) - pickle_file.seek(0) - model2 = pkl.load(pickle_file) - - contents2 = model2.equation_file_contents_ - cols_to_check = ["equation", "loss", "complexity"] - for frame1, frame2 in zip(contents, contents2): - pd.testing.assert_frame_equal(frame1[cols_to_check], frame2[cols_to_check]) - - y_predictions2 = model2.predict(X) - np.testing.assert_array_equal(y_predictions, y_predictions2) - - def test_scikit_learn_compatibility(self): - """Test PySRRegressor compatibility with scikit-learn.""" - model = PySRRegressor( - niterations=int(1 + DEFAULT_NITERATIONS / 10), - populations=int(1 + DEFAULT_POPULATIONS / 3), - ncycles_per_iteration=int(2 + DEFAULT_NCYCLES / 10), - verbosity=0, - progress=False, - random_state=0, - deterministic=True, # Deterministic as tests require this. - procs=0, - multithreading=False, - warm_start=False, - temp_equation_file=True, - ) # Return early. - - check_generator = check_estimator(model, generate_only=True) - exception_messages = [] - for _, check in check_generator: - if check.func.__name__ == "check_complex_data": - # We can use complex data, so avoid this check. - continue - try: - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - check(model) - print("Passed", check.func.__name__) - except Exception: - error_message = str(traceback.format_exc()) - exception_messages.append( - f"{check.func.__name__}:\n" + error_message + "\n" - ) - print("Failed", check.func.__name__, "with:") - # Add a leading tab to error message, which - # might be multi-line: - print("\n".join([(" " * 4) + row for row in error_message.split("\n")])) - # If any checks failed don't let the test pass. - self.assertEqual(len(exception_messages), 0) - - def test_param_groupings(self): - """Test that param_groupings are complete""" - param_groupings_file = Path(__file__).parent.parent / "param_groupings.yml" - if not param_groupings_file.exists(): - return - - # Read the file, discarding lines ending in ":", - # and removing leading "\s*-\s*": - params = [] - with open(param_groupings_file, "r") as f: - for line in f.readlines(): - if line.strip().endswith(":"): - continue - if line.strip().startswith("-"): - params.append(line.strip()[1:].strip()) - - regressor_params = [ - p for p in DEFAULT_PARAMS.keys() if p not in ["self", "kwargs"] - ] - - # Check the sets are equal: - self.assertSetEqual(set(params), set(regressor_params)) - TRUE_PREAMBLE = "\n".join( [ @@ -1148,6 +1152,7 @@ def runtests(just_tests=False): TestBest, TestFeatureSelection, TestMiscellaneous, + TestHelpMessages, TestLaTeXTable, TestDimensionalConstraints, ] From ea7ced402bd5db4e675175e4527cca804bb9f170 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 5 May 2024 12:23:42 +0100 Subject: [PATCH 2/4] feat: automatically suggest related keywords --- pysr/sr.py | 19 ++++++++++++++++--- pysr/test/test.py | 20 ++++++++++++++++++++ 2 files changed, 36 insertions(+), 3 deletions(-) diff --git a/pysr/sr.py b/pysr/sr.py index 4b8d5b30e..307098806 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -1,6 +1,8 @@ """Define the PySRRegressor scikit-learn interface.""" import copy +import difflib +import inspect import os import pickle as pkl import re @@ -907,9 +909,11 @@ def __init__( FutureWarning, ) else: - raise TypeError( - f"{k} is not a valid keyword argument for PySRRegressor." - ) + suggested_keywords = self._suggest_keywords(k) + err_msg = f"{k} is not a valid keyword argument for PySRRegressor." + if len(suggested_keywords) > 0: + err_msg += f" Did you mean {' or '.join(suggested_keywords)}?" + raise TypeError(err_msg) @classmethod def from_file( @@ -1991,6 +1995,15 @@ def fit( return self + def _suggest_keywords(self, k: str) -> List[str]: + valid_keywords = [ + param + for param in inspect.signature(self.__init__).parameters + if param not in ["self", "kwargs"] + ] + suggestions = difflib.get_close_matches(k, valid_keywords, n=3) + return suggestions + def refresh(self, checkpoint_file=None) -> None: """ Update self.equations_ with any new options passed. diff --git a/pysr/test/test.py b/pysr/test/test.py index d2a169e16..2d27f78b6 100644 --- a/pysr/test/test.py +++ b/pysr/test/test.py @@ -804,6 +804,26 @@ def test_bad_kwargs(self): model.get_best() print("Failed", opt["kwargs"]) + def test_suggest_keywords(self): + model = PySRRegressor() + # Easy + self.assertEqual(model._suggest_keywords("loss_function"), ["loss_function"]) + + # More complex, and with error + with self.assertRaises(TypeError) as cm: + model = PySRRegressor(ncyclesperiterationn=5) + + self.assertIn("ncyclesperiterationn is not a valid keyword", str(cm.exception)) + self.assertIn("Did you mean", str(cm.exception)) + self.assertIn("ncycles_per_iteration or", str(cm.exception)) + self.assertIn("niteration", str(cm.exception)) + + # Farther matches (this might need to be changed) + with self.assertRaises(TypeError) as cm: + model = PySRRegressor(operators=["+", "-"]) + + self.assertIn("unary_operators or binary_operators", str(cm.exception)) + TRUE_PREAMBLE = "\n".join( [ From 62481837578a2b118c6dd8db9031d6c36eb521bd Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 5 May 2024 12:29:50 +0100 Subject: [PATCH 3/4] refactor: clean up use of `__init__` in keyword suggestion --- pysr/sr.py | 21 +++++++++++---------- pysr/test/test.py | 12 +++++++++--- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/pysr/sr.py b/pysr/sr.py index 307098806..0cfce42b5 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -909,7 +909,7 @@ def __init__( FutureWarning, ) else: - suggested_keywords = self._suggest_keywords(k) + suggested_keywords = _suggest_keywords(PySRRegressor, k) err_msg = f"{k} is not a valid keyword argument for PySRRegressor." if len(suggested_keywords) > 0: err_msg += f" Did you mean {' or '.join(suggested_keywords)}?" @@ -1995,15 +1995,6 @@ def fit( return self - def _suggest_keywords(self, k: str) -> List[str]: - valid_keywords = [ - param - for param in inspect.signature(self.__init__).parameters - if param not in ["self", "kwargs"] - ] - suggestions = difflib.get_close_matches(k, valid_keywords, n=3) - return suggestions - def refresh(self, checkpoint_file=None) -> None: """ Update self.equations_ with any new options passed. @@ -2455,6 +2446,16 @@ def latex_table( return with_preamble(table_string) +def _suggest_keywords(cls, k: str) -> List[str]: + valid_keywords = [ + param + for param in inspect.signature(cls.__init__).parameters + if param not in ["self", "kwargs"] + ] + suggestions = difflib.get_close_matches(k, valid_keywords, n=3) + return suggestions + + def idx_model_selection(equations: pd.DataFrame, model_selection: str): """Select an expression and return its index.""" if model_selection == "accuracy": diff --git a/pysr/test/test.py b/pysr/test/test.py index 2d27f78b6..4a5c23a61 100644 --- a/pysr/test/test.py +++ b/pysr/test/test.py @@ -15,7 +15,12 @@ from ..export_latex import sympy2latex from ..feature_selection import _handle_feature_selection, run_feature_selection from ..julia_helpers import init_julia -from ..sr import _check_assertions, _process_constraints, idx_model_selection +from ..sr import ( + _check_assertions, + _process_constraints, + _suggest_keywords, + idx_model_selection, +) from ..utils import _csv_filename_to_pkl_filename from .params import ( DEFAULT_NCYCLES, @@ -805,9 +810,10 @@ def test_bad_kwargs(self): print("Failed", opt["kwargs"]) def test_suggest_keywords(self): - model = PySRRegressor() # Easy - self.assertEqual(model._suggest_keywords("loss_function"), ["loss_function"]) + self.assertEqual( + _suggest_keywords(PySRRegressor, "loss_function"), ["loss_function"] + ) # More complex, and with error with self.assertRaises(TypeError) as cm: From 6088859cdd32666862f95bdde2091d5ddf749691 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 5 May 2024 12:38:57 +0100 Subject: [PATCH 4/4] docs: better formatting for errors --- pysr/sr.py | 12 +++++++----- pysr/test/test.py | 10 ++++++---- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/pysr/sr.py b/pysr/sr.py index 0cfce42b5..36c33cf38 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -884,15 +884,15 @@ def __init__( updated_kwarg_name = DEPRECATED_KWARGS[k] setattr(self, updated_kwarg_name, v) warnings.warn( - f"{k} has been renamed to {updated_kwarg_name} in PySRRegressor. " + f"`{k}` has been renamed to `{updated_kwarg_name}` in PySRRegressor. " "Please use that instead.", FutureWarning, ) # Handle kwargs that have been moved to the fit method elif k in ["weights", "variable_names", "Xresampled"]: warnings.warn( - f"{k} is a data dependant parameter so should be passed when fit is called. " - f"Ignoring parameter; please pass {k} during the call to fit instead.", + f"`{k}` is a data-dependent parameter and should be passed when fit is called. " + f"Ignoring parameter; please pass `{k}` during the call to fit instead.", FutureWarning, ) elif k == "julia_project": @@ -910,9 +910,11 @@ def __init__( ) else: suggested_keywords = _suggest_keywords(PySRRegressor, k) - err_msg = f"{k} is not a valid keyword argument for PySRRegressor." + err_msg = ( + f"`{k}` is not a valid keyword argument for PySRRegressor." + ) if len(suggested_keywords) > 0: - err_msg += f" Did you mean {' or '.join(suggested_keywords)}?" + err_msg += f" Did you mean {', '.join(map(lambda s: f'`{s}`', suggested_keywords))}?" raise TypeError(err_msg) @classmethod diff --git a/pysr/test/test.py b/pysr/test/test.py index 4a5c23a61..14393fce0 100644 --- a/pysr/test/test.py +++ b/pysr/test/test.py @@ -819,16 +819,18 @@ def test_suggest_keywords(self): with self.assertRaises(TypeError) as cm: model = PySRRegressor(ncyclesperiterationn=5) - self.assertIn("ncyclesperiterationn is not a valid keyword", str(cm.exception)) + self.assertIn( + "`ncyclesperiterationn` is not a valid keyword", str(cm.exception) + ) self.assertIn("Did you mean", str(cm.exception)) - self.assertIn("ncycles_per_iteration or", str(cm.exception)) - self.assertIn("niteration", str(cm.exception)) + self.assertIn("`ncycles_per_iteration`, ", str(cm.exception)) + self.assertIn("`niterations`", str(cm.exception)) # Farther matches (this might need to be changed) with self.assertRaises(TypeError) as cm: model = PySRRegressor(operators=["+", "-"]) - self.assertIn("unary_operators or binary_operators", str(cm.exception)) + self.assertIn("`unary_operators`, `binary_operators`", str(cm.exception)) TRUE_PREAMBLE = "\n".join(