phenology · SarahAlidoost · Oct 16, 2023 · Oct 16, 2023 · Oct 17, 2023 · Oct 18, 2023
diff --git a/pyPhenology/models/base.py b/pyPhenology/models/base.py
@@ -29,11 +29,11 @@ def fit(self, observations, predictors, loss_function='rmse',
                 temperature, precipitation, and day length
 
             loss_function : str, or function
-            
-            A string for built in loss functions (currently only 'rmse'), 
+
+            A string for built in loss functions (currently only 'rmse'),
             or a customized function which accpepts 2 arguments. obs and pred,
             both numpy arrays of the same shape
-            
+
             method : str
                 Optimization method to use. Either 'DE' or 'BF' for differential
                 evolution or brute force methods.
@@ -49,16 +49,31 @@ def fit(self, observations, predictors, loss_function='rmse',
                 display various internals
 
         """
-
-        validation.validate_predictors(predictors, self._required_data['predictor_columns'])
-        validation.validate_observations(observations)
         self._set_loss_function(loss_function)
         if len(self._parameters_to_estimate) == 0:
             raise RuntimeError('No parameters to estimate')
 
-        self._organize_predictors(predictors=predictors,
-                                  observations=observations,
-                                  for_prediction=False)
+        # check if the data is sklearn compatible
+        valid_sklearn, X, y = validation.validate_sklearn_Xy(predictors, observations)
+
+        # sklearn compatible option is not implemented for M1 yet.
+        if "M1" in type(self).__name__:
+            valid_sklearn = False
+
+        if valid_sklearn:
+            self._organize_sklearn_predictors(y=y,
+                                              X=X,
+                                              for_prediction=False)
+        else:
+            warn('Data or model is not sklearn compatible, trying pyphenology'
+            'see https://pyphenology.readthedocs.io/en/master/data_structures.html#data-structure')
+            # check if pyphenology compatible
+            validation.validate_predictors(predictors, self._required_data['predictor_columns'])
+            validation.validate_observations(observations)
+
+            self._organize_predictors(predictors=predictors,
+                                    observations=observations,
+                                    for_prediction=False)
 
         if debug:
             verbose = True
@@ -105,7 +120,7 @@ def predict(self, to_predict=None, predictors=None, **kwargs):
         Parameters:
             to_predict : dataframe, optional
                 pandas dataframe of site/year combinations to predict from
-                the given predictor data. just like the observations 
+                the given predictor data. just like the observations
                 dataframe used in fit() but (optionally) without the doy column
 
             predictors : dataframe, optional
@@ -125,28 +140,38 @@ def predict(self, to_predict=None, predictors=None, **kwargs):
         {'to_predict':pd.DataFrame,'predictors':pd.DataFrame}
         {'to_predict':None,'predictors':None}
         """
+        if to_predict is None and predictors is None:
+            # Making predictions on data used for fitting
+            if self.obs_fitting is not None and self.fitting_predictors is not None:
+                predictors = self.fitting_predictors
+            else:
+                raise TypeError('No to_predict + temperature passed, and' +
+                                    'no fitting done. Nothing to predict')
+
+        elif to_predict is None and predictors is not None:
+            # check if the data is sklearn compatible
+            valid_sklearn, X = validation.validate_sklearn_X(predictors)
+            if valid_sklearn:
+                # not implemented for M1
+                predictors = self._organize_sklearn_predictors(y=None,
+                                                               X=X,
+                                                               for_prediction=True)
+
+            elif isinstance(predictors, dict):
+                # predictors is a dict containing data that can be
+                # used directly in _apply_mode()
+                self._validate_formatted_predictors(predictors)
+            else:
+                raise TypeError('Invalid arguments.')
 
-        if to_predict is None and isinstance(predictors, dict):
-            # predictors is a dict containing data that can be
-            # used directly in _apply_mode()
-            self._validate_formatted_predictors(predictors)
-
-        elif isinstance(to_predict, pd.DataFrame) and isinstance(predictors, pd.DataFrame):
+        elif to_predict is not None and predictors is not None:
             # New data to predict
             validation.validate_predictors(predictors, self._required_data['predictor_columns'])
             validation.validate_observations(to_predict, for_prediction=True)
 
             predictors = self._organize_predictors(observations=to_predict,
                                                    predictors=predictors,
                                                    for_prediction=True)
-
-        elif to_predict is None and predictors is None:
-            # Making predictions on data used for fitting
-            if self.obs_fitting is not None and self.fitting_predictors is not None:
-                predictors = self.fitting_predictors
-            else:
-                raise TypeError('No to_predict + temperature passed, and' +
-                                'no fitting done. Nothing to predict')
         else:
             raise TypeError('Invalid arguments. to_predict and predictors ' +
                             'must both be pandas dataframes of new data to predict,' +
@@ -161,7 +186,7 @@ def _set_loss_function(self, loss_function):
         """The loss function (ie. RMSE)
 
         Either a sting for a built in function, or a customized
-        function which accpepts 2 arguments. obs, pred, both 
+        function which accpepts 2 arguments. obs, pred, both
         numpy arrays of the same shape
         """
         if isinstance(loss_function, str):
@@ -176,7 +201,7 @@ def _organize_predictors(self, observations, predictors, for_prediction):
         """Convert data to internal structure used by models
 
         This function inside _base() is used for all the modes which
-        have temperature as the only predictor variables (which is most of them). 
+        have temperature as the only predictor variables (which is most of them).
         Models which have other predictors have their own _organize_predictors() method.
         """
         if for_prediction:
@@ -193,15 +218,32 @@ def _organize_predictors(self, observations, predictors, for_prediction):
                                        'doy_series': doy_series}
             self.obs_fitting = cleaned_observations
 
+    def _organize_sklearn_predictors(self, y, X, for_prediction):
+        """Convert incoming data to expected structure.
+        It is documented in
+        https://pyphenology.readthedocs.io/en/master/data_structures.html In
+        pyphenology, the processed temperature has a shape of (features,
+        samples) whereas in sklearn has (samples, features), this is the reason
+        for X.T below. `doy_series` is the julian date of the temperature, here
+        it is an array of numbers each corresponds to a column of X.
+        """
+        if for_prediction:
+            return {'temperature': X.T,
+                    'doy_series': np.arange(X.shape[1])}
+        else:
+            self.fitting_predictors = {'temperature': X.T,
+                                        'doy_series': np.arange(X.shape[1])}
+            self.obs_fitting = y
+
     def _validate_formatted_predictors(self, predictors):
         """Make sure everything is valid.
 
         This is used when pre-formatted data (as opposed to dataframes)
         is passed to predict() or fit().
 
         This function inside _base() is used for all the modes which
-        have temperature as the only predictor variables (which is most of them). 
-        Models which have other predictors have their own 
+        have temperature as the only predictor variables (which is most of them).
+        Models which have other predictors have their own
         _validate_formatted_predictors() method.
         """
         # Don't allow any nan values in 2d temperature array
@@ -324,7 +366,7 @@ def _translate_scipy_parameters(self, parameters_array):
         """Map parameters from a 1D array to a dictionary for
         use in phenology model functions. Ordering matters
         in unpacking the scipy_array since it isn't labeled. Thus
-        it relies on self._parameters_to_estimate being an 
+        it relies on self._parameters_to_estimate being an
         OrdereddDict
         """
         # If only a single value is being fit, some scipy.
@@ -388,25 +430,25 @@ def score(self, metric='rmse', doy_observed=None,
         Metrics available are root mean square error (``rmse``) and AIC (``aic``).
         For AIC the number of parameters in the model is set to the number of
         parameters actually estimated in ``fit()``, not the total number of
-        model parameters. 
+        model parameters.
 
         Parameters:
             metric : str, optional
                 The metric used either 'rmse' for the root mean square error,
                 or 'aic' for akaike information criteria.
-                
+
             doy_observed : numpy array, optional
                 The true doy values to evaluate with. This must be a numpy
                 array the same length as the number of rows in to_predict
 
             to_predict : dataframe, optional
                 pandas dataframe of site/year combinations to predict from
-                the given predictor data. just like the observations 
+                the given predictor data. just like the observations
                 dataframe used in fit() but (optionally) without the doy column
 
             predictors : dataframe, optional
                 pandas dataframe in the format specific to this package
-        
+
         Returns:
             The score as a float
         """

diff --git a/pyPhenology/models/sklearn_thermaltime.py b/pyPhenology/models/sklearn_thermaltime.py
@@ -6,6 +6,13 @@
 
 
 class SklearnThermalTime(BaseEstimator, RegressorMixin):
+    """
+    Sklearn data structure is assuming 'predictors' with shape (a,b), where a is
+    equal to the sample size in 'observations', and b is equal to the number of
+    days in the yearly time series of temperature (ie. Jan 1 - July 30) where
+    there is no gaps.
+    """
+
     def __init__(
         self,
         t1=None,
@@ -38,34 +45,11 @@ def fit(
         self.X_ = X
         self.y_ = y
 
-        # Define temperature, doy_series
+        # Fit the model
         self.model_ = ThermalTime()
-
-        # Convert incoming data to expected structure as documented here
-        # https://pyphenology.readthedocs.io/en/master/data_structures.html
-
-        # year: The year of the temperature timeseries, here we assume that they
-        # are in the same year of 2000
-        # site_id: A site identifier for each location, here we assume that the
-        # number of sites is equal to number of rows in X
-        # doy: The julian date of the temperature, here it is a list of numbers
-        # each corresponds to a column of X
-        # temperature: The temperature i.e. X the predictor
-
-        predictors = pd.DataFrame(
-            {
-                "year": X.size * [2000],
-                "site_id": np.repeat(np.arange(len(X)), X.shape[1]),
-                "doy": list(range(X.shape[1])) * X.shape[0],
-                "temperature": X.flatten(),
-            }
-        )
-        observations = pd.DataFrame(
-            {"year": len(y) * [2000], "site_id": range(len(y)), "doy": y}
-        )
         self.model_.fit(
-            observations,
-            predictors,
+            y,
+            X,
             optimizer_params=self.optimizer_params,
             loss_function=self.loss_function,
             method=self.method,
@@ -81,9 +65,5 @@ def predict(self, X):
 
         # Input validation
         X = check_array(X)
-        predictors = {
-            "temperature": X.T,
-            "doy_series": list(range(X.shape[1]))
-            }
 
-        return self.model_.predict(predictors=predictors)
+        return self.model_.predict(predictors=X)
diff --git a/pyPhenology/models/stat_models.py b/pyPhenology/models/stat_models.py
@@ -7,7 +7,7 @@ class Linear(BaseModel):
     """Linear Regression Model
 
     A 2 parameter regression model with :math:`DOY` as
-    the response variable. 
+    the response variable.
 
     .. math::
         DOY = \\beta_{1} + \\beta_{2}T_{mean}
@@ -55,9 +55,9 @@ def _apply_model(self, temperature, doy_series, intercept, slope,
 class Naive(BaseModel):
     """A naive model of the spatially interpolated mean
 
-    This is the mean doy for an event adjusted for latitude, essentially    
+    This is the mean doy for an event adjusted for latitude, essentially
     a 2 parameter regression model with :math:`DOY` as
-    the response variable. 
+    the response variable.
 
     .. math::
         DOY = \\beta_{1} + \\beta_{2}Latitude
@@ -103,6 +103,20 @@ def _organize_predictors(self, predictors, observations, for_prediction):
             self.fitting_predictors = {'latitude': obs_with_latitude.latitude.values}
             self.obs_fitting = obs_with_latitude.doy.values
 
+    def _organize_sklearn_predictors(self, y, X, for_prediction):
+        """Convert incoming data to expected structure.
+        It is documented in
+        https://pyphenology.readthedocs.io/en/master/data_structures.html In
+        pyphenology, the processed temperature has a shape of (features,
+        samples) whereas in sklearn has (samples, features), this is the reason
+        for X.T below.
+        """
+        if for_prediction:
+            return {'latitude': X.T}
+        else:
+            self.fitting_predictors = {'latitude': X.T}
+            self.obs_fitting = y
+
     def _validate_formatted_predictors(self, predictors):
         pass
 

diff --git a/pyPhenology/models/validation.py b/pyPhenology/models/validation.py
@@ -1,4 +1,31 @@
 import pandas as pd
+from sklearn.utils import check_X_y, check_array
+
+
+def validate_sklearn_Xy(predictors, observations):
+    """Input validation for standard estimators.
+
+    check_X_y will raise a ValueError if the inputs are not compatible."""
+    valid_sklearn_Xy = True
+    try:
+        predictors, observations = check_X_y(predictors, observations)
+    except ValueError:
+        valid_sklearn_Xy = False
+
+    return valid_sklearn_Xy, predictors, observations
+
+
+def validate_sklearn_X(predictors):
+    """Input validation for standard estimators.
+
+    check_array will raise a ValueError if the inputs are not compatible."""
+    valid_sklearn_X = True
+    try:
+        predictors = check_array(predictors)
+    except ValueError:
+        valid_sklearn_X = False
+
+    return valid_sklearn_X, predictors
 
 
 def validate_predictors(predictor_df, valid_columns):

diff --git a/test/test_sklearn_thermaltime.py b/test/test_sklearn_thermaltime.py
@@ -74,10 +74,11 @@ def test_pycaret_compatible(self, tmp_path):
         df["doy"] = doy_array
         df.dropna(inplace=True)
         df_test = df.iloc[:10]  # same as above
+        df_train = df.iloc[11:]
 
         # Create pycaret instances
         exp = RegressionExperiment()
-        exp.setup(df, target="doy", session_id=123, test_data=df_test, index=False, preprocess=False)
+        exp.setup(df_train, target="doy", session_id=123, test_data=df_test, index=True, preprocess=False)
         model = exp.create_model(SklearnThermalTime(optimizer_params=optimizer_params), cross_validation=False)
 
         # it should be possible to get `RMSE`` from `model` but I want to test
@@ -92,7 +93,7 @@ def test_pycaret_compatible(self, tmp_path):
 
         # Note: if data changes, the test might fail!
         # Note: RMSE of pyPhenology and pyCaret are not exactly the same!
-        assert abs(rmse_phenology - rmse_pycaret) < 1  # 1 doy, not strict
+        assert abs(rmse_phenology - rmse_pycaret) < 0.5  # half doy, not strict
 
 
 if __name__ == "__main__":