Skip to content
This repository has been archived by the owner on Feb 14, 2024. It is now read-only.

Fix issue 3 #4

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
108 changes: 75 additions & 33 deletions pyPhenology/models/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,11 @@ def fit(self, observations, predictors, loss_function='rmse',
temperature, precipitation, and day length

loss_function : str, or function
A string for built in loss functions (currently only 'rmse'),

A string for built in loss functions (currently only 'rmse'),
or a customized function which accpepts 2 arguments. obs and pred,
both numpy arrays of the same shape

method : str
Optimization method to use. Either 'DE' or 'BF' for differential
evolution or brute force methods.
Expand All @@ -49,16 +49,31 @@ def fit(self, observations, predictors, loss_function='rmse',
display various internals

"""

validation.validate_predictors(predictors, self._required_data['predictor_columns'])
validation.validate_observations(observations)
self._set_loss_function(loss_function)
if len(self._parameters_to_estimate) == 0:
raise RuntimeError('No parameters to estimate')

self._organize_predictors(predictors=predictors,
observations=observations,
for_prediction=False)
# check if the data is sklearn compatible
valid_sklearn, X, y = validation.validate_sklearn_Xy(predictors, observations)

# sklearn compatible option is not implemented for M1 yet.
if "M1" in type(self).__name__:
valid_sklearn = False

if valid_sklearn:
self._organize_sklearn_predictors(y=y,
X=X,
for_prediction=False)
else:
warn('Data or model is not sklearn compatible, trying pyphenology'
'see https://pyphenology.readthedocs.io/en/master/data_structures.html#data-structure')
# check if pyphenology compatible
validation.validate_predictors(predictors, self._required_data['predictor_columns'])
validation.validate_observations(observations)

self._organize_predictors(predictors=predictors,
observations=observations,
for_prediction=False)

if debug:
verbose = True
Expand Down Expand Up @@ -105,7 +120,7 @@ def predict(self, to_predict=None, predictors=None, **kwargs):
Parameters:
to_predict : dataframe, optional
pandas dataframe of site/year combinations to predict from
the given predictor data. just like the observations
the given predictor data. just like the observations
dataframe used in fit() but (optionally) without the doy column

predictors : dataframe, optional
Expand All @@ -125,28 +140,38 @@ def predict(self, to_predict=None, predictors=None, **kwargs):
{'to_predict':pd.DataFrame,'predictors':pd.DataFrame}
{'to_predict':None,'predictors':None}
"""
if to_predict is None and predictors is None:
# Making predictions on data used for fitting
if self.obs_fitting is not None and self.fitting_predictors is not None:
predictors = self.fitting_predictors
else:
raise TypeError('No to_predict + temperature passed, and' +
'no fitting done. Nothing to predict')

elif to_predict is None and predictors is not None:
# check if the data is sklearn compatible
valid_sklearn, X = validation.validate_sklearn_X(predictors)
if valid_sklearn:
# not implemented for M1
predictors = self._organize_sklearn_predictors(y=None,
X=X,
for_prediction=True)

elif isinstance(predictors, dict):
# predictors is a dict containing data that can be
# used directly in _apply_mode()
self._validate_formatted_predictors(predictors)
else:
raise TypeError('Invalid arguments.')

if to_predict is None and isinstance(predictors, dict):
# predictors is a dict containing data that can be
# used directly in _apply_mode()
self._validate_formatted_predictors(predictors)

elif isinstance(to_predict, pd.DataFrame) and isinstance(predictors, pd.DataFrame):
elif to_predict is not None and predictors is not None:
# New data to predict
validation.validate_predictors(predictors, self._required_data['predictor_columns'])
validation.validate_observations(to_predict, for_prediction=True)

predictors = self._organize_predictors(observations=to_predict,
predictors=predictors,
for_prediction=True)

elif to_predict is None and predictors is None:
# Making predictions on data used for fitting
if self.obs_fitting is not None and self.fitting_predictors is not None:
predictors = self.fitting_predictors
else:
raise TypeError('No to_predict + temperature passed, and' +
'no fitting done. Nothing to predict')
else:
raise TypeError('Invalid arguments. to_predict and predictors ' +
'must both be pandas dataframes of new data to predict,' +
Expand All @@ -161,7 +186,7 @@ def _set_loss_function(self, loss_function):
"""The loss function (ie. RMSE)

Either a sting for a built in function, or a customized
function which accpepts 2 arguments. obs, pred, both
function which accpepts 2 arguments. obs, pred, both
numpy arrays of the same shape
"""
if isinstance(loss_function, str):
Expand All @@ -176,7 +201,7 @@ def _organize_predictors(self, observations, predictors, for_prediction):
"""Convert data to internal structure used by models

This function inside _base() is used for all the modes which
have temperature as the only predictor variables (which is most of them).
have temperature as the only predictor variables (which is most of them).
Models which have other predictors have their own _organize_predictors() method.
"""
if for_prediction:
Expand All @@ -193,15 +218,32 @@ def _organize_predictors(self, observations, predictors, for_prediction):
'doy_series': doy_series}
self.obs_fitting = cleaned_observations

def _organize_sklearn_predictors(self, y, X, for_prediction):
"""Convert incoming data to expected structure.
It is documented in
https://pyphenology.readthedocs.io/en/master/data_structures.html In
pyphenology, the processed temperature has a shape of (features,
samples) whereas in sklearn has (samples, features), this is the reason
for X.T below. `doy_series` is the julian date of the temperature, here
it is an array of numbers each corresponds to a column of X.
"""
if for_prediction:
return {'temperature': X.T,
'doy_series': np.arange(X.shape[1])}
else:
self.fitting_predictors = {'temperature': X.T,
'doy_series': np.arange(X.shape[1])}
self.obs_fitting = y

def _validate_formatted_predictors(self, predictors):
"""Make sure everything is valid.

This is used when pre-formatted data (as opposed to dataframes)
is passed to predict() or fit().

This function inside _base() is used for all the modes which
have temperature as the only predictor variables (which is most of them).
Models which have other predictors have their own
have temperature as the only predictor variables (which is most of them).
Models which have other predictors have their own
_validate_formatted_predictors() method.
"""
# Don't allow any nan values in 2d temperature array
Expand Down Expand Up @@ -324,7 +366,7 @@ def _translate_scipy_parameters(self, parameters_array):
"""Map parameters from a 1D array to a dictionary for
use in phenology model functions. Ordering matters
in unpacking the scipy_array since it isn't labeled. Thus
it relies on self._parameters_to_estimate being an
it relies on self._parameters_to_estimate being an
OrdereddDict
"""
# If only a single value is being fit, some scipy.
Expand Down Expand Up @@ -388,25 +430,25 @@ def score(self, metric='rmse', doy_observed=None,
Metrics available are root mean square error (``rmse``) and AIC (``aic``).
For AIC the number of parameters in the model is set to the number of
parameters actually estimated in ``fit()``, not the total number of
model parameters.
model parameters.

Parameters:
metric : str, optional
The metric used either 'rmse' for the root mean square error,
or 'aic' for akaike information criteria.

doy_observed : numpy array, optional
The true doy values to evaluate with. This must be a numpy
array the same length as the number of rows in to_predict

to_predict : dataframe, optional
pandas dataframe of site/year combinations to predict from
the given predictor data. just like the observations
the given predictor data. just like the observations
dataframe used in fit() but (optionally) without the doy column

predictors : dataframe, optional
pandas dataframe in the format specific to this package

Returns:
The score as a float
"""
Expand Down
42 changes: 11 additions & 31 deletions pyPhenology/models/sklearn_thermaltime.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,13 @@


class SklearnThermalTime(BaseEstimator, RegressorMixin):
"""
Sklearn data structure is assuming 'predictors' with shape (a,b), where a is
equal to the sample size in 'observations', and b is equal to the number of
days in the yearly time series of temperature (ie. Jan 1 - July 30) where
there is no gaps.
"""

def __init__(
self,
t1=None,
Expand Down Expand Up @@ -38,34 +45,11 @@ def fit(
self.X_ = X
self.y_ = y

# Define temperature, doy_series
# Fit the model
self.model_ = ThermalTime()

# Convert incoming data to expected structure as documented here
# https://pyphenology.readthedocs.io/en/master/data_structures.html

# year: The year of the temperature timeseries, here we assume that they
# are in the same year of 2000
# site_id: A site identifier for each location, here we assume that the
# number of sites is equal to number of rows in X
# doy: The julian date of the temperature, here it is a list of numbers
# each corresponds to a column of X
# temperature: The temperature i.e. X the predictor

predictors = pd.DataFrame(
{
"year": X.size * [2000],
"site_id": np.repeat(np.arange(len(X)), X.shape[1]),
"doy": list(range(X.shape[1])) * X.shape[0],
"temperature": X.flatten(),
}
)
observations = pd.DataFrame(
{"year": len(y) * [2000], "site_id": range(len(y)), "doy": y}
)
self.model_.fit(
observations,
predictors,
y,
X,
optimizer_params=self.optimizer_params,
loss_function=self.loss_function,
method=self.method,
Expand All @@ -81,9 +65,5 @@ def predict(self, X):

# Input validation
X = check_array(X)
predictors = {
"temperature": X.T,
"doy_series": list(range(X.shape[1]))
}

return self.model_.predict(predictors=predictors)
return self.model_.predict(predictors=X)
20 changes: 17 additions & 3 deletions pyPhenology/models/stat_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ class Linear(BaseModel):
"""Linear Regression Model

A 2 parameter regression model with :math:`DOY` as
the response variable.
the response variable.

.. math::
DOY = \\beta_{1} + \\beta_{2}T_{mean}
Expand Down Expand Up @@ -55,9 +55,9 @@ def _apply_model(self, temperature, doy_series, intercept, slope,
class Naive(BaseModel):
"""A naive model of the spatially interpolated mean

This is the mean doy for an event adjusted for latitude, essentially
This is the mean doy for an event adjusted for latitude, essentially
a 2 parameter regression model with :math:`DOY` as
the response variable.
the response variable.

.. math::
DOY = \\beta_{1} + \\beta_{2}Latitude
Expand Down Expand Up @@ -103,6 +103,20 @@ def _organize_predictors(self, predictors, observations, for_prediction):
self.fitting_predictors = {'latitude': obs_with_latitude.latitude.values}
self.obs_fitting = obs_with_latitude.doy.values

def _organize_sklearn_predictors(self, y, X, for_prediction):
"""Convert incoming data to expected structure.
It is documented in
https://pyphenology.readthedocs.io/en/master/data_structures.html In
pyphenology, the processed temperature has a shape of (features,
samples) whereas in sklearn has (samples, features), this is the reason
for X.T below.
"""
if for_prediction:
return {'latitude': X.T}
else:
self.fitting_predictors = {'latitude': X.T}
self.obs_fitting = y

def _validate_formatted_predictors(self, predictors):
pass

Expand Down
27 changes: 27 additions & 0 deletions pyPhenology/models/validation.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,31 @@
import pandas as pd
from sklearn.utils import check_X_y, check_array


def validate_sklearn_Xy(predictors, observations):
"""Input validation for standard estimators.

check_X_y will raise a ValueError if the inputs are not compatible."""
valid_sklearn_Xy = True
try:
predictors, observations = check_X_y(predictors, observations)
except ValueError:
valid_sklearn_Xy = False

return valid_sklearn_Xy, predictors, observations


def validate_sklearn_X(predictors):
"""Input validation for standard estimators.

check_array will raise a ValueError if the inputs are not compatible."""
valid_sklearn_X = True
try:
predictors = check_array(predictors)
except ValueError:
valid_sklearn_X = False

return valid_sklearn_X, predictors


def validate_predictors(predictor_df, valid_columns):
Expand Down
5 changes: 3 additions & 2 deletions test/test_sklearn_thermaltime.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,10 +74,11 @@ def test_pycaret_compatible(self, tmp_path):
df["doy"] = doy_array
df.dropna(inplace=True)
df_test = df.iloc[:10] # same as above
df_train = df.iloc[11:]

# Create pycaret instances
exp = RegressionExperiment()
exp.setup(df, target="doy", session_id=123, test_data=df_test, index=False, preprocess=False)
exp.setup(df_train, target="doy", session_id=123, test_data=df_test, index=True, preprocess=False)
model = exp.create_model(SklearnThermalTime(optimizer_params=optimizer_params), cross_validation=False)

# it should be possible to get `RMSE`` from `model` but I want to test
Expand All @@ -92,7 +93,7 @@ def test_pycaret_compatible(self, tmp_path):

# Note: if data changes, the test might fail!
# Note: RMSE of pyPhenology and pyCaret are not exactly the same!
assert abs(rmse_phenology - rmse_pycaret) < 1 # 1 doy, not strict
assert abs(rmse_phenology - rmse_pycaret) < 0.5 # half doy, not strict


if __name__ == "__main__":
Expand Down