Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

adding bagged estimator to have better results in terms of monotonicity #82

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion hazardous/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from pathlib import Path

from ._survival_boost import SurvivalBoost
from ._survival_boost import BaggedSurvivalBoost, SurvivalBoost

with open(Path(__file__).parent / "VERSION.txt") as _fh:
__version__ = _fh.read().strip()
Expand All @@ -9,4 +9,5 @@
__all__ = [
"metrics",
"SurvivalBoost",
"BaggedSurvivalBoost",
]
74 changes: 74 additions & 0 deletions hazardous/_survival_boost.py
Original file line number Diff line number Diff line change
Expand Up @@ -633,3 +633,77 @@ def score(self, X, y):
)
ibs_events.append(ibs_event)
return -np.mean(ibs_events)


class BaggedSurvivalBoost(BaseEstimator, ClassifierMixin):
def __init__(
# TODO: run a grid search on a few datasets to find good defaults.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍 For the TODO

self,
hard_zero_fraction=0.1,
# TODO: implement convergence criterion and use max_iter instead of
# n_iter.
n_iter=100,
learning_rate=0.05,
max_leaf_nodes=31,
max_depth=None,
min_samples_leaf=50,
show_progressbar=True,
n_time_grid_steps=100,
time_horizon=None,
ipcw_strategy="alternating",
n_iter_before_feedback=20,
random_state=None,
n_horizons_per_observation=3,
bagging=5,
):
self.hard_zero_fraction = hard_zero_fraction
self.n_iter = n_iter
self.learning_rate = learning_rate
self.max_depth = max_depth
self.max_leaf_nodes = max_leaf_nodes
self.min_samples_leaf = min_samples_leaf
self.show_progressbar = show_progressbar
self.n_time_grid_steps = n_time_grid_steps
self.time_horizon = time_horizon
self.n_iter_before_feedback = n_iter_before_feedback
self.ipcw_strategy = ipcw_strategy
self.random_state = random_state
self.n_horizons_per_observation = n_horizons_per_observation
self.bagging = bagging # number of models to train

def fit(self, X, y, times=None):
self.models = []
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In "sklearn conventions", we set an underscore suffix to identify outputs attributes of the fitting process

Suggested change
self.models = []
self.models_ = []

survival_boost_params = self.get_params()
survival_boost_params.pop("random_state")
survival_boost_params.pop("bagging")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now that I'm thinking about this, what is your opinion on adding a bagging parameter to SurvivalBoost, instead of defining this new class? This would slightly complicate the internal methods of SurvivalBoost, but we would only have one class to deal with.

for i in range(self.bagging):
model = SurvivalBoost(
random_state=self.random_state + i, **survival_boost_params
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You need to handle the default case when self.random_state is None

)
model.fit(X, y, times)
self.models.append(model)
return self

def predict_proba(self, X, time_horizon=None):
self.predictions = []
for model in self.models:
self.predictions.append(model.predict_proba(X, time_horizon))
return np.mean(self.predictions, axis=0)

def predict_cumulative_incidence(self, X, times=None):
self.predictions = []
for model in self.models:
self.predictions.append(model.predict_cumulative_incidence(X, times))
return np.mean(self.predictions, axis=0)

def predict_survival_function(self, X, times=None):
self.predictions = []
for model in self.models:
self.predictions.append(model.predict_survival_function(X, times))
return np.mean(self.predictions, axis=0)

def score(self, X, y):
self.scores = []
for model in self.models:
self.scores.append(model.score(X, y))
return np.mean(self.scores)
Loading