You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
def test_random_forest_posteriors_on_independent():
"""Test regression from :gh:`283`.
Posteriors were biased when the classes were independent and using the bootstrap and oob sample
technique to estimate the final population test statistic. This resulted in a biased estimate
of the AUC score. Stratification of the bootstrapping samples was the solution to this problem.
"""
from sktree import RandomForestClassifier
from sklearn.datasets import make_classification
scores = []
for idx in range(5):
# create a dataset with overlapping classes
X = np.random.standard_normal(size=(128, 4096))
y = np.vstack([np.zeros(64), np.ones(64)]).ravel()
y = y.reshape(-1, 1)
clf = RandomForestClassifier(
n_estimators=100,
random_state=idx,
bootstrap=True,
max_samples=1.0,
n_jobs=-1,
# stratify=True,
)
clf.fit(X, y)
oob_posteriors = np.empty((len(clf.estimators_), X.shape[0], 2))
for idx, (tree, inbag_idx) in enumerate(zip(clf.estimators_, clf.estimators_samples_)):
oob_idx = np.array([idx for idx in range(X.shape[0]) if idx not in inbag_idx])
oob_posteriors[idx, oob_idx, :] = tree.predict_proba(X[oob_idx])
auc_score = roc_auc_score(y, np.nanmean(oob_posteriors, axis=0)[:, 1])
scores.append(auc_score)
# Without stratification, this test should fail
print(np.mean(scores), scores)
assert np.mean(scores) > 0.49 and np.mean(scores) < 0.51, f"{np.mean(scores)} {scores}"
assert False
The text was updated successfully, but these errors were encountered:
We can test:
The text was updated successfully, but these errors were encountered: