Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Gradient Boost Regression #5

Merged
merged 3 commits into from
May 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion algorithms/linfa-ensemble/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,10 @@ linfa-datasets = { version = "0.7.0", path = "../../datasets/", features = ["iri
ndarray = { version = "0.15" , features = ["rayon", "approx"]}
ndarray-rand = "0.14"
rand = { version = "0.8", features = ["small_rng"] }
approx = {version = "0.5"}
TechieHustle marked this conversation as resolved.
Show resolved Hide resolved

[dev-dependencies]
rand = { version = "0.8", features = ["small_rng"] }
linfa-datasets = { version = "0.7.0", path = "../../datasets/", features = ["iris"] }
linfa-datasets = { version = "0.7.0", path = "../../datasets/", features = ["iris"] }
ndarray = { version = "0.15" , features = ["rayon", "approx"]}
approx = {version = "0.5"}
15 changes: 10 additions & 5 deletions algorithms/linfa-ensemble/examples/adaboost.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ use rand::rngs::SmallRng;
use linfa::prelude::*;
use linfa_ensemble::{Adaboost, Result};


fn main() -> Result<()> {
let mut rng = SmallRng::seed_from_u64(42);

Expand All @@ -17,9 +16,15 @@ fn main() -> Result<()> {
.split_with_ratio(0.8);

println!("Training model with Adaboost ...");
let ada_model = Adaboost::<f64,usize>::params().n_estimators(10)
.d_tree_params(DecisionTreeParams::new().max_depth(Some(2)).min_weight_leaf(0.00001).min_weight_split(0.00001))
.fit(&train)?;
let ada_model = Adaboost::<f64, usize>::params()
.n_estimators(10)
.d_tree_params(
DecisionTreeParams::new()
.max_depth(Some(2))
.min_weight_leaf(0.00001)
.min_weight_split(0.00001),
)
.fit(&train)?;

let ada_pred_y = ada_model.predict(&test);
let cm = ada_pred_y.confusion_matrix(&test)?;
Expand All @@ -32,4 +37,4 @@ fn main() -> Result<()> {
);

Ok(())
}
}
53 changes: 26 additions & 27 deletions algorithms/linfa-ensemble/examples/random_forest.rs
Original file line number Diff line number Diff line change
@@ -1,38 +1,37 @@
//! Random Forest
use ndarray_rand::rand::SeedableRng;
use rand::rngs::SmallRng;
use linfa_trees::DecisionTree;
use linfa::traits::Fit;
use linfa::prelude::{Predict, ToConfusionMatrix};
use linfa::traits::Fit;
use linfa_ensemble::EnsembleLearnerParams;
use linfa_trees::DecisionTree;
use ndarray_rand::rand::SeedableRng;
use rand::rngs::SmallRng;

fn main() {
//Number of models in the ensemble
let ensemble_size = 100;
//Proportion of training data given to each model
let bootstrap_proportion = 0.7;

//Number of models in the ensemble
let ensemble_size = 100;
//Proportion of training data given to each model
let bootstrap_proportion = 0.7;

//Load dataset
let mut rng = SmallRng::seed_from_u64(42);
let (train, test) = linfa_datasets::iris()
.shuffle(&mut rng)
.split_with_ratio(0.7);
//Load dataset
let mut rng = SmallRng::seed_from_u64(42);
let (train, test) = linfa_datasets::iris()
.shuffle(&mut rng)
.split_with_ratio(0.7);

//Train ensemble learner model
let model = EnsembleLearnerParams::new(DecisionTree::params())
.ensemble_size(ensemble_size)
.bootstrap_proportion(bootstrap_proportion)
.fit(&train)
.unwrap();
//Train ensemble learner model
let model = EnsembleLearnerParams::new(DecisionTree::params())
.ensemble_size(ensemble_size)
.bootstrap_proportion(bootstrap_proportion)
.fit(&train)
.unwrap();

//Return highest ranking predictions
let final_predictions_ensemble = model.predict(&test);
println!("Final Predictions: \n{:?}", final_predictions_ensemble);
//Return highest ranking predictions
let final_predictions_ensemble = model.predict(&test);
println!("Final Predictions: \n{:?}", final_predictions_ensemble);

let cm = final_predictions_ensemble.confusion_matrix(&test).unwrap();
let cm = final_predictions_ensemble.confusion_matrix(&test).unwrap();

println!("{:?}", cm);
println!("Test accuracy: {} \n with default Decision Tree params, \n Ensemble Size: {},\n Bootstrap Proportion: {}",
println!("{:?}", cm);
println!("Test accuracy: {} \n with default Decision Tree params, \n Ensemble Size: {},\n Bootstrap Proportion: {}",
100.0 * cm.accuracy(), ensemble_size, bootstrap_proportion);
}
}
101 changes: 54 additions & 47 deletions algorithms/linfa-ensemble/src/adaboost/algorithm.rs
Original file line number Diff line number Diff line change
@@ -1,19 +1,13 @@
use std::{collections::{HashMap}, iter::zip};
use std::{collections::HashMap, iter::zip};

use linfa_trees::{DecisionTree};
use linfa::{
dataset::{Labels},
error::Error,
error::Result,
traits::*,
DatasetBase, Float, Label,
};
use linfa::{dataset::Labels, error::Error, error::Result, traits::*, DatasetBase, Float, Label};
use linfa_trees::DecisionTree;

#[cfg(feature = "serde")]
use serde_crate::{Deserialize, Serialize};
use super::AdaboostValidParams;
use linfa::dataset::AsSingleTargets;
use super::{AdaboostValidParams};
use ndarray::{Array1, ArrayBase, Data, Ix2};
#[cfg(feature = "serde")]
use serde_crate::{Deserialize, Serialize};
// adaboost will be a vector of stumps

// stump will contain a decision tree and a weight associated with that stump
Expand All @@ -25,13 +19,13 @@ use ndarray::{Array1, ArrayBase, Data, Ix2};
serde(crate = "serde_crate")
)]
#[derive(Debug, Clone, PartialEq)]
pub struct Stump<F: Float,L: Label> {
tree: DecisionTree<F,L>,
pub struct Stump<F: Float, L: Label> {
tree: DecisionTree<F, L>,
weight: f32,
}

impl <F: Float, L: Label + std::fmt::Debug> Stump<F,L> {
fn make_stump(tree: DecisionTree<F,L> ,weight: f32) -> Self {
impl<F: Float, L: Label + std::fmt::Debug> Stump<F, L> {
fn make_stump(tree: DecisionTree<F, L>, weight: f32) -> Self {
Stump { tree, weight }
}
}
Expand All @@ -42,8 +36,8 @@ impl <F: Float, L: Label + std::fmt::Debug> Stump<F,L> {
serde(crate = "serde_crate")
)]
#[derive(Debug, Clone, PartialEq)]
pub struct Adaboost<F: Float,L: Label> {
stumps: Vec<Stump<F,L>>,
pub struct Adaboost<F: Float, L: Label> {
stumps: Vec<Stump<F, L>>,
}

impl<F: Float, L: Label + Default, D: Data<Elem = F>> PredictInplace<ArrayBase<D, Ix2>, Array1<L>>
Expand All @@ -58,7 +52,7 @@ impl<F: Float, L: Label + Default, D: Data<Elem = F>> PredictInplace<ArrayBase<D
);

// Create a vector that has a hashmap with key as label and value as the weight for that label to hold the aggregate of the predictions from every stump for every data record
let mut map: Vec<HashMap<L,f32>> = Vec::new();
let mut map: Vec<HashMap<L, f32>> = Vec::new();
for stump in self.stumps.iter() {
// go over each and aggregate the weights of the stump in a hashmap
for pred in stump.tree.predict(x).iter() {
Expand Down Expand Up @@ -90,9 +84,8 @@ impl<F: Float, L: Label + Default, D: Data<Elem = F>> PredictInplace<ArrayBase<D
}
}


impl<'a, F: Float, L: Label + 'a + std::fmt::Debug, D, T> Fit<ArrayBase<D, Ix2>, T, Error>
for AdaboostValidParams<F,L>
for AdaboostValidParams<F, L>
where
D: Data<Elem = F>,
T: AsSingleTargets<Elem = L> + Labels<Elem = L>,
Expand All @@ -108,21 +101,25 @@ where
let weights = vec![sample_weight; dataset.records().nrows()];

// updating the dataset to have the weights by creating a new dataset
let mut data = DatasetBase::new(dataset.records.view().clone(), dataset.targets.as_targets().clone()).with_feature_names(dataset.feature_names().clone()).with_weights(Array1::from_vec(weights));
let mut data = DatasetBase::new(
dataset.records.view().clone(),
dataset.targets.as_targets().clone(),
)
.with_feature_names(dataset.feature_names().clone())
.with_weights(Array1::from_vec(weights));

// for lifetime purpose
let binding = dataset.targets.as_targets();
// collect all the different unique classes
let classes: std::collections::HashSet<&L> = binding.iter().collect();
let num_classes = classes.len();

// lowest f32 value allowed
let eps = f32::EPSILON;
let differential = 1.0 - eps;

let mut stumps: Vec<Stump<F,L>> = Vec::new();
let mut stumps: Vec<Stump<F, L>> = Vec::new();
for i in 0..self.n_estimators() {

let tree_params = self.d_tree_params();
let tree = tree_params.fit(&data)?;
// Debug:
Expand All @@ -134,58 +131,63 @@ where
// predict the data and accumulate the error for wrongly predicted samples
let predictions = tree.predict(&data);

for ((idx, pred), weight) in zip(dataset.targets().as_targets().iter().enumerate(), data.weights().unwrap().iter()){
for ((idx, pred), weight) in zip(
dataset.targets().as_targets().iter().enumerate(),
data.weights().unwrap().iter(),
) {
if predictions[idx] != *pred {
error += weight;
}
}

// To avoid 0 errors
error = error.min(differential);

let alpha: f32 = ((num_classes-1) as f32).ln() + self.learning_rate() * ((1.0 - error) / error ).ln();
let alpha: f32 = ((num_classes - 1) as f32).ln()
+ self.learning_rate() * ((1.0 - error) / error).ln();

// From sklearn: sample_weight = np.exp(np.log(sample_weight)+ estimator_weight * incorrect * (sample_weight > 0))

// update weights in dataset
let mut updated_weights: Vec<f32> = Vec::new();
for ((idx,pred), weight) in zip(dataset.targets().as_targets().iter().enumerate(), data.weights().unwrap().iter()){
for ((idx, pred), weight) in zip(
dataset.targets().as_targets().iter().enumerate(),
data.weights().unwrap().iter(),
) {
if *weight > 0.0 && predictions[idx] != *pred {
let delta = f32::exp(f32::ln(*weight) + alpha);
updated_weights.push(delta);

} else {
updated_weights.push(*weight);
updated_weights.push(*weight);
}
}

// normalize the weights
let updated_weights = &Array1::from_vec(updated_weights);
let normalized_weights = (updated_weights)/(updated_weights.sum());
let normalized_weights = (updated_weights) / (updated_weights.sum());

// update the weights in the dataset for new stump
data = DatasetBase::new(dataset.records.view().clone(), dataset.targets.as_targets().clone()).with_feature_names(dataset.feature_names().clone()).with_weights(normalized_weights);
data = DatasetBase::new(
dataset.records.view().clone(),
dataset.targets.as_targets().clone(),
)
.with_feature_names(dataset.feature_names().clone())
.with_weights(normalized_weights);

// push the stump with it's weight
stumps.push(Stump::make_stump(tree, alpha));

}
Ok(Adaboost{
stumps,
})

Ok(Adaboost { stumps })
}
}



#[cfg(test)]
mod tests {
use super::*;

use linfa::{error::Result, Dataset};
use linfa_trees::DecisionTreeParams;
use ndarray::{array};
use ndarray::array;

use crate::AdaboostParams;

Expand All @@ -203,12 +205,17 @@ mod tests {
let targets = array![0, 0, 1];

let dataset = Dataset::new(data.clone(), targets);
let model = Adaboost::params().n_estimators(5).d_tree_params(DecisionTreeParams::new().min_weight_leaf(0.00001).min_weight_split(0.00001)).fit(&dataset)?;
let model = Adaboost::params()
.n_estimators(5)
.d_tree_params(
DecisionTreeParams::new()
.min_weight_leaf(0.00001)
.min_weight_split(0.00001),
)
.fit(&dataset)?;

assert_eq!(model.predict(&data), array![0, 0, 1]);

Ok(())
}


}
}
Loading