diff --git a/algorithms/linfa-ensemble/Cargo.toml b/algorithms/linfa-ensemble/Cargo.toml index c5440506c..9985c6c1c 100644 --- a/algorithms/linfa-ensemble/Cargo.toml +++ b/algorithms/linfa-ensemble/Cargo.toml @@ -30,7 +30,10 @@ linfa-datasets = { version = "0.7.0", path = "../../datasets/", features = ["iri ndarray = { version = "0.15" , features = ["rayon", "approx"]} ndarray-rand = "0.14" rand = { version = "0.8", features = ["small_rng"] } +approx = {version = "0.5"} [dev-dependencies] rand = { version = "0.8", features = ["small_rng"] } -linfa-datasets = { version = "0.7.0", path = "../../datasets/", features = ["iris"] } \ No newline at end of file +linfa-datasets = { version = "0.7.0", path = "../../datasets/", features = ["iris"] } +ndarray = { version = "0.15" , features = ["rayon", "approx"]} +approx = {version = "0.5"} \ No newline at end of file diff --git a/algorithms/linfa-ensemble/examples/adaboost.rs b/algorithms/linfa-ensemble/examples/adaboost.rs index 2fa0ea019..69bd2fa78 100644 --- a/algorithms/linfa-ensemble/examples/adaboost.rs +++ b/algorithms/linfa-ensemble/examples/adaboost.rs @@ -8,7 +8,6 @@ use rand::rngs::SmallRng; use linfa::prelude::*; use linfa_ensemble::{Adaboost, Result}; - fn main() -> Result<()> { let mut rng = SmallRng::seed_from_u64(42); @@ -17,9 +16,15 @@ fn main() -> Result<()> { .split_with_ratio(0.8); println!("Training model with Adaboost ..."); - let ada_model = Adaboost::::params().n_estimators(10) - .d_tree_params(DecisionTreeParams::new().max_depth(Some(2)).min_weight_leaf(0.00001).min_weight_split(0.00001)) - .fit(&train)?; + let ada_model = Adaboost::::params() + .n_estimators(10) + .d_tree_params( + DecisionTreeParams::new() + .max_depth(Some(2)) + .min_weight_leaf(0.00001) + .min_weight_split(0.00001), + ) + .fit(&train)?; let ada_pred_y = ada_model.predict(&test); let cm = ada_pred_y.confusion_matrix(&test)?; @@ -32,4 +37,4 @@ fn main() -> Result<()> { ); Ok(()) -} \ No newline at end of file +} diff --git a/algorithms/linfa-ensemble/examples/random_forest.rs b/algorithms/linfa-ensemble/examples/random_forest.rs index 3361d218d..5edf25391 100644 --- a/algorithms/linfa-ensemble/examples/random_forest.rs +++ b/algorithms/linfa-ensemble/examples/random_forest.rs @@ -1,38 +1,37 @@ //! Random Forest -use ndarray_rand::rand::SeedableRng; -use rand::rngs::SmallRng; -use linfa_trees::DecisionTree; -use linfa::traits::Fit; use linfa::prelude::{Predict, ToConfusionMatrix}; +use linfa::traits::Fit; use linfa_ensemble::EnsembleLearnerParams; +use linfa_trees::DecisionTree; +use ndarray_rand::rand::SeedableRng; +use rand::rngs::SmallRng; fn main() { + //Number of models in the ensemble + let ensemble_size = 100; + //Proportion of training data given to each model + let bootstrap_proportion = 0.7; - //Number of models in the ensemble - let ensemble_size = 100; - //Proportion of training data given to each model - let bootstrap_proportion = 0.7; - - //Load dataset - let mut rng = SmallRng::seed_from_u64(42); - let (train, test) = linfa_datasets::iris() - .shuffle(&mut rng) - .split_with_ratio(0.7); + //Load dataset + let mut rng = SmallRng::seed_from_u64(42); + let (train, test) = linfa_datasets::iris() + .shuffle(&mut rng) + .split_with_ratio(0.7); - //Train ensemble learner model - let model = EnsembleLearnerParams::new(DecisionTree::params()) - .ensemble_size(ensemble_size) - .bootstrap_proportion(bootstrap_proportion) - .fit(&train) - .unwrap(); + //Train ensemble learner model + let model = EnsembleLearnerParams::new(DecisionTree::params()) + .ensemble_size(ensemble_size) + .bootstrap_proportion(bootstrap_proportion) + .fit(&train) + .unwrap(); - //Return highest ranking predictions - let final_predictions_ensemble = model.predict(&test); - println!("Final Predictions: \n{:?}", final_predictions_ensemble); + //Return highest ranking predictions + let final_predictions_ensemble = model.predict(&test); + println!("Final Predictions: \n{:?}", final_predictions_ensemble); - let cm = final_predictions_ensemble.confusion_matrix(&test).unwrap(); + let cm = final_predictions_ensemble.confusion_matrix(&test).unwrap(); - println!("{:?}", cm); - println!("Test accuracy: {} \n with default Decision Tree params, \n Ensemble Size: {},\n Bootstrap Proportion: {}", + println!("{:?}", cm); + println!("Test accuracy: {} \n with default Decision Tree params, \n Ensemble Size: {},\n Bootstrap Proportion: {}", 100.0 * cm.accuracy(), ensemble_size, bootstrap_proportion); -} \ No newline at end of file +} diff --git a/algorithms/linfa-ensemble/src/adaboost/algorithm.rs b/algorithms/linfa-ensemble/src/adaboost/algorithm.rs index e6a48c1d1..6592096c7 100644 --- a/algorithms/linfa-ensemble/src/adaboost/algorithm.rs +++ b/algorithms/linfa-ensemble/src/adaboost/algorithm.rs @@ -1,19 +1,13 @@ -use std::{collections::{HashMap}, iter::zip}; +use std::{collections::HashMap, iter::zip}; -use linfa_trees::{DecisionTree}; -use linfa::{ - dataset::{Labels}, - error::Error, - error::Result, - traits::*, - DatasetBase, Float, Label, -}; +use linfa::{dataset::Labels, error::Error, error::Result, traits::*, DatasetBase, Float, Label}; +use linfa_trees::DecisionTree; -#[cfg(feature = "serde")] -use serde_crate::{Deserialize, Serialize}; +use super::AdaboostValidParams; use linfa::dataset::AsSingleTargets; -use super::{AdaboostValidParams}; use ndarray::{Array1, ArrayBase, Data, Ix2}; +#[cfg(feature = "serde")] +use serde_crate::{Deserialize, Serialize}; // adaboost will be a vector of stumps // stump will contain a decision tree and a weight associated with that stump @@ -25,13 +19,13 @@ use ndarray::{Array1, ArrayBase, Data, Ix2}; serde(crate = "serde_crate") )] #[derive(Debug, Clone, PartialEq)] -pub struct Stump { - tree: DecisionTree, +pub struct Stump { + tree: DecisionTree, weight: f32, } -impl Stump { - fn make_stump(tree: DecisionTree ,weight: f32) -> Self { +impl Stump { + fn make_stump(tree: DecisionTree, weight: f32) -> Self { Stump { tree, weight } } } @@ -42,8 +36,8 @@ impl Stump { serde(crate = "serde_crate") )] #[derive(Debug, Clone, PartialEq)] -pub struct Adaboost { - stumps: Vec>, +pub struct Adaboost { + stumps: Vec>, } impl> PredictInplace, Array1> @@ -58,7 +52,7 @@ impl> PredictInplace> = Vec::new(); + let mut map: Vec> = Vec::new(); for stump in self.stumps.iter() { // go over each and aggregate the weights of the stump in a hashmap for pred in stump.tree.predict(x).iter() { @@ -90,9 +84,8 @@ impl> PredictInplace Fit, T, Error> - for AdaboostValidParams + for AdaboostValidParams where D: Data, T: AsSingleTargets + Labels, @@ -108,21 +101,25 @@ where let weights = vec![sample_weight; dataset.records().nrows()]; // updating the dataset to have the weights by creating a new dataset - let mut data = DatasetBase::new(dataset.records.view().clone(), dataset.targets.as_targets().clone()).with_feature_names(dataset.feature_names().clone()).with_weights(Array1::from_vec(weights)); + let mut data = DatasetBase::new( + dataset.records.view().clone(), + dataset.targets.as_targets().clone(), + ) + .with_feature_names(dataset.feature_names().clone()) + .with_weights(Array1::from_vec(weights)); // for lifetime purpose let binding = dataset.targets.as_targets(); // collect all the different unique classes let classes: std::collections::HashSet<&L> = binding.iter().collect(); let num_classes = classes.len(); - + // lowest f32 value allowed let eps = f32::EPSILON; let differential = 1.0 - eps; - let mut stumps: Vec> = Vec::new(); + let mut stumps: Vec> = Vec::new(); for i in 0..self.n_estimators() { - let tree_params = self.d_tree_params(); let tree = tree_params.fit(&data)?; // Debug: @@ -134,58 +131,63 @@ where // predict the data and accumulate the error for wrongly predicted samples let predictions = tree.predict(&data); - for ((idx, pred), weight) in zip(dataset.targets().as_targets().iter().enumerate(), data.weights().unwrap().iter()){ + for ((idx, pred), weight) in zip( + dataset.targets().as_targets().iter().enumerate(), + data.weights().unwrap().iter(), + ) { if predictions[idx] != *pred { error += weight; } } - + // To avoid 0 errors error = error.min(differential); - let alpha: f32 = ((num_classes-1) as f32).ln() + self.learning_rate() * ((1.0 - error) / error ).ln(); + let alpha: f32 = ((num_classes - 1) as f32).ln() + + self.learning_rate() * ((1.0 - error) / error).ln(); // From sklearn: sample_weight = np.exp(np.log(sample_weight)+ estimator_weight * incorrect * (sample_weight > 0)) - + // update weights in dataset let mut updated_weights: Vec = Vec::new(); - for ((idx,pred), weight) in zip(dataset.targets().as_targets().iter().enumerate(), data.weights().unwrap().iter()){ + for ((idx, pred), weight) in zip( + dataset.targets().as_targets().iter().enumerate(), + data.weights().unwrap().iter(), + ) { if *weight > 0.0 && predictions[idx] != *pred { let delta = f32::exp(f32::ln(*weight) + alpha); updated_weights.push(delta); - } else { - updated_weights.push(*weight); + updated_weights.push(*weight); } } - + // normalize the weights let updated_weights = &Array1::from_vec(updated_weights); - let normalized_weights = (updated_weights)/(updated_weights.sum()); + let normalized_weights = (updated_weights) / (updated_weights.sum()); // update the weights in the dataset for new stump - data = DatasetBase::new(dataset.records.view().clone(), dataset.targets.as_targets().clone()).with_feature_names(dataset.feature_names().clone()).with_weights(normalized_weights); + data = DatasetBase::new( + dataset.records.view().clone(), + dataset.targets.as_targets().clone(), + ) + .with_feature_names(dataset.feature_names().clone()) + .with_weights(normalized_weights); // push the stump with it's weight stumps.push(Stump::make_stump(tree, alpha)); - } - Ok(Adaboost{ - stumps, - }) - + Ok(Adaboost { stumps }) } } - - #[cfg(test)] mod tests { use super::*; use linfa::{error::Result, Dataset}; use linfa_trees::DecisionTreeParams; - use ndarray::{array}; + use ndarray::array; use crate::AdaboostParams; @@ -203,12 +205,17 @@ mod tests { let targets = array![0, 0, 1]; let dataset = Dataset::new(data.clone(), targets); - let model = Adaboost::params().n_estimators(5).d_tree_params(DecisionTreeParams::new().min_weight_leaf(0.00001).min_weight_split(0.00001)).fit(&dataset)?; + let model = Adaboost::params() + .n_estimators(5) + .d_tree_params( + DecisionTreeParams::new() + .min_weight_leaf(0.00001) + .min_weight_split(0.00001), + ) + .fit(&dataset)?; assert_eq!(model.predict(&data), array![0, 0, 1]); Ok(()) } - - -} \ No newline at end of file +} diff --git a/algorithms/linfa-ensemble/src/adaboost/hyperparams.rs b/algorithms/linfa-ensemble/src/adaboost/hyperparams.rs index 351816b8a..064e77557 100644 --- a/algorithms/linfa-ensemble/src/adaboost/hyperparams.rs +++ b/algorithms/linfa-ensemble/src/adaboost/hyperparams.rs @@ -7,21 +7,20 @@ use linfa::{ use serde_crate::{Deserialize, Serialize}; use crate::Adaboost; -use linfa_trees::{DecisionTreeParams}; +use linfa_trees::DecisionTreeParams; #[cfg_attr( feature = "serde", derive(Serialize, Deserialize), serde(crate = "serde_crate") )] #[derive(Clone, Copy, Debug, PartialEq)] -pub struct AdaboostValidParams { +pub struct AdaboostValidParams { n_estimators: usize, learning_rate: f32, d_tree_params: DecisionTreeParams, } -impl AdaboostValidParams { - +impl AdaboostValidParams { pub fn learning_rate(&self) -> f32 { self.learning_rate } @@ -30,7 +29,7 @@ impl AdaboostValidParams { self.n_estimators } - pub fn d_tree_params(&self) -> DecisionTreeParams { + pub fn d_tree_params(&self) -> DecisionTreeParams { self.d_tree_params.clone() } } @@ -41,14 +40,16 @@ impl AdaboostValidParams { serde(crate = "serde_crate") )] #[derive(Clone, Copy, Debug, PartialEq)] -pub struct AdaboostParams(AdaboostValidParams); +pub struct AdaboostParams(AdaboostValidParams); -impl AdaboostParams { +impl AdaboostParams { pub fn new() -> Self { - Self(AdaboostValidParams { + Self(AdaboostValidParams { learning_rate: 0.5, n_estimators: 50, - d_tree_params: DecisionTreeParams::new().min_weight_leaf(0.00001).min_weight_split(0.00001), + d_tree_params: DecisionTreeParams::new() + .min_weight_leaf(0.00001) + .min_weight_split(0.00001), }) } @@ -65,15 +66,13 @@ impl AdaboostParams { } /// Sets the params for the weak learner used in Adaboost - pub fn d_tree_params(mut self, d_tree_params: DecisionTreeParams) -> Self { + pub fn d_tree_params(mut self, d_tree_params: DecisionTreeParams) -> Self { self.0.d_tree_params = d_tree_params; self } - - } -impl Default for AdaboostParams { +impl Default for AdaboostParams { fn default() -> Self { Self::new() } @@ -85,13 +84,13 @@ impl Adaboost { /// * `learning_rate = 0.00001` // Violates the convention that new should return a value of type `Self` #[allow(clippy::new_ret_no_self)] - pub fn params() -> AdaboostParams { + pub fn params() -> AdaboostParams { AdaboostParams::new() } } -impl ParamGuard for AdaboostParams { - type Checked = AdaboostValidParams; +impl ParamGuard for AdaboostParams { + type Checked = AdaboostValidParams; type Error = Error; fn check_ref(&self) -> Result<&Self::Checked> { @@ -100,8 +99,7 @@ impl ParamGuard for AdaboostParams { "Minimum learning rate should be greater than zero, but was {}", self.0.learning_rate ))) - } - else { + } else { Ok(&self.0) } } @@ -110,4 +108,4 @@ impl ParamGuard for AdaboostParams { self.check_ref()?; Ok(self.0) } -} \ No newline at end of file +} diff --git a/algorithms/linfa-ensemble/src/adaboost/mod.rs b/algorithms/linfa-ensemble/src/adaboost/mod.rs index e41dce25d..35c8da056 100644 --- a/algorithms/linfa-ensemble/src/adaboost/mod.rs +++ b/algorithms/linfa-ensemble/src/adaboost/mod.rs @@ -2,4 +2,4 @@ mod algorithm; mod hyperparams; pub use algorithm::*; -pub use hyperparams::*; \ No newline at end of file +pub use hyperparams::*; diff --git a/algorithms/linfa-ensemble/src/gradient_boost/README.md b/algorithms/linfa-ensemble/src/gradient_boost/README.md new file mode 100644 index 000000000..dc3ec74ec --- /dev/null +++ b/algorithms/linfa-ensemble/src/gradient_boost/README.md @@ -0,0 +1,3 @@ +# Classifier +- Start with `log(odds)` as the initial leaf (a.k.a base estimator) of prediction using the classes. +- `log(proba_kth_class)` should be one-vs-all label classification diff --git a/algorithms/linfa-ensemble/src/gradient_boost/algorithm.rs b/algorithms/linfa-ensemble/src/gradient_boost/algorithm.rs new file mode 100644 index 000000000..fcd0ebd98 --- /dev/null +++ b/algorithms/linfa-ensemble/src/gradient_boost/algorithm.rs @@ -0,0 +1,160 @@ +use ndarray::{Array1, Array2}; + +use crate::random_forest::DecisionTreeRegressor; + + +/* +Source of Algorithm implemented is taken from the blog: +https://lewtun.github.io/hepml/lesson05_gradient-boosting-deep-dive/ +*/ + +pub struct GBRegressor { + trees: Vec, + num_trees: usize, + learning_rate: f64, + max_features: usize, // Maximum number of features to consider for each split + max_depth: usize, + min_samples_split: usize, + init_train_target_mean: f64, +} + +impl GBRegressor { + pub fn new( + num_trees: usize, + learning_rate: f64, + max_features: usize, + max_depth: usize, + min_samples_split: usize, + ) -> Self { + GBRegressor { + trees: Vec::new(), + num_trees, + learning_rate, + max_features, + max_depth, + min_samples_split, + init_train_target_mean: 0.0, + } + } + + pub fn fit(&mut self, features: &Array2, targets: &Array1) { + + self.init_train_target_mean = targets.mean().unwrap_or(0.0); + + let mut y_pred = vec![self.init_train_target_mean; targets.dim()]; + + for idx in 0..self.num_trees { + let gradient = self.gradient(&y_pred, targets); + // let tree = DecisionTreeRegressor::new(self.max_depth, self.min_samples_split); + // tree.fit(&features, &Array1::from_vec(gradient)); + // self.trees.push(tree); + + let tree = self.create_and_train_tree(features, &Array1::from_vec(gradient)); + self.trees.push(tree); + + let update: Array1 = self.trees[idx].predict(features); + y_pred = self.update_preds(&update, &y_pred); + } + } + + fn gradient(&self, y_pred: &Vec, targets: &Array1) -> Vec { + y_pred + .iter() + .zip(targets.iter()) + .map(|(pred, target)| (target - pred)) + .collect() + } + + fn update_preds(&self, update_vals: &Array1, y_pred: &Vec) -> Vec { + y_pred + .iter() + .zip(update_vals.iter()) + .map(|(pred, val)| pred + (self.learning_rate * val)) + .collect() + } + + fn create_and_train_tree( + &mut self, + features: &Array2, + targets: &Array1, + ) -> DecisionTreeRegressor { + let mut tree = DecisionTreeRegressor::new(self.max_depth, self.min_samples_split); + tree.fit(&features, &targets); + tree + } + + pub fn predict(&self, features: &Array2) -> Array1 { + + let mut predictions: Vec> = Vec::new(); + for tree in &self.trees { + let prediction = tree.predict(features) * self.learning_rate; + predictions.push(prediction); + } + + let num_samples = features.nrows(); + let mut final_predictions = Array1::::zeros(num_samples); + final_predictions.fill(self.init_train_target_mean); + + for prediction in predictions { + final_predictions += &prediction; + } + // final_predictions /= self.num_trees as f64; + final_predictions + } +} + + +#[cfg(test)] +mod tests { + use super::*; + use ndarray::{Array1, Array2}; // For floating-point assertions + + fn load_iris_data() -> (Array2, Array1) { + // Load the dataset + let dataset = linfa_datasets::iris(); + + // Extract features; assuming all rows and all but the last column if last is target + let features = dataset.records().clone(); + + // Assuming the target are the labels, we need to convert them or use them as is depending on the use case. + // If you need to predict a feature, then split differently as shown in previous messages. + // Here we just clone the labels for the demonstration. + let targets = dataset.targets().mapv(|x| x as f64); + + (features, targets) + } + + #[test] + fn test_gradient_boost_with_iris() { + let (features, targets) = load_iris_data(); + + let mut gb_regressor = GBRegressor::new(50, 0.1, 4, 10, 3); + gb_regressor.fit(&features, &targets); + let predictions = gb_regressor.predict(&features); + + // Define a tolerance level + let tolerance = 0.01; // Tolerance level for correct classification + let mut correct = 0; + let mut incorrect = 0; + + // Count correct and incorrect predictions + for (&actual, &predicted) in targets.iter().zip(predictions.iter()) { + if (predicted - actual).abs() < tolerance { + correct += 1; + } else { + incorrect += 1; + } + } + + println!("Correct predictions: {}", correct); + println!("Incorrect predictions: {}", incorrect); + + let rmse = (&predictions - &targets) + .mapv(|a| a.powi(2)) + .mean() + .unwrap() + .sqrt(); + + println!("RMSE: {:?}", rmse); + } +} diff --git a/algorithms/linfa-ensemble/src/gradient_boost/mod.rs b/algorithms/linfa-ensemble/src/gradient_boost/mod.rs new file mode 100644 index 000000000..d46b6269f --- /dev/null +++ b/algorithms/linfa-ensemble/src/gradient_boost/mod.rs @@ -0,0 +1,5 @@ +mod algorithm; +// mod hyperparams; + +pub use algorithm::*; +// pub use hyperparams::*; diff --git a/algorithms/linfa-ensemble/src/lib.rs b/algorithms/linfa-ensemble/src/lib.rs index 3364ca14e..de31e5cc8 100644 --- a/algorithms/linfa-ensemble/src/lib.rs +++ b/algorithms/linfa-ensemble/src/lib.rs @@ -1,6 +1,11 @@ -mod random_forest; mod adaboost; -pub use random_forest::*; +mod random_forest; +// mod random_forest_regressor; +mod gradient_boost; + pub use adaboost::*; +pub use random_forest::*; +// pub use random_forest_regressor::*; +pub use gradient_boost::*; pub use linfa::error::Result; diff --git a/algorithms/linfa-ensemble/src/random_forest/algorithm.rs b/algorithms/linfa-ensemble/src/random_forest/algorithm.rs index 3f5d80b4e..2f7ef3108 100644 --- a/algorithms/linfa-ensemble/src/random_forest/algorithm.rs +++ b/algorithms/linfa-ensemble/src/random_forest/algorithm.rs @@ -3,17 +3,17 @@ use linfa::{ error::{Error, Result}, traits::*, DatasetBase, ParamGuard, - }; - use ndarray::{Array, Array2, Axis, Dimension}; - use rand::rngs::ThreadRng; - use rand::Rng; - use std::{cmp::Eq, collections::HashMap, hash::Hash}; - - pub struct EnsembleLearner { +}; +use ndarray::{Array, Array2, Axis, Dimension}; +use rand::rngs::ThreadRng; +use rand::Rng; +use std::{cmp::Eq, collections::HashMap, hash::Hash}; + +pub struct EnsembleLearner { pub models: Vec, - } - - impl EnsembleLearner { +} + +impl EnsembleLearner { // Generates prediction iterator returning predictions from each model pub fn generate_predictions<'b, R: Records, T>( &'b self, @@ -24,7 +24,7 @@ use linfa::{ { self.models.iter().map(move |m| m.predict(x)) } - + // Consumes prediction iterator to return all predictions pub fn aggregate_predictions( &self, @@ -43,11 +43,11 @@ use linfa::{ ::Elem: Copy + Eq + Hash, { let mut prediction_maps = Vec::new(); - + for y in ys { let targets = y.as_targets(); let no_targets = targets.shape()[0]; - + for i in 0..no_targets { if prediction_maps.len() == i { prediction_maps.push(HashMap::new()); @@ -57,21 +57,21 @@ use linfa::{ .or_insert(0) += 1; } } - + prediction_maps.into_iter().map(|xs| { let mut xs: Vec<_> = xs.into_iter().collect(); xs.sort_by(|(_, x), (_, y)| y.cmp(x)); xs }) } - } - - impl PredictInplace, T> for EnsembleLearner - where +} + +impl PredictInplace, T> for EnsembleLearner +where M: PredictInplace, T>, ::Elem: Copy + Eq + Hash, T: AsTargets + AsTargetsMut::Elem>, - { +{ fn predict_inplace(&self, x: &Array2, y: &mut T) { let mut y_array = y.as_targets_mut(); assert_eq!( @@ -79,10 +79,10 @@ use linfa::{ y_array.len_of(Axis(0)), "The number of data points must match the number of outputs." ); - + let mut predictions = self.generate_predictions(x); let aggregated_predictions = self.aggregate_predictions(&mut predictions); - + for (target, output) in y_array .axis_iter_mut(Axis(0)) .zip(aggregated_predictions.into_iter()) @@ -92,30 +92,30 @@ use linfa::{ } } } - + fn default_target(&self, x: &Array2) -> T { self.models[0].default_target(x) } - } - - #[derive(Clone, Copy, Debug, PartialEq)] - pub struct EnsembleLearnerValidParams { +} + +#[derive(Clone, Copy, Debug, PartialEq)] +pub struct EnsembleLearnerValidParams { pub ensemble_size: usize, pub bootstrap_proportion: f64, pub model_params: P, pub rng: R, - } - - #[derive(Clone, Copy, Debug, PartialEq)] - pub struct EnsembleLearnerParams(EnsembleLearnerValidParams); - - impl

EnsembleLearnerParams { +} + +#[derive(Clone, Copy, Debug, PartialEq)] +pub struct EnsembleLearnerParams(EnsembleLearnerValidParams); + +impl

EnsembleLearnerParams { pub fn new(model_params: P) -> EnsembleLearnerParams { return Self::new_fixed_rng(model_params, rand::thread_rng()); } - } - - impl EnsembleLearnerParams { +} + +impl EnsembleLearnerParams { pub fn new_fixed_rng(model_params: P, rng: R) -> EnsembleLearnerParams { Self(EnsembleLearnerValidParams { ensemble_size: 1, @@ -124,22 +124,22 @@ use linfa::{ rng: rng, }) } - + pub fn ensemble_size(mut self, size: usize) -> Self { self.0.ensemble_size = size; self } - + pub fn bootstrap_proportion(mut self, proportion: f64) -> Self { self.0.bootstrap_proportion = proportion; self } - } - - impl ParamGuard for EnsembleLearnerParams { +} + +impl ParamGuard for EnsembleLearnerParams { type Checked = EnsembleLearnerValidParams; type Error = Error; - + fn check_ref(&self) -> Result<&Self::Checked> { if self.0.bootstrap_proportion > 1.0 || self.0.bootstrap_proportion <= 0.0 { Err(Error::Parameters(format!( @@ -155,45 +155,45 @@ use linfa::{ Ok(&self.0) } } - + fn check(self) -> Result { self.check_ref()?; Ok(self.0) } - } - - impl, T::Owned, Error>, R: Rng + Clone> Fit, T, Error> +} + +impl, T::Owned, Error>, R: Rng + Clone> Fit, T, Error> for EnsembleLearnerValidParams - where +where D: Clone, T: FromTargetArrayOwned, T::Elem: Copy + Eq + Hash, T::Owned: AsTargets, - { +{ type Object = EnsembleLearner; - + fn fit( &self, dataset: &DatasetBase, T>, ) -> core::result::Result { let mut models = Vec::new(); let mut rng = self.rng.clone(); - + let dataset_size = ((dataset.records.nrows() as f64) * self.bootstrap_proportion).ceil() as usize; - + let iter = dataset.bootstrap_samples(dataset_size, &mut rng); - // let mut count = 0; + // let mut count = 0; for train in iter { - // count += 1; + // count += 1; let model = self.model_params.fit(&train).unwrap(); models.push(model); - + if models.len() == self.ensemble_size { break; } } - + Ok(EnsembleLearner { models }) } - } \ No newline at end of file +} diff --git a/algorithms/linfa-ensemble/src/random_forest/mod.rs b/algorithms/linfa-ensemble/src/random_forest/mod.rs index e9f7752a0..1eb472a96 100644 --- a/algorithms/linfa-ensemble/src/random_forest/mod.rs +++ b/algorithms/linfa-ensemble/src/random_forest/mod.rs @@ -1,3 +1,5 @@ mod algorithm; +mod random_forest_regressor; -pub use algorithm::*; \ No newline at end of file +pub use algorithm::*; +pub use random_forest_regressor::*; diff --git a/algorithms/linfa-ensemble/src/random_forest/random_forest_regressor.rs b/algorithms/linfa-ensemble/src/random_forest/random_forest_regressor.rs new file mode 100644 index 000000000..7cfa7d8db --- /dev/null +++ b/algorithms/linfa-ensemble/src/random_forest/random_forest_regressor.rs @@ -0,0 +1,325 @@ +use linfa::prelude::*; +use linfa_datasets::iris; +use ndarray::{Array1, Array2, Axis}; +use rand::rngs::ThreadRng; +use rand::seq::SliceRandom; +use rand::Rng; + +pub struct DecisionTreeRegressor { + max_depth: usize, + min_samples_split: usize, + tree: Option, +} + +#[derive(Debug)] +pub struct TreeNode { + feature: usize, + value: f64, + left: Box>, + right: Box>, + output: Option, +} + +impl DecisionTreeRegressor { + pub fn new(max_depth: usize, min_samples_split: usize) -> Self { + DecisionTreeRegressor { + max_depth, + min_samples_split, + tree: None, + } + } + + pub fn fit(&mut self, features: &Array2, targets: &Array1) { + self.tree = Some(self.build_tree(features, targets, 0)); + } + + fn build_tree(&self, features: &Array2, targets: &Array1, depth: usize) -> TreeNode { + if depth >= self.max_depth || features.nrows() < self.min_samples_split { + return TreeNode { + feature: 0, + value: 0.0, + left: Box::new(None), + right: Box::new(None), + output: Some(Self::mean(targets)), + }; + } + + let (best_feature, best_value) = self.best_split(features, targets); + let (left_idxs, right_idxs) = Self::split_dataset(features, best_feature, best_value); + + let left_tree = if !left_idxs.is_empty() { + let left_features = features.select(Axis(0), &left_idxs); + let left_targets = targets.select(Axis(0), &left_idxs); + Some(self.build_tree(&left_features, &left_targets, depth + 1)) + } else { + None + }; + + let right_tree = if !right_idxs.is_empty() { + let right_features = features.select(Axis(0), &right_idxs); + let right_targets = targets.select(Axis(0), &right_idxs); + Some(self.build_tree(&right_features, &right_targets, depth + 1)) + } else { + None + }; + + TreeNode { + feature: best_feature, + value: best_value, + left: Box::new(left_tree), + right: Box::new(right_tree), + output: None, // Output is now only assigned at leaves + } + } + + fn best_split(&self, features: &Array2, targets: &Array1) -> (usize, f64) { + let mut best_feature = 0; + let mut best_value = 0.0; + let mut best_mse = f64::INFINITY; + + for feature_idx in 0..features.ncols() { + let mut possible_values = features.column(feature_idx).to_vec(); + possible_values.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap()); + possible_values.dedup(); + + for &value in possible_values.iter() { + let (left_idxs, right_idxs) = Self::split_dataset(features, feature_idx, value); + if !left_idxs.is_empty() && !right_idxs.is_empty() { + let left_targets = targets.select(Axis(0), &left_idxs); + let right_targets = targets.select(Axis(0), &right_idxs); + let mse = Self::calculate_mse(&left_targets, &right_targets); + println!("Feature: {}, Value: {}, MSE: {}", feature_idx, value, mse); // Debug statement + if mse < best_mse { + best_mse = mse; + best_feature = feature_idx; + best_value = value; + } + } + } + } + + (best_feature, best_value) + } + + pub fn predict(&self, features: &Array2) -> Array1 { + let mut predictions = Array1::::zeros(features.nrows()); + for (i, feature_row) in features.axis_iter(Axis(0)).enumerate() { + let mut node = &self.tree; + while let Some(ref n) = *node { + if let Some(output) = n.output { + predictions[i] = output; + break; // Break if a leaf node with output is reached. + } + node = if feature_row[n.feature] <= n.value { + &n.left + } else { + &n.right + }; + } + } + predictions + } + + fn split_dataset( + features: &Array2, + feature_idx: usize, + value: f64, + ) -> (Vec, Vec) { + let mut left_idxs = Vec::new(); + let mut right_idxs = Vec::new(); + + for (idx, feature_row) in features.axis_iter(Axis(0)).enumerate() { + if feature_row[feature_idx] <= value { + left_idxs.push(idx); + } else { + right_idxs.push(idx); + } + } + + (left_idxs, right_idxs) + } + + fn calculate_mse(left_targets: &Array1, right_targets: &Array1) -> f64 { + let left_mean = Self::mean(left_targets); + let right_mean = Self::mean(right_targets); + let left_mse: f64 = left_targets.iter().map(|&x| (x - left_mean).powi(2)).sum(); + let right_mse: f64 = right_targets + .iter() + .map(|&x| (x - right_mean).powi(2)) + .sum(); + (left_mse + right_mse) / (left_targets.len() + right_targets.len()) as f64 + } + + fn mean(values: &Array1) -> f64 { + values.mean().unwrap_or(0.0) + } +} + +// impl DecisionTreeRegressor { +// pub fn predict(&self, features: &Array2) -> Array1 { +// let mut predictions = Array1::::zeros(features.nrows()); +// for (i, feature_row) in features.axis_iter(Axis(0)).enumerate() { +// let mut node = &self.tree; +// while let Some(ref n) = *node { +// if feature_row[n.feature] <= n.value { +// node = &n.left; +// } else { +// node = &n.right; +// } +// } +// predictions[i] = if let Some(ref n) = *node { n.output } else { 0.0 }; +// } +// predictions +// } +// } + +pub struct RandomForestRegressor { + trees: Vec, + num_trees: usize, + max_features: usize, // Maximum number of features to consider for each split + max_depth: usize, + min_samples_split: usize, + rng: ThreadRng, +} + +impl RandomForestRegressor { + pub fn new( + num_trees: usize, + max_features: usize, + max_depth: usize, + min_samples_split: usize, + ) -> Self { + let rng = rand::thread_rng(); + RandomForestRegressor { + trees: Vec::with_capacity(num_trees), + num_trees, + max_features, + max_depth, + min_samples_split, + rng, + } + } + + pub fn fit(&mut self, features: &Array2, targets: &Array1) { + for _ in 0..self.num_trees { + let tree = self.create_and_train_tree(features, targets); + self.trees.push(tree); + } + } + + fn create_and_train_tree( + &mut self, + features: &Array2, + targets: &Array1, + ) -> DecisionTreeRegressor { + let num_samples = features.nrows(); + let indices: Vec = (0..num_samples).collect(); + let sampled_indices = indices + .as_slice() + .choose_multiple(&mut self.rng, num_samples) + .cloned() + .collect::>(); + + let sampled_features = features.select(Axis(0), &sampled_indices); + let sampled_targets = targets.select(Axis(0), &sampled_indices); + + let mut tree = DecisionTreeRegressor::new(self.max_depth, self.min_samples_split); + tree.fit(&sampled_features, &sampled_targets); + tree + } + + pub fn predict(&self, features: &Array2) -> Array1 { + let mut predictions: Vec> = Vec::new(); + for tree in &self.trees { + predictions.push(tree.predict(features)); + } + + let num_samples = features.nrows(); + let mut final_predictions = Array1::::zeros(num_samples); + for prediction in predictions { + final_predictions += &prediction; + } + final_predictions /= self.num_trees as f64; + final_predictions + } +} + +#[cfg(test)] +mod tests { + use super::*; + use approx::assert_relative_eq; + use ndarray::{array, Array1, Array2}; // For floating-point assertions + + #[test] + fn test_decision_tree_regressor() { + let features = Array2::from_shape_vec((4, 1), vec![1.0, 2.0, 4.0, 5.0]).unwrap(); + let targets = Array1::from_vec(vec![1.1, 1.9, 3.9, 5.1]); + let mut regressor = DecisionTreeRegressor::new(3, 1); + + if features.nrows() != targets.len() { + panic!("Feature and target count mismatch"); + } + + regressor.fit(&features, &targets); + let predictions = regressor.predict(&features); + let rmse = calculate_rmse(&targets, &predictions); + + println!("RMSE: {:?}", rmse); + assert_relative_eq!(rmse, 0.0, epsilon = 0.1); + } + + fn calculate_rmse(actual: &Array1, predicted: &Array1) -> f64 { + let errors = actual - predicted; + let mse = errors.mapv(|e| e.powi(2)).mean().unwrap(); + mse.sqrt() + } + + fn load_iris_data() -> (Array2, Array1) { + // Load the dataset + let dataset = iris(); + + // Extract features; assuming all rows and all but the last column if last is target + let features = dataset.records().clone(); + + // Assuming the target are the labels, we need to convert them or use them as is depending on the use case. + // If you need to predict a feature, then split differently as shown in previous messages. + // Here we just clone the labels for the demonstration. + let targets = dataset.targets().mapv(|x| x as f64); + + (features, targets) + } + + #[test] + fn test_random_forest_with_iris() { + let (features, targets) = load_iris_data(); + + let mut forest = RandomForestRegressor::new(100, 4, 10, 3); + forest.fit(&features, &targets); + let predictions = forest.predict(&features); + + // Define a tolerance level + let tolerance = 0.1; // Tolerance level for correct classification + let mut correct = 0; + let mut incorrect = 0; + + // Count correct and incorrect predictions + for (&actual, &predicted) in targets.iter().zip(predictions.iter()) { + if (predicted - actual).abs() < tolerance { + correct += 1; + } else { + incorrect += 1; + } + } + + println!("Correct predictions: {}", correct); + println!("Incorrect predictions: {}", incorrect); + + let rmse = (&predictions - &targets) + .mapv(|a| a.powi(2)) + .mean() + .unwrap() + .sqrt(); + + println!("RMSE: {:?}", rmse); + } +} diff --git a/algorithms/linfa-ensemble/src/xgboost/README.md b/algorithms/linfa-ensemble/src/xgboost/README.md new file mode 100644 index 000000000..5e7cd5f42 --- /dev/null +++ b/algorithms/linfa-ensemble/src/xgboost/README.md @@ -0,0 +1,27 @@ +# XGBoost Tree Regression Based on [Video](https://www.youtube.com/watch?v=OtD8wVaFm6E&ab_channel=StatQuestwithJoshStarmer) + +- XGBoost Trees (different from our regular trees) +- Most common Regression for XGBoost: + ### First Epoch + - Initial prediction is made (by default 0.5) + - Each tree starts as single leaf + - Similarity score of residuals: (sum of residuals)^2 / (no. of residuals + lambda) + - lambda: regularization param (starts with 0) + ### Subsequent Epochs + - Take two datapoints (start with extreme label vals and move towards other end), find the avg + - Split the data based on the average of label vals and find simliarity of left, and right + - Find Gain using formula: Left_similarity + Right_similarity - Root_similarity + - The avg of the two datapoints which results in the largest gain will be used as the split. + - We continue to do this split for multiple levels deep (default max depth is 6). + #### Tree Pruning + - We keep a gamma value and on each branch of the (starting from the bottom most) tree, we compare the gain with gamma. If (gain - gamma < 0) on a branch, prune that branch. + > Side NOTE: + >- A lambda (regularization) helps removing the outliers in the data, preventing overfitting. It decreases the gain value on each branch. + >- Regularization also helps in decreasing the similarity score. And the amount of decrease in scores is inversely proportional to the number of residuals in the node. + >- In the video example, if lambda is 1 and using a gamma even of value 0 leads to pruning of the tree branch since the gain becomes negative. That means when lambda + - Output value for the leaf nodes is calculated using formula (`sum of all residuals / (number of residuals + lambda)`). + - Learning rate (default 0.3) is applied to each leaf node and values are updated from the predicted value (`leaf_residual = prediced_val + eta x leaf_output`). + +# XGBoost Tree Classification +- Start with a prediction probability of 0.5. +- In each epoch, \ No newline at end of file diff --git a/datasets/data/iris.csv.gz b/datasets/data/iris.csv.gz index 700e93baf..97c2de270 100644 Binary files a/datasets/data/iris.csv.gz and b/datasets/data/iris.csv.gz differ diff --git a/datasets/data/winequality-red.csv.gz b/datasets/data/winequality-red.csv.gz index 7fc89bd71..32ca7d23d 100644 Binary files a/datasets/data/winequality-red.csv.gz and b/datasets/data/winequality-red.csv.gz differ diff --git a/decision_tree_example.tex b/decision_tree_example.tex new file mode 100644 index 000000000..be96ddd8e --- /dev/null +++ b/decision_tree_example.tex @@ -0,0 +1,53 @@ + +\documentclass[margin=10pt]{standalone} +\usepackage{tikz,forest} +\usetikzlibrary{arrows.meta} +\forestset{ +default preamble={ +before typesetting nodes={ + !r.replace by={[, coordinate, append]} +}, +where n children=0{ + tier=word, +}{ + %diamond, aspect=2, +}, +where level=0{}{ + if n=1{ + edge label={node[pos=.2, above] {Y}}, + }{ + edge label={node[pos=.2, above] {N}}, + } +}, +for tree={ + edge+={thick, -Latex}, + s sep'+=2cm, + draw, + thick, + edge path'={ (!u) -| (.parent)}, + align=center, +} +} +}\begin{document}\begin{forest}[Val($2$) $ \leq 2.45$ \\ Imp. $0.33$ + [Label: 0] + [Val($2$) $ \leq 4.85$ \\ Imp. $0.41$ + [Val($3$) $ \leq 1.65$ \\ Imp. $0.02$ + [Label: 1] + [Val($0$) $ \leq 5.95$ \\ Imp. $0.50$ + [Label: 1] + [Label: 2]]] + [Val($3$) $ \leq 1.75$ \\ Imp. $0.06$ + [Val($2$) $ \leq 5.05$ \\ Imp. $0.25$ + [Val($1$) $ \leq 2.55$ \\ Imp. $0.12$ + [Val($2$) $ \leq 4.95$ \\ Imp. $0.50$ + [Label: 1] + [Label: 2]] + [Label: 1]] + [Label: 2]] + [Label: 2]]]] +\node [anchor=north west] at (current bounding box.north east) {% + \begin{tabular}{c c c} + \multicolumn{3}{@{}l@{}}{Legend:}\\ + Imp.&:&Impurity decrease\\Var(2)&:&feature-2\\Var(3)&:&feature-3\\Var(1)&:&feature-1\\Var(0)&:&feature-0\\\end{tabular}}; + \end{forest} +\end{document} \ No newline at end of file diff --git a/src/dataset/impl_dataset.rs b/src/dataset/impl_dataset.rs index 837c47eff..bf5e04738 100644 --- a/src/dataset/impl_dataset.rs +++ b/src/dataset/impl_dataset.rs @@ -498,7 +498,7 @@ where .map(|_| rng.gen_range(0..self.nsamples())) .collect::>(); - let records = self.records().select(Axis(0), &indices); + let records = self.records().select(Axis(0), &indices); let targets = T::new_targets(self.as_targets().select(Axis(0), &indices)); DatasetBase::new(records, targets) diff --git a/src/dataset/impl_targets.rs b/src/dataset/impl_targets.rs index 8952ab56c..09b734ebd 100644 --- a/src/dataset/impl_targets.rs +++ b/src/dataset/impl_targets.rs @@ -2,8 +2,8 @@ use std::collections::HashMap; use super::{ AsMultiTargets, AsMultiTargetsMut, AsProbabilities, AsSingleTargets, AsSingleTargetsMut, - AsTargets, AsTargetsMut, CountedTargets, DatasetBase, FromTargetArray, FromTargetArrayOwned, Label, Labels, Pr, - TargetDim, + AsTargets, AsTargetsMut, CountedTargets, DatasetBase, FromTargetArray, FromTargetArrayOwned, + Label, Labels, Pr, TargetDim, }; use ndarray::{ Array, Array1, Array2, ArrayBase, ArrayView, ArrayViewMut, Axis, CowArray, Data, DataMut, @@ -43,7 +43,6 @@ impl<'a, L: Clone + 'a, S: Data, I: TargetDim> FromTargetArray<'a> for } } - impl, I: TargetDim> AsTargetsMut for ArrayBase { type Elem = L; type Ix = I; @@ -99,7 +98,6 @@ where } } - impl<'a, L: Label + 'a, T> FromTargetArray<'a> for CountedTargets where T: FromTargetArray<'a, Elem = L>, @@ -243,4 +241,4 @@ where feature_names: self.feature_names.clone(), } } -} \ No newline at end of file +} diff --git a/src/dataset/mod.rs b/src/dataset/mod.rs index 2164cdf8a..9db22a632 100644 --- a/src/dataset/mod.rs +++ b/src/dataset/mod.rs @@ -998,4 +998,4 @@ mod tests { let prob = -0.5; assert_abs_diff_eq!(Pr::new_unchecked(prob).0, prob); } -} \ No newline at end of file +}