Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implementation of Adaboost Regressor , Random Forest Regressor, Decision Tree Regressor and Visualizations #7

Merged
merged 6 commits into from
May 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ linfa-datasets = { path = "datasets", features = [
"diabetes",
"generate",
"mnist",
"boston",
] }
statrs = "0.16.0"

Expand Down
8 changes: 6 additions & 2 deletions algorithms/linfa-ensemble/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,17 +25,21 @@ features = ["std", "derive"]
[dependencies]
linfa = { version = "0.7.0", path = "../.." }
linfa-trees = { version = "0.7.0", path = "../linfa-trees"}
linfa-datasets = { version = "0.7.0", path = "../../datasets/", features = ["iris", "mnist"] }
serde = { version = "1.0", features = ["derive","std"] }
linfa-datasets = { version = "0.7.0", path = "../../datasets/", features = ["iris", "mnist", "boston"] }
ndarray = { version = "0.15" , features = ["rayon", "approx"]}
ndarray-rand = "0.14"
rand = { version = "0.8", features = ["small_rng"] }
pyo3 = { version = "0.21.2", features = ["extension-module"] }
rayon = {version = "1.10.0"}
approx = {version = "0.5"}
plotters = "0.3"
ndarray-csv = "0.5" # Check for the latest or required version on crates.io
csv = "1.1"

[dev-dependencies]
rand = { version = "0.8", features = ["small_rng"] }
linfa-datasets = { version = "0.7.0", path = "../../datasets/", features = ["iris", "mnist"] }
linfa-datasets = { version = "0.7.0", path = "../../datasets/", features = ["iris", "mnist", "boston"] }
rayon = {version = "1.10.0"}
approx = {version = "0.5"}

Expand Down
74 changes: 74 additions & 0 deletions algorithms/linfa-ensemble/examples/adaboost_regressor.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#[cfg(test)]
mod tests {
use super::*;
use ndarray::{Array2, Array1, s};
use ndarray_csv::Array2Reader;
use std::fs::File;
use rand::rngs::StdRng;
use rand::SeedableRng;
use csv::ReaderBuilder;
use linfa_ensemble::AdaBoostRegressor;
use linfa_datasets::{boston, diabetes};

#[test]
fn test_adaboost_with_boston_housing() {
// Load the dataset
let dataset = boston(); // dataset now contains both features and targets

// Parameters for AdaBoost
let n_estimators = 50;
let learning_rate = 1.0;
let max_depth = 4;
let min_samples_split = 10;
let random_state = 42; // Random state for reproducibility

// Create AdaBoostRegressor instance
let mut regressor = AdaBoostRegressor::new(n_estimators, learning_rate, random_state, max_depth, min_samples_split);

// Fit the regressor to the Boston Housing dataset
regressor.fit(dataset.records(), dataset.targets());

// Make predictions
let predictions = regressor.predict(dataset.records());

// Calculate Mean Squared Error
let mse = (dataset.targets() - &predictions).mapv(|a| a.powi(2)).mean().unwrap_or(0.0); // Calculate Mean Squared Error
let rmse = mse.sqrt(); // Calculate Root Mean Squared Error
println!("Root Mean Squared Error: {}", rmse);

// Assert to check if RMSE is below a threshold
assert!(rmse < 25.0, "The RMSE should be lower than 25.0, but it was {}", rmse);
}

#[test]
fn test_adaboost_with_diabetes() {
// Load the dataset
let dataset = diabetes();

// Parameters for AdaBoost
let n_estimators = 100;
let learning_rate = 0.5;
let max_depth = 3;
let min_samples_split = 5;
let random_state = 42;

// Create AdaBoostRegressor instance
let mut regressor = AdaBoostRegressor::new(n_estimators, learning_rate, random_state, max_depth, min_samples_split);

// Fit the regressor to the Diabetes dataset
regressor.fit(dataset.records(), dataset.targets());

// Make predictions
let predictions = regressor.predict(dataset.records());

// Calculate Mean Squared Error
let mse = (dataset.targets() - &predictions).mapv(|a| a.powi(2)).mean().unwrap_or(0.0); // Calculate Mean Squared Error
let rmse = mse.sqrt(); // Calculate Root Mean Squared Error
println!("Root Mean Squared Error: {}", rmse);

// Assert to check if RMSE is below a threshold
assert!(rmse < 200.0, "The RMSE should be lower than 200.0, but it was {}", rmse);
}


}
164 changes: 164 additions & 0 deletions algorithms/linfa-ensemble/examples/random_forest_regressor.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
// use linfa_ensemble::RandomForestRegressor;
// use ndarray::{Array1, Axis};
// use rand::seq::SliceRandom;
// use rand::thread_rng;
// use linfa_ensemble::visualization;

// fn main() {
// // Number of trees in the forest
// let num_trees = 100;
// // Number of features to consider for each split
// let max_features = 4; // Set to the number of features in your dataset or adjust as needed
// // Maximum depth of each tree
// let max_depth = 10;
// // Minimum number of samples required to split a node
// let min_samples_split = 5;

// // Load the Iris dataset
// let iris = linfa_datasets::diabetes();
// let iris_cloned = iris.clone();

// // Extract features and targets
// let features = iris_cloned.records();
// let targets = iris.targets().mapv(|x| x as f64);

// // Shuffle and split the data into train and test
// let mut rng = thread_rng();
// let mut indices: Vec<usize> = (0..features.nrows()).collect();
// indices.shuffle(&mut rng);
// let split_index = (features.nrows() as f64 * 0.8) as usize; // 60% train, 40% test
// let train_indices = &indices[..split_index];
// let test_indices = &indices[split_index..];

// let train_features = features.select(Axis(0), train_indices);
// let train_targets = targets.select(Axis(0), train_indices);
// let test_features = features.select(Axis(0), test_indices);
// let test_targets = targets.select(Axis(0), test_indices);

// // Train random forest regressor
// let mut forest = RandomForestRegressor::new(num_trees, max_features, max_depth, min_samples_split);
// forest.fit(&train_features, &train_targets);

// // Predict on test dataset
// let predictions = forest.predict(&test_features);

// // Evaluate performance
// let mse = mean_squared_error(&test_targets, &predictions);
// println!("Mean Squared Error: {}", mse);



// println!("Generated graph");
// }

// fn mean_squared_error(actual: &Array1<f64>, predicted: &Array1<f64>) -> f64 {
// let errors = actual - predicted;
// let squared_errors = errors.mapv(|x| x.powi(2));
// squared_errors.mean().unwrap()
// }

#[cfg(test)]
mod tests {
use super::*;
use approx::assert_relative_eq;
use linfa_datasets::{iris, diabetes};
use linfa_ensemble::RandomForestRegressor;
use ndarray::{Array1, Array2, Axis}; // For floating-point assertions
use linfa_ensemble::visualization;


fn calculate_rmse(actual: &Array1<f64>, predicted: &Array1<f64>) -> f64 {
let errors = actual - predicted;
let mse = errors.mapv(|e| e.powi(2)).mean().unwrap();
mse.sqrt()
}

fn load_iris_data() -> (Array2<f64>, Array1<f64>) {
// Load the dataset
let dataset = iris();

// Extract features; assuming all rows and all but the last column if last is target
let features = dataset.records().clone();

let targets = dataset.targets().mapv(|x| x as f64);

(features, targets)
}

fn load_diabetes_data() -> (Array2<f64>, Array1<f64>) {
let dataset = diabetes();

let features = dataset.records().clone();
let targets = dataset.targets().mapv(|x| x as f64);

(features, targets)
}

#[test]
fn test_random_forest_with_diabetes() {
let (features, targets) = load_diabetes_data();

// Split data into training and testing sets
let split_ratio = 0.7; // Using 70% of the data for training
let split_index = (features.nrows() as f64 * split_ratio) as usize;
let (train_features, test_features) = features.view().split_at(Axis(0), split_index);
let (train_targets, test_targets) = targets.view().split_at(Axis(0), split_index);

let mut forest = RandomForestRegressor::new(100, 10, 5, 10);
// Convert views to owned arrays before passing to fit
forest.fit(&train_features.to_owned(), &train_targets.to_owned());
let train_predictions = forest.predict(&train_features.to_owned());
let test_predictions = forest.predict(&test_features.to_owned());

// Evaluate the performance on the test set
let test_rmse = calculate_rmse(&test_targets.to_owned(), &test_predictions);
println!("Test RMSE for Diabetes Dataset: {:?}", test_rmse);

// Assert that the RMSE is below an acceptable threshold
assert!(test_rmse < 70.0, "The RMSE should be lower than 60.0");

// Visualization of training and testing results
visualization::plot_scatter(
&train_targets.to_owned(),
&train_predictions,
&test_targets.to_owned(),
&test_predictions,
"diabetes_rf_scatter.png",
).unwrap();
}


#[test]
fn test_random_forest_with_iris() {
let (features, targets) = load_iris_data();

let mut forest = RandomForestRegressor::new(100, 10, 3, 10);
forest.fit(&features, &targets);
let predictions = forest.predict(&features);

// Define a tolerance level
let tolerance = 0.1; // Tolerance level for correct classification
let mut correct = 0;
let mut incorrect = 0;

// Count correct and incorrect predictions
for (&actual, &predicted) in targets.iter().zip(predictions.iter()) {
if (predicted - actual).abs() < tolerance {
correct += 1;
} else {
incorrect += 1;
}
}

println!("Correct predictions: {}", correct);
println!("Incorrect predictions: {}", incorrect);

let rmse = (&predictions - &targets)
.mapv(|a| a.powi(2))
.mean()
.unwrap()
.sqrt();

println!("RMSE: {:?}", rmse);
}
}
Loading