Merge pull request #4 from Sin317/km1139

Python API calling introduced with Pyo3 and TIKZ for Adaboost and random forest classifier parallelization
Sin317 · May 6, 2024 · bb5c2ab · bb5c2ab
2 parents 51fb2f7 + f24d601
commit bb5c2ab
Show file tree

Hide file tree

Showing 18 changed files with 574 additions and 60 deletions.
diff --git a/.gitignore b/.gitignore
@@ -10,7 +10,7 @@ Cargo.lock
 **/*.rs.bk
 # vscode
 .vscode
-
+.DS_Store
 # ctags
 tags
 *.npy

diff --git a/Cargo.toml b/Cargo.toml
@@ -48,6 +48,7 @@ thiserror = "1.0"
 
 criterion = { version = "0.4.0", optional = true }
 
+rayon = "1.10.0"
 [dependencies.serde_crate]
 package = "serde"
 optional = true
@@ -62,6 +63,7 @@ linfa-datasets = { path = "datasets", features = [
     "iris",
     "diabetes",
     "generate",
+    "mnist",
 ] }
 statrs = "0.16.0"
 

diff --git a/algorithms/linfa-ensemble/.gitignore b/algorithms/linfa-ensemble/.gitignore
@@ -0,0 +1,72 @@
+/target
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+.pytest_cache/
+*.py[cod]
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+.venv/
+env/
+bin/
+build/
+develop-eggs/
+dist/
+eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+include/
+man/
+venv/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+pip-selfcheck.json
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.cache
+nosetests.xml
+coverage.xml
+
+# Translations
+*.mo
+
+# Mr Developer
+.mr.developer.cfg
+.project
+.pydevproject
+
+# Rope
+.ropeproject
+
+# Django stuff:
+*.log
+*.pot
+
+.DS_Store
+
+# Sphinx documentation
+docs/_build/
+
+# PyCharm
+.idea/
+
+# VSCode
+.vscode/
+
+# Pyenv
+.python-version
diff --git a/algorithms/linfa-ensemble/Cargo.toml b/algorithms/linfa-ensemble/Cargo.toml
@@ -7,11 +7,13 @@ description = "A general method for creating ensemble classifiers"
 license = "MIT/Apache-2.0"
 
 repository = "https://github.com/rust-ml/linfa"
-readme = "README.md"
 
 keywords = ["machine-learning", "linfa", "ensemble"]
 categories = ["algorithms", "mathematics", "science"]
 
+# "cdylib" is necessary to produce a shared library for Python to import from.
+crate-type = ["cdylib"]
+
 [features]
 default = []
 serde = ["serde_crate", "ndarray/serde"]
@@ -26,14 +28,22 @@ features = ["std", "derive"]
 [dependencies]
 linfa = { version = "0.7.0", path = "../.." }
 linfa-trees = { version = "0.7.0", path = "../linfa-trees"}
-linfa-datasets = { version = "0.7.0", path = "../../datasets/", features = ["iris"] }
+linfa-datasets = { version = "0.7.0", path = "../../datasets/", features = ["iris", "mnist"] }
 ndarray = { version = "0.15" , features = ["rayon", "approx"]}
 ndarray-rand = "0.14"
 rand = { version = "0.8", features = ["small_rng"] }
+pyo3 = { version = "0.21.2", features = ["extension-module"] }
+rayon = {version = "1.10.0"}
 approx = {version = "0.5"}
 
 [dev-dependencies]
 rand = { version = "0.8", features = ["small_rng"] }
-linfa-datasets = { version = "0.7.0", path = "../../datasets/", features = ["iris"] }
-ndarray = { version = "0.15" , features = ["rayon", "approx"]}
-approx = {version = "0.5"}
+linfa-datasets = { version = "0.7.0", path = "../../datasets/", features = ["iris", "mnist"] }
+rayon = {version = "1.10.0"}
+approx = {version = "0.5"}
+
+[lib]
+# The name of the native library. This is the name which will be used in Python to import the
+# library (i.e. `import string_sum`). If you change this, you must also change the name of the
+# `#[pymodule]` in `src/lib.rs`.
+name = "linfa_ensemble"
diff --git a/algorithms/linfa-ensemble/examples/adaboost.rs b/algorithms/linfa-ensemble/examples/adaboost.rs
@@ -1,5 +1,4 @@
-use std::fs::File;
-use std::io::Write;
+use std::{fs::File, io::Write};
 
 use linfa_trees::DecisionTreeParams;
 use ndarray_rand::rand::SeedableRng;
@@ -15,7 +14,8 @@ fn main() -> Result<()> {
         .shuffle(&mut rng)
         .split_with_ratio(0.8);
 
-    println!("Training model with Adaboost ...");
+    println!("IRIS DATA: Training model with Adaboost ...");
+
     let ada_model = Adaboost::<f64, usize>::params()
         .n_estimators(10)
         .d_tree_params(
@@ -32,9 +32,19 @@ fn main() -> Result<()> {
     println!("{:?}", cm);
 
     println!(
-        "Test accuracy with Adaboost : {:.2}%",
+        "IRIS DATA: Test accuracy with Adaboost : {:.2}%",
         100.0 * cm.accuracy()
     );
 
+    let mut tikz = File::create("adaboost_example.tex").unwrap();
+    tikz.write_all(
+        ada_model
+            .export_to_tikz()
+            .with_legend()
+            .to_string()
+            .as_bytes(),
+    )
+    .unwrap();
+
     Ok(())
 }
diff --git a/algorithms/linfa-ensemble/examples/random_forest.rs b/algorithms/linfa-ensemble/examples/random_forest.rs
@@ -1,6 +1,5 @@
 //! Random Forest
-use linfa::prelude::{Predict, ToConfusionMatrix};
-use linfa::traits::Fit;
+use linfa::prelude::*;
 use linfa_ensemble::EnsembleLearnerParams;
 use linfa_trees::DecisionTree;
 use ndarray_rand::rand::SeedableRng;
@@ -14,18 +13,21 @@ fn main() {
 
     //Load dataset
     let mut rng = SmallRng::seed_from_u64(42);
-    let (train, test) = linfa_datasets::iris()
-        .shuffle(&mut rng)
-        .split_with_ratio(0.7);
+
+    let (train, test) = linfa_datasets::mnist();
+
+    train.shuffle(&mut rng);
+    test.shuffle(&mut rng);
 
     //Train ensemble learner model
-    let model = EnsembleLearnerParams::new(DecisionTree::params())
+    let model = EnsembleLearnerParams::new(DecisionTree::<f64, usize>::params())
         .ensemble_size(ensemble_size)
         .bootstrap_proportion(bootstrap_proportion)
         .fit(&train)
         .unwrap();
+    // println!("Done with Fit");
+    //   //Return highest ranking predictions
 
-    //Return highest ranking predictions
     let final_predictions_ensemble = model.predict(&test);
     println!("Final Predictions: \n{:?}", final_predictions_ensemble);
 

diff --git a/algorithms/linfa-ensemble/pyproject.toml b/algorithms/linfa-ensemble/pyproject.toml
@@ -0,0 +1,15 @@
+[build-system]
+requires = ["maturin>=1.5,<2.0"]
+build-backend = "maturin"
+
+[project]
+name = "linfa-ensemble"
+requires-python = ">=3.8"
+classifiers = [
+    "Programming Language :: Rust",
+    "Programming Language :: Python :: Implementation :: CPython",
+    "Programming Language :: Python :: Implementation :: PyPy",
+]
+dynamic = ["version"]
+[tool.maturin]
+features = ["pyo3/extension-module"]
diff --git a/algorithms/linfa-ensemble/src/adaboost/algorithm.rs b/algorithms/linfa-ensemble/src/adaboost/algorithm.rs
@@ -1,13 +1,10 @@
-use std::{collections::HashMap, iter::zip};
-
-use linfa::{dataset::Labels, error::Error, error::Result, traits::*, DatasetBase, Float, Label};
-use linfa_trees::DecisionTree;
-
 use super::AdaboostValidParams;
+use super::Tikz;
 use linfa::dataset::AsSingleTargets;
+use linfa::{dataset::Labels, error::Error, error::Result, traits::*, DatasetBase, Float, Label};
+use linfa_trees::DecisionTree;
 use ndarray::{Array1, ArrayBase, Data, Ix2};
-#[cfg(feature = "serde")]
-use serde_crate::{Deserialize, Serialize};
+use std::{collections::HashMap, iter::zip};
 // adaboost will be a vector of stumps
 
 // stump will contain a decision tree and a weight associated with that stump
@@ -25,6 +22,10 @@ pub struct Stump<F: Float, L: Label> {
 }
 
 impl<F: Float, L: Label + std::fmt::Debug> Stump<F, L> {
+    pub fn tree(&self) -> &DecisionTree<F, L> {
+        &self.tree
+    }
+
     fn make_stump(tree: DecisionTree<F, L>, weight: f32) -> Self {
         Stump { tree, weight }
     }
@@ -84,6 +85,15 @@ impl<F: Float, L: Label + Default, D: Data<Elem = F>> PredictInplace<ArrayBase<D
     }
 }
 
+impl<F: Float, L: Label> Adaboost<F, L> {
+    pub fn stumps(&self) -> &Vec<Stump<F, L>> {
+        &self.stumps
+    }
+    pub fn export_to_tikz(&self) -> Tikz<'_, F, L> {
+        Tikz::new(&self)
+    }
+}
+
 impl<'a, F: Float, L: Label + 'a + std::fmt::Debug, D, T> Fit<ArrayBase<D, Ix2>, T, Error>
     for AdaboostValidParams<F, L>
 where
@@ -218,4 +228,30 @@ mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn mnist_test() {
+        use ndarray_rand::rand::SeedableRng;
+        use rand::rngs::SmallRng;
+        // mnist
+        let mut rng = SmallRng::seed_from_u64(42);
+
+        let (train, test) = linfa_datasets::mnist();
+        train.shuffle(&mut rng);
+        test.shuffle(&mut rng);
+
+        println!("MNIST DATA: Training model with Adaboost ...");
+        let ada_model = Adaboost::<f64, usize>::params()
+            .n_estimators(2)
+            .d_tree_params(
+                DecisionTreeParams::new()
+                    .max_depth(Some(25))
+                    .min_weight_leaf(0.00001)
+                    .min_weight_split(0.00001),
+            )
+            .fit(&train)
+            .unwrap();
+
+        let _ada_pred_y = ada_model.predict(&test);
+    }
 }
diff --git a/algorithms/linfa-ensemble/src/adaboost/mod.rs b/algorithms/linfa-ensemble/src/adaboost/mod.rs
@@ -1,5 +1,6 @@
 mod algorithm;
 mod hyperparams;
-
+mod tikz;
 pub use algorithm::*;
 pub use hyperparams::*;
+pub use tikz::*;
-Original file line number
+Diff line change
@@ Expand Up / @@ -10,7 +10,7 @@ Cargo.lock @@
     **/*.rs.bk
     # vscode
     .vscode
+    .DS_Store
     # ctags
     tags
     *.npy
@@ Expand Down @@