Skip to content

Commit

Permalink
Merge pull request #1 from smartcorelib/development
Browse files Browse the repository at this point in the history
update
  • Loading branch information
z1queue committed Feb 26, 2021
2 parents cd44f1d + 1b42f8a commit 023b449
Show file tree
Hide file tree
Showing 75 changed files with 6,158 additions and 790 deletions.
20 changes: 18 additions & 2 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ workflows:
jobs:
- build
- clippy
- coverage

jobs:
build:
docker:
Expand All @@ -21,10 +23,10 @@ jobs:
command: cargo fmt -- --check
- run:
name: Stable Build
command: cargo build --features "nalgebra-bindings ndarray-bindings"
command: cargo build --all-features
- run:
name: Test
command: cargo test --features "nalgebra-bindings ndarray-bindings"
command: cargo test --all-features
- save_cache:
key: project-cache
paths:
Expand All @@ -41,3 +43,17 @@ jobs:
- run:
name: Run cargo clippy
command: cargo clippy --all-features -- -Drust-2018-idioms -Dwarnings

coverage:
machine: true
steps:
- checkout
- run:
name: Generate report
command: >
docker run --security-opt seccomp=unconfined -v $PWD:/volume
xd009642/tarpaulin:latest-nightly cargo tarpaulin -v --ciserver circle-ci
--out Lcov --all-features -- --test-threads 1
- run:
name: Upload
command: bash <(curl -s https://codecov.io/bash) -Z -f
16 changes: 10 additions & 6 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
name = "smartcore"
description = "The most advanced machine learning library in rust."
homepage = "https://smartcorelib.org"
version = "0.1.0"
version = "0.2.0"
authors = ["SmartCore Developers"]
edition = "2018"
license = "Apache-2.0"
Expand All @@ -19,14 +19,13 @@ nalgebra-bindings = ["nalgebra"]
datasets = []

[dependencies]
ndarray = { version = "0.13", optional = true }
nalgebra = { version = "0.22.0", optional = true }
ndarray = { version = "0.14", optional = true }
nalgebra = { version = "0.23.0", optional = true }
num-traits = "0.2.12"
num = "0.3.0"
rand = "0.7.3"
rand_distr = "0.3.0"
serde = { version = "1.0.115", features = ["derive"] }
serde_derive = "1.0.115"
serde = { version = "1.0.115", features = ["derive"], optional = true }

[dev-dependencies]
criterion = "0.3"
Expand All @@ -35,4 +34,9 @@ bincode = "1.3.1"

[[bench]]
name = "distance"
harness = false
harness = false

[[bench]]
name = "naive_bayes"
harness = false
required-features = ["ndarray-bindings", "nalgebra-bindings"]
73 changes: 73 additions & 0 deletions benches/naive_bayes.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
use criterion::BenchmarkId;
use criterion::{black_box, criterion_group, criterion_main, Criterion};

use nalgebra::DMatrix;
use ndarray::Array2;
use smartcore::linalg::naive::dense_matrix::DenseMatrix;
use smartcore::linalg::BaseMatrix;
use smartcore::linalg::BaseVector;
use smartcore::naive_bayes::gaussian::GaussianNB;

pub fn gaussian_naive_bayes_fit_benchmark(c: &mut Criterion) {
let mut group = c.benchmark_group("GaussianNB::fit");

for n_samples in [100_usize, 1000_usize, 10000_usize].iter() {
for n_features in [10_usize, 100_usize, 1000_usize].iter() {
let x = DenseMatrix::<f64>::rand(*n_samples, *n_features);
let y: Vec<f64> = (0..*n_samples)
.map(|i| (i % *n_samples / 5_usize) as f64)
.collect::<Vec<f64>>();
group.bench_with_input(
BenchmarkId::from_parameter(format!(
"n_samples: {}, n_features: {}",
n_samples, n_features
)),
n_samples,
|b, _| {
b.iter(|| {
GaussianNB::fit(black_box(&x), black_box(&y), Default::default()).unwrap();
})
},
);
}
}
group.finish();
}

pub fn gaussian_naive_matrix_datastructure(c: &mut Criterion) {
let mut group = c.benchmark_group("GaussianNB");
let classes = (0..10000).map(|i| (i % 25) as f64).collect::<Vec<f64>>();

group.bench_function("DenseMatrix", |b| {
let x = DenseMatrix::<f64>::rand(10000, 500);
let y = <DenseMatrix<f64> as BaseMatrix<f64>>::RowVector::from_array(&classes);

b.iter(|| {
GaussianNB::fit(black_box(&x), black_box(&y), Default::default()).unwrap();
})
});

group.bench_function("ndarray", |b| {
let x = Array2::<f64>::rand(10000, 500);
let y = <Array2<f64> as BaseMatrix<f64>>::RowVector::from_array(&classes);

b.iter(|| {
GaussianNB::fit(black_box(&x), black_box(&y), Default::default()).unwrap();
})
});

group.bench_function("ndalgebra", |b| {
let x = DMatrix::<f64>::rand(10000, 500);
let y = <DMatrix<f64> as BaseMatrix<f64>>::RowVector::from_array(&classes);

b.iter(|| {
GaussianNB::fit(black_box(&x), black_box(&y), Default::default()).unwrap();
})
});
}
criterion_group!(
benches,
gaussian_naive_bayes_fit_benchmark,
gaussian_naive_matrix_datastructure
);
criterion_main!(benches);
33 changes: 15 additions & 18 deletions src/algorithm/neighbour/bbd_tree.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,7 @@ impl<T: RealNumber> BBDTree<T> {

let (n, _) = data.shape();

let mut index = vec![0; n];
for i in 0..n {
index[i] = i;
}
let index = (0..n).collect::<Vec<_>>();

let mut tree = BBDTree {
nodes,
Expand All @@ -64,7 +61,7 @@ impl<T: RealNumber> BBDTree<T> {

pub(in crate) fn clustering(
&self,
centroids: &Vec<Vec<T>>,
centroids: &[Vec<T>],
sums: &mut Vec<Vec<T>>,
counts: &mut Vec<usize>,
membership: &mut Vec<usize>,
Expand Down Expand Up @@ -92,8 +89,8 @@ impl<T: RealNumber> BBDTree<T> {
fn filter(
&self,
node: usize,
centroids: &Vec<Vec<T>>,
candidates: &Vec<usize>,
centroids: &[Vec<T>],
candidates: &[usize],
k: usize,
sums: &mut Vec<Vec<T>>,
counts: &mut Vec<usize>,
Expand All @@ -117,15 +114,15 @@ impl<T: RealNumber> BBDTree<T> {
let mut new_candidates = vec![0; k];
let mut newk = 0;

for i in 0..k {
for candidate in candidates.iter().take(k) {
if !BBDTree::prune(
&self.nodes[node].center,
&self.nodes[node].radius,
centroids,
closest,
candidates[i],
*candidate,
) {
new_candidates[newk] = candidates[i];
new_candidates[newk] = *candidate;
newk += 1;
}
}
Expand Down Expand Up @@ -166,9 +163,9 @@ impl<T: RealNumber> BBDTree<T> {
}

fn prune(
center: &Vec<T>,
radius: &Vec<T>,
centroids: &Vec<Vec<T>>,
center: &[T],
radius: &[T],
centroids: &[Vec<T>],
best_index: usize,
test_index: usize,
) -> bool {
Expand Down Expand Up @@ -285,8 +282,8 @@ impl<T: RealNumber> BBDTree<T> {
}

let mut mean = vec![T::zero(); d];
for i in 0..d {
mean[i] = node.sum[i] / T::from(node.count).unwrap();
for (i, mean_i) in mean.iter_mut().enumerate().take(d) {
*mean_i = node.sum[i] / T::from(node.count).unwrap();
}

node.cost = BBDTree::node_cost(&self.nodes[node.lower.unwrap()], &mean)
Expand All @@ -295,11 +292,11 @@ impl<T: RealNumber> BBDTree<T> {
self.add_node(node)
}

fn node_cost(node: &BBDTreeNode<T>, center: &Vec<T>) -> T {
fn node_cost(node: &BBDTreeNode<T>, center: &[T]) -> T {
let d = center.len();
let mut scatter = T::zero();
for i in 0..d {
let x = (node.sum[i] / T::from(node.count).unwrap()) - center[i];
for (i, center_i) in center.iter().enumerate().take(d) {
let x = (node.sum[i] / T::from(node.count).unwrap()) - *center_i;
scatter += x * x;
}
node.cost + T::from(node.count).unwrap() * scatter
Expand Down
16 changes: 11 additions & 5 deletions src/algorithm/neighbour/cover_tree.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
//! use smartcore::algorithm::neighbour::cover_tree::*;
//! use smartcore::math::distance::Distance;
//!
//! #[derive(Clone)]
//! struct SimpleDistance {} // Our distance function
//!
//! impl Distance<i32, f64> for SimpleDistance {
Expand All @@ -23,6 +24,7 @@
//! ```
use std::fmt::Debug;

#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};

use crate::algorithm::sort::heap_select::HeapSelection;
Expand All @@ -31,7 +33,8 @@ use crate::math::distance::Distance;
use crate::math::num::RealNumber;

/// Implements Cover Tree algorithm
#[derive(Serialize, Deserialize, Debug)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[derive(Debug)]
pub struct CoverTree<T, F: RealNumber, D: Distance<T, F>> {
base: F,
inv_log_base: F,
Expand All @@ -55,7 +58,8 @@ impl<T, F: RealNumber, D: Distance<T, F>> PartialEq for CoverTree<T, F, D> {
}
}

#[derive(Debug, Serialize, Deserialize)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[derive(Debug)]
struct Node<F: RealNumber> {
idx: usize,
max_dist: F,
Expand All @@ -64,7 +68,7 @@ struct Node<F: RealNumber> {
scale: i64,
}

#[derive(Debug, Serialize, Deserialize)]
#[derive(Debug)]
struct DistanceSet<F: RealNumber> {
idx: usize,
dist: Vec<F>,
Expand Down Expand Up @@ -436,7 +440,7 @@ impl<T: Debug + PartialEq, F: RealNumber, D: Distance<T, F>> CoverTree<T, F, D>
}
}

fn max(&self, distance_set: &Vec<DistanceSet<F>>) -> F {
fn max(&self, distance_set: &[DistanceSet<F>]) -> F {
let mut max = F::zero();
for n in distance_set {
if max < n.dist[n.dist.len() - 1] {
Expand All @@ -453,7 +457,8 @@ mod tests {
use super::*;
use crate::math::distance::Distances;

#[derive(Debug, Serialize, Deserialize)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[derive(Debug, Clone)]
struct SimpleDistance {}

impl Distance<i32, f64> for SimpleDistance {
Expand Down Expand Up @@ -499,6 +504,7 @@ mod tests {
}

#[test]
#[cfg(feature = "serde")]
fn serde() {
let data = vec![1, 2, 3, 4, 5, 6, 7, 8, 9];

Expand Down
7 changes: 6 additions & 1 deletion src/algorithm/neighbour/linear_search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
//! use smartcore::algorithm::neighbour::linear_search::*;
//! use smartcore::math::distance::Distance;
//!
//! #[derive(Clone)]
//! struct SimpleDistance {} // Our distance function
//!
//! impl Distance<i32, f64> for SimpleDistance {
Expand All @@ -21,6 +22,7 @@
//!
//! ```

#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};
use std::cmp::{Ordering, PartialOrd};
use std::marker::PhantomData;
Expand All @@ -31,7 +33,8 @@ use crate::math::distance::Distance;
use crate::math::num::RealNumber;

/// Implements Linear Search algorithm, see [KNN algorithms](../index.html)
#[derive(Serialize, Deserialize, Debug)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[derive(Debug)]
pub struct LinearKNNSearch<T, F: RealNumber, D: Distance<T, F>> {
distance: D,
data: Vec<T>,
Expand Down Expand Up @@ -137,6 +140,8 @@ mod tests {
use super::*;
use crate::math::distance::Distances;

#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[derive(Debug, Clone)]
struct SimpleDistance {}

impl Distance<i32, f64> for SimpleDistance {
Expand Down
8 changes: 6 additions & 2 deletions src/algorithm/neighbour/mod.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#![allow(clippy::ptr_arg)]
//! # Nearest Neighbors Search Algorithms and Data Structures
//!
//! Nearest neighbor search is a basic computational tool that is particularly relevant to machine learning,
Expand Down Expand Up @@ -34,6 +35,7 @@ use crate::algorithm::neighbour::linear_search::LinearKNNSearch;
use crate::error::Failed;
use crate::math::distance::Distance;
use crate::math::num::RealNumber;
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};

pub(crate) mod bbd_tree;
Expand All @@ -44,15 +46,17 @@ pub mod linear_search;

/// Both, KNN classifier and regressor benefits from underlying search algorithms that helps to speed up queries.
/// `KNNAlgorithmName` maintains a list of supported search algorithms, see [KNN algorithms](../algorithm/neighbour/index.html)
#[derive(Serialize, Deserialize, Debug, Clone)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[derive(Debug, Clone)]
pub enum KNNAlgorithmName {
/// Heap Search algorithm, see [`LinearSearch`](../algorithm/neighbour/linear_search/index.html)
LinearSearch,
/// Cover Tree Search algorithm, see [`CoverTree`](../algorithm/neighbour/cover_tree/index.html)
CoverTree,
}

#[derive(Serialize, Deserialize, Debug)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[derive(Debug)]
pub(crate) enum KNNAlgorithm<T: RealNumber, D: Distance<Vec<T>, T>> {
LinearSearch(LinearKNNSearch<Vec<T>, T, D>),
CoverTree(CoverTree<Vec<T>, T, D>),
Expand Down
Loading

0 comments on commit 023b449

Please sign in to comment.