Skip to content

Commit

Permalink
Outlier scores for HDBSCAN (#73)
Browse files Browse the repository at this point in the history
Implemented GLOSH (Global-Local Outlier Score from Hierarchies) for `HDbscan`.
  • Loading branch information
azizkayumov authored Nov 29, 2024
1 parent f20c908 commit 205e7e9
Show file tree
Hide file tree
Showing 2 changed files with 156 additions and 9 deletions.
4 changes: 2 additions & 2 deletions examples/hdbscan.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ fn main() {
let data: Vec<f64> = rdr
.deserialize()
.map(|v| {
let r: Vec<f64> = v.expect("corruptted data");
let r: Vec<f64> = v.expect("corrupted data");
if nfeatures < 1 {
nfeatures = r.len();
}
Expand All @@ -41,7 +41,7 @@ fn main() {
metric: Euclidean::default(),
boruvka: true,
};
let (clusters, outliers) = clustering.fit(&data.view());
let (clusters, outliers, _outlier_scores) = clustering.fit(&data.view());
println!("========= Report =========");
println!("# of events processed: {}", data.nrows());
println!("# of features provided: {}", data.ncols());
Expand Down
161 changes: 154 additions & 7 deletions src/hdbscan.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,15 +45,19 @@ where
}
}

impl<S, A, M> Fit<ArrayBase<S, Ix2>, (HashMap<usize, Vec<usize>>, Vec<usize>)> for HDbscan<A, M>
impl<S, A, M> Fit<ArrayBase<S, Ix2>, (HashMap<usize, Vec<usize>>, Vec<usize>, Vec<A>)>
for HDbscan<A, M>
where
A: AddAssign + DivAssign + FloatCore + FromPrimitive + Sync + Send,
S: Data<Elem = A>,
M: Metric<A> + Clone + Sync + Send,
{
fn fit(&mut self, input: &ArrayBase<S, Ix2>) -> (HashMap<usize, Vec<usize>>, Vec<usize>) {
fn fit(
&mut self,
input: &ArrayBase<S, Ix2>,
) -> (HashMap<usize, Vec<usize>>, Vec<usize>, Vec<A>) {
if input.is_empty() {
return (HashMap::new(), Vec::new());
return (HashMap::new(), Vec::new(), Vec::new());
}
let input = input.as_standard_layout();
let db = BallTree::new(input.view(), self.metric.clone()).expect("non-empty array");
Expand Down Expand Up @@ -87,8 +91,10 @@ where
mst.sort_unstable_by(|a, b| a.2.partial_cmp(&(b.2)).expect("invalid distance"));
let sorted_mst = Array1::from_vec(mst);
let labeled = label(sorted_mst);
let condensed = Array1::from_vec(condense_mst(labeled.view(), self.min_cluster_size));
find_clusters(&condensed.view())
let condensed = condense_mst(labeled.view(), self.min_cluster_size);
let outlier_scores = glosh(&condensed, self.min_cluster_size);
let (clusters, outliers) = find_clusters(&Array1::from_vec(condensed).view());
(clusters, outliers, outlier_scores)
}
}

Expand Down Expand Up @@ -387,6 +393,84 @@ fn find_clusters<A: FloatCore + FromPrimitive + AddAssign + Sub>(
(res_clusters, outliers)
}

// GLOSH: Global-Local Outlier Score from Hierarchies
// Reference: https://dl.acm.org/doi/10.1145/2733381
//
// Given the following hierarchy (min_cluster_size = 3),
// Root
// / \
// A ...
// eps_x -> / \
// x A
// / \
// y A
// /|\ <- eps_A: A is still a cluster w.r.t. min_cluster_size at this level
// a b c
//
// To compute the outlier score of point x, we need:
// - eps_x: eps that x joins to cluster A (A is the first cluster that x joins to)
// - eps_A: lowest eps that A or any of A's child clusters survives w.r.t. min_cluster_size.
// Then, the outlier score of x is defined as:
// score(x) = 1 - eps_A / eps_x
//
// Since we are working with density lambda values (where lambda = 1/eps):
// lambda_x = 1 / eps_x
// lambda_A = 1 / eps_C
// score(x) = 1 - lambda_x / lambda_C
fn glosh<A: FloatCore>(
condensed_mst: &[(usize, usize, A, usize)],
min_cluster_size: usize,
) -> Vec<A> {
let deaths = max_lambdas(condensed_mst, min_cluster_size);
let num_events = condensed_mst
.iter()
.map(|(_, child, _, _)| *child)
.max()
.map_or(0, |max_child| max_child + 1);

let mut scores = vec![A::zero(); num_events];
for (parent, child, lambda, _) in condensed_mst {
if *child >= num_events {
continue;
}
let lambda_max = deaths[*parent];
if lambda_max == A::zero() {
scores[*child] = A::zero();
} else {
scores[*child] = (lambda_max - *lambda) / lambda_max;
}
}
scores
}

// Return the maximum lambda value (min eps) for each cluster C such that
// the cluster or any of its child clusters has at least min_cluster_size points.
fn max_lambdas<A: FloatCore>(
condensed_mst: &[(usize, usize, A, usize)],
min_cluster_size: usize,
) -> Vec<A> {
let largest_parent = condensed_mst
.iter()
.max_by_key(|(parent, _, _, _)| *parent)
.unwrap()
.0;

// bottom-up traverse the hierarchy to keep track of the maximum lambda values
// (same with the reverse order iteration on the condensed_mst)
let mut parent_sizes: Vec<usize> = vec![0; largest_parent + 1];
let mut deaths_arr: Vec<A> = vec![A::zero(); largest_parent + 1];
for (parent, child, lambda, child_size) in condensed_mst.iter().rev() {
parent_sizes[*parent] += *child_size;
if parent_sizes[*parent] >= min_cluster_size {
deaths_arr[*parent] = deaths_arr[*parent].max(*lambda);
}
if *child_size >= min_cluster_size {
deaths_arr[*parent] = deaths_arr[*parent].max(deaths_arr[*child]);
}
}
deaths_arr
}

fn bfs_tree(tree: &[(usize, usize)], root: usize) -> Vec<usize> {
let mut result = vec![];
let mut to_process = HashSet::new();
Expand Down Expand Up @@ -936,7 +1020,7 @@ mod test {
metric: Euclidean::default(),
boruvka: false,
};
let (clusters, outliers) = hdbscan.fit(&data);
let (clusters, outliers, _) = hdbscan.fit(&data);
assert_eq!(clusters.len(), 2);
assert_eq!(
outliers.len(),
Expand Down Expand Up @@ -967,7 +1051,7 @@ mod test {
metric: Euclidean::default(),
boruvka: false,
};
let (clusters, outliers) = hdbscan.fit(&data);
let (clusters, outliers, _) = hdbscan.fit(&data);
assert_eq!(clusters.len(), 2);
assert_eq!(
outliers.len(),
Expand Down Expand Up @@ -1044,6 +1128,69 @@ mod test {
assert_eq!(answer, mst);
}

#[test]
fn outlier_scores() {
use ndarray::array;
use petal_neighbors::distance::Euclidean;

use crate::Fit;

let data = array![
// cluster1:
[1.0, 1.0],
[1.0, 2.0],
[2.0, 1.0],
[2.0, 2.0],
// cluster2:
[4.0, 1.0],
[4.0, 2.0],
[5.0, 1.0],
[5.0, 2.0],
// cluster3:
[9.0, 1.0],
[9.0, 2.0],
[10.0, 1.0],
[10.0, 2.0],
[11.0, 1.0],
[11.0, 2.0],
// outlier1:
[2.0, 5.0],
// outlier2:
[10.0, 8.0],
];
let mut hdbscan = super::HDbscan {
eps: 0.5,
alpha: 1.,
min_samples: 4,
min_cluster_size: 4,
metric: Euclidean::default(),
boruvka: false,
};
let (_, _, outlier_scores) = hdbscan.fit(&data);

// The first 14 data objects immediately form their clusters at eps = √2
// The outlier scores of these objects are all 0:
// glosh(x) = 1 - √2 / √2 = 0
for i in 0..14 {
assert_eq!(outlier_scores[i], 0.0);
}

// Outlier1 joins the cluster C = {cluster1 ∪ cluster2} at:
// eps_outlier1 = √13
// The lowest eps that C or any of its child clusters survives w.r.t. min_cluster_size = 4 is:
// eps_C = √2 (due to cluster1 or cluster2)
// Then the outlier score of outlier1 is:
// glosh(outlier1) = 1 - √2 / √13 = 0.60776772972
assert_eq!(outlier_scores[14], 1.0 - 2.0_f64.sqrt() / 13.0_f64.sqrt());

// Outlier2 joins the root cluster at at eps = √37
// The lowest eps that the root cluster survives w.r.t. min_cluster_size = 4 is:
// eps_root = √2
// Then the outlier score of outlier2 is:
// glosh(outlier2) = 1 - √2 / √37 = 0.76750472251
assert_eq!(outlier_scores[15], 1.0 - 2.0_f64.sqrt() / 37.0_f64.sqrt());
}

#[test]
fn tree_union_find() {
use succinct::{BitVecMut, BitVector};
Expand Down

0 comments on commit 205e7e9

Please sign in to comment.