Skip to content

Commit

Permalink
feat!: Implement a DBScan clustering algorithm directly in ACTS (#2863)
Browse files Browse the repository at this point in the history
With this branch we removes all dependencies on MLPack for the clustering used in the ML seed filtering algorithm and replaces them with a custom implementation of the DBScan density-based clustering algorithm.

This new algorithm is far more powerful than the previous MLPack implementation and supports the full C++. With it, we observe a speed-up of the seed filtering algorithm by a factor of 2 and remove external dependency. All tracking performances should stay unchanged even if some minor difference might appear at the border of 2 nearby clusters (since the entries are not processed in the same order compared to the old mlpack implementation)
  • Loading branch information
Corentin-Allaire authored Jan 15, 2024
1 parent 72d3f88 commit 7f6f131
Show file tree
Hide file tree
Showing 21 changed files with 425 additions and 228 deletions.
5 changes: 0 additions & 5 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ option(ACTS_BUILD_PLUGIN_JSON "Build json plugin" OFF)
option(ACTS_USE_SYSTEM_NLOHMANN_JSON "Use nlohmann::json provided by the system instead of the bundled version" ${ACTS_USE_SYSTEM_LIBS})
option(ACTS_BUILD_PLUGIN_LEGACY "Build legacy plugin" OFF)
option(ACTS_BUILD_PLUGIN_ONNX "Build ONNX plugin" OFF)
option(ACTS_BUILD_PLUGIN_MLPACK "Build MLpack plugin" OFF)
option(ACTS_SETUP_VECMEM "Explicitly set up vecmem for the project" OFF)
option(ACTS_USE_SYSTEM_VECMEM "Use a system-provided vecmem installation" ${ACTS_USE_SYSTEM_LIBS})
option(ACTS_BUILD_PLUGIN_SYCL "Build SYCL plugin" OFF)
Expand Down Expand Up @@ -197,7 +196,6 @@ set(_acts_eigen3_version 3.3.7)
set(_acts_hepmc3_version 3.2.1)
set(_acts_nlohmanjson_version 3.2.0)
set(_acts_onnxruntime_version 1.12.0)
set(_acts_mlpack_version 3.1.1)
set(_acts_root_version 6.20)
set(_acts_tbb_version 2020.1)

Expand Down Expand Up @@ -309,9 +307,6 @@ if(ACTS_BUILD_PLUGIN_JSON)
add_subdirectory(thirdparty/nlohmann_json)
endif()
endif()
if(ACTS_BUILD_PLUGIN_MLPACK)
find_package(mlpack ${_acts_mlpack_version} REQUIRED)
endif()
if(ACTS_BUILD_PLUGIN_SYCL)
find_package(SYCL REQUIRED)
endif()
Expand Down
233 changes: 233 additions & 0 deletions Core/include/Acts/Utilities/DBScan.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,233 @@
// This file is part of the Acts project.
//
// Copyright (C) 2024 CERN for the benefit of the Acts project
//
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.

#pragma once

#include "Acts/Utilities/KDTree.hpp"

#include <algorithm>
#include <array>
#include <cmath>
#include <functional>
#include <iostream>
#include <memory>
#include <string>
#include <vector>

namespace Acts {
/// @brief A general implementation of an N dimensional DBScan clustering algorithm.
///
/// This is a general implementation of an N dimensional DBScan clustering
/// algorithm. The DBScan algorithm uses density information to cluster together
/// points that are close to each other.
///
/// For each point, we will look for the neighbours that are within the epsilon
/// radius. If the number of neighbours is greater than the minimum number of
/// points, we will start a new cluster and assign the current point to it. We
/// will then look for the neighbours of the neighbours and assign them to the
/// current cluster if they are not already assigned to a cluster. If the
/// neighbours have itself more than the minimum number of points as neighbours,
/// we will repeat the process on those neighbours.
///
/// To speed up the search for the neighbours, we use the KDTree implemented in
/// ACTS. It performs a range search in the orthogonal hypercube with a length
/// of 2 epsilon. An extra cut is used to only keep the neighbours that are
/// within the epsilon radius.
///
/// @tparam kDims The number of dimensions.
/// @tparam scalar_t The scalar type used to construct position vectors.
/// @tparam kLeafSize The maximum number of points in a leaf node of the KDTree.
template <std::size_t kDims, typename scalar_t = double,
std::size_t kLeafSize = 4>
class DBScan {
public:
// The type of coordinates for points.
using Point = std::array<scalar_t, kDims>;

// The type of a vector of coordinate.
using VectorPoints = std::vector<Point>;

// The type to pair the points with an ID.
using Pair = std::pair<Point, std::size_t>;

// The type of a vector of coordinate-ID pairs.
using VectorPairs = std::vector<Pair>;

// KDTree used before the DBScan algorithm to find the neighbours.
using Tree = KDTree<kDims, std::size_t, scalar_t, std::array, kLeafSize>;

// Remove the default constructor.
DBScan() = delete;

/// @brief Construct the DBScan algorithm with a given epsilon and minPoints.
///
/// @param epsilon The epsilon radius used to find the neighbours.
/// @param minPoints The minimum number of points to form a cluster.
/// @param onePointCluster If true, all the noise points are considered as
/// individual one point clusters.
DBScan(scalar_t epsilon = 1.0, std::size_t minPoints = 1,
bool onePointCluster = false)
: m_eps(epsilon),
m_minPoints(minPoints),
m_onePointCluster(onePointCluster) {}

/// @brief Cluster the input points.
///
/// This function implements the main loop of the DBScan algorithm.
/// It loops over all the point and will try to start new cluster
/// if it finds points that have yet to be clustered.
///
/// @param inputPoints The input points to cluster.
/// @param clusteredPoints Vector containing the cluster ID of each point.
/// @return The number of clusters (excluding noise if onePointCluster==False).
///
int cluster(const VectorPoints& inputPoints,
std::vector<int>& clusteredPoints) {
// Transform the initial vector of input point to a vector of pairs
// with the index of the point in the initial vector.
VectorPairs inputPointsWithIndex;
for (std::size_t id = 0; id < inputPoints.size(); id++) {
inputPointsWithIndex.push_back(std::make_pair(inputPoints[id], id));
}
// Build the KDTree with the input points.
Tree tree = Tree(std::move(inputPointsWithIndex));

// Initialize the cluster ID to 0.
int clusterID = 0;
// By default all the points are considered as noise.
clusteredPoints = std::vector<int>(inputPoints.size(), -1);

// Loop over all the points
for (std::size_t id = 0; id < inputPoints.size(); id++) {
// If the point is already assigned to a cluster, skip it.
if (clusteredPoints[id] != -1) {
continue;
}
// If not we try to build a new cluster
std::vector<std::size_t> pointToProcess{id};
expandCluster(tree, inputPoints, pointToProcess, clusteredPoints,
clusterID);
// If the cluster has been created, increment the cluster ID.
if (clusteredPoints[id] != -1) {
clusterID++;
}
}
if (m_onePointCluster) {
// If noise is present and onePointCluster is true, all the noise points
// are considered as individual one point clusters. Loop over all the
// points in the KDTree.
for (auto& cluster : clusteredPoints) {
// If the point is assigned to noise, assign it to a new cluster.
if (cluster == -1) {
cluster = clusterID;
clusterID++;
}
}
}
return clusterID;
}

private:
/// @brief Extend the cluster.
///
/// This function will extend the cluster by finding all the neighbours of the
/// current point and assign them to the current cluster.
/// The KDTree is used to find the neighbours and an extra cut is used to only
/// keep the neighbours that are within the epsilon radius.
///
/// @param tree The KDTree containing all the points.
/// @param inputPoints The vector containing the input points.
/// @param pointsToProcess The vector containing the ids of the points that need to be
/// processed.
/// @param clusteredPoints Vector containing the cluster ID of each point.
/// @param clusterID The ID of the current cluster.
///
void expandCluster(const Tree& tree, const VectorPoints& inputPoints,
const std::vector<std::size_t>& pointsToProcess,
std::vector<int>& clusteredPoints, const int clusterID) {
// Loop over all the points that need to be process.
for (const auto id : pointsToProcess) {
// Lets look for the neighbours of the current point.
const Point currentPoint = inputPoints[id];
std::vector<std::size_t> neighbours;
// We create the range in which we will look for the neighbours (an
// hypercube with a length of 2 epsilon).
typename Tree::range_t range;
for (std::size_t dim = 0; dim < kDims; dim++) {
range[dim] = std::make_pair(currentPoint[dim] - m_eps,
currentPoint[dim] + m_eps);
}
// We use the KDTree to find the neighbours.
// An extra cut needs to be applied to only keep the neighbours that
// are within the epsilon radius.
tree.rangeSearchMapDiscard(
range, [this, &neighbours, currentPoint](
const typename Tree::coordinate_t& pos,
const typename Tree::value_t& val) {
scalar_t distance = 0;
for (std::size_t dim = 0; dim < kDims; dim++) {
distance += (pos[dim] - currentPoint[dim]) *
(pos[dim] - currentPoint[dim]);
}
if (distance <= m_eps * m_eps) {
neighbours.push_back(val);
}
});
std::size_t nNeighbours = neighbours.size();
// If a cluster has already been started we add the neighbours to it
if (clusteredPoints[id] != -1) {
updateNeighbours(neighbours, clusteredPoints, clusterID);
}
if (nNeighbours >= m_minPoints) {
// If the cluster has not been started yet and we have enough
// neighbours, we start the cluster and assign the current point and its
// neighbours to it.
if (clusteredPoints[id] == -1) {
clusteredPoints[id] = clusterID;
updateNeighbours(neighbours, clusteredPoints, clusterID);
}
// Try to extend the cluster with the neighbours.
expandCluster(tree, inputPoints, neighbours, clusteredPoints,
clusterID);
}
}
}

/// @brief Update the neighbours.
///
/// This function will remove the neighbours that are already assigned to a
/// cluster and assign the remaining ones to the current cluster.
///
/// @param neighbours The vector containing the ids of the neighbours.
/// @param clusteredPoints Vector containing the cluster ID of each point.
/// @param clusterID The ID of the current cluster.
///
void updateNeighbours(std::vector<std::size_t>& neighbours,
std::vector<int>& clusteredPoints,
const int clusterID) {
neighbours.erase(std::remove_if(neighbours.begin(), neighbours.end(),
[&clusteredPoints](int i) {
return clusteredPoints[i] != -1;
}),
neighbours.end());

for (const auto& neighbour : neighbours) {
clusteredPoints[neighbour] = clusterID;
}
}

// The epsilon radius used to find the neighbours.
scalar_t m_eps;
// The minimum number of points to form a cluster.
std::size_t m_minPoints = 1;
// If true, all the noise points are considered as individual one point
// clusters.
bool m_onePointCluster = false;
};

} // namespace Acts
14 changes: 2 additions & 12 deletions Examples/Algorithms/TrackFindingML/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,15 +1,10 @@
set(SOURCES
src/AmbiguityResolutionML.cpp
src/AmbiguityResolutionMLAlgorithm.cpp
src/AmbiguityResolutionMLDBScanAlgorithm.cpp
src/SeedFilterMLAlgorithm.cpp
)

if(ACTS_BUILD_PLUGIN_MLPACK)
list(APPEND SOURCES
src/AmbiguityResolutionMLDBScanAlgorithm.cpp
src/SeedFilterMLAlgorithm.cpp
)
endif()

add_library(
ActsExamplesTrackFindingML SHARED
${SOURCES}
Expand All @@ -28,11 +23,6 @@ target_link_libraries(
ActsExamplesFramework
)

if(ACTS_BUILD_PLUGIN_MLPACK)
target_link_libraries(
ActsExamplesTrackFindingML PUBLIC ActsPluginMlpack)
endif()

install(
TARGETS ActsExamplesTrackFindingML
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// This file is part of the Acts project.
//
// Copyright (C) 2023 CERN for the benefit of the Acts project
// Copyright (C) 2023-2024 CERN for the benefit of the Acts project
//
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
Expand All @@ -10,21 +10,20 @@

#include "Acts/EventData/TrackContainer.hpp"
#include "Acts/TrackFinding/detail/AmbiguityTrackClustering.hpp"
#include "Acts/Utilities/DBScan.hpp"

#include <map>
#include <unordered_map>
#include <vector>

#include "mlpack/methods/dbscan.hpp"

namespace Acts {

/// Clusterise tracks based on shared hits
///
/// @param trackMap : Multimap storing pair of track ID and vector of measurement ID. The keys are the number of measurement and are just there to facilitate the ordering.
/// @param tracks : Track container with all the track to be clustered
/// @param epsilon : Maximum distance between 2 tracks to be clustered
/// @param minPoints : Minimum number of tracks to create a cluster
/// @param trackMap Multimap storing pair of track ID and vector of measurement ID. The keys are the number of measurement and are just there to facilitate the ordering.
/// @param tracks Track container with all the track to be clustered
/// @param epsilon Maximum distance between 2 tracks to be clustered
/// @param minPoints Minimum number of tracks to create a cluster
/// @return an unordered map representing the clusters, the keys the ID of the primary track of each cluster and the store a vector of track IDs.
template <typename track_container_t, typename traj_t,
template <typename> class holder_t>
Expand All @@ -40,34 +39,29 @@ std::unordered_map<std::size_t, std::vector<std::size_t>> dbscanTrackClustering(
// different clusters.
std::unordered_map<std::size_t, std::size_t> hitToTrack;

// DBSCAN algorithm from MLpack used in the track clustering
mlpack::DBSCAN dbscan(epsilon, minPoints);
// Initialize a DBScan of dimension 4 (phi, eta, z, Pt)
using DBSCAN = Acts::DBScan<4, double, 4>;
DBSCAN dbscan(epsilon, minPoints, true);

arma::mat data(2, trackMap.size());
std::vector<std::array<double, 4>> data;
std::size_t trackID = 0;
arma::Row<std::size_t> assignments;
std::vector<int> clusterAssignments;

// Get the input feature of the network for all the tracks
for (const auto& [key, val] : trackMap) {
auto traj = tracks.getTrack(val.first);
data(0, trackID) = Acts::VectorHelpers::eta(traj.momentum());
data(1, trackID) = Acts::VectorHelpers::phi(traj.momentum());
trackID++;
data.push_back({Acts::VectorHelpers::eta(traj.momentum()),
Acts::VectorHelpers::phi(traj.momentum())});
}
std::size_t clusterNb = dbscan.Cluster(data, assignments);
trackID = 0;
std::size_t clusterNb = dbscan.cluster(data, clusterAssignments);

// Cluster track with DBScan
std::vector<
std::multimap<int, std::pair<std::size_t, std::vector<std::size_t>>>>
dbscanClusters(clusterNb);
for (const auto& [key, val] : trackMap) {
std::size_t clusterID = assignments(trackID);
if (assignments(trackID) == SIZE_MAX) {
cluster.emplace(val.first, std::vector<std::size_t>(1, val.first));
} else {
dbscanClusters[clusterID].emplace(key, val);
}
std::size_t clusterID = clusterAssignments[trackID];
dbscanClusters[clusterID].emplace(key, val);
trackID++;
}

Expand Down
Loading

0 comments on commit 7f6f131

Please sign in to comment.