diff --git a/Core/include/Acts/TrackFinding/detail/AmbiguityTrackClustering.hpp b/Core/include/Acts/TrackFinding/detail/AmbiguityTrackClustering.hpp index 5985f37a6a3..fb2c11f21ba 100644 --- a/Core/include/Acts/TrackFinding/detail/AmbiguityTrackClustering.hpp +++ b/Core/include/Acts/TrackFinding/detail/AmbiguityTrackClustering.hpp @@ -20,8 +20,10 @@ namespace detail { /// /// @param trackMap : Multimap storing pair of track ID and vector of measurement ID. The keys are the number of measurement and are just there to facilitate the ordering. /// @return an unordered map representing the clusters, the keys the ID of the primary track of each cluster and the store a vector of track IDs. -std::unordered_map> clusterDuplicateTracks( - const std::multimap>>& trackMap); +std::unordered_map> +clusterDuplicateTracks( + const std::multimap>>& + trackMap); } // namespace detail } // namespace Acts diff --git a/Core/src/TrackFinding/AmbiguityTrackClustering.cpp b/Core/src/TrackFinding/AmbiguityTrackClustering.cpp index 6e31a39a4dc..b9cf526df7e 100644 --- a/Core/src/TrackFinding/AmbiguityTrackClustering.cpp +++ b/Core/src/TrackFinding/AmbiguityTrackClustering.cpp @@ -10,18 +10,20 @@ #include -std::unordered_map> Acts::detail::clusterDuplicateTracks( - const std::multimap>>& trackMap) { +std::unordered_map> +Acts::detail::clusterDuplicateTracks( + const std::multimap>>& + trackMap) { // Unordered map associating a vector with all the track ID of a cluster to // the ID of the first track of the cluster - std::unordered_map> cluster; + std::unordered_map> cluster; // Unordered map associating hits to the ID of the first track of the // different clusters. - std::unordered_map hitToTrack; + std::unordered_map hitToTrack; // Loop over all the tracks for (auto track = trackMap.rbegin(); track != trackMap.rend(); ++track) { - std::vector hits = track->second.second; + std::vector hits = track->second.second; auto matchedTrack = hitToTrack.end(); // Loop over all the hits in the track for (auto hit = hits.begin(); hit != hits.end(); hit++) { @@ -36,7 +38,7 @@ std::unordered_map> Acts::detail::clusterDuplicateTracks( // None of the hits have been matched to a track create a new cluster if (matchedTrack == hitToTrack.end()) { cluster.emplace(track->second.first, - std::vector(1, track->second.first)); + std::vector(1, track->second.first)); for (const auto& hit : hits) { // Add the hits of the new cluster to the hitToTrack hitToTrack.emplace(hit, track->second.first); diff --git a/Examples/Algorithms/TrackFindingML/CMakeLists.txt b/Examples/Algorithms/TrackFindingML/CMakeLists.txt index fce4f979a72..e517d0060e8 100644 --- a/Examples/Algorithms/TrackFindingML/CMakeLists.txt +++ b/Examples/Algorithms/TrackFindingML/CMakeLists.txt @@ -6,6 +6,7 @@ set(SOURCES if(ACTS_BUILD_PLUGIN_MLPACK) list(APPEND SOURCES src/AmbiguityResolutionMLDBScanAlgorithm.cpp + src/SeedFilterMLAlgorithm.cpp ) endif() diff --git a/Examples/Algorithms/TrackFindingML/include/ActsExamples/TrackFindingML/AmbiguityResolutionML.hpp b/Examples/Algorithms/TrackFindingML/include/ActsExamples/TrackFindingML/AmbiguityResolutionML.hpp index d9bffb31a83..aafc5c403fc 100644 --- a/Examples/Algorithms/TrackFindingML/include/ActsExamples/TrackFindingML/AmbiguityResolutionML.hpp +++ b/Examples/Algorithms/TrackFindingML/include/ActsExamples/TrackFindingML/AmbiguityResolutionML.hpp @@ -33,15 +33,16 @@ class AmbiguityResolutionML : public IAlgorithm { /// @param tracks is the input track container /// @param nMeasurementsMin minimum number of measurement per track /// @return an ordered list containing pairs of track ID and associated measurement ID - std::multimap>> mapTrackHits( - const ConstTrackContainer& tracks, int nMeasurementsMin) const; + std::multimap>> + mapTrackHits(const ConstTrackContainer& tracks, int nMeasurementsMin) const; /// Prepare the output track container to be written /// /// @param tracks is the input track container /// @param goodTracks is list of the IDs of all the tracks we want to keep - ConstTrackContainer prepareOutputTrack(const ConstTrackContainer& tracks, - std::vector& goodTracks) const; + ConstTrackContainer prepareOutputTrack( + const ConstTrackContainer& tracks, + std::vector& goodTracks) const; }; } // namespace ActsExamples diff --git a/Examples/Algorithms/TrackFindingML/include/ActsExamples/TrackFindingML/SeedFilterMLAlgorithm.hpp b/Examples/Algorithms/TrackFindingML/include/ActsExamples/TrackFindingML/SeedFilterMLAlgorithm.hpp new file mode 100644 index 00000000000..122feb693c7 --- /dev/null +++ b/Examples/Algorithms/TrackFindingML/include/ActsExamples/TrackFindingML/SeedFilterMLAlgorithm.hpp @@ -0,0 +1,83 @@ +// This file is part of the Acts project. +// +// Copyright (C) 2023 CERN for the benefit of the Acts project +// +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#pragma once + +#include "Acts/Plugins/Onnx/SeedClassifier.hpp" +#include "ActsExamples/EventData/SimSeed.hpp" +#include "ActsExamples/EventData/Track.hpp" +#include "ActsExamples/Framework/DataHandle.hpp" +#include "ActsExamples/TrackFindingML/AmbiguityResolutionML.hpp" + +#include + +namespace ActsExamples { + +/// Removes seeds that seem to be duplicated and fake. +/// +/// The implementation works as follows: +/// 1) Cluster together nearby seeds using a DBScan +/// 2) For each seed use a neural network to compute a score +/// 3) In each cluster keep the seed with the highest score +class SeedFilterMLAlgorithm : public IAlgorithm { + public: + struct Config { + /// Input estimated track parameters collection. + std::string inputTrackParameters; + /// Input seeds collection. + std::string inputSimSeeds; + /// Path to the ONNX model for the duplicate neural network + std::string inputSeedFilterNN; + /// Output estimated track parameters collection. + std::string outputTrackParameters; + /// Output seeds collection. + std::string outputSimSeeds; + /// Maximum distance between 2 tracks to be clustered in the DBScan + float epsilonDBScan = 0.03; + /// Minimum number of tracks to create a cluster in the DBScan + int minPointsDBScan = 2; + /// Minimum score a seed need to be selected + float minSeedScore = 0.1; + /// Clustering parameters weight for phi used before the DBSCAN + double clusteringWeighPhi = 1.0; + /// Clustering parameters weight for eta used before the DBSCAN + double clusteringWeighEta = 1.0; + /// Clustering parameters weight for z used before the DBSCAN + double clusteringWeighZ = 50.0; + /// Clustering parameters weight for pT used before the DBSCAN + double clusteringWeighPt = 1.0; + }; + + /// Construct the seed filter algorithm. + /// + /// @param cfg is the algorithm configuration + /// @param lvl is the logging level + SeedFilterMLAlgorithm(Config cfg, Acts::Logging::Level lvl); + + /// Run the seed filter algorithm. + /// + /// @param cxt is the algorithm context with event information + /// @return a process code indication success or failure + ProcessCode execute(const AlgorithmContext& ctx) const final; + + /// Const access to the config + const Config& config() const { return m_cfg; } + + private: + Config m_cfg; + // ONNX model for track selection + Acts::SeedClassifier m_seedClassifier; + ReadDataHandle m_inputTrackParameters{ + this, "InputTrackParameters"}; + ReadDataHandle m_inputSimSeeds{this, "InputSimSeeds"}; + WriteDataHandle m_outputTrackParameters{ + this, "OutputTrackParameters"}; + WriteDataHandle m_outputSimSeeds{this, "OutputSimSeeds"}; +}; + +} // namespace ActsExamples diff --git a/Examples/Algorithms/TrackFindingML/src/AmbiguityResolutionML.cpp b/Examples/Algorithms/TrackFindingML/src/AmbiguityResolutionML.cpp index 5b4d80b25a5..7895d627cd6 100644 --- a/Examples/Algorithms/TrackFindingML/src/AmbiguityResolutionML.cpp +++ b/Examples/Algorithms/TrackFindingML/src/AmbiguityResolutionML.cpp @@ -15,23 +15,24 @@ ActsExamples::AmbiguityResolutionML::AmbiguityResolutionML( std::string name, Acts::Logging::Level lvl) : ActsExamples::IAlgorithm(name, lvl) {} -std::multimap>> +std::multimap>> ActsExamples::AmbiguityResolutionML::mapTrackHits( const ActsExamples::ConstTrackContainer& tracks, int nMeasurementsMin) const { - std::multimap>> trackMap; + std::multimap>> trackMap; // Loop over all the trajectories in the events for (const auto& track : tracks) { - std::vector hits; + std::vector hits; int nbMeasurements = 0; // Store the hits id for the trajectory and compute the number of // measurement tracks.trackStateContainer().visitBackwards( track.tipIndex(), [&](const auto& state) { if (state.typeFlags().test(Acts::TrackStateFlag::MeasurementFlag)) { - int indexHit = state.getUncalibratedSourceLink() - .template get() - .index(); + std::size_t indexHit = + state.getUncalibratedSourceLink() + .template get() + .index(); hits.emplace_back(indexHit); ++nbMeasurements; } @@ -47,7 +48,7 @@ ActsExamples::AmbiguityResolutionML::mapTrackHits( ActsExamples::ConstTrackContainer ActsExamples::AmbiguityResolutionML::prepareOutputTrack( const ActsExamples::ConstTrackContainer& tracks, - std::vector& goodTracks) const { + std::vector& goodTracks) const { std::shared_ptr trackStateContainer = tracks.trackStateContainerHolder(); auto trackContainer = std::make_shared(); diff --git a/Examples/Algorithms/TrackFindingML/src/AmbiguityResolutionMLAlgorithm.cpp b/Examples/Algorithms/TrackFindingML/src/AmbiguityResolutionMLAlgorithm.cpp index 3be5dd22bd4..0bdbe661c14 100644 --- a/Examples/Algorithms/TrackFindingML/src/AmbiguityResolutionMLAlgorithm.cpp +++ b/Examples/Algorithms/TrackFindingML/src/AmbiguityResolutionMLAlgorithm.cpp @@ -35,12 +35,12 @@ ActsExamples::ProcessCode ActsExamples::AmbiguityResolutionMLAlgorithm::execute( // Read input data const auto& tracks = m_inputTracks(ctx); // Associate measurement to their respective tracks - std::multimap>> trackMap = - mapTrackHits(tracks, m_cfg.nMeasurementsMin); + std::multimap>> + trackMap = mapTrackHits(tracks, m_cfg.nMeasurementsMin); auto cluster = Acts::detail::clusterDuplicateTracks(trackMap); // Select the ID of the track we want to keep - std::vector goodTracks = - m_duplicateClassifier.solveAmbuguity(cluster, tracks); + std::vector goodTracks = + m_duplicateClassifier.solveAmbiguity(cluster, tracks); // Prepare the output track collection from the IDs auto outputTracks = prepareOutputTrack(tracks, goodTracks); m_outputTracks(ctx, std::move(outputTracks)); diff --git a/Examples/Algorithms/TrackFindingML/src/AmbiguityResolutionMLDBScanAlgorithm.cpp b/Examples/Algorithms/TrackFindingML/src/AmbiguityResolutionMLDBScanAlgorithm.cpp index dd72d147839..4c0601c4ece 100644 --- a/Examples/Algorithms/TrackFindingML/src/AmbiguityResolutionMLDBScanAlgorithm.cpp +++ b/Examples/Algorithms/TrackFindingML/src/AmbiguityResolutionMLDBScanAlgorithm.cpp @@ -39,14 +39,14 @@ ActsExamples::AmbiguityResolutionMLDBScanAlgorithm::execute( // Read input data const auto& tracks = m_inputTracks(ctx); // Associate measurement to their respective tracks - std::multimap>> trackMap = - mapTrackHits(tracks, m_cfg.nMeasurementsMin); + std::multimap>> + trackMap = mapTrackHits(tracks, m_cfg.nMeasurementsMin); // Cluster the tracks using DBscan auto cluster = Acts::dbscanTrackClustering( trackMap, tracks, m_cfg.epsilonDBScan, m_cfg.minPointsDBScan); // Select the ID of the track we want to keep - std::vector goodTracks = - m_duplicateClassifier.solveAmbuguity(cluster, tracks); + std::vector goodTracks = + m_duplicateClassifier.solveAmbiguity(cluster, tracks); // Prepare the output track collection from the IDs auto outputTracks = prepareOutputTrack(tracks, goodTracks); m_outputTracks(ctx, std::move(outputTracks)); diff --git a/Examples/Algorithms/TrackFindingML/src/SeedFilterMLAlgorithm.cpp b/Examples/Algorithms/TrackFindingML/src/SeedFilterMLAlgorithm.cpp new file mode 100644 index 00000000000..05eb6a9a02e --- /dev/null +++ b/Examples/Algorithms/TrackFindingML/src/SeedFilterMLAlgorithm.cpp @@ -0,0 +1,102 @@ +// This file is part of the Acts project. +// +// Copyright (C) 2023 CERN for the benefit of the Acts project +// +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "ActsExamples/TrackFindingML/SeedFilterMLAlgorithm.hpp" + +#include "Acts/Plugins/Mlpack/SeedFilterDBScanClustering.hpp" +#include "ActsExamples/Framework/ProcessCode.hpp" +#include "ActsExamples/Framework/WhiteBoard.hpp" + +#include +#include + +ActsExamples::SeedFilterMLAlgorithm::SeedFilterMLAlgorithm( + ActsExamples::SeedFilterMLAlgorithm::Config cfg, Acts::Logging::Level lvl) + : ActsExamples::IAlgorithm("SeedFilterMLAlgorithm", lvl), + m_cfg(std::move(cfg)), + m_seedClassifier(m_cfg.inputSeedFilterNN.c_str()) { + if (m_cfg.inputTrackParameters.empty()) { + throw std::invalid_argument("Missing track parameters input collection"); + } + if (m_cfg.inputSimSeeds.empty()) { + throw std::invalid_argument("Missing seed input collection"); + } + if (m_cfg.outputTrackParameters.empty()) { + throw std::invalid_argument("Missing track parameters output collection"); + } + if (m_cfg.outputSimSeeds.empty()) { + throw std::invalid_argument("Missing seed output collection"); + } + m_inputTrackParameters.initialize(m_cfg.inputTrackParameters); + m_inputSimSeeds.initialize(m_cfg.inputSimSeeds); + m_outputTrackParameters.initialize(m_cfg.outputTrackParameters); + m_outputSimSeeds.initialize(m_cfg.outputSimSeeds); +} + +ActsExamples::ProcessCode ActsExamples::SeedFilterMLAlgorithm::execute( + const AlgorithmContext& ctx) const { + // Read input data + const auto& seeds = m_inputSimSeeds(ctx); + const auto& params = m_inputTrackParameters(ctx); + if (seeds.size() != params.size()) { + throw std::invalid_argument( + "The number of seeds and track parameters is different"); + } + + Eigen::Array + networkInput(seeds.size(), 14); + std::vector> clusteringParams; + // Loop over the seed and parameters to fill the input for the clustering + // and the NN + for (std::size_t i = 0; i < seeds.size(); i++) { + // Compute the track parameters + double pT = std::abs(1.0 / params[i].parameters()[Acts::eBoundQOverP]) * + std::sin(params[i].parameters()[Acts::eBoundTheta]); + double eta = + std::atanh(std::cos(params[i].parameters()[Acts::eBoundTheta])); + double phi = params[i].parameters()[Acts::eBoundPhi]; + + // Fill and weight the clustering inputs + clusteringParams.push_back( + {phi / m_cfg.clusteringWeighPhi, eta / m_cfg.clusteringWeighEta, + seeds[i].z() / m_cfg.clusteringWeighZ, pT / m_cfg.clusteringWeighPt}); + + // Fill the NN input + networkInput.row(i) << pT, eta, phi, seeds[i].sp()[0]->x(), + seeds[i].sp()[0]->y(), seeds[i].sp()[0]->z(), seeds[i].sp()[1]->x(), + seeds[i].sp()[1]->y(), seeds[i].sp()[1]->z(), seeds[i].sp()[2]->x(), + seeds[i].sp()[2]->y(), seeds[i].sp()[2]->z(), seeds[i].z(), + seeds[i].seedQuality(); + } + + // Cluster the tracks using DBscan + auto cluster = Acts::dbscanSeedClustering( + clusteringParams, m_cfg.epsilonDBScan, m_cfg.minPointsDBScan); + + // Select the ID of the track we want to keep + std::vector goodSeed = m_seedClassifier.solveAmbiguity( + cluster, networkInput, m_cfg.minSeedScore); + + // Create the output seed collection + SimSeedContainer outputSeeds; + outputSeeds.reserve(goodSeed.size()); + + // Create the output track parameters collection + TrackParametersContainer outputTrackParameters; + outputTrackParameters.reserve(goodSeed.size()); + + for (auto i : goodSeed) { + outputSeeds.push_back(seeds[i]); + outputTrackParameters.push_back(params[i]); + } + + m_outputSimSeeds(ctx, SimSeedContainer{outputSeeds}); + m_outputTrackParameters(ctx, TrackParametersContainer{outputTrackParameters}); + + return ActsExamples::ProcessCode::SUCCESS; +} diff --git a/Examples/Io/Csv/include/ActsExamples/Io/Csv/CsvSeedWriter.hpp b/Examples/Io/Csv/include/ActsExamples/Io/Csv/CsvSeedWriter.hpp index 2059bb63dc9..c182aa5bbf7 100644 --- a/Examples/Io/Csv/include/ActsExamples/Io/Csv/CsvSeedWriter.hpp +++ b/Examples/Io/Csv/include/ActsExamples/Io/Csv/CsvSeedWriter.hpp @@ -90,7 +90,7 @@ class CsvSeedWriter : public WriterT { /// @brief Struct for brief seed summary info /// struct SeedInfo { - std::size_t seedId = 0; + std::size_t seedID = 0; ActsFatras::Barcode particleId; float seedPt = -1; float seedPhi = 0; diff --git a/Examples/Io/Csv/include/ActsExamples/Io/Csv/CsvTrackWriter.hpp b/Examples/Io/Csv/include/ActsExamples/Io/Csv/CsvTrackWriter.hpp index 2bea2289973..c53c21f7ef7 100644 --- a/Examples/Io/Csv/include/ActsExamples/Io/Csv/CsvTrackWriter.hpp +++ b/Examples/Io/Csv/include/ActsExamples/Io/Csv/CsvTrackWriter.hpp @@ -89,7 +89,7 @@ class CsvTrackWriter : public WriterT { /// struct TrackInfo : public Acts::MultiTrajectoryHelpers::TrajectoryState { std::size_t trackId = 0; - unsigned int seedId = 0; + unsigned int seedID = 0; ActsFatras::Barcode particleId; std::size_t nMajorityHits = 0; std::string trackType; diff --git a/Examples/Io/Csv/src/CsvSeedWriter.cpp b/Examples/Io/Csv/src/CsvSeedWriter.cpp index 12361d694b3..2b3be226c48 100644 --- a/Examples/Io/Csv/src/CsvSeedWriter.cpp +++ b/Examples/Io/Csv/src/CsvSeedWriter.cpp @@ -147,7 +147,7 @@ ActsExamples::ProcessCode ActsExamples::CsvSeedWriter::writeT( // track info SeedInfo toAdd; - toAdd.seedId = iparams; + toAdd.seedID = iparams; toAdd.particleId = majorityParticleId; toAdd.seedPt = std::abs(1.0 / params[Acts::eBoundQOverP]) * std::sin(params[Acts::eBoundTheta]); @@ -160,7 +160,7 @@ ActsExamples::ProcessCode ActsExamples::CsvSeedWriter::writeT( toAdd.seedType = truthMatched ? "duplicate" : "fake"; toAdd.measurementsID = ptrack; - infoMap[toAdd.seedId] = toAdd; + infoMap[toAdd.seedID] = toAdd; } mos << "seed_id,particleId," @@ -177,7 +177,7 @@ ActsExamples::ProcessCode ActsExamples::CsvSeedWriter::writeT( info.seedType = "good"; } // write the track info - mos << info.seedId << ","; + mos << info.seedID << ","; mos << info.particleId << ","; mos << info.seedPt << ","; mos << info.seedEta << ","; diff --git a/Examples/Io/Csv/src/CsvTrackWriter.cpp b/Examples/Io/Csv/src/CsvTrackWriter.cpp index 2938724eab5..d6b3b336ce4 100644 --- a/Examples/Io/Csv/src/CsvTrackWriter.cpp +++ b/Examples/Io/Csv/src/CsvTrackWriter.cpp @@ -116,9 +116,9 @@ ProcessCode CsvTrackWriter::writeT(const AlgorithmContext& context, TrackInfo toAdd; toAdd.trackId = trackId; if (tracks.hasColumn(Acts::hashString("trackGroup"))) { - toAdd.seedId = seedNumber(track); + toAdd.seedID = seedNumber(track); } else { - toAdd.seedId = 0; + toAdd.seedID = 0; } toAdd.particleId = majorityParticleId; toAdd.nStates = track.nTrackStates(); @@ -199,7 +199,7 @@ ProcessCode CsvTrackWriter::writeT(const AlgorithmContext& context, // write the track info mos << trajState.trackId << ","; - mos << trajState.seedId << ","; + mos << trajState.seedID << ","; mos << trajState.particleId << ","; mos << trajState.nStates << ","; mos << trajState.nMajorityHits << ","; diff --git a/Examples/Python/python/acts/examples/reconstruction.py b/Examples/Python/python/acts/examples/reconstruction.py index d058f6aaac9..53e436001d2 100644 --- a/Examples/Python/python/acts/examples/reconstruction.py +++ b/Examples/Python/python/acts/examples/reconstruction.py @@ -144,6 +144,12 @@ defaults=[None] * 3, ) +SeedFilterMLDBScanConfig = namedtuple( + "SeedFilterMLDBScanConfig", + ["epsilonDBScan", "minPointsDBScan", "minSeedScore"], + defaults=[None] * 3, +) + class VertexFinder(Enum): Truth = (1,) @@ -933,6 +939,86 @@ def addSeedPerformanceWriters( ) +acts.examples.NamedTypeArgs( + config=SeedFilterMLDBScanConfig, +) + + +def addSeedFilterML( + s, + config: SeedFilterMLDBScanConfig = SeedFilterMLDBScanConfig(), + onnxModelFile: Optional[Union[Path, str]] = None, + logLevel: Optional[acts.logging.Level] = None, + outputDirRoot: Optional[Union[Path, str]] = None, + outputDirCsv: Optional[Union[Path, str]] = None, +) -> None: + customLogLevel = acts.examples.defaultLogging(s, logLevel)() + from acts.examples.onnx.mlpack import SeedFilterMLAlgorithm + + inputParticles = "particles" + selectedParticles = "truth_seeds_selected" + seeds = "seeds" + estParams = "estimatedparameters" + + filterML = SeedFilterMLAlgorithm( + level=customLogLevel, + inputTrackParameters="estimatedparameters", + inputSimSeeds="seeds", + inputSeedFilterNN=onnxModelFile, + outputTrackParameters="filtered-parameters", + outputSimSeeds="filtered-seeds", + **acts.examples.defaultKWArgs( + epsilonDBScan=config.epsilonDBScan, + minPointsDBScan=config.minPointsDBScan, + minSeedScore=config.minSeedScore, + ), + ) + s.addAlgorithm(filterML) + s.addWhiteboardAlias(seeds, "filtered-seeds") + s.addWhiteboardAlias("estimatedparameters", "filtered-parameters") + + prototracks = "seed-prototracks-ML" + s.addAlgorithm( + acts.examples.SeedsToPrototracks( + level=customLogLevel, + inputSeeds=seeds, + outputProtoTracks=prototracks, + ) + ) + + if outputDirRoot is not None: + addSeedPerformanceWriters( + s, + outputDirRoot, + seeds, + prototracks, + selectedParticles, + inputParticles, + estParams, + customLogLevel, + ) + + if outputDirCsv is not None: + outputDirCsv = Path(outputDirCsv) + + if not outputDirCsv.exists(): + outputDirCsv.mkdir() + + CsvSeedWriter = acts.examples.CsvSeedWriter( + level=customLogLevel, + inputTrackParameters=estParams, + inputSimSeeds=seeds, + inputSimHits="simhits", + inputMeasurementParticlesMap="measurement_particles_map", + inputMeasurementSimHitsMap="measurement_simhits_map", + outputDir=str(outputDirCsv), + fileName=str(f"seed.csv"), + ) + s.addWriter(CsvSeedWriter) + + return s + + def addKalmanTracks( s: acts.examples.Sequencer, trackingGeometry: acts.TrackingGeometry, diff --git a/Examples/Python/src/OnnxMlpack.cpp b/Examples/Python/src/OnnxMlpack.cpp index f27555b4c7f..2c2da4cfb7b 100644 --- a/Examples/Python/src/OnnxMlpack.cpp +++ b/Examples/Python/src/OnnxMlpack.cpp @@ -8,6 +8,7 @@ #include "Acts/Plugins/Python/Utilities.hpp" #include "ActsExamples/TrackFindingML/AmbiguityResolutionMLDBScanAlgorithm.hpp" +#include "ActsExamples/TrackFindingML/SeedFilterMLAlgorithm.hpp" #include #include @@ -27,5 +28,11 @@ void addOnnxMlpack(Context& ctx) { ActsExamples::AmbiguityResolutionMLDBScanAlgorithm, mlpack, "AmbiguityResolutionMLDBScanAlgorithm", inputTracks, inputDuplicateNN, outputTracks, nMeasurementsMin, epsilonDBScan, minPointsDBScan); + + ACTS_PYTHON_DECLARE_ALGORITHM(ActsExamples::SeedFilterMLAlgorithm, mlpack, + "SeedFilterMLAlgorithm", inputTrackParameters, + inputSimSeeds, inputSeedFilterNN, + outputTrackParameters, outputSimSeeds, + epsilonDBScan, minPointsDBScan, minSeedScore); } } // namespace Acts::Python diff --git a/Examples/Scripts/Python/MLAmbiguityResolution/ambiguity_solver_full_chain.py b/Examples/Scripts/Python/MLAmbiguityResolution/ambiguity_solver_full_chain.py index e748efa686c..290d38e6bf8 100644 --- a/Examples/Scripts/Python/MLAmbiguityResolution/ambiguity_solver_full_chain.py +++ b/Examples/Scripts/Python/MLAmbiguityResolution/ambiguity_solver_full_chain.py @@ -19,7 +19,6 @@ def readDataSet(CKS_files: list[str]) -> pd.DataFrame: @param[in] CKS_files: DataFrame contain the data from each track files (1 file per events usually) @return: combined DataFrame containing all the track, ordered by events and then by truth particle ID in each event """ - globalindex = 0 data = [] for f in CKS_files: datafile = pd.read_csv(f) diff --git a/Examples/Scripts/Python/MLAmbiguityResolution/ambiguity_solver_network.py b/Examples/Scripts/Python/MLAmbiguityResolution/ambiguity_solver_network.py index 5fbeb1d78eb..cb0a63e6e42 100644 --- a/Examples/Scripts/Python/MLAmbiguityResolution/ambiguity_solver_network.py +++ b/Examples/Scripts/Python/MLAmbiguityResolution/ambiguity_solver_network.py @@ -17,7 +17,7 @@ def prepareDataSet(data: pd.DataFrame) -> pd.DataFrame: data = data # Remove tracks with less than 7 measurements data = data[data["nMeasurements"] > 6] - data = data.sort_values("good/duplicate/fake", ascending=False) + # data = data.sort_values("good/duplicate/fake", ascending=False) # Remove pure duplicate (tracks purely identical) keep the ones good one if among them. data = data.drop_duplicates( subset=[ @@ -30,7 +30,7 @@ def prepareDataSet(data: pd.DataFrame) -> pd.DataFrame: ], keep="first", ) - data = data.sort_values("particleId") + # data = data.sort_values("particleId") # Set truth particle ID as index data = data.set_index("particleId") # Transform the hit list from a string to an actual list diff --git a/Examples/Scripts/Python/MLAmbiguityResolution/ambiguity_solver_perf.py b/Examples/Scripts/Python/MLAmbiguityResolution/ambiguity_solver_perf.py index fc2545d88ce..1f1ba26d7c7 100644 --- a/Examples/Scripts/Python/MLAmbiguityResolution/ambiguity_solver_perf.py +++ b/Examples/Scripts/Python/MLAmbiguityResolution/ambiguity_solver_perf.py @@ -14,7 +14,6 @@ def readDataSet(CKS_files: list[str]) -> pd.DataFrame: @param[in] CKS_files: DataFrame contain the data from each track files (1 file per events usually) @return: combined DataFrame containing all the track, ordered by events and then by truth particle ID in each event """ - globalindex = 0 data = [] for f in CKS_files: datafile = pd.read_csv(f) @@ -31,11 +30,16 @@ def readDataSet(CKS_files: list[str]) -> pd.DataFrame: glob.glob("odd_output" + "/event0000000[0-9][0-9]-tracks_ckf.csv") ) CKF_files_resolved = sorted( + glob.glob("odd_output" + "/event0000000[0-9][0-9]-tracks_ambi.csv") +) +ML_files_resolved = sorted( glob.glob("odd_output" + "/event0000000[0-9][0-9]-tracks_ambiML.csv") ) data_track = readDataSet(CKF_files_track) +data_ML_track = readDataSet(CKF_files_track) data_resolved = readDataSet(CKF_files_resolved) +data_ML_resolved = readDataSet(ML_files_resolved) # Compute the algorithm performances nb_part = 0 @@ -49,6 +53,12 @@ def readDataSet(CKS_files: list[str]) -> pd.DataFrame: nb_reco_duplicate = 0 nb_reco_track = 0 +nb_good_match_ML = 0 +nb_reco_part_ML = 0 +nb_reco_fake_ML = 0 +nb_reco_duplicate_ML = 0 +nb_reco_track_ML = 0 + # Compute the different efficiencies for trackEvent, resolvedEvent in zip(data_track, data_resolved): nb_part += trackEvent.loc[trackEvent["good/duplicate/fake"] == "good"].shape[0] @@ -91,13 +101,49 @@ def readDataSet(CKS_files: list[str]) -> pd.DataFrame: ].index.nunique() nb_reco_track += resolvedEvent.shape[0] +# Compute the different efficiencies for ML +for trackEvent, resolvedEvent in zip(data_ML_track, data_ML_resolved): + # Merge two dataFrames and add indicator column + merged_ML = pd.merge( + trackEvent.loc[trackEvent["good/duplicate/fake"] == "good"], + resolvedEvent, + on=[ + "particleId", + "nStates", + "nMeasurements", + "nOutliers", + "nHoles", + "ndf", + "chi2/ndf", + "good/duplicate/fake", + ], + how="left", + indicator="exists", + ) + + # Add column to show if each row in first DataFrame exists in second + merged_ML["exists"] = np.where(merged_ML.exists == "both", True, False) + merged_ML.to_csv(path_or_buf="merged_ML.csv") + + nb_good_match_ML += merged_ML.loc[merged_ML["exists"] == True].shape[0] + nb_reco_fake_ML += resolvedEvent.loc[ + resolvedEvent["good/duplicate/fake"] == "fake" + ].shape[0] + nb_reco_duplicate_ML += resolvedEvent.loc[ + resolvedEvent["good/duplicate/fake"] == "duplicate" + ].shape[0] + nb_reco_part_ML += resolvedEvent.loc[ + resolvedEvent["good/duplicate/fake"] != "fake" + ].index.nunique() + nb_reco_track_ML += resolvedEvent.shape[0] + print("===Initial efficiencies===") print("nb particles : ", nb_part) print("nb track : ", nb_track) print("duplicate rate: ", 100 * nb_duplicate / nb_track, " %") print("Fake rate: ", 100 * nb_fake / nb_track, " %") -print("===computed efficiencies===") +print("===computed efficiencies Greedy===") print("nb particles : ", nb_part) print("nb good match : ", nb_good_match) print("nb particle reco : ", nb_reco_part) @@ -106,3 +152,13 @@ def readDataSet(CKS_files: list[str]) -> pd.DataFrame: print("Efficiency (particle reco) : ", 100 * nb_reco_part / nb_part, " %") print("duplicate rate: ", 100 * nb_reco_duplicate / nb_reco_track, " %") print("Fake rate: ", 100 * nb_reco_fake / nb_reco_track, " %") + +print("===computed efficiencies ML===") +print("nb particles: ", nb_part) +print("nb good match: ", nb_good_match_ML) +print("nb particle reco: ", nb_reco_part_ML) +print("nb track reco: ", nb_reco_track_ML) +print("Efficiency (good track): ", 100 * nb_good_match_ML / nb_part, " %") +print("Efficiency (particle reco): ", 100 * nb_reco_part_ML / nb_part, " %") +print("duplicate rate: ", 100 * nb_reco_duplicate_ML / nb_reco_track_ML, " %") +print("Fake rate: ", 100 * nb_reco_fake_ML / nb_reco_track_ML, " %") diff --git a/Examples/Scripts/Python/MLAmbiguityResolution/match_good_track-seed.py b/Examples/Scripts/Python/MLAmbiguityResolution/match_good_track-seed.py index e5321040952..f917eaaf3cc 100644 --- a/Examples/Scripts/Python/MLAmbiguityResolution/match_good_track-seed.py +++ b/Examples/Scripts/Python/MLAmbiguityResolution/match_good_track-seed.py @@ -4,17 +4,15 @@ import numpy as np -def matchGood(seed_files: list[str], ckf_files: list[str]) -> pd.DataFrame: +def matchGood(seed_files: list[str], ckf_files: list[str]): """Read the dataset from the tracks and seeds files, then modify the seed dataset so that good seed correspond to the ones that lead to good tracks. Seed with truth id that do not lead to good tracks are considered as fake. Also create a new dataset with only truth particle associated to a good seeds.""" """ @param[in] Seed_files: List of files containing seeds data (1 file per events usually) @param[in] CKF_files: List of files containing tracks data (1 file per events usually) - @return: combined DataFrame containing all the seed, ordered by events and then by truth particle ID in each events """ data_seed = pd.DataFrame() data_track = pd.DataFrame() goodSeed = pd.DataFrame() - data = pd.DataFrame() # Loop over the different track files and collect the list of seed ID associated to the good tracks for f_ckf, f_seed in zip(ckf_files, seed_files): print("reading file: ", f_ckf, f_seed) @@ -52,7 +50,6 @@ def matchGood(seed_files: list[str], ckf_files: list[str]) -> pd.DataFrame: matchedData = matchedData.set_index("seed_id") matchedData = matchedData.drop(columns=["goodSeed"]) matchedData.to_csv(matched) - data = pd.concat([data, matchedData]) # Save the cleaned dataset for future use (the cleaning is time consuming) cleaned = f_seed[:-4] + "_cleaned.csv" @@ -61,11 +58,11 @@ def matchGood(seed_files: list[str], ckf_files: list[str]) -> pd.DataFrame: cleanedData = cleanedData.drop(columns=["goodSeed"]) cleanedData.to_csv(cleaned) - return data + return # Read the seed and track files and match them # This will allow us to determine which seeds leads to the best possible tracks seed_files = sorted(glob.glob("odd_output" + "/event*-seed.csv")) ckf_files = sorted(glob.glob("odd_output" + "/event*-tracks_ckf.csv")) -data = matchGood(seed_files, ckf_files) +matchGood(seed_files, ckf_files) diff --git a/Examples/Scripts/Python/MLAmbiguityResolution/seed_filter_full_chain.py b/Examples/Scripts/Python/MLAmbiguityResolution/seed_filter_full_chain.py new file mode 100644 index 00000000000..58aae10821e --- /dev/null +++ b/Examples/Scripts/Python/MLAmbiguityResolution/seed_filter_full_chain.py @@ -0,0 +1,444 @@ +import glob +import os +import math + +import pandas as pd +import numpy as np + +import torch.utils + +from sklearn.cluster import DBSCAN, KMeans + +from sklearn.preprocessing import LabelEncoder, OrdinalEncoder +from seed_solver_network import prepareDataSet, DuplicateClassifier, Normalise + + +def readDataSet(CKS_files: list[str]) -> pd.DataFrame: + """Read the dataset from the different files, remove the pure duplicate tracks and combine the datasets""" + """ + @param[in] CKS_files: DataFrame contain the data from each track files (1 file per events usually) + @return: combined DataFrame containing all the track, ordered by events and then by truth particle ID in each events + """ + data = [] + for f in CKS_files: + datafile = pd.read_csv(f) + datafile = prepareDataSet(datafile) + data.append(datafile) + return data + + +def prepareInferenceData(data: pd.DataFrame) -> tuple[np.ndarray, np.ndarray]: + """Prepare the data""" + """ + @param[in] data: input DataFrame to be prepared + @return: array of the network input and the corresponding truth + """ + # Remove truth and useless variable + target_column = "good/duplicate/fake" + # Separate the truth from the input variables + y = LabelEncoder().fit(data[target_column]).transform(data[target_column]) + input = data.drop( + columns=[ + target_column, + "seed_id", + "Hits_ID", + "cluster", + ] + ) + # Prepare the input feature + x_cat = OrdinalEncoder().fit_transform(input.select_dtypes("object")) + x = np.concatenate((x_cat, input), axis=1) + return x, y + + +def clusterSeed( + event: pd.DataFrame, DBSCAN_eps: float = 0.03, DBSCAN_min_samples: int = 2 +) -> pd.DataFrame: + """ + Cluster together all the track that appear to belong to the same truth particle + To cluster the tracks together, a DBSCAN is first used followed by a sub clustering based on hits shared by tracks. + """ + """ + @param[in] event: input DataFrame that contain all track in one event + @param[in] DBSCAN_eps: minimum radius used by the DBSCAN to cluster track together + @param[in] DBSCAN_min_samples: minimum number of tracks needed for DBSCAN to create a cluster + @return: DataFrame identical to the output with an added column with the cluster + """ + # Perform the DBSCAN clustering and sort the Db by cluster ID + trackDir = event[["eta", "phi", "vertexZ", "pT"]].to_numpy() + trackDir[:, 2] = trackDir[:, 2] / 50 + # Perform the subclustering + clustering = DBSCAN(eps=DBSCAN_eps, min_samples=DBSCAN_min_samples).fit(trackDir) + clusterarray = renameCluster(clustering.labels_) + event["cluster"] = clusterarray + sorted = event.sort_values(["cluster"], ascending=True) + return sorted + + +def renameCluster(clusterarray: np.ndarray) -> np.ndarray: + """Rename the cluster IDs to be int starting from 0""" + """ + @param[in] clusterarray: numpy array containing the hits IDs and the cluster ID + @return: numpy array with updated cluster IDs + """ + new_id = len(set(clusterarray)) - (1 if -1 in clusterarray else 0) + for i, cluster in enumerate(clusterarray): + if cluster == -1: + clusterarray[i] = new_id + new_id = new_id + 1 + return clusterarray + + +# ================================================================== + +import time + +start = time.time() + +# ttbar events as test input +CKF_files = sorted(glob.glob("odd_output" + "/event0000000[0-1][0-9]-seed_matched.csv")) +data = readDataSet(CKF_files) + +# Data of each event after clustering +clusteredData = [] +# Data of each event after ambiguity resolution +cleanedData = [] + +t1 = time.time() + +# Cluster tracks belonging to the same particle +for event in data: + clustered = clusterSeed(event) + clusteredData.append(clustered) + +t2 = time.time() + +duplicateClassifier = torch.load("seedduplicateClassifier.pt") + +import matplotlib.pyplot as plt + +# Make a copy of the data to be plotted +plotData = [] +plotDF = pd.DataFrame() +for event in clusteredData: + plotData.append(event.copy()) + plotDF = pd.concat([plotDF, event.copy()]) + +# Plot the distribution of the 4 variable +plotDF["eta"].hist(bins=100) +plt.xlabel("eta") +plt.ylabel("nb seed") +plt.savefig("eta.png") +plt.clf() + +plotDF["phi"].hist(bins=100) +plt.xlabel("phi") +plt.ylabel("nb seed") +plt.savefig("phi.png") +plt.clf() + +plotDF["vertexZ"].hist(bins=100) +plt.xlabel("vertexZ") +plt.ylabel("nb seed") +plt.savefig("vertexZ.png") +plt.clf() + +plotDF["pT"].hist(bins=100, range=[0, 10]) +plt.xlabel("pT") +plt.ylabel("nb seed") +plt.savefig("pT.png") +plt.clf() + +plotDF.plot.scatter(x="eta", y="pT") +plt.xlabel("eta") +plt.ylabel("pT") +plt.savefig("pT_eta.png") +plt.clf() + + +plotDF2 = pd.DataFrame() +# Create histogram filled with the number of seeds per cluster +for event in plotData: + event["nb_seed"] = 0 + event["nb_fake"] = 0 + event["nb_duplicate"] = 0 + event["nb_good"] = 0 + event["nb_cluster"] = 0 + event["nb_truth"] = 0 + event["nb_seed_truth"] = 0 + event["nb_seed_removed"] = 0 + event["particleId"] = event.index + event["nb_seed"] = event.groupby(["cluster"])["cluster"].transform("size") + # Create histogram filled with the number of fake seeds per cluster + event.loc[event["good/duplicate/fake"] == "fake", "nb_fake"] = ( + event.loc[event["good/duplicate/fake"] == "fake"] + .groupby(["cluster"])["cluster"] + .transform("size") + ) + # Create histogram filled with the number of duplicate seeds per cluster + event.loc[event["good/duplicate/fake"] == "duplicate", "nb_duplicate"] = ( + event.loc[event["good/duplicate/fake"] == "duplicate"] + .groupby(["cluster"])["cluster"] + .transform("size") + ) + # Create histogram filled with the number of good seeds per cluster + event.loc[event["good/duplicate/fake"] == "good", "nb_good"] = ( + event.loc[event["good/duplicate/fake"] == "good"] + .groupby(["cluster"])["cluster"] + .transform("size") + ) + + plotDF2 = pd.concat([plotDF2, event]) + +plotDF2["nb_seed"].hist(bins=20, weights=1 / plotDF2["nb_seed"], range=[0, 20]) +plt.xlabel("nb seed/[cluster]") +plt.ylabel("Arbitrary unit") +plt.savefig("nb_seed.png") +plt.clf() + +plotDF2["nb_fake"].hist(bins=10, weights=1 / plotDF2["nb_seed"], range=[0, 10]) +plt.xlabel("nb fake/[cluster]") +plt.ylabel("Arbitrary unit") +plt.savefig("nb_fake.png") +plt.clf() + +plotDF2["nb_duplicate"].hist(bins=10, weights=1 / plotDF2["nb_seed"], range=[0, 10]) +plt.xlabel("nb duplicate/[cluster]") +plt.ylabel("Arbitrary unit") +plt.savefig("nb_duplicate.png") +plt.clf() + +plotDF2["nb_good"].hist(bins=5, weights=1 / plotDF2["nb_seed"], range=[0, 5]) +plt.xlabel("nb good/[cluster]") +plt.ylabel("Arbitrary unit") +plt.savefig("nb_good.png") +plt.clf() + +t3 = time.time() + +# Performed the MLP based ambiguity resolution +for clusteredEvent in clusteredData: + # Prepare the data + x_test, y_test = prepareInferenceData(clusteredEvent) + x = torch.tensor(x_test, dtype=torch.float32) + output_predict = duplicateClassifier(x).detach().numpy() + + # Create an array of random value between 0 and 1 of the same size as the output + # output_predict = np.random.rand(len(x_test)) + + clusteredEvent["score"] = output_predict + # Keep only the track in cluster of more than 1 track or with a score above 0.5 + idx = clusteredEvent["score"] > 0.1 + cleanedEvent = clusteredEvent[idx] + # For each cluster only keep the seed with the highest score + idx = ( + cleanedEvent.groupby(["cluster"])["score"].transform(max) + == cleanedEvent["score"] + ) + cleanedEvent = cleanedEvent[idx] + # For cluster with more than 1 seed, keep the one with the smallest seed_id + idx = ( + cleanedEvent.groupby(["cluster"])["seed_id"].transform(min) + == cleanedEvent["seed_id"] + ) + cleanedEvent = cleanedEvent[idx] + cleanedData.append(cleanedEvent) + +t4 = time.time() + +# Compute the algorithm performances +nb_part = 0 +nb_track = 0 +nb_fake = 0 +nb_duplicate = 0 + +nb_good_match = 0 +nb_reco_part = 0 +nb_reco_fake = 0 +nb_reco_duplicate = 0 +nb_reco_track = 0 + +for clusteredEvent, cleanedEvent in zip(clusteredData, cleanedData): + nb_part += clusteredEvent.loc[ + clusteredEvent["good/duplicate/fake"] != "fake" + ].index.nunique() + nb_track += clusteredEvent.shape[0] + nb_fake += clusteredEvent.loc[ + clusteredEvent["good/duplicate/fake"] == "fake" + ].shape[0] + nb_duplicate += clusteredEvent.loc[ + clusteredEvent["good/duplicate/fake"] == "duplicate" + ].shape[0] + + nb_good_match += cleanedEvent.loc[ + cleanedEvent["good/duplicate/fake"] == "good" + ].shape[0] + nb_reco_fake += cleanedEvent.loc[ + cleanedEvent["good/duplicate/fake"] == "fake" + ].shape[0] + nb_reco_duplicate += cleanedEvent.loc[ + cleanedEvent["good/duplicate/fake"] == "duplicate" + ].shape[0] + nb_reco_part += cleanedEvent.loc[ + cleanedEvent["good/duplicate/fake"] != "fake" + ].index.nunique() + nb_reco_track += cleanedEvent.shape[0] + +tend = time.time() + +print("===Initial efficiencies===") +print("nb particles: ", nb_part) +print("nb track: ", nb_track) +print("duplicate rate: ", 100 * nb_duplicate / nb_track, " %") +print("Fake rate: ", 100 * nb_fake / nb_track, " %") + +print("===computed efficiencies===") +print("nb particles: ", nb_part) +print("nb good match: ", nb_good_match) +print("nb particle reco: ", nb_reco_part) +print("nb track reco: ", nb_reco_track) +print("Efficiency (good track): ", 100 * nb_good_match / nb_part, " %") +print("Efficiency (particle reco): ", 100 * nb_reco_part / nb_part, " %") +print( + "duplicate rate: ", + 100 * ((nb_good_match + nb_reco_duplicate) - nb_reco_part) / nb_reco_track, + " %", +) +print("Fake rate: ", 100 * nb_reco_fake / nb_reco_track, " %") + +print("===computed speed===") +print("Load: ", (t1 - start) * 1000 / len(CKF_files), "ms") +print("Clustering: ", (t2 - t1) * 1000 / len(CKF_files), "ms") +print("Inference: ", (t4 - t3) * 1000 / len(CKF_files), "ms") +print("Perf: ", (tend - t4) * 1000 / len(CKF_files), "ms") +print("tot: ", (t4 - start) * 1000 / len(CKF_files), "ms") +print("Seed filter: ", (t4 - t1) * 1000 / len(CKF_files), "ms") + + +# ================================================================== +# Plotting + +# Combine the events to have a better statistic +clusteredDataPlots = pd.concat(clusteredData) + +cleanedDataPlots = pd.concat(cleanedData) +# cleanedDataPlots = cleanedData[0] + +import matplotlib.pyplot as plt + +# Plot the average score distribution for each type of track +plt.figure() +for tag in ["good", "duplicate", "fake"]: + weights = np.ones_like( + cleanedDataPlots.loc[cleanedDataPlots["good/duplicate/fake"] == tag]["score"] + ) / len( + cleanedDataPlots.loc[cleanedDataPlots["good/duplicate/fake"] == tag]["score"] + ) + plt.hist( + cleanedDataPlots.loc[cleanedDataPlots["good/duplicate/fake"] == tag]["score"], + bins=100, + weights=weights, + alpha=0.65, + label=tag, + ) +plt.legend() +plt.xlabel("score") +plt.ylabel("Fraction of good/duplicate/fake tracks") +plt.title("Score distribution for each type of track") +plt.savefig("score_distribution.png") +plt.yscale("log") +plt.savefig("score_distribution_log.png") + +# Average value of the score +averageCleanedDataPlots = cleanedDataPlots.loc[ + cleanedDataPlots["good/duplicate/fake"] == "good" +].groupby( + pd.cut( + cleanedDataPlots.loc[cleanedDataPlots["good/duplicate/fake"] == "good"]["eta"], + np.linspace(-3, 3, 100), + ) +) +plt.figure() +plt.plot( + np.linspace(-3, 3, 99), + averageCleanedDataPlots["score"].mean(), + label="average score", +) +plt.legend() +plt.xlabel("eta") +plt.ylabel("score") +plt.title("Average score for each eta bin") +plt.savefig("score_eta.png") + +# Plot the pT distribution for each type of track +plt.figure() +plt.hist( + [ + clusteredDataPlots.loc[clusteredDataPlots["good/duplicate/fake"] == "good"][ + "pT" + ], + clusteredDataPlots.loc[ + clusteredDataPlots["good/duplicate/fake"] == "duplicate" + ]["pT"], + clusteredDataPlots.loc[clusteredDataPlots["good/duplicate/fake"] == "fake"][ + "pT" + ], + ], + bins=100, + range=(0, 100), + stacked=False, + label=["good", "duplicate", "fake"], +) +plt.legend() +plt.xlabel("pT") +plt.ylabel("number of tracks") +plt.yscale("log") +plt.title("pT distribution for each type of track") +plt.savefig("pT_distribution.png") + +# Plot the eta distribution for each type of track +plt.figure() +plt.hist( + [ + clusteredDataPlots.loc[clusteredDataPlots["good/duplicate/fake"] == "good"][ + "eta" + ], + clusteredDataPlots.loc[ + clusteredDataPlots["good/duplicate/fake"] == "duplicate" + ]["eta"], + clusteredDataPlots.loc[clusteredDataPlots["good/duplicate/fake"] == "fake"][ + "eta" + ], + ], + bins=100, + range=(-3, 3), + stacked=False, + label=["good", "duplicate", "fake"], +) +plt.legend() +plt.xlabel("eta") +plt.ylabel("number of tracks") +plt.yscale("log") +plt.title("eta distribution for each type of track") +plt.savefig("eta_distribution.png") + +# Average value of the score for 50 pt bins +averageCleanedDataPlots = cleanedDataPlots.loc[ + cleanedDataPlots["good/duplicate/fake"] == "good" +].groupby( + pd.cut( + cleanedDataPlots.loc[cleanedDataPlots["good/duplicate/fake"] == "good"]["pT"], + np.linspace(0, 100, 50), + ) +) +plt.figure() +plt.plot( + np.linspace(0, 100, 49), + averageCleanedDataPlots["score"].mean(), + label="average score", +) +plt.legend() +plt.xlabel("pT") +plt.ylabel("score") +plt.title("Average score for each eta bin") +plt.savefig("score_pt.png") diff --git a/Examples/Scripts/Python/MLAmbiguityResolution/seed_solver_network.py b/Examples/Scripts/Python/MLAmbiguityResolution/seed_solver_network.py new file mode 100644 index 00000000000..36de3db37f8 --- /dev/null +++ b/Examples/Scripts/Python/MLAmbiguityResolution/seed_solver_network.py @@ -0,0 +1,65 @@ +import pandas as pd +import numpy as np + +import torch.nn as nn +import torch.nn.functional as F +import torch.utils + +import ast + + +def prepareDataSet(data: pd.DataFrame) -> pd.DataFrame: + """Format the dataset that have been written from the Csv file""" + """ + @param[in] data: input DataFrame containing 1 event + @return: Formatted DataFrame + """ + # Sort by particle ID + data = data.sort_values("particleId") + # Set truth particle ID as index + data = data.set_index("particleId") + # Transform the hit list from a string to an actual list + hitsIds = [] + mergedIds = [] + for list in data["Hits_ID"].values: + hitsIds.append(ast.literal_eval(list)) + data["Hits_ID"] = hitsIds + # Combine dataset + return data + + +class DuplicateClassifier(nn.Module): + """MLP model used to separate goods seed from duplicate seeds. Return one score per seed the higher one correspond to the good seed.""" + + def __init__(self, input_dim, n_layers): + """Four layer MLP, sigmoid activation for the last layer.""" + super(DuplicateClassifier, self).__init__() + self.linear1 = nn.Linear(input_dim, n_layers[0]) + self.linear2 = nn.Linear(n_layers[0], n_layers[1]) + self.linear3 = nn.Linear(n_layers[1], n_layers[2]) + self.linear4 = nn.Linear(n_layers[2], n_layers[3]) + self.linear5 = nn.Linear(n_layers[3], n_layers[4]) + self.output = nn.Linear(n_layers[4], 1) + self.sigmoid = nn.Sigmoid() + + def forward(self, z): + z = F.relu(self.linear1(z)) + z = F.relu(self.linear2(z)) + z = F.relu(self.linear3(z)) + z = F.relu(self.linear4(z)) + z = F.relu(self.linear5(z)) + return self.sigmoid(self.output(z)) + + +class Normalise(nn.Module): + """Normalisation of the input before the MLP model.""" + + def __init__(self, mean, std): + super(Normalise, self).__init__() + self.mean = torch.tensor(mean, dtype=torch.float32) + self.std = torch.tensor(std, dtype=torch.float32) + + def forward(self, z): + z = z - self.mean + z = z / self.std + return z diff --git a/Examples/Scripts/Python/MLAmbiguityResolution/seedduplicateClassifier.onnx b/Examples/Scripts/Python/MLAmbiguityResolution/seedduplicateClassifier.onnx new file mode 100755 index 00000000000..f8dd749ef4b Binary files /dev/null and b/Examples/Scripts/Python/MLAmbiguityResolution/seedduplicateClassifier.onnx differ diff --git a/Examples/Scripts/Python/MLAmbiguityResolution/train_ambiguity_solver.py b/Examples/Scripts/Python/MLAmbiguityResolution/train_ambiguity_solver.py index a94ea7cbbbb..c36972ec38b 100644 --- a/Examples/Scripts/Python/MLAmbiguityResolution/train_ambiguity_solver.py +++ b/Examples/Scripts/Python/MLAmbiguityResolution/train_ambiguity_solver.py @@ -15,6 +15,7 @@ avg_mean = [0, 0, 0, 0, 0, 0, 0, 0] avg_sdv = [0, 0, 0, 0, 0, 0, 0, 0] events = 0 +device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") def readDataSet(CKS_files: list[str]) -> pd.DataFrame: @@ -23,7 +24,6 @@ def readDataSet(CKS_files: list[str]) -> pd.DataFrame: @param[in] CKS_files: DataFrame contain the data from each track files (1 file per events usually) @return: combined DataFrame containing all the track, ordered by events and then by truth particle ID in each events """ - globalindex = 0 data = pd.DataFrame() for f in CKS_files: datafile = pd.read_csv(f) @@ -151,6 +151,7 @@ def scoringBatch(batch: list[pd.DataFrame], Optimiser=0) -> tuple[int, int, floa if Optimiser: Optimiser.zero_grad() input = torch.tensor(b_data[1], dtype=torch.float32) + input = input.to(device) prediction = duplicateClassifier(input) # loop over all the track in the batch for index, pred, truth in zip(b_data[0], prediction, b_data[2]): @@ -187,7 +188,7 @@ def scoringBatch(batch: list[pd.DataFrame], Optimiser=0) -> tuple[int, int, floa nb_part += 1 # Normalise the loss to the batch size batch_loss = batch_loss / len(b_data[0]) - loss += batch_loss + loss += batch_loss.item() # Perform the gradient descend if an optimiser was specified if Optimiser: batch_loss.backward() @@ -221,14 +222,14 @@ def train( val_batch = int(len(batch) * (1 - validation)) # Loop over all the epoch for epoch in range(epochs): - print("Epoch : ", epoch, " / ", epochs) + print("Epoch: ", epoch, " / ", epochs) loss = 0.0 nb_part = 0.0 nb_good_match = 0.0 # Loop over all the network over the training batch nb_part, nb_good_match, loss = scoringBatch(batch[:val_batch], Optimiser=opt) - print("Loss/train : ", loss, " Eff/train : ", nb_good_match / nb_part) + print("Loss/train: ", loss, " Eff/train: ", nb_good_match / nb_part) writer.add_scalar("Loss/train", loss, epoch) writer.add_scalar("Eff/train", nb_good_match / nb_part, epoch) @@ -237,7 +238,7 @@ def train( nb_part, nb_good_match, loss = scoringBatch(batch[val_batch:]) writer.add_scalar("Loss/val", loss, epoch) writer.add_scalar("Eff/val", nb_good_match / nb_part, epoch) - print("Loss/val : ", loss, " Eff/val : ", nb_good_match / nb_part) + print("Loss/val: ", loss, " Eff/val: ", nb_good_match / nb_part) writer.close() return duplicateClassifier @@ -262,6 +263,7 @@ def train( duplicateClassifier = nn.Sequential( Normalise(avg_mean, avg_sdv), DuplicateClassifier(input_dim, layers_dim) ) +duplicateClassifier = duplicateClassifier.to(device) # Train the model and save it input = data.index, x_train, y_train @@ -292,6 +294,7 @@ def train( output_predict = [] x_test = torch.tensor(x_test, dtype=torch.float32) +x_test = x_test.to(device) for x in x_test: output_predict.append(duplicateClassifier(x)) @@ -326,6 +329,6 @@ def train( if max_match == 1: nb_good_match += 1 -print("nb particles : ", nb_part) -print("nb good match : ", nb_good_match) +print("nb particles: ", nb_part) +print("nb good match: ", nb_good_match) print("Efficiency: ", 100 * nb_good_match / nb_part, " %") diff --git a/Examples/Scripts/Python/MLAmbiguityResolution/train_seed_solver.py b/Examples/Scripts/Python/MLAmbiguityResolution/train_seed_solver.py new file mode 100644 index 00000000000..f3f8b9930df --- /dev/null +++ b/Examples/Scripts/Python/MLAmbiguityResolution/train_seed_solver.py @@ -0,0 +1,393 @@ +import glob + +import pandas as pd +import numpy as np + +import torch.nn as nn +import torch.nn.functional as F +import torch.utils +from torch.utils.tensorboard import SummaryWriter + +from sklearn.preprocessing import LabelEncoder, StandardScaler, OrdinalEncoder + +from seed_solver_network import ( + prepareDataSet, + DuplicateClassifier, + Normalise, +) + +avg_mean = [0] * 14 +avg_sdv = [0] * 14 +events = 0 +device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + + +def readDataSet(Seed_files: list[str]) -> pd.DataFrame: + """Read the dataset from the different files, remove the particle with only fakes and combine the datasets""" + """ + @param[in] Seed_files: DataFrame contain the data from each seed files (1 file per events usually) + @return: combined DataFrame containing all the seed, ordered by events and then by truth particle ID in each events + """ + data = pd.DataFrame() + for f in Seed_files: + datafile = pd.read_csv(f) + datafile = prepareDataSet(datafile) + data = pd.concat([data, datafile]) + return data + + +def prepareTrainingData(data: pd.DataFrame) -> tuple[np.ndarray, np.ndarray]: + """Prepare the data""" + """ + @param[in] data: input DataFrame to be prepared + @return: array of the network input and the corresponding truth + """ + # Remove truth and useless variable + target_column = "good/duplicate/fake" + # Separate the truth from the input variables + y = LabelEncoder().fit(data[target_column]).transform(data[target_column]) + input = data.drop( + columns=[ + target_column, + "seed_id", + "Hits_ID", + ] + ) + # Compute the normalisation factors + scale = StandardScaler() + scale.fit(input.select_dtypes("number")) + # Variables to compute the normalisation + global avg_mean + avg_mean = avg_mean + scale.mean_ + global avg_sdv + avg_sdv = avg_sdv + scale.var_ + global events + events = events + 1 + # Prepare the input feature + x_cat = OrdinalEncoder().fit_transform(input.select_dtypes("object")) + x = np.concatenate((x_cat, input), axis=1) + return x, y + + +def batchSplit(data: pd.DataFrame, batch_size: int) -> list[pd.DataFrame]: + """Split the data into batch each containing @batch_size truth particles (the number of corresponding seeds may vary)""" + """ + @param[in] data: input DataFrame to be cut into batch + @param[in] batch_size: Number of truth particles per batch + @return: list of DataFrame, each element correspond to a batch + """ + batch = [] + pid = data[0][0] + n_particle = 0 + id_prev = 0 + id = 0 + for index, row, truth in zip(data[0], data[1], data[2]): + if index != pid: + pid = index + n_particle += 1 + if n_particle == batch_size: + b = data[0][id_prev:id], data[1][id_prev:id], data[2][id_prev:id] + batch.append(b) + n_particle = 0 + id_prev = id + id += 1 + return batch + + +def computeLoss( + score_good: torch.Tensor, + score_duplicate: list[torch.Tensor], + score_fake: list[torch.Tensor], + batch_loss: torch.Tensor, + margin_duplicate: float = 0.3, + margin_fake: float = 0.9, +) -> torch.Tensor: + """Compute one loss for each duplicate seed associated with the particle""" + """ + @param[in] score_good: score return by the model for the good seed associated with this particle + @param[in] score_duplicate: list of the scores of all duplicate seed associated with this particle + @param[in] margin_duplicate: Margin used in the computation of the MarginRankingLoss for duplicate seeds + @param[in] margin_fake: Margin used in the computation of the MarginRankingLoss for fake seeds + @return: return the updated loss + """ + # Compute the losses using the MarginRankingLoss with respect to the good seed score + batch_loss = batch_loss + if score_duplicate: + for s in score_duplicate: + batch_loss += F.relu(s - score_good + margin_duplicate) / ( + len(score_duplicate) + len(score_fake) + 1 + ) + if score_fake: + for s in score_fake: + batch_loss += F.relu(s - score_good + margin_fake) / ( + len(score_duplicate) + len(score_fake) + 1 + ) + batch_loss += margin_fake / (len(score_duplicate) + len(score_fake) + 1) + + return batch_loss + + +def scoringBatch(batch: list[pd.DataFrame], Optimiser=0) -> tuple[int, int, float]: + """Run the MLP on a batch and compute the corresponding efficiency and loss. If an optimiser is specify train the MLP.""" + """ + @param[in] batch: list of DataFrame, each element correspond to a batch + @param[in] Optimiser: Optimiser for the MLP, if one is specify the network will be train on batch. + @return: array containing the number of particles, the number of particle where the good seed was found and the loss + """ + # number of particles + nb_part = 0 + # number of particles associated with a good seed + nb_good_match = 0 + # number of particles associated with a best seed + nb_best_match = 0 + # loss for the batch + loss = 0 + # best seed score for a particle + max_score = 0 + # is the best score associated with the good seed + max_match = 1 + # loop over all the batch + for b_data in batch: + # ID of the current particle + pid = b_data[0][0] + # loss for the current batch + batch_loss = 0 + # score of the good seed + score_good = 1 + # score of the duplicate seeds + score_duplicate = [] + # score of the fake seeds + score_fake = [] + + if Optimiser: + Optimiser.zero_grad() + input = torch.tensor(b_data[1], dtype=torch.float32) + input = input.to(device) + prediction = duplicateClassifier(input) + # loop over all the seed in the batch + for index, pred, truth in zip(b_data[0], prediction, b_data[2]): + # If we are changing particle update the loss + if index != pid: + # Starting a new particles, compute the loss for the previous one + if max_match == 0 or max_match == 2: + nb_good_match += 1 + if max_match == 2: + nb_best_match += 1 + batch_loss = computeLoss( + score_good, + score_duplicate, + score_fake, + batch_loss, + margin_duplicate=0.2, + margin_fake=0.4, + ) + nb_part += 1 + # Reinitialise the variable for the next particle + pid = index + score_duplicate = [] + score_fake = [] + score_good = 1 + max_score = 0 + max_match = 1 + # Store seed scores + if truth == 2: + score_good = pred + elif truth == 0: + score_duplicate.append(pred) + else: + score_fake.append(pred) + # Prepare efficiency computation + if pred > max_score: + max_score = pred + max_match = truth + # Compute the loss for the last particle when reaching the end of the batch + if max_match == 0 or max_match == 2: + nb_good_match += 1 + if max_score == 2: + nb_best_match += 1 + batch_loss = computeLoss( + score_good, + score_duplicate, + score_fake, + batch_loss, + margin_duplicate=0.2, + margin_fake=0.4, + ) + nb_part += 1 + # Normalise the loss to the batch size + batch_loss = batch_loss / len(b_data[0]) + loss += batch_loss.item() + # Perform the gradient descent if an optimiser was specified + if Optimiser: + batch_loss.backward() + Optimiser.step() + loss = loss / len(batch) + return nb_part, nb_good_match, nb_best_match, loss + + +def train( + duplicateClassifier: DuplicateClassifier, + data: tuple[np.ndarray, np.ndarray, np.ndarray], + epochs: int = 20, + batch: int = 32, + validation: float = 0.3, +) -> DuplicateClassifier: + """Training of the MLP""" + """ + @param[in] duplicateClassifier: model to be trained. + @param[in] data: tuple containing three list. Each element of those list correspond to a given seed and represent : the truth particle ID, the seed parameters and the truth. + @param[in] epochs: number of epoch the model will be trained for. + @param[in] batch: size of the batch used in the training + @param[in] validation: Fraction of the batch used in training + @return: trained model + """ + # Prepare tensorboard for the training plot + # use 'tensorboard --logdir=runs' to access the plot afterward + writer = SummaryWriter() + opt = torch.optim.Adam(duplicateClassifier.parameters()) + # Split the data in batch + batch = batchSplit(data, batch) + val_batch = int(len(batch) * (1 - validation)) + # Loop over all the epoch + for epoch in range(epochs): + print("Epoch: ", epoch, " / ", epochs) + loss = 0.0 + nb_part = 0.0 + nb_good_match = 0.0 + + # Loop over all the network over the training batch + nb_part, nb_good_match, nb_best_match, loss = scoringBatch( + batch[:val_batch], Optimiser=opt + ) + print( + "Loss/train: ", + loss, + " Eff/train: ", + nb_good_match / nb_part, + " Eff_best/train: ", + nb_best_match / nb_part, + ) + writer.add_scalar("Loss/train", loss, epoch) + writer.add_scalar("Eff/train", nb_good_match / nb_part, epoch) + writer.add_scalar("Eff_best/train", nb_best_match / nb_part, epoch) + + # If using validation, compute the efficiency and loss over the training batch + if validation > 0.0: + nb_part, nb_good_match, nb_best_match, loss = scoringBatch( + batch[val_batch:] + ) + writer.add_scalar("Loss/val", loss, epoch) + writer.add_scalar("Eff/val", nb_good_match / nb_part, epoch) + writer.add_scalar("Eff_best/train", nb_best_match / nb_part, epoch) + print( + "Loss/val: ", + loss, + " Eff/val: ", + nb_good_match / nb_part, + " Eff_best/val: ", + nb_best_match / nb_part, + ) + + writer.close() + return duplicateClassifier + + +# ================================================================== + +# ttbar events used as the training input, here we assume 1000 events are available +CKF_files = sorted( + glob.glob("odd_output" + "/event000000[0-9][0-9][0-9]-seed_cleaned.csv") +) +data = readDataSet(CKF_files) +# Prepare the data +x_train, y_train = prepareTrainingData(data) + +avg_mean = [x / events for x in avg_mean] +avg_sdv = [x / events for x in avg_sdv] + +# Create our model and chose the layers sizes +input_dim = np.shape(x_train)[1] +layers_dim = [80, 80, 100, 80, 80] + +duplicateClassifier = nn.Sequential( + Normalise(avg_mean, avg_sdv), DuplicateClassifier(input_dim, layers_dim) +) +duplicateClassifier = duplicateClassifier.to(device) + +# Train the model and save it +input = data.index, x_train, y_train +train(duplicateClassifier, input, epochs=30, batch=128, validation=0.3) +duplicateClassifier.eval() +input_test = torch.tensor(x_train, dtype=torch.float32) +torch.save(duplicateClassifier, "seedduplicateClassifier.pt") +torch.onnx.export( + duplicateClassifier, + input_test[0], + "seedduplicateClassifier.onnx", + input_names=["x"], + output_names=["y"], + dynamic_axes={"x": {0: "batch_size"}, "y": {0: "batch_size"}}, +) + +del CKF_files +del data +del x_train, y_train +del input, input_test +del duplicateClassifier +# ================================================================== + +# ttbar events for the test, here we assume 40 events are availables +CKF_files_test = sorted( + glob.glob("odd_output" + "/event000001[0-0][0-9][0-9]-seed_cleaned.csv") +) + +test = readDataSet(CKF_files_test) + +# Prepare the data +x_test, y_test = prepareTrainingData(test) + +# Write the network score to a list +output_predict = [] + +model = torch.load("seedduplicateClassifier.pt") + +x_test = torch.tensor(x_test, dtype=torch.float32) +x_test = x_test.to(device) +for x in x_test: + output_predict.append(model(x)) + +# For the first 100 particles print the ID, score and truth +for sample_test, sample_predict, sample_true in zip( + test.index[0:100], output_predict[0:100], y_test[0:100] +): + print(sample_test, sample_predict, sample_true) + +id = 0 +pid = test.index[0] +nb_part = 0 +nb_good_match = 0 +nb_best_match = 0 +max_match = 1 +max_score = 0 + +# Compute the efficiency +for index, pred, truth in zip(test.index, output_predict, y_test): + if index != pid: + nb_part += 1 + if max_match == 0 or max_match == 2: + nb_good_match += 1 + if max_match == 2: + nb_best_match += 1 + pid = index + max_match = 1 + max_score = 0 + + if pred > max_score: + max_score = pred + max_match = truth + +print("nb particles: ", nb_part) +print("nb good match: ", nb_good_match) +print("nb best match: ", nb_best_match) +print("Efficiency: ", 100 * nb_good_match / nb_part, " %") +print("Efficiency_best: ", 100 * nb_best_match / nb_part, " %") diff --git a/Examples/Scripts/Python/full_chain_odd.py b/Examples/Scripts/Python/full_chain_odd.py index e42e8bc69f4..66d53ddedf3 100755 --- a/Examples/Scripts/Python/full_chain_odd.py +++ b/Examples/Scripts/Python/full_chain_odd.py @@ -23,6 +23,8 @@ AmbiguityResolutionMLConfig, addVertexFitting, VertexFinder, + addSeedFilterML, + SeedFilterMLDBScanConfig, ) from common import getOpenDataDetectorDirectory from acts.examples.odd import getOpenDataDetector @@ -43,12 +45,18 @@ help="Use the Ml Ambiguity Solver instead of the classical one", action="store_true", ) +parser.add_argument( + "--MLSeedFilter", + help="Use the Ml seed filter to select seed after the seeding step", + action="store_true", +) args = vars(parser.parse_args()) ttbar = args["ttbar"] g4_simulation = args["geant4"] ambiguity_MLSolver = args["MLSolver"] +seedFilter_ML = args["MLSeedFilter"] u = acts.UnitConstants geoDir = getOpenDataDetectorDirectory() outputDir = pathlib.Path.cwd() / "odd_output" @@ -164,6 +172,17 @@ outputDirRoot=outputDir, # outputDirCsv=outputDir, ) +if seedFilter_ML: + addSeedFilterML( + s, + SeedFilterMLDBScanConfig( + epsilonDBScan=0.03, minPointsDBScan=2, minSeedScore=0.1 + ), + onnxModelFile=os.path.dirname(__file__) + + "/MLAmbiguityResolution/seedDuplicateClassifier.onnx", + outputDirRoot=outputDir, + # outputDirCsv=outputDir, + ) addCKFTracks( s, diff --git a/Plugins/Mlpack/CMakeLists.txt b/Plugins/Mlpack/CMakeLists.txt index 1fc9090e4f3..132921eb814 100644 --- a/Plugins/Mlpack/CMakeLists.txt +++ b/Plugins/Mlpack/CMakeLists.txt @@ -1,7 +1,8 @@ add_library( ActsPluginMlpack SHARED # header files - include/Acts/Plugins/Mlpack/AmbiguityDBScanClustering.hpp) + include/Acts/Plugins/Mlpack/AmbiguityDBScanClustering.hpp + include/Acts/Plugins/Mlpack/SeedFilterDBScanClustering.hpp) target_include_directories( ActsPluginMlpack diff --git a/Plugins/Mlpack/include/Acts/Plugins/Mlpack/AmbiguityDBScanClustering.hpp b/Plugins/Mlpack/include/Acts/Plugins/Mlpack/AmbiguityDBScanClustering.hpp index 273dd863846..d726a6db888 100644 --- a/Plugins/Mlpack/include/Acts/Plugins/Mlpack/AmbiguityDBScanClustering.hpp +++ b/Plugins/Mlpack/include/Acts/Plugins/Mlpack/AmbiguityDBScanClustering.hpp @@ -28,22 +28,23 @@ namespace Acts { /// @return an unordered map representing the clusters, the keys the ID of the primary track of each cluster and the store a vector of track IDs. template class holder_t> -std::unordered_map> dbscanTrackClustering( - std::multimap>>& trackMap, +std::unordered_map> dbscanTrackClustering( + std::multimap>>& + trackMap, const Acts::TrackContainer& tracks, float epsilon = 0.07, int minPoints = 2) { // Unordered map associating a vector with all the track ID of a cluster to // the ID of the first track of the cluster - std::unordered_map> cluster; + std::unordered_map> cluster; // Unordered map associating hits to the ID of the first track of the // different clusters. - std::unordered_map hitToTrack; + std::unordered_map hitToTrack; // DBSCAN algorithm from MLpack used in the track clustering mlpack::DBSCAN dbscan(epsilon, minPoints); arma::mat data(2, trackMap.size()); - int trackID = 0; + std::size_t trackID = 0; arma::Row assignments; // Get the input feature of the network for all the tracks @@ -57,12 +58,13 @@ std::unordered_map> dbscanTrackClustering( trackID = 0; // Cluster track with DBScan - std::vector>>> + std::vector< + std::multimap>>> dbscanClusters(clusterNb); for (const auto& [key, val] : trackMap) { - int clusterID = assignments(trackID); + std::size_t clusterID = assignments(trackID); if (assignments(trackID) == SIZE_MAX) { - cluster.emplace(val.first, std::vector(1, val.first)); + cluster.emplace(val.first, std::vector(1, val.first)); } else { dbscanClusters[clusterID].emplace(key, val); } diff --git a/Plugins/Mlpack/include/Acts/Plugins/Mlpack/SeedFilterDBScanClustering.hpp b/Plugins/Mlpack/include/Acts/Plugins/Mlpack/SeedFilterDBScanClustering.hpp new file mode 100644 index 00000000000..369650ed3e5 --- /dev/null +++ b/Plugins/Mlpack/include/Acts/Plugins/Mlpack/SeedFilterDBScanClustering.hpp @@ -0,0 +1,62 @@ +// This file is part of the Acts project. +// +// Copyright (C) 2023 CERN for the benefit of the Acts project +// +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#pragma once + +#include +#include +#include + +#include "mlpack/methods/dbscan.hpp" + +namespace Acts { + +/// Clusters seed based on their direction, their Z impact parameter and their +/// momentum using DBScan +/// +/// @param input : Input parameters for the clustering (phi, eta, z, Pt) +/// @param epsilon : Maximum distance between 2 tracks to be clustered +/// @param minPoints : Minimum number of tracks to create a cluster +/// @return an unordered map representing the clusters, the keys the ID of the primary seed of each cluster and the stored value a vector of seed IDs. +std::vector> dbscanSeedClustering( + const std::vector>& input, float epsilon = 0.03, + int minPoints = 2) { + // DBSCAN algorithm from MLpack used in the seed clustering + mlpack::DBSCAN dbscan(epsilon, minPoints); + + // Compute the space dimension of the input + int dim = input[0].size(); + + // Prepare the input for the DBScan + arma::mat data(dim, input.size()); + arma::Row assignments; + std::size_t trackID = 0; + for (const auto& param : input) { + for (int i = 0; i < dim; i++) { + data(i, trackID) = param[i]; + } + trackID++; + } + // Cluster track with DBScan + std::size_t clusterNb = dbscan.Cluster(data, assignments); + + // Prepare the output + std::vector> cluster(clusterNb, + std::vector()); + for (std::size_t iD = 0; iD < input.size(); iD++) { + std::size_t clusterID = assignments(iD); + if (assignments(iD) == SIZE_MAX) { + cluster.push_back(std::vector(1, iD)); + } else { + cluster[clusterID].push_back(iD); + } + } + return cluster; +} + +} // namespace Acts diff --git a/Plugins/Onnx/CMakeLists.txt b/Plugins/Onnx/CMakeLists.txt index 3e04aefad4c..f4b756fb38a 100644 --- a/Plugins/Onnx/CMakeLists.txt +++ b/Plugins/Onnx/CMakeLists.txt @@ -4,6 +4,7 @@ add_library( include/Acts/Plugins/Onnx/OnnxRuntimeBase.hpp include/Acts/Plugins/Onnx/MLTrackClassifier.hpp include/Acts/Plugins/Onnx/AmbiguityTrackClassifier.hpp + include/Acts/Plugins/Onnx/SeedClassifier.hpp # source files src/OnnxRuntimeBase.cpp src/MLTrackClassifier.cpp) diff --git a/Plugins/Onnx/include/Acts/Plugins/Onnx/AmbiguityTrackClassifier.hpp b/Plugins/Onnx/include/Acts/Plugins/Onnx/AmbiguityTrackClassifier.hpp index 24cf17db84d..8759450abd4 100644 --- a/Plugins/Onnx/include/Acts/Plugins/Onnx/AmbiguityTrackClassifier.hpp +++ b/Plugins/Onnx/include/Acts/Plugins/Onnx/AmbiguityTrackClassifier.hpp @@ -39,7 +39,7 @@ class AmbiguityTrackClassifier { template class holder_t> std::vector> inferScores( - std::unordered_map>& clusters, + std::unordered_map>& clusters, const Acts::TrackContainer& tracks) const { // Compute the number of entry (since it is smaller than the number of @@ -50,7 +50,7 @@ class AmbiguityTrackClassifier { } // Input of the neural network Acts::NetworkBatchInput networkInput(trackNb, 8); - int inputID = 0; + std::size_t inputID = 0; // Get the input feature of the network for all the tracks for (const auto& [key, val] : clusters) { for (const auto& trackID : val) { @@ -80,15 +80,15 @@ class AmbiguityTrackClassifier { /// @param clusters is a map of clusters, each cluster correspond to a vector of track ID /// @param outputTensor is the score vector obtained from inferScores. /// @return a vector of trackID corresponding tho the good tracks - std::vector trackSelection( - std::unordered_map>& clusters, + std::vector trackSelection( + std::unordered_map>& clusters, std::vector>& outputTensor) const { - std::vector goodTracks; - int iOut = 0; + std::vector goodTracks; + std::size_t iOut = 0; // Loop over all the cluster and only keep the track with the highest score // in each cluster for (const auto& [key, val] : clusters) { - int bestTrackID = 0; + std::size_t bestTrackID = 0; float bestTrackScore = 0; for (const auto& track : val) { if (outputTensor[iOut][0] > bestTrackScore) { @@ -109,13 +109,14 @@ class AmbiguityTrackClassifier { /// @return a vector of trackID corresponding tho the good tracks template class holder_t> - std::vector solveAmbuguity( - std::unordered_map>& clusters, + std::vector solveAmbiguity( + std::unordered_map>& clusters, const Acts::TrackContainer& tracks) const { std::vector> outputTensor = inferScores(clusters, tracks); - std::vector goodTracks = trackSelection(clusters, outputTensor); + std::vector goodTracks = + trackSelection(clusters, outputTensor); return goodTracks; } diff --git a/Plugins/Onnx/include/Acts/Plugins/Onnx/SeedClassifier.hpp b/Plugins/Onnx/include/Acts/Plugins/Onnx/SeedClassifier.hpp new file mode 100644 index 00000000000..77dfef6c51d --- /dev/null +++ b/Plugins/Onnx/include/Acts/Plugins/Onnx/SeedClassifier.hpp @@ -0,0 +1,92 @@ +// This file is part of the Acts project. +// +// Copyright (C) 2023 CERN for the benefit of the Acts project +// +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#pragma once + +#include "Acts/Plugins/Onnx/OnnxRuntimeBase.hpp" + +#include + +#include + +namespace Acts { + +/// Onnx model implementation for seed scoring and selection +class SeedClassifier { + public: + /// Construct the scoring algorithm. + /// + /// @param modelPath path to the model file + SeedClassifier(const char* modelPath) + : m_env(ORT_LOGGING_LEVEL_WARNING, "MLSeedClassifier"), + m_duplicateClassifier(m_env, modelPath){}; + + /// Compute a score for each seed to be used in the seed selection + /// + /// @param networkInput input of the network + /// @return a vector of vector of seed score. Due to the architecture of the network each seed only have a size 1 score vector. + std::vector> inferScores( + Acts::NetworkBatchInput& networkInput) const { + // Use the network to compute a score for all the Seeds. + std::vector> outputTensor = + m_duplicateClassifier.runONNXInference(networkInput); + return outputTensor; + } + + /// Select the seed associated with each cluster based on the score vector + /// + /// @param clusters is a vector of clusters, each cluster corresponds to a vector of seedIDs + /// @param outputTensor is the score vector obtained from inferScores. + /// @param minSeedScore is the minimum score a seed needs to be selected + /// @return a vector of seedIDs corresponding tho the good seeds + std::vector seedSelection( + std::vector>& clusters, + std::vector>& outputTensor, + float minSeedScore = 0.1) const { + std::vector goodSeeds; + // Loop over all the cluster and only keep the seed with the highest score + // in each cluster + for (const auto& cluster : clusters) { + std::size_t bestseedID = 0; + float bestSeedScore = 0; + for (const auto& seed : cluster) { + if (outputTensor[seed][0] > bestSeedScore) { + bestSeedScore = outputTensor[seed][0]; + bestseedID = seed; + } + } + if (bestSeedScore >= minSeedScore) { + goodSeeds.push_back(bestseedID); + } + } + return goodSeeds; + } + + /// Select the seed associated with each cluster + /// + /// @param clusters is a map of clusters, each cluster correspond to a vector of seed ID + /// @param networkInput input of the network + /// @param minSeedScore is the minimum score a seed need to be selected + /// @return a vector of seedID corresponding the the good seeds + std::vector solveAmbiguity( + std::vector>& clusters, + Acts::NetworkBatchInput& networkInput, float minSeedScore = 0.1) const { + std::vector> outputTensor = inferScores(networkInput); + std::vector goodSeeds = + seedSelection(clusters, outputTensor, minSeedScore); + return goodSeeds; + } + + private: + // ONNX environment + Ort::Env m_env; + // ONNX model for the duplicate neural network + Acts::OnnxRuntimeBase m_duplicateClassifier; +}; + +} // namespace Acts diff --git a/docs/plugins/MLAlgorithms.md b/docs/plugins/MLAlgorithms.md index 2534bcb4cfe..3b4f84e0af1 100644 --- a/docs/plugins/MLAlgorithms.md +++ b/docs/plugins/MLAlgorithms.md @@ -1,38 +1,66 @@ -# Machine leaning algorithms +# Machine learning algorithms -ACTS allows you to replace some components of the tracking chain by machine learning solutions. -For now a replacement to the ambiguity solver is available, but when others are implemented they will be explained here. +ACTS allows you to replace some components of the tracking chain with machine-learning solutions. +A replacement for the ambiguity solver and a filtering algorithm for seeds are available for now, but when others are implemented, they will be explained here. ## Onnx plugin -To be able to perform neural network models' inferences in C++ ACTS uses [onnxruntime](https://onnxruntime.ai/). An interface to use it has been implemented as an ACTS plugin, to use it you will need to compile ACTS with the `ACTS_PLUGIN_ONNX` option. For more detail on how to export your model to onnx please see the documentation on their [website](https://onnxruntime.ai/docs/) +To be able to perform neural network (NN) models' inferences in C++, ACTS uses [ONNX Runtime](https://onnxruntime.ai/). An interface to use ONNX Runtime has been implemented as an ACTS plugin; to use it, you will need to compile ACTS with the `ACTS_PLUGIN_ONNX` option. For more details on how to export your model to ONNX, please see the documentation on their [website](https://onnxruntime.ai/docs/). ### OnnxRuntimeBase -The `OnnxRuntimeBase` class implement the inference of a standard MLP via Onnx and just require a link to the `.onnx` file containing the model one wants to use. Please note that right now the implementation of the inference in ACTS only work with a single input node and a single output node. The inference can be both perform in single entry mode and in batch mode. In single entry mode, the `runONNXInference` method takes a single vector as entry, each element of the vector corresponding to one of the input features of your network. In batch mode, the input is an `Eigen::Array` with the columns corresponding to the input features and rows to the different batch inputs. +The `OnnxRuntimeBase` class implements the inference of a standard MLP via ONNX. The class just requires a link to the `.onnx` file containing the model one wants to use. Please note, that the implementation of the inference in ACTS only works with a single input node and a single output node. The inference can be performed in single entry and batch modes. In single entry mode, the `runONNXInference` method takes a single vector as an entry, each element of the vector corresponding to one of the input features of your network. In batch mode, the input is an `Eigen::Array` with the columns corresponding to the input features and rows to the different batch inputs. ## AmbiguityResolutionMLAlgorithm -The goal of the ambiguity solver is to remove duplicated and fake tracks that remains after the CKF. To perform this cleaning, this algorithm works in three steps : +The goal of the ambiguity solver is to remove duplicated and fake tracks that remain after the CKF. To perform this cleaning, this algorithm works in three steps: - Clustering: tracks are clustered together, one cluster ~ one truth particle -- Ranking: tracks in each cluster are ranked, the best one is kept +- Ranking: tracks in each cluster are scored, the best one is kept - Cleaning: last pass over all the remaining tracks to remove duplicate and fake (not implemented yet) ### Clustering -The clustering is implemented with the `clusterTracks` function. Its input is a multimap of pair of track ID and vector of measurement ID. The multimap uses the number of measurement associated with the tracks as a key, this is only a trick to sort the tracks by the number of measurements efficiently. Then for each track, starting with the one which has the most measurements, we check if a cluster shares a hits with the track. If not, we create a new cluster and associate all the hits of the current track with the cluster. If yes, the track is added to that cluster (note that the hits associated to the cluster doesn't change here). After looping over all the tracks, each of them should have been associated with a cluster. +The clustering is implemented with the `clusterTracks` function. Its input is a multimap of a pair of track IDs and a vector of measurement IDs. The multimap uses the number of measurements associated with the tracks as a key, which is only a trick to sort the tracks efficiently by the number of measurements. Then, for each track, starting with the one with the most measurements, we check if a cluster shares a hit with the track. If not, we create a new cluster and associate all the hits of the current track with the cluster. If yes, the track is added to that cluster (note that the hits associated with the cluster don’t change here). After looping over all the tracks, each should have been associated with a cluster. ### Ranking -At this step we have multiple clusters of tracks. We use a NN to compute a score for each track, the closer the score is to 1, the more confident we are that the track is the best track (the best one associated with the truth particle). Then for each cluster, we select the track with the highest score. +At this step, we have multiple clusters of tracks. We use a NN to compute a score for each track; the closer the score is to 1, the more confident we are that the track is the best (the best one associated with the truth particle). Then, for each cluster, we select the track with the highest score. ### Cleaning -Finally, for the remaining tracks, there might be some fakes and duplicates. With the first tests it doesn't seem to be the case, so the current implementation stops here. But in the future if some configuration encounters too many fake/duplicate after the ranking a simple classification NN could be used to separate the good tracks from the others. +Finally, for the remaining tracks, there might be some fakes and duplicates. It doesn’t seem to be the case with the first tests, so the current implementation stops here. But in the future, if some configuration encounters too many fake/duplicates after the ranking, a simple classification NN could be used to separate the good tracks from the others. + +Running the ACTS Greedy Solver after the ML Solver can be helpful in some cases; it will likely remove most of the remaining fakes while being extremely fast and not affecting the reconstruction performances (since both algorithms are orthogonal). The default implementation in the full ODD chain uses this approach. + +### How to use + +The ML-based Ambiguity solver comes with a pre-trained model to be used with the ODD. Using it is extremely simple; just call the `full_chain_odd.py` with the `--MLSolver` option. If you want to try this solver with another detector, a few more steps are needed. First, you need to train a model. As an input, you will need the multitrajectory output from the CKF (run a full chain up to the CKF, then use the `CSVWriter`). At least 100 ttbar events with 200 PU are needed (1000 would be ideal). You can then run `Examples/Scripts/Python/MLAmbiguityResolution/train_ambiguity_solver.py` on our dataset, 2/3 of the data should be used for the training and 1/3 for the validation. You will receive a `duplicateClassifier.onnx` file as an output, which can now be used as an input for the `AmbiguityResolutionMLAlgorithm` with your detector. + +Additionally, two other Python files are present with the training. +- `ambiguity_solver_full_chain.py` performs the same job as the chain we presented here but in Python using CSV files as input. +- `ambiguity_solver_perf.py` can be used to study the performance of the ambiguity solver (with and without ML) and also takes CSV files as input. + +## MLSeedFiltering + +While the ambiguity solver can significantly improve the cleanliness of the output removing both duplicates and fakes from the final collection, it doesn't help with the speed of the full tracking chain. The MLSeedFiltering aims to use a NN to determine which seed will lead to the best trajectory before performing the track reconstruction. We can select the seed used in track reconstruction based on the NN score. Depending on the full tracking setup, this might help us improve the track reconstruction speed, the track reconstructed quality, or both. In the case of the ODD, a speed improvement of x10 was observed in track reconstruction speed for ttbar, mu=200 G4+Pythia8 events. + +It uses the same three steps as the ML ambiguity solver but with seed instead of tracks: + +- Clustering: seeds are clustered together, one cluster ~ one truth particle +- Ranking: seeds in each cluster are scored, and the best one is kept +- Cleaning: last pass over all the remaining scores to remove fake + +### Clustering + +The clustering is implemented with the `dbscanSeedClustering` function. Its input is a vector of vectors containing the seed parameters that will be used in the clustering. For the clustering itself, we used a 4D DBSCAN clustering algorithm, a density-based clustering technique. All the seeds are represented as points in a 4D space using their value of $\phi$, $\eta$, $z_{0}$, $p_{T}$ and are clustered together based on their proximity. The output of the clustering is a vector of vectors of ints; each element corresponds to one cluster with the inner vector listing the ID of all the seeds in that cluster. + +### Ranking + +At this step, we have multiple clusters of seeds. We use a NN to compute a score for each seed. Due to the loss function used in training, the fake seed (coming from more than one truth particle) tends to have a score close to 0; we can thus remove them by cutting the low-score seed. The `minSeedScore` parameter is used to choose the cutoff value. For each cluster, we can then select the seed with the highest score to be kept. ### How to use -The ML based Ambiguity solver comes with a pre-trained model to be used with the ODD. Using it is extremely simple, just call the `full_chain_odd.py` with the `--MLSolver` option. If you want to try this solver with another detector, a few more step are needed. First you need to train a model. As an input you will need the multitrajectory output from the CKF (run a full chain up to the CKF then use the `CSVWriter`). At least 100 ttbar events with 200 PU are needed (1000 would be ideal). You can then run `Examples/Scripts/Python/MLAmbiguityResolution/train_ambiguity_solver.py ` on our dataset, 2/3 of the data should be used for the training and 1/3 for the validation. You will receive a `duplicateClassifier.onnx` file as an output which now can be used as an input for the `AmbiguityResolutionMLAlgorithm` with your detector. +The ML-based Ambiguity solver comes with a pre-trained model to be used with the ODD. Using it is extremely simple; just call the `full_chain_odd.py` with the `--MLSeedFilter` option. If you want to try this solver with another detector, a few more steps are needed. First, you need to train a model. As an input, you will need the seeding and multitrajectory output from the CKF (run a full chain up to the CKF, then use the `CSVWriter` for both the seeding and the CKF). At least 1000 ttbar events with 200 PU are needed. First, we need to match the seed and the corresponding tracks together; for that, we can run `Examples/Scripts/Python/MLAmbiguityResolution/match_good_track-seed.py`. The network can then be trained with: `Examples/Scripts/Python/MLAmbiguityResolution/train_seed_solver.py ` on our dataset, 2/3 of the data should be used for the training and 1/3 for the validation. You will receive a `seedduplicateClassifier.onnx` file as an output, which can now be used as an input for the `AmbiguityResolutionMLAlgorithm` with your detector. -Additionally, two other Python files are present with the training. One called `ambiguity_solver_full_chain.py` this one perform the same job as the chain we presented here but in python using Csv files in input. The difference is that they also perform an additional pre-clustering using a DBScan. The other one `ambiguity_solver_perf.py` can be used to study the performance of the ambiguity solver (ML and not) and also takes Csv files as input. \ No newline at end of file +Additionally, another Python files is present with the training: `seed_filter_full_chain.py` performs the same job as the chain we presented here, but in Python, using CSV files in the input, this will plot many helpful distributions that will help you understand how the Filter is performing.