Skip to content

Commit

Permalink
Add an RDataSource for podio files and collections (#593)
Browse files Browse the repository at this point in the history
* Moving RDataSource closer to Podio/EDM4hep

* Adding podio::ROOTDataSource class to the rootmap

* Separating datasource into standalone library

* Adding ON flag to all tests

* The headers should install now

* Other suggested adjustment

* Installing also utilities directory

* Cleanup setup code slightly to avoid unnecessary copies

* Adding missing podioDataSourceDict target

---------

Co-authored-by: Thomas Madlener <thomas.madlener@desy.de>
  • Loading branch information
kjvbrt and tmadlener authored Aug 28, 2024
1 parent 7495b33 commit cfe4836
Show file tree
Hide file tree
Showing 12 changed files with 476 additions and 15 deletions.
1 change: 1 addition & 0 deletions .github/workflows/key4hep.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ jobs:
-DCMAKE_CXX_FLAGS=" -fdiagnostics-color=always -Werror -Wno-error=deprecated-declarations " \
-DUSE_EXTERNAL_CATCH2=AUTO \
-DENABLE_RNTUPLE=ON \
-DENABLE_DATASOURCE=ON \
-G Ninja ..
echo "::endgroup::"
echo "::group::Build"
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/pre-commit.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ jobs:
cmake .. -DENABLE_SIO=ON \
-DENABLE_JULIA=ON \
-DENABLE_RNTUPLE=ON \
-DENABLE_DATASOURCE=ON \
-DCMAKE_CXX_STANDARD=20 \
-DCMAKE_CXX_FLAGS=" -fdiagnostics-color=always -Werror "\
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/publish-docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ jobs:
echo -e "::endgroup::\n::group::Build podio"
cmake -B build . --install-prefix=$(pwd)/install \
-GNinja -DENABLE_SIO=ON -DENABLE_RNTUPLE=ON \
-DBUILD_TESTING=OFF \
-DENABLE_DATASOURCE=ON -DBUILD_TESTING=OFF \
-DCMAKE_CXX_STANDARD=20
cmake --build build --target install
source ./init.sh && source ./env.sh
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ jobs:
cmake -DENABLE_SIO=ON \
-DENABLE_JULIA=ON \
-DENABLE_RNTUPLE=ON \
-DENABLE_DATASOURCE=ON \
-DCMAKE_INSTALL_PREFIX=../install \
-DCMAKE_CXX_STANDARD=20 \
-DCMAKE_CXX_FLAGS=" -fdiagnostics-color=always -Werror -Wno-error=deprecated-declarations " \
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/ubuntu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ jobs:
cd build
cmake -DENABLE_SIO=ON \
-DENABLE_JULIA=ON \
-DENABLE_DATASOURCE=ON \
-DCMAKE_INSTALL_PREFIX=../install \
-DCMAKE_CXX_STANDARD=17 \
-DCMAKE_CXX_FLAGS=" -fdiagnostics-color=always -Werror -Wno-error=deprecated-declarations " \
Expand Down
16 changes: 10 additions & 6 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -69,20 +69,24 @@ option(CREATE_DOC "Whether or not to create doxygen doc target." OFF)
option(ENABLE_SIO "Build SIO I/O support" OFF)
option(PODIO_RELAX_PYVER "Do not require exact python version match with ROOT" OFF)
option(ENABLE_RNTUPLE "Build with support for the new ROOT NTtuple format" OFF)
option(ENABLE_DATASOURCE "Build podio's ROOT DataSource" OFF)
option(PODIO_USE_CLANG_FORMAT "Use clang-format to format the code" OFF)
option(ENABLE_JULIA "Enable Julia support. When enabled, Julia datamodels will be generated, and Julia tests will run." OFF)


#--- Declare ROOT dependency ---------------------------------------------------
list(APPEND CMAKE_PREFIX_PATH $ENV{ROOTSYS})
set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
if(NOT ENABLE_RNTUPLE)
find_package(ROOT REQUIRED COMPONENTS RIO Tree)
else()
find_package(ROOT REQUIRED COMPONENTS RIO Tree ROOTNTuple)
if(${ROOT_VERSION} VERSION_LESS 6.28.02)
set(root_components_needed RIO Tree)
if(ENABLE_RNTUPLE)
list(APPEND root_components_needed ROOTNTuple)
endif()
if(ENABLE_DATASOURCE)
list(APPEND root_components_needed ROOTDataFrame)
endif()
find_package(ROOT REQUIRED COMPONENTS ${root_components_needed})
if((ENABLE_RNTUPLE) AND (${ROOT_VERSION} VERSION_LESS 6.28.02))
message(FATAL_ERROR "You are trying to build podio with support for the new ROOT NTuple format, but your ROOT version is too old. Please update ROOT to at least version 6.28.02")
endif()
endif()

# ROOT_CXX_STANDARD was introduced in https://github.com/root-project/root/pull/6466
Expand Down
160 changes: 160 additions & 0 deletions include/podio/DataSource.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
#ifndef PODIO_DATASOURCE_H
#define PODIO_DATASOURCE_H

// Podio
#include <podio/CollectionBase.h>
#include <podio/Frame.h>
#include <podio/Reader.h>

// ROOT
#include <ROOT/RDataFrame.hxx>
#include <ROOT/RDataSource.hxx>

// STL
#include <memory>
#include <string>
#include <typeinfo>
#include <utility>
#include <vector>

namespace podio {
class DataSource : public ROOT::RDF::RDataSource {
public:
///
/// @brief Construct the podio::DataSource from the provided file.
///
explicit DataSource(const std::string& filePath, int nEvents = -1);

///
/// @brief Construct the podio::DataSource from the provided file list.
///
explicit DataSource(const std::vector<std::string>& filePathList, int nEvents = -1);

///
/// @brief Inform the podio::DataSource of the desired level of parallelism.
///
void SetNSlots(unsigned int nSlots) override;

///
/// @brief Inform podio::DataSource that an event-loop is about to start.
///
void Initialize() override;

///
/// @brief Retrieve from podio::DataSource a set of ranges of entries that
/// can be processed concurrently.
///
std::vector<std::pair<ULong64_t, ULong64_t>> GetEntryRanges() override;

///
/// @brief Inform podio::DataSource that a certain thread is about to start
/// working on a certain range of entries.
///
void InitSlot(unsigned int slot, ULong64_t firstEntry) override;

///
/// @brief Inform podio::DataSource that a certain thread is about to start
/// working on a certain entry.
///
bool SetEntry(unsigned int slot, ULong64_t entry) override;

///
/// @brief Inform podio::DataSource that a certain thread finished working
/// on a certain range of entries.
///
void FinalizeSlot(unsigned int slot) override;

///
/// @brief Inform podio::DataSource that an event-loop finished.
///
void Finalize() override;

///
/// @brief Returns a reference to the collection of the dataset's column
/// names
///
const std::vector<std::string>& GetColumnNames() const override;

///
/// @brief Checks if the dataset has a certain column.
///
bool HasColumn(std::string_view columnName) const override;

///
/// @brief Type of a column as a string. Required for JITting.
///
std::string GetTypeName(std::string_view columnName) const override;

protected:
///
/// @brief Type-erased vector of pointers to pointers to column
/// values --- one per slot.
///
std::vector<void*> GetColumnReadersImpl(std::string_view name, const std::type_info& typeInfo) override;

std::string AsString() override {
return "Podio data source";
};

private:
/// Number of slots/threads
unsigned int m_nSlots = 1;

/// Input filename
std::vector<std::string> m_filePathList = {};

/// Total number of events
ULong64_t m_nEvents = 0;

/// Ranges of events available to be processed
std::vector<std::pair<ULong64_t, ULong64_t>> m_rangesAvailable = {};

/// Ranges of events available ever created
std::vector<std::pair<ULong64_t, ULong64_t>> m_rangesAll = {};

/// Column names
std::vector<std::string> m_columnNames{};

/// Column types
std::vector<std::string> m_columnTypes = {};

/// Collections, m_Collections[columnIndex][slotIndex]
std::vector<std::vector<const podio::CollectionBase*>> m_Collections = {};

/// Active collections
std::vector<unsigned int> m_activeCollections = {};

/// Root podio readers
std::vector<std::unique_ptr<podio::Reader>> m_podioReaders = {};

/// Podio frames
std::vector<std::unique_ptr<podio::Frame>> m_frames = {};

///
/// @brief Setup input for the podio::DataSource.
///
/// @param[in] Number of events.
/// @return void.
///
void SetupInput(int nEvents);
};

///
/// @brief Create RDataFrame from multiple Podio files.
///
/// @param[in] filePathList List of file paths from which the RDataFrame
/// will be created.
/// @return RDataFrame created from input file list.
///
ROOT::RDataFrame CreateDataFrame(const std::vector<std::string>& filePathList);

///
/// @brief Create RDataFrame from a Podio file.
///
/// @param[in] filePath File path from which the RDataFrame will be created.
/// @return RDataFrame created from input file list.
///
ROOT::RDataFrame CreateDataFrame(const std::string& filePath);
} // namespace podio

#endif /* PODIO_DATASOURCE_H */
69 changes: 61 additions & 8 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ if(ENABLE_SIO)
LIST(APPEND INSTALL_LIBRARIES podioSioIO podioSioIODict)
endif()


# --- IO
set(io_sources
Writer.cc
Expand All @@ -160,19 +161,63 @@ if(ENABLE_SIO)
target_link_libraries(podioIO PUBLIC podio::podioSioIO)
endif()


# --- DataSource
if(ENABLE_DATASOURCE)
set(rds_sources
DataSource.cc
)

set(rds_headers
${PROJECT_SOURCE_DIR}/include/podio/DataSource.h
)

podio_add_lib_and_dict(podioDataSource "${rds_headers}" "${rds_sources}" rds_selection.xml)
target_link_libraries(podioDataSource PUBLIC podio::podio
podio::podioIO
podio::podioRootIO
ROOT::Core
ROOT::RIO
ROOT::Tree
ROOT::ROOTVecOps
ROOT::ROOTDataFrame
)
target_compile_definitions(podioDataSource PUBLIC PODIO_ENABLE_DATASOURCE=1)
endif()


# --- Install everything
install(TARGETS podio podioDict podioRootIO podioRootIODict podioIO ${INSTALL_LIBRARIES}
EXPORT podioTargets
DESTINATION "${CMAKE_INSTALL_LIBDIR}")
if (NOT ENABLE_DATASOURCE)
install(TARGETS podio podioDict podioRootIO podioRootIODict podioIO ${INSTALL_LIBRARIES}
EXPORT podioTargets
DESTINATION "${CMAKE_INSTALL_LIBDIR}")
else()
install(TARGETS podio podioDict podioRootIO podioRootIODict podioIO podioDataSource podioDataSourceDict ${INSTALL_LIBRARIES}
EXPORT podioTargets
DESTINATION "${CMAKE_INSTALL_LIBDIR}")
endif()

# Only install the necessary headers
if (ENABLE_SIO)
install(DIRECTORY ${PROJECT_SOURCE_DIR}/include/podio DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}")
else()
install(DIRECTORY ${PROJECT_SOURCE_DIR}/include/podio DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
REGEX SIO.*\\.h$ EXCLUDE )
file(GLOB headers_necessary
"${PROJECT_SOURCE_DIR}/include/podio/*.h")

if (NOT ENABLE_SIO)
list(FILTER headers_necessary EXCLUDE REGEX SIO.*\\.h$)
endif()
if (NOT ENABLE_RNTUPLE)
list(FILTER headers_necessary EXCLUDE REGEX RNTuple.*\\.h$)
endif()
if (NOT ENABLE_DATASOURCE)
list(FILTER headers_necessary EXCLUDE REGEX DataSource.h)
endif()

install(FILES ${headers_necessary}
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/podio
)
install(DIRECTORY ${PROJECT_SOURCE_DIR}/include/podio/utilities
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/podio
)

install(FILES
${CMAKE_CURRENT_BINARY_DIR}/podioDictDict.rootmap
${CMAKE_CURRENT_BINARY_DIR}/libpodioDict_rdict.pcm
Expand All @@ -188,6 +233,14 @@ if (ENABLE_SIO)
)
endif()

if (ENABLE_DATASOURCE)
install(FILES
${CMAKE_CURRENT_BINARY_DIR}/podioDataSourceDictDict.rootmap
${CMAKE_CURRENT_BINARY_DIR}/libpodioDataSourceDict_rdict.pcm
DESTINATION "${CMAKE_INSTALL_LIBDIR}"
)
endif()

add_executable(podio_test_hashes test_hashes.cpp)
target_link_libraries(podio_test_hashes PRIVATE podio::podio)
install(TARGETS podio_test_hashes
Expand Down
Loading

0 comments on commit cfe4836

Please sign in to comment.