Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make it possible to read only a subset of available collections in readers #504

Open
wants to merge 19 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 14 additions & 6 deletions include/podio/RNTupleReader.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,6 @@

namespace podio {

/**
This class has the function to read available data from disk
and to prepare collections and buffers.
**/
/// The RNTupleReader can be used to read files that have been written with the
/// RNTuple backend.
///
Expand Down Expand Up @@ -61,20 +57,32 @@ class RNTupleReader {
/// Read the next data entry for a given category.
///
/// @param name The category name for which to read the next entry
/// @param collsToRead (optional) the collection names that should be read. If
/// not provided (or empty) all collections will be read
///
/// @returns FrameData from which a podio::Frame can be constructed if the
/// category exists and if there are still entries left to read.
/// Otherwise a nullptr
std::unique_ptr<podio::ROOTFrameData> readNextEntry(const std::string& name);
///
/// @throws std::invalid_argument in case collsToRead contains collection
/// names that are not available
std::unique_ptr<podio::ROOTFrameData> readNextEntry(const std::string& name,
const std::vector<std::string>& collsToRead = {});

/// Read the desired data entry for a given category.
///
/// @param name The category name for which to read the next entry
/// @param entry The entry number to read
/// @param collsToRead (optional) the collection names that should be read. If
/// not provided (or empty) all collections will be read
///
/// @returns FrameData from which a podio::Frame can be constructed if the
/// category and the desired entry exist. Otherwise a nullptr
std::unique_ptr<podio::ROOTFrameData> readEntry(const std::string& name, const unsigned entry);
///
/// @throws std::invalid_argument in case collsToRead contains collection
/// names that are not available
std::unique_ptr<podio::ROOTFrameData> readEntry(const std::string& name, const unsigned entry,
const std::vector<std::string>& collsToRead = {});

/// Get the names of all the available Frame categories in the current file(s).
///
Expand Down
35 changes: 26 additions & 9 deletions include/podio/ROOTReader.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ namespace detail {
// vector
using CollectionInfo = std::tuple<std::string, bool, SchemaVersionT, size_t>;

struct NamedCollInfo {
std::string name{};
CollectionInfo info{};
};
} // namespace detail

class CollectionBase;
Expand Down Expand Up @@ -74,20 +78,32 @@ class ROOTReader {
/// Read the next data entry for a given category.
///
/// @param name The category name for which to read the next entry
/// @param collsToRead (optional) the collection names that should be read. If
/// not provided (or empty) all collections will be read
///
/// @returns FrameData from which a podio::Frame can be constructed if the
/// category exists and if there are still entries left to read.
/// Otherwise a nullptr
std::unique_ptr<podio::ROOTFrameData> readNextEntry(const std::string& name);
///
/// @throws std::invalid_argument in case collsToRead contains collection
/// names that are not available
std::unique_ptr<podio::ROOTFrameData> readNextEntry(const std::string& name,
const std::vector<std::string>& collsToRead = {});

/// Read the desired data entry for a given category.
///
/// @param name The category name for which to read the next entry
/// @param entry The entry number to read
/// @param collsToRead (optional) the collection names that should be read. If
/// not provided (or empty) all collections will be read
///
/// @returns FrameData from which a podio::Frame can be constructed if the
/// category and the desired entry exist. Otherwise a nullptr
std::unique_ptr<podio::ROOTFrameData> readEntry(const std::string& name, const unsigned entry);
///
/// @throws std::invalid_argument in case collsToRead contains collection
/// names that are not available
std::unique_ptr<podio::ROOTFrameData> readEntry(const std::string& name, const unsigned entry,
const std::vector<std::string>& collsToRead = {});

/// Get the number of entries for the given name
///
Expand Down Expand Up @@ -146,12 +162,12 @@ class ROOTReader {
/// constructor from chain for more convenient map insertion
CategoryInfo(std::unique_ptr<TChain>&& c) : chain(std::move(c)) {
}
std::unique_ptr<TChain> chain{nullptr}; ///< The TChain with the data
unsigned entry{0}; ///< The next entry to read
std::vector<std::pair<std::string, detail::CollectionInfo>> storedClasses{}; ///< The stored collections in this
///< category
std::vector<root_utils::CollectionBranches> branches{}; ///< The branches for this category
std::shared_ptr<CollectionIDTable> table{nullptr}; ///< The collection ID table for this category
std::unique_ptr<TChain> chain{nullptr}; ///< The TChain with the data
unsigned entry{0}; ///< The next entry to read
std::vector<detail::NamedCollInfo> storedClasses{}; ///< The stored collections in this
///< category
std::vector<root_utils::CollectionBranches> branches{}; ///< The branches for this category
std::shared_ptr<CollectionIDTable> table{nullptr}; ///< The collection ID table for this category
};

/// Initialize the passed CategoryInfo by setting up the necessary branches,
Expand All @@ -174,7 +190,8 @@ class ROOTReader {
/// Read the data entry specified in the passed CategoryInfo, and increase the
/// counter afterwards. In case the requested entry is larger than the
/// available number of entries, return a nullptr.
std::unique_ptr<podio::ROOTFrameData> readEntry(ROOTReader::CategoryInfo& catInfo);
std::unique_ptr<podio::ROOTFrameData> readEntry(ROOTReader::CategoryInfo& catInfo,
const std::vector<std::string>& collsToRead);

/// Get / read the buffers at index iColl in the passed category information
podio::CollectionReadBuffers getCollectionBuffers(CategoryInfo& catInfo, size_t iColl, bool reloadBranches,
Expand Down
38 changes: 24 additions & 14 deletions include/podio/Reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ class Reader {
struct ReaderConcept {
virtual ~ReaderConcept() = default;

virtual podio::Frame readNextFrame(const std::string& name) = 0;
virtual podio::Frame readFrame(const std::string& name, size_t index) = 0;
virtual podio::Frame readNextFrame(const std::string& name, const std::vector<std::string>&) = 0;
virtual podio::Frame readFrame(const std::string& name, size_t index, const std::vector<std::string>&) = 0;
virtual size_t getEntries(const std::string& name) const = 0;
virtual podio::version::Version currentFileVersion() const = 0;
virtual std::optional<podio::version::Version> currentFileVersion(const std::string& name) const = 0;
Expand All @@ -44,16 +44,17 @@ class Reader {

~ReaderModel() = default;

podio::Frame readNextFrame(const std::string& name) override {
auto maybeFrame = m_reader->readNextEntry(name);
podio::Frame readNextFrame(const std::string& name, const std::vector<std::string>& collsToRead) override {
auto maybeFrame = m_reader->readNextEntry(name, collsToRead);
if (maybeFrame) {
return maybeFrame;
}
throw std::runtime_error("Failed reading category " + name + " (reading beyond bounds?)");
}

podio::Frame readFrame(const std::string& name, size_t index) override {
auto maybeFrame = m_reader->readEntry(name, index);
podio::Frame readFrame(const std::string& name, size_t index,
const std::vector<std::string>& collsToRead) override {
auto maybeFrame = m_reader->readEntry(name, index, collsToRead);
if (maybeFrame) {
return maybeFrame;
}
Expand Down Expand Up @@ -105,46 +106,55 @@ class Reader {
/// Read the next frame of a given category
///
/// @param name The category name for which to read the next frame
/// @param collsToRead (optional) the collection names that should be read. If
/// not provided (or empty) all collections will be read
///
/// @returns A fully constructed Frame with the contents read from file
///
/// @throws std::invalid_argument in case the category is not available or in
/// case no more entries are available
podio::Frame readNextFrame(const std::string& name) {
return m_self->readNextFrame(name);
podio::Frame readNextFrame(const std::string& name, const std::vector<std::string>& collsToRead = {}) {
return m_self->readNextFrame(name, collsToRead);
}

/// Read the next frame of the "events" category
///
/// @param collsToRead (optional) the collection names that should be read. If
/// not provided (or empty) all collections will be read
///
/// @returns A fully constructed Frame with the contents read from file
///
/// @throws std::invalid_argument in case no (more) events are available
podio::Frame readNextEvent() {
return readNextFrame(podio::Category::Event);
podio::Frame readNextEvent(const std::vector<std::string>& collsToRead = {}) {
return readNextFrame(podio::Category::Event, collsToRead);
}

/// Read a specific frame for a given category
///
/// @param name The category name for which to read the next entry
/// @param index The entry number to read
/// @param collsToRead (optional) the collection names that should be read. If
/// not provided (or empty) all collections will be read
///
/// @returns A fully constructed Frame with the contents read from file
///
/// @throws std::invalid_argument in case the category is not available or in
/// case the specified entry is not available
podio::Frame readFrame(const std::string& name, size_t index) {
return m_self->readFrame(name, index);
podio::Frame readFrame(const std::string& name, size_t index, const std::vector<std::string>& collsToRead = {}) {
return m_self->readFrame(name, index, collsToRead);
}

/// Read a specific frame of the "events" category
///
/// @param index The event number to read
/// @param collsToRead (optional) the collection names that should be read. If
/// not provided (or empty) all collections will be read
///
/// @returns A fully constructed Frame with the contents read from file
///
/// @throws std::invalid_argument in case the desired event is not available
podio::Frame readEvent(size_t index) {
return readFrame(podio::Category::Event, index);
podio::Frame readEvent(size_t index, const std::vector<std::string>& collsToRead = {}) {
return readFrame(podio::Category::Event, index, collsToRead);
}

/// Get the number of entries for the given name
Expand Down
21 changes: 9 additions & 12 deletions include/podio/SIOFrameData.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,10 @@
#include "podio/CollectionBuffers.h"
#include "podio/CollectionIDTable.h"
#include "podio/GenericParameters.h"
#include "podio/SIOBlock.h"

#include <sio/buffer.h>
#include <sio/definitions.h>

#include <memory>
#include <numeric>
#include <optional>
#include <string>
#include <vector>
Expand All @@ -35,19 +32,15 @@ class SIOFrameData {
/// tableBuffer containing the necessary information for unpacking the
/// collections. The two size parameters denote the uncompressed size of the
/// respective buffers.
SIOFrameData(sio::buffer&& collBuffers, std::size_t dataSize, sio::buffer&& tableBuffer, std::size_t tableSize) :
m_recBuffer(std::move(collBuffers)),
m_tableBuffer(std::move(tableBuffer)),
m_dataSize(dataSize),
m_tableSize(tableSize) {
}
///
/// In case the limitColls contain a collection name that is not available
/// from the idTable names this throws an exception
SIOFrameData(sio::buffer&& collBuffers, std::size_t dataSize, sio::buffer&& tableBuffer, std::size_t tableSize,
std::vector<std::string> limitColls = {});

std::optional<podio::CollectionReadBuffers> getCollectionBuffers(const std::string& name);

podio::CollectionIDTable getIDTable() {
if (m_idTable.empty()) {
readIdTable();
}
return {m_idTable.ids(), m_idTable.names()};
}

Expand Down Expand Up @@ -79,6 +72,10 @@ class SIOFrameData {
std::vector<short> m_subsetCollectionBits{};

podio::GenericParameters m_parameters{};

/// The collections that should be made available for a Frame constructed from
/// this (if non-empty)
std::vector<std::string> m_limitColls{};
};
} // namespace podio

Expand Down
22 changes: 20 additions & 2 deletions include/podio/SIOReader.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,21 +37,39 @@ class SIOReader {

/// Read the next data entry for a given category.
///
/// @note Given how the SIO files are currently layed out it is in fact not
/// possible to only read a subset of a Frame. Rather the subset of
/// collections to read will be an artificial limit on the returned
/// SIOFrameData. Limiting the collections to read will not improve I/O
/// performance.
///
/// @param name The category name for which to read the next entry
/// @param collsToRead (optional) the collection names that should be read. If
/// not provided (or empty) all collections will be read
///
/// @returns FrameData from which a podio::Frame can be constructed if the
/// category exists and if there are still entries left to read.
/// Otherwise a nullptr
std::unique_ptr<podio::SIOFrameData> readNextEntry(const std::string& name);
std::unique_ptr<podio::SIOFrameData> readNextEntry(const std::string& name,
const std::vector<std::string>& collsToRead = {});

/// Read the desired data entry for a given category.
///
/// @note Given how the SIO files are currently layed out it is in fact not
/// possible to only read a subset of a Frame. Rather the subset of
/// collections to read will be an artificial limit on the returned
/// SIOFrameData. Limiting the collections to read will not improve I/O
/// performance.
///
/// @param name The category name for which to read the next entry
/// @param entry The entry number to read
/// @param collsToRead (optional) the collection names that should be read. If
/// not provided (or empty) all collections will be read
///
/// @returns FrameData from which a podio::Frame can be constructed if the
/// category and the desired entry exist. Otherwise a nullptr
std::unique_ptr<podio::SIOFrameData> readEntry(const std::string& name, const unsigned entry);
std::unique_ptr<podio::SIOFrameData> readEntry(const std::string& name, const unsigned entry,
const std::vector<std::string>& collsToRead = {});

/// Get the number of entries for the given name
///
Expand Down
23 changes: 18 additions & 5 deletions src/RNTupleReader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -138,11 +138,13 @@ std::vector<std::string_view> RNTupleReader::getAvailableCategories() const {
return cats;
}

std::unique_ptr<ROOTFrameData> RNTupleReader::readNextEntry(const std::string& name) {
return readEntry(name, m_entries[name]);
std::unique_ptr<ROOTFrameData> RNTupleReader::readNextEntry(const std::string& name,
const std::vector<std::string>& collsToRead) {
return readEntry(name, m_entries[name], collsToRead);
}

std::unique_ptr<ROOTFrameData> RNTupleReader::readEntry(const std::string& category, const unsigned entNum) {
std::unique_ptr<ROOTFrameData> RNTupleReader::readEntry(const std::string& category, const unsigned entNum,
const std::vector<std::string>& collsToRead) {
if (m_totalEntries.find(category) == m_totalEntries.end()) {
getEntries(category);
}
Expand All @@ -156,6 +158,16 @@ std::unique_ptr<ROOTFrameData> RNTupleReader::readEntry(const std::string& categ
}
}

const auto& collInfo = m_collectionInfo[category];
// Make sure to not silently ignore non-existant but requested collections
if (!collsToRead.empty()) {
for (const auto& name : collsToRead) {
if (std::ranges::find(collInfo.name, name) == collInfo.name.end()) {
throw std::invalid_argument(name + " is not available from Frame");
}
}
}

m_entries[category] = entNum + 1;

// m_readerEntries contains the accumulated entries for all the readers
Expand All @@ -176,9 +188,10 @@ std::unique_ptr<ROOTFrameData> RNTupleReader::readEntry(const std::string& categ
auto dentry = m_readers[category][readerIndex]->GetModel()->GetDefaultEntry();
#endif

const auto& collInfo = m_collectionInfo[category];

for (size_t i = 0; i < collInfo.id.size(); ++i) {
if (!collsToRead.empty() && std::ranges::find(collsToRead, collInfo.name[i]) == collsToRead.end()) {
continue;
}
const auto& collType = collInfo.type[i];
const auto& bufferFactory = podio::CollectionBufferFactory::instance();
auto maybeBuffers =
Expand Down
Loading
Loading