diff --git a/src/parser/Iri.cpp b/src/parser/Iri.cpp index 9b5a70d91a..13edaecccb 100644 --- a/src/parser/Iri.cpp +++ b/src/parser/Iri.cpp @@ -1,6 +1,7 @@ -// Copyright 2023, University of Freiburg, -// Chair of Algorithms and Data Structures. -// Author: Benedikt Maria Beckermann +// Copyright 2023 - 2024, University of Freiburg +// Chair of Algorithms and Data Structures +// Authors: Benedikt Maria Beckermann +// Hannah Bast #include "parser/Iri.h" @@ -47,6 +48,60 @@ Iri Iri::fromPrefixAndSuffix(const Iri& prefix, std::string_view suffix) { return Iri{prefix, asNormalizedStringViewUnsafe(suffixNormalized)}; } +// ____________________________________________________________________________ +Iri Iri::getBaseIri(bool domainOnly) const { + AD_CORRECTNESS_CHECK(iri_.starts_with('<') && iri_.ends_with('>'), iri_); + // Check if we have a scheme and find the first `/` after that (or the first + // `/` at all if there is no scheme). + size_t pos = iri_.find(schemePattern); + if (pos == std::string::npos) { + LOG(WARN) << "No scheme found in base IRI: \"" << iri_ << "\"" + << " (but we accept it anyway)" << std::endl; + pos = 1; + } else { + pos += schemePattern.size(); + } + pos = iri_.find('/', pos); + // Return the IRI with `/` appended in the following two cases: the IRI has + // the empty path, or `domainOnly` is false and the final `/` is missing. + if (pos == std::string::npos || + (!domainOnly && iri_[iri_.size() - 2] != '/')) { + return fromIrirefWithoutBrackets( + absl::StrCat(std::string_view(iri_).substr(1, iri_.size() - 2), "/"sv)); + } + // If `domainOnly` is true, remove the path part. + if (domainOnly) { + return fromIrirefWithoutBrackets(std::string_view(iri_).substr(1, pos)); + } + // Otherwise, return the IRI as is. + return *this; +} + +// ____________________________________________________________________________ +Iri Iri::fromIrirefConsiderBase(std::string_view iriStringWithBrackets, + const Iri& basePrefixForRelativeIris, + const Iri& basePrefixForAbsoluteIris) { + auto iriSv = iriStringWithBrackets; + AD_CORRECTNESS_CHECK(iriSv.size() >= 2); + AD_CORRECTNESS_CHECK(iriSv[0] == '<' && iriSv[iriSv.size() - 1] == '>'); + if (iriSv.find("://") != std::string_view::npos || + basePrefixForAbsoluteIris.empty()) { + // Case 1: IRI with scheme (like ``) or `BASE_IRI_FOR_TESTING` + // (which is `<@>`, and no valid base IRI has length 3). + return TripleComponent::Iri::fromIriref(iriSv); + } else if (iriSv[1] == '/') { + // Case 2: Absolute IRI without scheme (like ``). + AD_CORRECTNESS_CHECK(!basePrefixForAbsoluteIris.empty()); + return TripleComponent::Iri::fromPrefixAndSuffix( + basePrefixForAbsoluteIris, iriSv.substr(2, iriSv.size() - 3)); + } else { + // Case 3: Relative IRI (like ``). + AD_CORRECTNESS_CHECK(!basePrefixForRelativeIris.empty()); + return TripleComponent::Iri::fromPrefixAndSuffix( + basePrefixForRelativeIris, iriSv.substr(1, iriSv.size() - 2)); + } +} + // ____________________________________________________________________________ Iri Iri::fromStringRepresentation(std::string s) { AD_CORRECTNESS_CHECK(s.starts_with("<") || s.starts_with("@")); diff --git a/src/parser/Iri.h b/src/parser/Iri.h index 208c4a9f06..f302ae26cc 100644 --- a/src/parser/Iri.h +++ b/src/parser/Iri.h @@ -1,6 +1,7 @@ -// Copyright 2023, University of Freiburg, -// Chair of Algorithms and Data Structures. -// Author: Benedikt Maria Beckermann +// Copyright 2023 - 2024, University of Freiburg +// Chair of Algorithms and Data Structures +// Authors: Benedikt Maria Beckermann +// Hannah Bast #pragma once @@ -14,15 +15,18 @@ namespace ad_utility::triple_component { class Iri { private: // Store the string value of the IRI including the angle brackets. - // brackets. std::string iri_; - // Create a new iri object + // Create a new `Iri` object explicit Iri(std::string iri); - // Create a new iri using a prefix + // Create a new `Iri` using a prefix Iri(const Iri& prefix, NormalizedStringView suffix); + // Pattern used to identify the scheme in an IRI. Note that we do not + // check the validity of the part before the `://` according to RFC 3987. + static constexpr std::string_view schemePattern = "://"; + public: // A default constructed IRI is empty. Iri() = default; @@ -36,15 +40,32 @@ class Iri { const std::string& toStringRepresentation() const; std::string& toStringRepresentation(); - // Create a new `Iri` given an iri string with brackets. + // Create a new `Ìri` given an IRI string with brackets. static Iri fromIriref(std::string_view stringWithBrackets); - // Create a new `Iri` given an iri string without brackets. + // Create a new `Iri` given an IRI string without brackets. static Iri fromIrirefWithoutBrackets(std::string_view stringWithoutBrackets); - // Create a new iri given a prefix iri and its suffix + // Create a new `Iri` given a prefix IRI and its suffix static Iri fromPrefixAndSuffix(const Iri& prefix, std::string_view suffix); + // Create a new `Iri` object, considering the base IRI. For IRIs with a scheme + // (like ``), this is the same as `fromIriref`. For IRIs without a + // scheme, prepend the base prefix for relative IRIs (like ``) + // or for absolute IRIs (like ``). + static Iri fromIrirefConsiderBase(std::string_view iriStringWithBrackets, + const Iri& basePrefixForRelativeIris, + const Iri& basePrefixForAbsoluteIris); + + // Get the base IRI from this `Iri` object. The returned `Iri` always has a + // `/` at the end. If `domainOnly` is true, remove the path part, for + // example, for `` the method returns + // ``. + Iri getBaseIri(bool domainOnly) const; + + // Return true iff the IRI is empty. + bool empty() const { return iri_.empty(); } + // Return the string value of the iri object without any leading or trailing // angled brackets. NormalizedStringView getContent() const; diff --git a/src/parser/ParallelBuffer.cpp b/src/parser/ParallelBuffer.cpp index 6b15fa78a8..c2bc5c985a 100644 --- a/src/parser/ParallelBuffer.cpp +++ b/src/parser/ParallelBuffer.cpp @@ -72,7 +72,12 @@ std::optional ParallelBufferWithEndRegex::findRegexNearEnd( // _____________________________________________________________________________ std::optional ParallelBufferWithEndRegex::getNextBlock() { + // Get the block of data read asynchronously after the previous call + // to `getNextBlock`. auto rawInput = rawBuffer_.getNextBlock(); + + // If there was no more data, return the remainder or `std::nullopt` if + // it is empty. if (!rawInput || exhausted_) { exhausted_ = true; if (remainder_.empty()) { @@ -85,7 +90,15 @@ ParallelBufferWithEndRegex::getNextBlock() { return copy; } + // Find `endRegex_` in the data (searching from the back, in chunks of + // exponentially increasing size). Note that this does not necessarily + // find the last match of `endRegex_` in the data, but the first match in the + // last chunk (from the back), where there is a match. auto endPosition = findRegexNearEnd(rawInput.value(), endRegex_); + + // If no match was found at all, report an error, except when this is the + // last block (then `getNextBlock` will return `std::nullopt`, and we simply + // concatenate it to the remainder). if (!endPosition) { if (rawBuffer_.getNextBlock()) { throw std::runtime_error(absl::StrCat( @@ -95,10 +108,13 @@ ParallelBufferWithEndRegex::getNextBlock() { "increase the FILE_BUFFER_SIZE " "or set \"parallel-parsing: false\" in the settings file.")); } - // This was the last (possibly incomplete) block, simply concatenate endPosition = rawInput->size(); exhausted_ = true; } + + // Concatenate the remainder (part after `endRegex_`) of the block from the + // previous round with the part of the block until `endRegex_` from this + // round. BufferType result; result.reserve(remainder_.size() + *endPosition); result.insert(result.end(), remainder_.begin(), remainder_.end()); diff --git a/src/parser/ParallelBuffer.h b/src/parser/ParallelBuffer.h index 79eb8f55c1..81bfac84c6 100644 --- a/src/parser/ParallelBuffer.h +++ b/src/parser/ParallelBuffer.h @@ -76,9 +76,8 @@ class ParallelFileBuffer : public ParallelBuffer { std::future fut_; }; -/// A parallel buffer, where each of the blocks except for the last one has to -/// end with a certain regex (e.g. a full stop followed by whitespace and a -/// newline to denote the end of a triple in a .ttl file). +// A parallel buffer that reads input from the file in blocks, where each block, +// except possibly the last, ends with `endRegex`. class ParallelBufferWithEndRegex : public ParallelBuffer { public: ParallelBufferWithEndRegex(size_t blocksize, std::string endRegex) @@ -86,7 +85,10 @@ class ParallelBufferWithEndRegex : public ParallelBuffer { endRegex_{endRegex}, endRegexAsString_{std::move(endRegex)} {} - // __________________________________________________________________________ + // Get the data that was read asynchronously after the previous call to this + // function. Returns the part of the data until `endRegex` is found, with the + // part after `endRegex` from the previous call prepended. If `endRegex` is + // not found, simply return the rest of the data. std::optional getNextBlock() override; // Open the file from which the blocks are read. diff --git a/src/parser/RdfParser.cpp b/src/parser/RdfParser.cpp index 9cba9f1455..503919f372 100644 --- a/src/parser/RdfParser.cpp +++ b/src/parser/RdfParser.cpp @@ -1,6 +1,7 @@ -// Copyright 2018, University of Freiburg, -// Chair of Algorithms and Data Structures. -// Author: Johannes Kalmbach(joka921) +// Copyright 2018 - 2024, University of Freiburg +// Chair of Algorithms and Data Structures +// Authors: Johannes Kalmbach +// Hannah Bast #include "parser/RdfParser.h" @@ -55,7 +56,9 @@ template bool TurtleParser::base() { if (skip()) { if (iriref() && check(skip())) { - prefixMap_[""] = lastParseResult_.getIri(); + const auto& iri = lastParseResult_.getIri(); + prefixMap_[baseForRelativeIriKey_] = iri.getBaseIri(false); + prefixMap_[baseForAbsoluteIriKey_] = iri.getBaseIri(true); return true; } else { raise("Parsing @base definition failed"); @@ -85,7 +88,9 @@ template bool TurtleParser::sparqlBase() { if (skip()) { if (iriref()) { - prefixMap_[""] = lastParseResult_.getIri(); + auto iri = lastParseResult_.getIri(); + prefixMap_[baseForRelativeIriKey_] = iri.getBaseIri(false); + prefixMap_[baseForAbsoluteIriKey_] = iri.getBaseIri(true); return true; } else { raise("Parsing BASE definition failed"); @@ -745,8 +750,8 @@ bool TurtleParser::iriref() { // more relaxed way. if constexpr (UseRelaxedParsing) { tok_.remove_prefix(endPos + 1); - lastParseResult_ = - TripleComponent::Iri::fromIriref(view.substr(0, endPos + 1)); + lastParseResult_ = TripleComponent::Iri::fromIrirefConsiderBase( + view.substr(0, endPos + 1), baseForRelativeIri(), baseForAbsoluteIri()); return true; } else { if (!parseTerminal()) { @@ -756,8 +761,9 @@ bool TurtleParser::iriref() { return false; } } - lastParseResult_ = - TripleComponent::Iri::fromIriref(lastParseResult_.getString()); + lastParseResult_ = TripleComponent::Iri::fromIrirefConsiderBase( + lastParseResult_.getString(), baseForRelativeIri(), + baseForAbsoluteIri()); return true; } } @@ -817,7 +823,19 @@ bool RdfStreamParser::resetStateAndRead( template void RdfStreamParser::initialize(const string& filename) { this->clear(); - fileBuffer_ = std::make_unique(bufferSize_); + // Make sure that a block of data ends with a newline. This is important for + // two reasons: + // + // 1. A block of data must not end in the middle of a comment. Otherwise the + // remaining part of the comment, which is prepended to the next block, is + // not recognized as a comment. + // + // 2. A block of data must not end with a `.` (without subsequent newline). + // The reason is that with a `.` at the end, we cannot decide whether we are + // in the middle of a `PN_LOCAL` (that continues in the next buffer) or at the + // end of a statement. + fileBuffer_ = + std::make_unique(bufferSize_, "([\\r\\n]+)"); fileBuffer_->open(filename); byteVec_.resize(bufferSize_); // decompress the first block and initialize Tokenizer @@ -847,8 +865,6 @@ bool RdfStreamParser::getLineImpl(TurtleTriple* triple) { // immediately rethrown. If we are reading from a stream in chunks of // bytes, we can try again with a larger buffer. try { - // variable parsedStatement will be true iff a statement can - // successfully be parsed parsedStatement = T::statement(); } catch (const typename T::ParseException& p) { parsedStatement = false; diff --git a/src/parser/RdfParser.h b/src/parser/RdfParser.h index ca55c61993..76929a44bb 100644 --- a/src/parser/RdfParser.h +++ b/src/parser/RdfParser.h @@ -169,6 +169,12 @@ class TurtleParser : public RdfParserBase { static constexpr std::array floatDatatypes_ = { XSD_DECIMAL_TYPE, XSD_DOUBLE_TYPE, XSD_FLOAT_TYPE}; + // The keys for storing the base prefix (for relative and absolute IRIs) in + // the prefix map. The only thing that is important about these keys is that + // they are different from each other and from any valid prefix name. + static constexpr const char* baseForRelativeIriKey_ = "@"; + static constexpr const char* baseForAbsoluteIriKey_ = "@@"; + protected: // Data members. @@ -187,9 +193,23 @@ class TurtleParser : public RdfParserBase { // `TripleComponent` since it can hold any parsing result, not only objects. TripleComponent lastParseResult_; - // Maps prefixes to their expanded form, initialized with the empty base - // (i.e. the prefix ":" maps to the empty IRI). - ad_utility::HashMap prefixMap_{{{}, {}}}; + // Map that maps prefix names to their IRI. For our tests, it is important + // that without any BASE declaration, the two base prefixes are mapped to the + // empty IRI. + static const inline ad_utility::HashMap + prefixMapDefault_{{baseForRelativeIriKey_, TripleComponent::Iri{}}, + {baseForAbsoluteIriKey_, TripleComponent::Iri{}}}; + ad_utility::HashMap prefixMap_ = + prefixMapDefault_; + + // Getters for the two base prefixes. Without BASE declaration, these will + // both return the empty IRI. + const TripleComponent::Iri& baseForRelativeIri() { + return prefixMap_.at(baseForRelativeIriKey_); + } + const TripleComponent::Iri& baseForAbsoluteIri() { + return prefixMap_.at(baseForAbsoluteIriKey_); + } // There are turtle constructs that reuse prefixes, subjects and predicates // so we have to save the last seen ones. @@ -222,7 +242,7 @@ class TurtleParser : public RdfParserBase { activePredicate_ = TripleComponent::Iri::fromIriref("<>"); activePrefix_.clear(); - prefixMap_.clear(); + prefixMap_ = prefixMapDefault_; tok_.reset(nullptr, 0); triples_.clear(); @@ -400,6 +420,8 @@ class TurtleParser : public RdfParserBase { FRIEND_TEST(RdfParserTest, predicateObjectList); FRIEND_TEST(RdfParserTest, objectList); FRIEND_TEST(RdfParserTest, object); + FRIEND_TEST(RdfParserTest, base); + FRIEND_TEST(RdfParserTest, sparqlBase); FRIEND_TEST(RdfParserTest, blankNode); FRIEND_TEST(RdfParserTest, blankNodePropertyList); FRIEND_TEST(RdfParserTest, numericLiteral); @@ -516,8 +538,6 @@ class RdfStringParser : public Parser { this->tok_.reset(tmpToParse_.data(), tmpToParse_.size()); } - void setPrefixMap(decltype(prefixMap_) m) { prefixMap_ = std::move(m); } - const auto& getPrefixMap() const { return prefixMap_; } // __________________________________________________________ @@ -604,10 +624,10 @@ class RdfStreamParser : public Parser { // that's why we need the backupState() and resetStateAndRead() methods ParallelBuffer::BufferType byteVec_; - std::unique_ptr fileBuffer_; + size_t bufferSize_ = FILE_BUFFER_SIZE; + std::unique_ptr fileBuffer_; // this many characters will be buffered at once, // defaults to a global constant - size_t bufferSize_ = FILE_BUFFER_SIZE; // that many bytes were already parsed before dealing with the current batch // in member byteVec_ diff --git a/test/RdfParserTest.cpp b/test/RdfParserTest.cpp index f917745d3c..6969961003 100644 --- a/test/RdfParserTest.cpp +++ b/test/RdfParserTest.cpp @@ -1,7 +1,7 @@ // Copyright 2018, University of Freiburg, // Chair of Algorithms and Data Structures. // Author: Johannes Kalmbach(joka921) -// + #include #include @@ -337,6 +337,36 @@ TEST(RdfParserTest, blankNodePropertyList) { testPropertyListAsSubject(CtreParser{}); } +TEST(RdfParserTest, base) { + auto testForGivenParser = [](auto parser) { + parser.setInputStream("@base ."); + ASSERT_TRUE(parser.base()); + ASSERT_EQ(parser.baseForRelativeIri().toStringRepresentation(), + ""); + ASSERT_EQ(parser.baseForAbsoluteIri().toStringRepresentation(), + ""); + parser.setInputStream("@base \"no iriref\" ."); + ASSERT_THROW(parser.base(), TurtleParser::ParseException); + }; + testForGivenParser(Re2Parser{}); + testForGivenParser(CtreParser{}); +} + +TEST(RdfParserTest, sparqlBase) { + auto testForGivenParser = [](auto parser) { + parser.setInputStream("BASE ."); + ASSERT_TRUE(parser.sparqlBase()); + ASSERT_EQ(parser.baseForRelativeIri().toStringRepresentation(), + ""); + ASSERT_EQ(parser.baseForAbsoluteIri().toStringRepresentation(), + ""); + parser.setInputStream("BASE \"no iriref\" ."); + ASSERT_THROW(parser.sparqlBase(), TurtleParser::ParseException); + }; + testForGivenParser(Re2Parser{}); + testForGivenParser(CtreParser{}); +} + TEST(RdfParserTest, object) { auto runCommonTests = [](auto p) { auto sub = iri(""); @@ -887,8 +917,8 @@ TEST(RdfParserTest, multilineComments) { ad_utility::deleteFile(filename); }; - // Test an input with many lines that only comments and whitespace. There - // was a bug for this case in a previous version of the parser. + // Test an input with many lines that contain only comments and whitespace. + // There was a bug for this case in a previous version of the parser. std::string input = R"(#Comments #at ##the beginning diff --git a/test/parser/LiteralOrIriTest.cpp b/test/parser/LiteralOrIriTest.cpp index 0c5486fc7b..3c7ad863af 100644 --- a/test/parser/LiteralOrIriTest.cpp +++ b/test/parser/LiteralOrIriTest.cpp @@ -1,6 +1,7 @@ -// Copyright 2023, University of Freiburg, -// Chair of Algorithms and Data Structures. -// Author: Benedikt Maria Beckermann +// Copyright 2023 - 2024, University of Freiburg +// Chair of Algorithms and Data Structures +// Authors: Benedikt Maria Beckermann +// Hannah Bast #include #include @@ -21,6 +22,74 @@ TEST(IriTest, IriCreation) { asStringViewUnsafe(iri.getContent())); } +TEST(IriTest, getBaseIri) { + // Helper lambda that calls `Iri::getBaseIri` and returns the result as a + // string (including the angle brackets). + auto getBaseIri = [](std::string_view iriSv, bool domainOnly) { + return Iri::fromIriref(iriSv) + .getBaseIri(domainOnly) + .toStringRepresentation(); + }; + // IRI with path. + EXPECT_EQ(getBaseIri("", false), + ""); + EXPECT_EQ(getBaseIri("", false), + ""); + EXPECT_EQ(getBaseIri("", true), + ""); + EXPECT_EQ(getBaseIri("", true), + ""); + // IRI with domain only. + EXPECT_EQ(getBaseIri("", false), + ""); + EXPECT_EQ(getBaseIri("", false), + ""); + EXPECT_EQ(getBaseIri("", true), + ""); + EXPECT_EQ(getBaseIri("", true), + ""); + // IRI without scheme. + EXPECT_EQ(getBaseIri("", false), ""); + EXPECT_EQ(getBaseIri("", true), ""); +} + +TEST(IriTest, emptyIri) { + EXPECT_TRUE(Iri{}.empty()); + EXPECT_FALSE(Iri::fromIriref("").empty()); +} + +TEST(IriTest, fromIrirefConsiderBase) { + // Helper lambda that calls `Iri::fromIrirefConsiderBase` with the two base + // IRIs and returns the results as a string (including the angle brackets). + Iri baseForRelativeIris; + Iri baseForAbsoluteIris; + auto fromIrirefConsiderBase = [&baseForRelativeIris, &baseForAbsoluteIris]( + std::string_view iriStringWithBrackets) { + return Iri::fromIrirefConsiderBase(iriStringWithBrackets, + baseForRelativeIris, baseForAbsoluteIris) + .toStringRepresentation(); + }; + + // Check that it works for "real" base IRIs. + baseForRelativeIris = Iri::fromIriref(""); + baseForAbsoluteIris = Iri::fromIriref(""); + EXPECT_EQ(fromIrirefConsiderBase(""), + ""); + EXPECT_EQ(fromIrirefConsiderBase(""), + ""); + EXPECT_EQ(fromIrirefConsiderBase(""), + ""); + + // Check that with the default base, all IRIs remain unchanged. + baseForRelativeIris = Iri{}; + baseForAbsoluteIris = Iri{}; + EXPECT_THAT(fromIrirefConsiderBase(""), + ""); + EXPECT_THAT(fromIrirefConsiderBase(""), ""); + EXPECT_THAT(fromIrirefConsiderBase(""), ""); + EXPECT_THAT(fromIrirefConsiderBase("<>"), "<>"); +} + TEST(LiteralTest, LiteralTest) { Literal literal = Literal::literalWithoutQuotes("Hello World");