Skip to content

Commit

Permalink
Make the relaxed parsing even more relaxed
Browse files Browse the repository at this point in the history
For `"ascii-prefixes-only": "false"`, also allow `"` in IRI references.
It doesn't harm and that way we can load DBpedia, which has many
such IRI references.
  • Loading branch information
Hannah Bast committed Dec 10, 2024
1 parent 0400f90 commit f6d74f1
Show file tree
Hide file tree
Showing 4 changed files with 8 additions and 8 deletions.
6 changes: 3 additions & 3 deletions src/parser/RdfParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -739,11 +739,11 @@ bool TurtleParser<T>::iriref() {
if (!view.starts_with('<')) {
return false;
}
auto endPos = view.find_first_of("<>\"\n", 1);
auto endPos = view.find_first_of(">\n", 1);
if (endPos == string::npos || view[endPos] != '>') {
raise(
"Unterminated IRI reference (found '<' but no '>' before "
"one of the following characters: <, \", newline)");
"Unterminated IRI reference (found '<' but no matching `>` "
"on the same line)");
}
// In relaxed mode, that is all we check. Otherwise, we check if the IRI is
// standard-compliant. If not, we output a warning and try to parse it in a
Expand Down
2 changes: 1 addition & 1 deletion src/parser/Tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ struct TurtleToken {
"<([^\\x00-\\x20<>\"{}|^`\\\\]|"s + UcharString + ")*>";
const RE2 Iriref;
const string IrirefStringRelaxed =
"<([^\\x00-\\x19<>\"\\\\]|"s + UcharString + ")*>";
"<([^\\x00-\\x19<>\\\\]|"s + UcharString + ")*>";
const RE2 IrirefRelaxed;

const string PercentString = "%" + cls(HexString) + "{2}";
Expand Down
6 changes: 3 additions & 3 deletions test/RdfParserTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -751,7 +751,7 @@ TEST(RdfParserTest, iriref) {
std::string iriref_1 = "<fine>";
std::string iriref_2 = "<okay ish>";
std::string iriref_3 = "<not\x19okay_for_RE2>";
std::string iriref_4 = "<throws\"exception>";
std::string iriref_4 = "<throws exception";
std::string iriref_5 = "no iriref at all";
// The first IRI ref is fine for both parsers.
parser.setInputStream(iriref_1);
Expand Down Expand Up @@ -779,8 +779,8 @@ TEST(RdfParserTest, iriref) {
} else {
ASSERT_FALSE(parser.iriref());
}
// The fourth IRI ref throws an exception when parsed (because " is
// encountered before the closing >).
// The fourth IRI ref throws an exception when parsed (because `<` is
// encountered before the closing `>`).
parser.setInputStream(iriref_4);
ASSERT_THROW(parser.iriref(), TurtleParser<Tokenizer>::ParseException);
// The fifth IRI ref is not recognized as an IRI ref.
Expand Down
2 changes: 1 addition & 1 deletion test/TokenTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -189,8 +189,8 @@ TEST(TokenizerTest, Entities) {
string iriref4 = "<escaped\\uAA34\\U000ABC34end>";
string noIriref1 = "< >";
string noIriref2 = "<{}|^`>";
string noIriref4 = "<\">";
string noIriref3 = "<\n>";
string noIriref4 = "<abc";

// Strict Iriref parsing.
ASSERT_TRUE(RE2::FullMatch(iriref1, t.Iriref, nullptr));
Expand Down

0 comments on commit f6d74f1

Please sign in to comment.