Make the relaxed parsing even more relaxed

For `"ascii-prefixes-only": "false"`, also allow `"` in IRI references. It doesn't harm and that way we can load DBpedia, which has many such IRI references.
ad-freiburg · Dec 10, 2024 · f6d74f1 · f6d74f1
1 parent 0400f90
commit f6d74f1
Show file tree

Hide file tree

Showing 4 changed files with 8 additions and 8 deletions.
diff --git a/src/parser/RdfParser.cpp b/src/parser/RdfParser.cpp
@@ -739,11 +739,11 @@ bool TurtleParser<T>::iriref() {
   if (!view.starts_with('<')) {
     return false;
   }
-  auto endPos = view.find_first_of("<>\"\n", 1);
+  auto endPos = view.find_first_of(">\n", 1);
   if (endPos == string::npos || view[endPos] != '>') {
     raise(
-        "Unterminated IRI reference (found '<' but no '>' before "
-        "one of the following characters: <, \", newline)");
+        "Unterminated IRI reference (found '<' but no matching `>` "
+        "on the same line)");
   }
   // In relaxed mode, that is all we check. Otherwise, we check if the IRI is
   // standard-compliant. If not, we output a warning and try to parse it in a

diff --git a/src/parser/Tokenizer.h b/src/parser/Tokenizer.h
@@ -128,7 +128,7 @@ struct TurtleToken {
       "<([^\\x00-\\x20<>\"{}|^`\\\\]|"s + UcharString + ")*>";
   const RE2 Iriref;
   const string IrirefStringRelaxed =
-      "<([^\\x00-\\x19<>\"\\\\]|"s + UcharString + ")*>";
+      "<([^\\x00-\\x19<>\\\\]|"s + UcharString + ")*>";
   const RE2 IrirefRelaxed;
 
   const string PercentString = "%" + cls(HexString) + "{2}";

diff --git a/test/RdfParserTest.cpp b/test/RdfParserTest.cpp
@@ -751,7 +751,7 @@ TEST(RdfParserTest, iriref) {
     std::string iriref_1 = "<fine>";
     std::string iriref_2 = "<okay ish>";
     std::string iriref_3 = "<not\x19okay_for_RE2>";
-    std::string iriref_4 = "<throws\"exception>";
+    std::string iriref_4 = "<throws exception";
     std::string iriref_5 = "no iriref at all";
     // The first IRI ref is fine for both parsers.
     parser.setInputStream(iriref_1);
@@ -779,8 +779,8 @@ TEST(RdfParserTest, iriref) {
     } else {
       ASSERT_FALSE(parser.iriref());
     }
-    // The fourth IRI ref throws an exception when parsed (because " is
-    // encountered before the closing >).
+    // The fourth IRI ref throws an exception when parsed (because `<` is
+    // encountered before the closing `>`).
     parser.setInputStream(iriref_4);
     ASSERT_THROW(parser.iriref(), TurtleParser<Tokenizer>::ParseException);
     // The fifth IRI ref is not recognized as an IRI ref.

diff --git a/test/TokenTest.cpp b/test/TokenTest.cpp
@@ -189,8 +189,8 @@ TEST(TokenizerTest, Entities) {
   string iriref4 = "<escaped\\uAA34\\U000ABC34end>";
   string noIriref1 = "< >";
   string noIriref2 = "<{}|^`>";
-  string noIriref4 = "<\">";
   string noIriref3 = "<\n>";
+  string noIriref4 = "<abc";
 
   // Strict Iriref parsing.
   ASSERT_TRUE(RE2::FullMatch(iriref1, t.Iriref, nullptr));