diff --git a/.gitignore b/.gitignore index 75b520bf014..574d708ce85 100644 --- a/.gitignore +++ b/.gitignore @@ -52,3 +52,4 @@ org.eclipse.dash.licenses-1.0.2.jar e2e/node_modules e2e/playwright-report e2e/test-results +.aider* diff --git a/core/repository/sparql/src/main/java/org/eclipse/rdf4j/repository/sparql/SPARQLConnection.java b/core/repository/sparql/src/main/java/org/eclipse/rdf4j/repository/sparql/SPARQLConnection.java index a7073f0eef0..7a2d40346d1 100644 --- a/core/repository/sparql/src/main/java/org/eclipse/rdf4j/repository/sparql/SPARQLConnection.java +++ b/core/repository/sparql/src/main/java/org/eclipse/rdf4j/repository/sparql/SPARQLConnection.java @@ -17,7 +17,9 @@ import java.io.InputStream; import java.io.Reader; import java.net.URL; +import java.util.Arrays; import java.util.Objects; +import java.util.stream.Collectors; import org.apache.http.client.HttpClient; import org.eclipse.rdf4j.common.iteration.CloseableIteration; @@ -39,6 +41,8 @@ import org.eclipse.rdf4j.model.impl.DynamicModelFactory; import org.eclipse.rdf4j.model.impl.SimpleValueFactory; import org.eclipse.rdf4j.model.util.Literals; +import org.eclipse.rdf4j.model.vocabulary.RDF4J; +import org.eclipse.rdf4j.model.vocabulary.SESAME; import org.eclipse.rdf4j.query.BindingSet; import org.eclipse.rdf4j.query.BooleanQuery; import org.eclipse.rdf4j.query.GraphQuery; @@ -79,6 +83,8 @@ */ public class SPARQLConnection extends AbstractRepositoryConnection implements HttpClientDependent { + private static final String COUNT_EVERYTHING = "SELECT (COUNT(*) AS ?count) WHERE { ?s ?p ?o }"; + private static final String EVERYTHING = "CONSTRUCT { ?s ?p ?o } WHERE { ?s ?p ?o }"; private static final String EVERYTHING_WITH_GRAPH = "SELECT * WHERE { ?s ?p ?o . OPTIONAL { GRAPH ?ctx { ?s ?p ?o } } }"; @@ -281,16 +287,61 @@ public boolean isEmpty() throws RepositoryException { @Override public long size(Resource... contexts) throws RepositoryException { - try (RepositoryResult stmts = getStatements(null, null, null, true, contexts)) { - long i = 0; - while (stmts.hasNext()) { - stmts.next(); - i++; + String query = sizeAsTupleQuery(contexts); + TupleQuery tq = prepareTupleQuery(SPARQL, query); + try (TupleQueryResult res = tq.evaluate()) { + if (res.hasNext()) { + + Value value = res.next().getBinding("count").getValue(); + if (value instanceof Literal) { + return ((Literal) value).longValue(); + } else { + return 0; + } + } + } catch (QueryEvaluationException e) { + throw new RepositoryException(e); + } + return 0; + } + + String sizeAsTupleQuery(Resource... contexts) { + + // in case the context is null we want the + // default graph of the remote store i.e. ask without graph/from. + if (contexts != null && isQuadMode() && contexts.length > 0) { + // this is an optimization for the case that we can use a GRAPH instead of a FROM. + if (contexts.length == 1 && isExposableGraphIri(contexts[0])) { + return "SELECT (COUNT(*) AS ?count) WHERE { GRAPH <" + contexts[0].stringValue() + + "> { ?s ?p ?o}}"; + } else { + // If we had an default graph setting that is sesame/rdf4j specific + // we must drop it before sending it over the wire. Otherwise + // gather up the given contexts and send them as a from clauses + // to make the matching dataset. + String graphs = Arrays.stream(contexts) + .filter(SPARQLConnection::isExposableGraphIri) + .map(Resource::stringValue) + .map(s -> "FROM <" + s + ">") + .collect(Collectors.joining(" ")); + return "SELECT (COUNT(*) AS ?count) " + graphs + "WHERE { ?s ?p ?o}"; } - return i; + } else { + return COUNT_EVERYTHING; } } + /** + * For the sparql protocol a context must be an IRI However we can't send out the RDF4j internal default graph IRIs + * + * @param resource to test if it can be the IRI for a named graph + * @return true if it the input can be a foreign named graph. + */ + private static boolean isExposableGraphIri(Resource resource) { + // We use the instanceof test to avoid any issue with a null pointer. + return resource instanceof IRI && !RDF4J.NIL.equals(resource) && !SESAME.NIL.equals(resource); + } + @Override public RepositoryResult getStatements(Resource subj, IRI pred, Value obj, boolean includeInferred, Resource... contexts) throws RepositoryException { diff --git a/core/repository/sparql/src/test/java/org/eclipse/rdf4j/repository/sparql/SPARQLConnectionTest.java b/core/repository/sparql/src/test/java/org/eclipse/rdf4j/repository/sparql/SPARQLConnectionTest.java index 081b9d09e07..fb133c58997 100644 --- a/core/repository/sparql/src/test/java/org/eclipse/rdf4j/repository/sparql/SPARQLConnectionTest.java +++ b/core/repository/sparql/src/test/java/org/eclipse/rdf4j/repository/sparql/SPARQLConnectionTest.java @@ -12,13 +12,20 @@ import static org.assertj.core.api.Assertions.assertThat; import static org.eclipse.rdf4j.model.util.Values.iri; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.mockito.ArgumentMatchers.any; import static org.mockito.ArgumentMatchers.anyBoolean; import static org.mockito.ArgumentMatchers.anyInt; +import static org.mockito.Mockito.atLeastOnce; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.never; import static org.mockito.Mockito.times; import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +import java.lang.ref.WeakReference; import org.eclipse.rdf4j.http.client.SPARQLProtocolSession; import org.eclipse.rdf4j.model.IRI; @@ -26,11 +33,20 @@ import org.eclipse.rdf4j.model.impl.SimpleValueFactory; import org.eclipse.rdf4j.model.vocabulary.FOAF; import org.eclipse.rdf4j.model.vocabulary.RDF; +import org.eclipse.rdf4j.model.vocabulary.RDF4J; import org.eclipse.rdf4j.model.vocabulary.RDFS; +import org.eclipse.rdf4j.query.impl.MapBindingSet; +import org.eclipse.rdf4j.query.impl.SimpleBinding; +import org.eclipse.rdf4j.query.impl.TupleQueryResultBuilder; +import org.eclipse.rdf4j.query.parser.ParsedQuery; +import org.eclipse.rdf4j.query.parser.sparql.SPARQLParser; +import org.eclipse.rdf4j.query.parser.sparql.SPARQLParserFactory; import org.eclipse.rdf4j.rio.ParserConfig; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.mockito.ArgumentCaptor; +import org.mockito.Mock; +import org.mockito.invocation.InvocationOnMock; public class SPARQLConnectionTest { @@ -100,6 +116,36 @@ public void testAddSingleContextHandling() throws Exception { assertThat(sparqlUpdate).containsPattern(expectedAddPattern).containsPattern(expectedRemovePattern); } + @Test + public void testSizeQuery() throws Exception { + + String sizeAsTupleQuery = subject.sizeAsTupleQuery(); + ParsedQuery query = new SPARQLParserFactory().getParser().parseQuery(sizeAsTupleQuery, "http://example.org/"); + assertNotNull(query); + + sizeAsTupleQuery = subject.sizeAsTupleQuery(vf.createIRI("urn:g1")); + query = new SPARQLParserFactory().getParser().parseQuery(sizeAsTupleQuery, "http://example.org/"); + assertNotNull(query); + + sizeAsTupleQuery = subject.sizeAsTupleQuery(vf.createIRI("urn:g1"), vf.createIRI("urn:g2")); + query = new SPARQLParserFactory().getParser().parseQuery(sizeAsTupleQuery, "http://example.org/"); + assertNotNull(query); + + sizeAsTupleQuery = subject.sizeAsTupleQuery(vf.createIRI("urn:g1"), vf.createBNode()); + query = new SPARQLParserFactory().getParser().parseQuery(sizeAsTupleQuery, "http://example.org/"); + assertNotNull(query); + + sizeAsTupleQuery = subject.sizeAsTupleQuery(RDF4J.NIL); + query = new SPARQLParserFactory().getParser().parseQuery(sizeAsTupleQuery, "http://example.org/"); + assertNotNull(query); + assertFalse(sizeAsTupleQuery.contains("nil")); + + sizeAsTupleQuery = subject.sizeAsTupleQuery(null); + query = new SPARQLParserFactory().getParser().parseQuery(sizeAsTupleQuery, "http://example.org/"); + + assertNotNull(query); + } + @Test public void testAddMultipleContextHandling() throws Exception { ArgumentCaptor sparqlUpdateCaptor = ArgumentCaptor.forClass(String.class); diff --git a/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/NativeStatementIterator.java b/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/NativeStatementIterator.java index 6d8c84cfa9c..29b803e6cb5 100644 --- a/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/NativeStatementIterator.java +++ b/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/NativeStatementIterator.java @@ -10,6 +10,8 @@ *******************************************************************************/ package org.eclipse.rdf4j.sail.nativerdf; +import static org.eclipse.rdf4j.sail.nativerdf.NativeStore.SOFT_FAIL_ON_CORRUPT_DATA_AND_REPAIR_INDEXES; + import java.io.IOException; import org.eclipse.rdf4j.common.io.ByteArrayUtil; @@ -20,6 +22,11 @@ import org.eclipse.rdf4j.model.Value; import org.eclipse.rdf4j.sail.SailException; import org.eclipse.rdf4j.sail.nativerdf.btree.RecordIterator; +import org.eclipse.rdf4j.sail.nativerdf.model.CorruptIRI; +import org.eclipse.rdf4j.sail.nativerdf.model.CorruptIRIOrBNode; +import org.eclipse.rdf4j.sail.nativerdf.model.CorruptUnknownValue; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * A statement iterator that wraps a RecordIterator containing statement records and translates these records to @@ -27,6 +34,8 @@ */ class NativeStatementIterator extends LookAheadIteration { + private static final Logger logger = LoggerFactory.getLogger(NativeStatementIterator.class); + /*-----------* * Variables * *-----------*/ @@ -54,17 +63,23 @@ public NativeStatementIterator(RecordIterator btreeIter, ValueStore valueStore) @Override public Statement getNextElement() throws SailException { try { - byte[] nextValue = btreeIter.next(); + byte[] nextValue; + try { + nextValue = btreeIter.next(); + } catch (AssertionError | Exception e) { + logger.error("Error while reading next value from btree iterator for {}", btreeIter.toString(), e); + throw e; + } if (nextValue == null) { return null; } int subjID = ByteArrayUtil.getInt(nextValue, TripleStore.SUBJ_IDX); - Resource subj = (Resource) valueStore.getValue(subjID); + Resource subj = valueStore.getResource(subjID); int predID = ByteArrayUtil.getInt(nextValue, TripleStore.PRED_IDX); - IRI pred = (IRI) valueStore.getValue(predID); + IRI pred = valueStore.getIRI(predID); int objID = ByteArrayUtil.getInt(nextValue, TripleStore.OBJ_IDX); Value obj = valueStore.getValue(objID); @@ -72,7 +87,18 @@ public Statement getNextElement() throws SailException { Resource context = null; int contextID = ByteArrayUtil.getInt(nextValue, TripleStore.CONTEXT_IDX); if (contextID != 0) { - context = (Resource) valueStore.getValue(contextID); + context = valueStore.getResource(contextID); + } + if (SOFT_FAIL_ON_CORRUPT_DATA_AND_REPAIR_INDEXES) { + if (subj == null) { + subj = new CorruptIRIOrBNode(valueStore.getRevision(), subjID, null); + } + if (pred == null) { + pred = new CorruptIRI(valueStore.getRevision(), predID, null, null); + } + if (obj == null) { + obj = new CorruptUnknownValue(valueStore.getRevision(), objID, null); + } } return valueStore.createStatement(subj, pred, obj, context); diff --git a/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/NativeStore.java b/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/NativeStore.java index c82bca9d4d9..e156083b8ef 100644 --- a/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/NativeStore.java +++ b/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/NativeStore.java @@ -24,6 +24,7 @@ import org.apache.commons.io.FileUtils; import org.eclipse.rdf4j.collection.factory.api.CollectionFactory; import org.eclipse.rdf4j.collection.factory.mapdb.MapDb3CollectionFactory; +import org.eclipse.rdf4j.common.annotation.InternalUseOnly; import org.eclipse.rdf4j.common.concurrent.locks.Lock; import org.eclipse.rdf4j.common.concurrent.locks.LockManager; import org.eclipse.rdf4j.common.io.MavenUtil; @@ -62,6 +63,17 @@ public class NativeStore extends AbstractNotifyingSail implements FederatedServi private static final String VERSION = MavenUtil.loadVersion("org.eclipse.rdf4j", "rdf4j-sail-nativerdf", "devel"); + /** + * Do not throw an exception when corrupt data is detected. Instead, try to return as much data as possible. + * + * Variable can be set through the system property + * org.eclipse.rdf4j.sail.nativerdf.softFailOnCorruptDataAndRepairIndexes. + */ + @InternalUseOnly + public static boolean SOFT_FAIL_ON_CORRUPT_DATA_AND_REPAIR_INDEXES = "true" + .equalsIgnoreCase( + System.getProperty("org.eclipse.rdf4j.sail.nativerdf.softFailOnCorruptDataAndRepairIndexes"));; + private static final Cleaner REMOVE_STORES_USED_FOR_MEMORY_OVERFLOW = Cleaner.create(); /** diff --git a/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/TripleStore.java b/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/TripleStore.java index c4bc52cd318..3c060af663d 100644 --- a/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/TripleStore.java +++ b/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/TripleStore.java @@ -287,9 +287,71 @@ private Set parseIndexSpecList(String indexSpecStr) throws SailException } private void initIndexes(Set indexSpecs) throws IOException { + + HashSet invalidIndexes = new HashSet<>(); + for (String fieldSeq : indexSpecs) { logger.trace("Initializing index '{}'...", fieldSeq); - indexes.add(new TripleIndex(fieldSeq)); + try { + indexes.add(new TripleIndex(fieldSeq, false)); + } catch (Exception e) { + if (NativeStore.SOFT_FAIL_ON_CORRUPT_DATA_AND_REPAIR_INDEXES) { + invalidIndexes.add(fieldSeq); + logger.warn("Ignoring index because it failed to initialize index '{}'", fieldSeq, e); + } else { + logger.error( + "Failed to initialize index '{}', consider setting org.eclipse.rdf4j.sail.nativerdf.softFailOnCorruptDataAndRepairIndexes to true.", + fieldSeq, e); + throw e; + } + + } + + } + + if (NativeStore.SOFT_FAIL_ON_CORRUPT_DATA_AND_REPAIR_INDEXES) { + indexSpecs.removeAll(invalidIndexes); + } + + List emptyIndexes = new ArrayList<>(); + List nonEmptyIndexes = new ArrayList<>(); + + checkIfIndexesAreEmptyOrNot(nonEmptyIndexes, emptyIndexes); + + if (!emptyIndexes.isEmpty() && !nonEmptyIndexes.isEmpty()) { + if (NativeStore.SOFT_FAIL_ON_CORRUPT_DATA_AND_REPAIR_INDEXES) { + indexes.removeAll(emptyIndexes); + } else { + for (TripleIndex index : emptyIndexes) { + throw new IOException("Index '" + new String(index.getFieldSeq()) + + "' is unexpectedly empty while other indexes are not. Consider setting the system property org.eclipse.rdf4j.sail.nativerdf.softFailOnCorruptDataAndRepairIndexes to true. Index file: " + + index.getBTree().getFile().getAbsolutePath()); + } + } + } + + } + + private void checkIfIndexesAreEmptyOrNot(List nonEmptyIndexes, List emptyIndexes) + throws IOException { + for (TripleIndex index : indexes) { + try (RecordIterator recordIterator = index.getBTree().iterateAll()) { + try { + byte[] next = recordIterator.next(); + if (next != null) { + next = recordIterator.next(); + if (next != null) { + nonEmptyIndexes.add(index); + } else { + emptyIndexes.add(index); + } + } else { + emptyIndexes.add(index); + } + } catch (Throwable ignored) { + emptyIndexes.add(index); + } + } } } @@ -355,7 +417,7 @@ private void reindex(Set currentIndexSpecs, Set newIndexSpecs) t for (String fieldSeq : addedIndexSpecs) { logger.debug("Initializing new index '{}'...", fieldSeq); - TripleIndex addedIndex = new TripleIndex(fieldSeq); + TripleIndex addedIndex = new TripleIndex(fieldSeq, true); BTree addedBTree = null; RecordIterator sourceIter = null; try { @@ -1122,7 +1184,17 @@ private class TripleIndex { private final BTree btree; - public TripleIndex(String fieldSeq) throws IOException { + public TripleIndex(String fieldSeq, boolean deleteExistingIndexFile) throws IOException { + if (deleteExistingIndexFile) { + File indexFile = new File(dir, getFilenamePrefix(fieldSeq) + ".dat"); + if (indexFile.exists()) { + indexFile.delete(); + } + File alloxFile = new File(dir, getFilenamePrefix(fieldSeq) + ".alloc"); + if (alloxFile.exists()) { + alloxFile.delete(); + } + } tripleComparator = new TripleComparator(fieldSeq); btree = new BTree(dir, getFilenamePrefix(fieldSeq), 2048, RECORD_LENGTH, tripleComparator, forceSync); } diff --git a/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/ValueStore.java b/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/ValueStore.java index 7193c7e8342..37787ac610c 100644 --- a/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/ValueStore.java +++ b/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/ValueStore.java @@ -10,6 +10,8 @@ *******************************************************************************/ package org.eclipse.rdf4j.sail.nativerdf; +import static org.eclipse.rdf4j.sail.nativerdf.NativeStore.SOFT_FAIL_ON_CORRUPT_DATA_AND_REPAIR_INDEXES; + import java.io.File; import java.io.IOException; import java.io.UnsupportedEncodingException; @@ -33,11 +35,18 @@ import org.eclipse.rdf4j.model.vocabulary.XSD; import org.eclipse.rdf4j.sail.SailException; import org.eclipse.rdf4j.sail.nativerdf.datastore.DataStore; +import org.eclipse.rdf4j.sail.nativerdf.model.CorruptIRI; +import org.eclipse.rdf4j.sail.nativerdf.model.CorruptIRIOrBNode; +import org.eclipse.rdf4j.sail.nativerdf.model.CorruptLiteral; +import org.eclipse.rdf4j.sail.nativerdf.model.CorruptUnknownValue; +import org.eclipse.rdf4j.sail.nativerdf.model.CorruptValue; import org.eclipse.rdf4j.sail.nativerdf.model.NativeBNode; import org.eclipse.rdf4j.sail.nativerdf.model.NativeIRI; import org.eclipse.rdf4j.sail.nativerdf.model.NativeLiteral; import org.eclipse.rdf4j.sail.nativerdf.model.NativeResource; import org.eclipse.rdf4j.sail.nativerdf.model.NativeValue; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * File-based indexed storage and retrieval of RDF values. ValueStore maps RDF values to integer IDs and vice-versa. @@ -49,9 +58,7 @@ @InternalUseOnly public class ValueStore extends SimpleValueFactory { - /*-----------* - * Constants * - *-----------*/ + private static final Logger logger = LoggerFactory.getLogger(ValueStore.class); /** * The default value cache size. @@ -146,6 +153,7 @@ public ValueStore(File dataDir, boolean forceSync, int valueCacheSize, int value namespaceIDCache = new ConcurrentCache<>(namespaceIDCacheSize); setNewRevision(); + } /*---------* @@ -180,6 +188,7 @@ public Lock getReadLock() throws InterruptedException { * @throws IOException If an I/O error occurred. */ public NativeValue getValue(int id) throws IOException { + // Check value cache Integer cacheID = id; NativeValue resultValue = valueCache.get(cacheID); @@ -191,12 +200,62 @@ public NativeValue getValue(int id) throws IOException { if (data != null) { resultValue = data2value(id, data); - // Store value in cache - valueCache.put(cacheID, resultValue); + if (!(resultValue instanceof CorruptValue)) { + // Store value in cache + valueCache.put(cacheID, resultValue); + } } } return resultValue; + + } + + /** + * Gets the Resource for the specified ID. + * + * @param id A value ID. + * @return The Resource for the ID, or null no such value could be found. + * @throws IOException If an I/O error occurred. + */ + public T getResource(int id) throws IOException { + + NativeValue resultValue = getValue(id); + + if (resultValue != null && !(resultValue instanceof Resource)) { + if (SOFT_FAIL_ON_CORRUPT_DATA_AND_REPAIR_INDEXES && resultValue instanceof CorruptValue) { + return (T) new CorruptIRIOrBNode(revision, id, ((CorruptValue) resultValue).getData()); + } + logger.warn( + "NativeStore is possibly corrupt. To attempt to repair or retrieve the data, read the documentation on http://rdf4j.org about the system property org.eclipse.rdf4j.sail.nativerdf.softFailOnCorruptDataAndRepairIndexes"); + } + + return (T) resultValue; + } + + /** + * Gets the IRI for the specified ID. + * + * @param id A value ID. + * @return The IRI for the ID, or null no such value could be found. + * @throws IOException If an I/O error occurred. + */ + public T getIRI(int id) throws IOException { + + NativeValue resultValue = getValue(id); + + if (resultValue != null && !(resultValue instanceof IRI)) { + if (SOFT_FAIL_ON_CORRUPT_DATA_AND_REPAIR_INDEXES && resultValue instanceof CorruptValue) { + if (resultValue instanceof CorruptIRI) { + return (T) resultValue; + } + return (T) new CorruptIRI(revision, id, null, ((CorruptValue) resultValue).getData()); + } + logger.warn( + "NativeStore is possibly corrupt. To attempt to repair or retrieve the data, read the documentation on http://rdf4j.org about the system property org.eclipse.rdf4j.sail.nativerdf.softFailOnCorruptDataAndRepairIndexes"); + } + + return (T) resultValue; } /** @@ -526,6 +585,14 @@ private boolean isNamespaceData(byte[] data) { } private NativeValue data2value(int id, byte[] data) throws IOException { + if (data.length == 0) { + if (SOFT_FAIL_ON_CORRUPT_DATA_AND_REPAIR_INDEXES) { + logger.error("Soft fail on corrupt data: Empty data array for value with id {}", id); + return new CorruptUnknownValue(revision, id, data); + } + throw new SailException("Empty data array for value with id " + id + + " consider setting the system property org.eclipse.rdf4j.sail.nativerdf.softFailOnCorruptDataAndRepairIndexes to true"); + } switch (data[0]) { case URI_VALUE: return data2uri(id, data); @@ -534,17 +601,35 @@ private NativeValue data2value(int id, byte[] data) throws IOException { case LITERAL_VALUE: return data2literal(id, data); default: - throw new IllegalArgumentException("Invalid type " + data[0] + " for value with id " + id); + if (SOFT_FAIL_ON_CORRUPT_DATA_AND_REPAIR_INDEXES) { + logger.error("Soft fail on corrupt data: Invalid type {} for value with id {}", data[0], id); + return new CorruptUnknownValue(revision, id, data); + } + throw new SailException("Invalid type " + data[0] + " for value with id " + id + + " consider setting the system property org.eclipse.rdf4j.sail.nativerdf.softFailOnCorruptDataAndRepairIndexes to true"); } } - private NativeIRI data2uri(int id, byte[] data) throws IOException { - int nsID = ByteArrayUtil.getInt(data, 1); - String namespace = getNamespace(nsID); + private T data2uri(int id, byte[] data) throws IOException { + String namespace = null; - String localName = new String(data, 5, data.length - 5, StandardCharsets.UTF_8); + try { + int nsID = ByteArrayUtil.getInt(data, 1); + namespace = getNamespace(nsID); + + String localName = new String(data, 5, data.length - 5, StandardCharsets.UTF_8); + + return (T) new NativeIRI(revision, namespace, localName, id); + } catch (Throwable e) { + if (SOFT_FAIL_ON_CORRUPT_DATA_AND_REPAIR_INDEXES + && (e instanceof Exception || e instanceof AssertionError)) { + return (T) new CorruptIRI(revision, id, namespace, data); + } + logger.warn( + "NativeStore is possibly corrupt. To attempt to repair or retrieve the data, read the documentation on http://rdf4j.org about the system property org.eclipse.rdf4j.sail.nativerdf.softFailOnCorruptDataAndRepairIndexes"); + throw e; + } - return new NativeIRI(revision, namespace, localName, id); } private NativeBNode data2bnode(int id, byte[] data) { @@ -552,31 +637,40 @@ private NativeBNode data2bnode(int id, byte[] data) { return new NativeBNode(revision, nodeID, id); } - private NativeLiteral data2literal(int id, byte[] data) throws IOException { - // Get datatype - int datatypeID = ByteArrayUtil.getInt(data, 1); - IRI datatype = null; - if (datatypeID != NativeValue.UNKNOWN_ID) { - datatype = (IRI) getValue(datatypeID); - } + private T data2literal(int id, byte[] data) throws IOException { + try { + // Get datatype + int datatypeID = ByteArrayUtil.getInt(data, 1); + IRI datatype = null; + if (datatypeID != NativeValue.UNKNOWN_ID) { + datatype = (IRI) getValue(datatypeID); + } - // Get language tag - String lang = null; - int langLength = data[5]; - if (langLength > 0) { - lang = new String(data, 6, langLength, StandardCharsets.UTF_8); - } + // Get language tag + String lang = null; + int langLength = data[5]; + if (langLength > 0) { + lang = new String(data, 6, langLength, StandardCharsets.UTF_8); + } - // Get label - String label = new String(data, 6 + langLength, data.length - 6 - langLength, StandardCharsets.UTF_8); + // Get label + String label = new String(data, 6 + langLength, data.length - 6 - langLength, StandardCharsets.UTF_8); - if (lang != null) { - return new NativeLiteral(revision, label, lang, id); - } else if (datatype != null) { - return new NativeLiteral(revision, label, datatype, id); - } else { - return new NativeLiteral(revision, label, CoreDatatype.XSD.STRING, id); + if (lang != null) { + return (T) new NativeLiteral(revision, label, lang, id); + } else if (datatype != null) { + return (T) new NativeLiteral(revision, label, datatype, id); + } else { + return (T) new NativeLiteral(revision, label, CoreDatatype.XSD.STRING, id); + } + } catch (Throwable e) { + if (SOFT_FAIL_ON_CORRUPT_DATA_AND_REPAIR_INDEXES + && (e instanceof Exception || e instanceof AssertionError)) { + return (T) new CorruptLiteral(revision, id, data); + } + throw e; } + } private String data2namespace(byte[] data) { diff --git a/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/btree/BTree.java b/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/btree/BTree.java index 078f0f8601d..b297d940ea3 100644 --- a/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/btree/BTree.java +++ b/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/btree/BTree.java @@ -25,6 +25,7 @@ import org.eclipse.rdf4j.common.io.ByteArrayUtil; import org.eclipse.rdf4j.common.io.NioFile; import org.eclipse.rdf4j.sail.SailException; +import org.eclipse.rdf4j.sail.nativerdf.NativeStore; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -292,6 +293,13 @@ public BTree(File dataDir, String filenamePrefix, int blockSize, int valueSize, this.valueSize = buf.getInt(); this.rootNodeID = buf.getInt(); + if (rootNodeID == 0 && NativeStore.SOFT_FAIL_ON_CORRUPT_DATA_AND_REPAIR_INDEXES) { + if (nioFile.size() > blockSize) { + throw new SailException("Root node ID is 0 but file is not empty. Btree may be corrupt. File: " + + file.getAbsolutePath()); + } + } + if (Arrays.equals(MAGIC_NUMBER, magicNumber)) { if (version > FILE_FORMAT_VERSION) { throw new IOException("Unable to read BTree file " + file + "; it uses a newer file format"); @@ -1117,4 +1125,11 @@ public void print(PrintStream out) throws IOException { out.println("#values = " + valueCount); out.println("---end of BTree file---"); } + + @Override + public String toString() { + return "BTree{" + + "file=" + getFile() + + '}'; + } } diff --git a/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/btree/RangeIterator.java b/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/btree/RangeIterator.java index d7f1617b292..e6a6a3847e6 100644 --- a/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/btree/RangeIterator.java +++ b/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/btree/RangeIterator.java @@ -422,4 +422,11 @@ public boolean nodeMergedWith(Node sourceNode, Node targetNode, int mergeIdx) th return deregister; } + + @Override + public String toString() { + return "RangeIterator{" + + "tree=" + tree + + '}'; + } } diff --git a/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/datastore/DataFile.java b/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/datastore/DataFile.java index d369c1649cf..73e9c349de7 100644 --- a/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/datastore/DataFile.java +++ b/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/datastore/DataFile.java @@ -10,6 +10,8 @@ *******************************************************************************/ package org.eclipse.rdf4j.sail.nativerdf.datastore; +import static org.eclipse.rdf4j.sail.nativerdf.NativeStore.SOFT_FAIL_ON_CORRUPT_DATA_AND_REPAIR_INDEXES; + import java.io.Closeable; import java.io.File; import java.io.IOException; @@ -18,6 +20,8 @@ import java.util.NoSuchElementException; import org.eclipse.rdf4j.common.io.NioFile; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * Class supplying access to a data file. A data file stores data sequentially. Each entry starts with the entry's @@ -27,6 +31,8 @@ */ public class DataFile implements Closeable { + private static final Logger logger = LoggerFactory.getLogger(DataFile.class); + /*-----------* * Constants * *-----------*/ @@ -197,29 +203,51 @@ public byte[] getData(long offset) throws IOException { (data[2] << 8) & 0x0000ff00 | (data[3]) & 0x000000ff; - // We have either managed to read enough data and can return the required subset of the data, or we have read - // too little so we need to execute another read to get the correct data. - if (dataLength <= data.length - 4) { + // If the data length is larger than 750MB, we are likely reading the wrong data. Probably data corruption. The + // limit of 750MB was chosen based on results from experimenting in the NativeSailStoreCorruptionTest class. + if (dataLength > 128 * 1024 * 1024) { + if (SOFT_FAIL_ON_CORRUPT_DATA_AND_REPAIR_INDEXES) { + logger.error( + "Data length is {}MB which is larger than 750MB. This is likely data corruption. Truncating length to 32 MB.", + dataLength / ((1024 * 1024))); + dataLength = 32 * 1024 * 1024; + } + } - // adjust the approximate average with 1 part actual length and 99 parts previous average up to a sensible - // max of 200 - dataLengthApproximateAverage = (int) (Math.min(200, - ((dataLengthApproximateAverage / 100.0) * 99) + (dataLength / 100.0))); + try { - return Arrays.copyOfRange(data, 4, dataLength + 4); + // We have either managed to read enough data and can return the required subset of the data, or we have + // read + // too little so we need to execute another read to get the correct data. + if (dataLength <= data.length - 4) { - } else { + // adjust the approximate average with 1 part actual length and 99 parts previous average up to a + // sensible + // max of 200 + dataLengthApproximateAverage = (int) (Math.min(200, + ((dataLengthApproximateAverage / 100.0) * 99) + (dataLength / 100.0))); - // adjust the approximate average, but favour the actual dataLength since dataLength predictions misses are - // costly - dataLengthApproximateAverage = Math.min(200, (dataLengthApproximateAverage + dataLength) / 2); + return Arrays.copyOfRange(data, 4, dataLength + 4); - // we didn't read enough data so we need to execute a new read - data = new byte[dataLength]; - buf = ByteBuffer.wrap(data); - nioFile.read(buf, offset + 4L); + } else { - return data; + // adjust the approximate average, but favour the actual dataLength since dataLength predictions misses + // are costly + dataLengthApproximateAverage = Math.min(200, (dataLengthApproximateAverage + dataLength) / 2); + + // we didn't read enough data so we need to execute a new read + data = new byte[dataLength]; + buf = ByteBuffer.wrap(data); + nioFile.read(buf, offset + 4L); + + return data; + } + } catch (OutOfMemoryError e) { + if (dataLength > 128 * 1024 * 1024) { + logger.error( + "Trying to read large amounts of data may be a sign of data corruption. Consider setting the system property org.eclipse.rdf4j.sail.nativerdf.softFailOnCorruptDataAndRepairIndexes to true"); + } + throw e; } } diff --git a/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/model/CorruptIRI.java b/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/model/CorruptIRI.java new file mode 100644 index 00000000000..71816d29e4d --- /dev/null +++ b/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/model/CorruptIRI.java @@ -0,0 +1,97 @@ +/******************************************************************************* + * Copyright (c) 2024 Eclipse RDF4J contributors. + * + * All rights reserved. This program and the accompanying materials + * are made available under the terms of the Eclipse Distribution License v1.0 + * which accompanies this distribution, and is available at + * http://www.eclipse.org/org/documents/edl-v10.php. + * + * SPDX-License-Identifier: BSD-3-Clause + ******************************************************************************/ + +package org.eclipse.rdf4j.sail.nativerdf.model; + +import java.nio.charset.StandardCharsets; + +import org.apache.commons.codec.binary.Hex; +import org.eclipse.rdf4j.model.IRI; +import org.eclipse.rdf4j.sail.nativerdf.NativeStore; +import org.eclipse.rdf4j.sail.nativerdf.ValueStoreRevision; + +import com.google.common.net.UrlEscapers; + +/** + * CorruptIRI is used when a NativeValue cannot be read from the ValueStore and if soft failure is enabled + * + * @see NativeStore#SOFT_FAIL_ON_CORRUPT_DATA_AND_REPAIR_INDEXES . + * + * @author Håvard M. Ottestad + */ +public class CorruptIRI extends CorruptValue implements IRI { + + private static final long serialVersionUID = -6995615243794525852L; + private final String namespace; + + public CorruptIRI(ValueStoreRevision revision, int internalID, String namespace, byte[] data) { + super(revision, internalID, data); + this.namespace = namespace; + } + + @Override + public String toString() { + return stringValue(); + } + + public String stringValue() { + try { + return getNamespace() + ":" + getLocalName(); + } catch (Throwable ignored) { + } + + return "CorruptIRI_with_ID_" + getInternalID(); + } + + @Override + public String getNamespace() { + if (namespace != null && !namespace.isEmpty()) { + return namespace; + } + return "urn:CorruptIRI:"; + } + + @Override + public String getLocalName() { + byte[] data = getData(); + if (data != null && data.length < 1024) { + try { + String localName = new String(data, 5, data.length - 5, StandardCharsets.UTF_8); + return "CORRUPT_" + UrlEscapers.urlPathSegmentEscaper().escape(localName); + } catch (Throwable ignored) { + } + + return "CORRUPT_" + Hex.encodeHexString(data); + } + + return "CORRUPT"; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + + if (o instanceof CorruptIRI && getInternalID() != NativeValue.UNKNOWN_ID) { + CorruptIRI otherCorruptValue = (CorruptIRI) o; + + if (otherCorruptValue.getInternalID() != NativeValue.UNKNOWN_ID + && getValueStoreRevision().equals(otherCorruptValue.getValueStoreRevision())) { + // CorruptValue is from the same revision of the same native store with both IDs set + return getInternalID() == otherCorruptValue.getInternalID(); + } + } + + return super.equals(o); + } + +} diff --git a/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/model/CorruptIRIOrBNode.java b/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/model/CorruptIRIOrBNode.java new file mode 100644 index 00000000000..83cdb9e6658 --- /dev/null +++ b/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/model/CorruptIRIOrBNode.java @@ -0,0 +1,98 @@ +/******************************************************************************* + * Copyright (c) 2024 Eclipse RDF4J contributors. + * + * All rights reserved. This program and the accompanying materials + * are made available under the terms of the Eclipse Distribution License v1.0 + * which accompanies this distribution, and is available at + * http://www.eclipse.org/org/documents/edl-v10.php. + * + * SPDX-License-Identifier: BSD-3-Clause + ******************************************************************************/ + +package org.eclipse.rdf4j.sail.nativerdf.model; + +import java.nio.charset.StandardCharsets; + +import org.apache.commons.codec.binary.Hex; +import org.eclipse.rdf4j.model.BNode; +import org.eclipse.rdf4j.model.IRI; +import org.eclipse.rdf4j.sail.nativerdf.NativeStore; +import org.eclipse.rdf4j.sail.nativerdf.ValueStoreRevision; + +import com.google.common.net.UrlEscapers; + +/** + * CorruptIRIOrBNode is used when a NativeValue cannot be read from the ValueStore and if soft failure is enabled + * + * @see NativeStore#SOFT_FAIL_ON_CORRUPT_DATA_AND_REPAIR_INDEXES . + * + * @author Håvard M. Ottestad + */ +public class CorruptIRIOrBNode extends CorruptValue implements IRI, BNode { + + private static final long serialVersionUID = 3709784393454516043L; + + public CorruptIRIOrBNode(ValueStoreRevision revision, int internalID, byte[] data) { + super(revision, internalID, data); + } + + @Override + public String toString() { + return stringValue(); + } + + public String stringValue() { + try { + return getNamespace() + ":" + getLocalName(); + } catch (Throwable ignored) { + } + + return "CorruptIRIOrBNode_with_ID_" + getInternalID(); + } + + @Override + public String getNamespace() { + return "urn:CorruptIRIOrBNode:"; + } + + @Override + public String getLocalName() { + byte[] data = getData(); + if (data != null && data.length < 1024) { + try { + String localName = new String(data, 5, data.length - 5, StandardCharsets.UTF_8); + return "CORRUPT_" + UrlEscapers.urlPathSegmentEscaper().escape(localName); + } catch (Throwable ignored) { + } + + return "CORRUPT_" + Hex.encodeHexString(data); + } + + return "CORRUPT"; + } + + @Override + public String getID() { + return ""; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + + if (o instanceof CorruptIRIOrBNode && getInternalID() != NativeValue.UNKNOWN_ID) { + CorruptIRIOrBNode otherCorruptValue = (CorruptIRIOrBNode) o; + + if (otherCorruptValue.getInternalID() != NativeValue.UNKNOWN_ID + && getValueStoreRevision().equals(otherCorruptValue.getValueStoreRevision())) { + // CorruptValue is from the same revision of the same native store with both IDs set + return getInternalID() == otherCorruptValue.getInternalID(); + } + } + + return super.equals(o); + } + +} diff --git a/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/model/CorruptLiteral.java b/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/model/CorruptLiteral.java new file mode 100644 index 00000000000..eb6b2587c25 --- /dev/null +++ b/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/model/CorruptLiteral.java @@ -0,0 +1,146 @@ +/******************************************************************************* + * Copyright (c) 2024 Eclipse RDF4J contributors. + * + * All rights reserved. This program and the accompanying materials + * are made available under the terms of the Eclipse Distribution License v1.0 + * which accompanies this distribution, and is available at + * http://www.eclipse.org/org/documents/edl-v10.php. + * + * SPDX-License-Identifier: BSD-3-Clause + ******************************************************************************/ + +package org.eclipse.rdf4j.sail.nativerdf.model; + +import java.math.BigDecimal; +import java.math.BigInteger; +import java.nio.charset.StandardCharsets; +import java.util.Optional; + +import javax.xml.datatype.XMLGregorianCalendar; + +import org.eclipse.rdf4j.model.IRI; +import org.eclipse.rdf4j.model.Literal; +import org.eclipse.rdf4j.model.base.CoreDatatype; +import org.eclipse.rdf4j.model.util.Values; +import org.eclipse.rdf4j.sail.nativerdf.NativeStore; +import org.eclipse.rdf4j.sail.nativerdf.ValueStoreRevision; + +/** + * CorruptLiteral is used when a NativeValue cannot be read from the ValueStore and if soft failure is enabled + * + * @see NativeStore#SOFT_FAIL_ON_CORRUPT_DATA_AND_REPAIR_INDEXES . + * + * @author Håvard M. Ottestad + */ +public class CorruptLiteral extends CorruptValue implements Literal { + + private static final long serialVersionUID = -2510885288827542623L; + + private static final IRI CORRUPT = Values.iri("urn:corrupt"); + + public CorruptLiteral(ValueStoreRevision revision, int internalID, byte[] data) { + super(revision, internalID, data); + } + + public String stringValue() { + return "CorruptLiteral_with_ID_" + getInternalID(); + } + + @Override + public String getLabel() { + byte[] data = getData(); + try { + if (data != null && data.length < 1024) { + return "CorruptUnknownValue with ID " + getInternalID() + " with possible data: " + + new String(data, StandardCharsets.UTF_8); + } + } catch (Throwable ignored) { + } + return "CorruptUnknownValue_with_ID_" + getInternalID(); + } + + @Override + public Optional getLanguage() { + return Optional.empty(); + } + + @Override + public IRI getDatatype() { + return CORRUPT; + } + + @Override + public boolean booleanValue() { + return false; + } + + @Override + public byte byteValue() { + return 0; + } + + @Override + public short shortValue() { + return 0; + } + + @Override + public int intValue() { + return 0; + } + + @Override + public long longValue() { + return 0; + } + + @Override + public BigInteger integerValue() { + return null; + } + + @Override + public BigDecimal decimalValue() { + return null; + } + + @Override + public float floatValue() { + return 0; + } + + @Override + public double doubleValue() { + return 0; + } + + @Override + public XMLGregorianCalendar calendarValue() { + return null; + } + + @Override + public CoreDatatype getCoreDatatype() { + return null; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + + if (o instanceof CorruptLiteral && getInternalID() != NativeValue.UNKNOWN_ID) { + CorruptLiteral otherCorruptValue = (CorruptLiteral) o; + + if (otherCorruptValue.getInternalID() != NativeValue.UNKNOWN_ID + && getValueStoreRevision().equals(otherCorruptValue.getValueStoreRevision())) { + // CorruptValue is from the same revision of the same native store with both IDs set + return getInternalID() == otherCorruptValue.getInternalID(); + } + } + + return super.equals(o); + } + +} diff --git a/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/model/CorruptUnknownValue.java b/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/model/CorruptUnknownValue.java new file mode 100644 index 00000000000..ea200b55fa5 --- /dev/null +++ b/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/model/CorruptUnknownValue.java @@ -0,0 +1,140 @@ +/******************************************************************************* + * Copyright (c) 2024 Eclipse RDF4J contributors. + * + * All rights reserved. This program and the accompanying materials + * are made available under the terms of the Eclipse Distribution License v1.0 + * which accompanies this distribution, and is available at + * http://www.eclipse.org/org/documents/edl-v10.php. + * + * SPDX-License-Identifier: BSD-3-Clause + ******************************************************************************/ + +package org.eclipse.rdf4j.sail.nativerdf.model; + +import java.math.BigDecimal; +import java.math.BigInteger; +import java.nio.charset.StandardCharsets; +import java.util.Optional; + +import javax.xml.datatype.XMLGregorianCalendar; + +import org.eclipse.rdf4j.model.IRI; +import org.eclipse.rdf4j.model.Literal; +import org.eclipse.rdf4j.model.base.CoreDatatype; +import org.eclipse.rdf4j.model.vocabulary.XSD; +import org.eclipse.rdf4j.sail.nativerdf.NativeStore; +import org.eclipse.rdf4j.sail.nativerdf.ValueStoreRevision; + +/** + * CorruptUnknownValue is used when a NativeValue cannot be read from the ValueStore and if soft failure is enabled + * + * @see NativeStore#SOFT_FAIL_ON_CORRUPT_DATA_AND_REPAIR_INDEXES . + * + * @author Håvard M. Ottestad + */ +public class CorruptUnknownValue extends CorruptValue implements Literal { + + private static final long serialVersionUID = -6650510290226676279L; + + public CorruptUnknownValue(ValueStoreRevision revision, int internalID, byte[] data) { + super(revision, internalID, data); + } + + @Override + public String getLabel() { + byte[] data = getData(); + try { + if (data != null && data.length < 1024) { + return "CorruptUnknownValue with ID " + getInternalID() + " with possible data: " + + new String(data, StandardCharsets.UTF_8); + } + } catch (Throwable ignored) { + } + return "CorruptUnknownValue_with_ID_" + getInternalID(); + } + + @Override + public Optional getLanguage() { + return Optional.empty(); + } + + @Override + public IRI getDatatype() { + return XSD.STRING; + } + + @Override + public boolean booleanValue() { + return false; + } + + @Override + public byte byteValue() { + return 0; + } + + @Override + public short shortValue() { + return 0; + } + + @Override + public int intValue() { + return 0; + } + + @Override + public long longValue() { + return 0; + } + + @Override + public BigInteger integerValue() { + return null; + } + + @Override + public BigDecimal decimalValue() { + return null; + } + + @Override + public float floatValue() { + return 0; + } + + @Override + public double doubleValue() { + return 0; + } + + @Override + public XMLGregorianCalendar calendarValue() { + return null; + } + + @Override + public CoreDatatype getCoreDatatype() { + return null; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + + if (o instanceof CorruptUnknownValue && getInternalID() != NativeValue.UNKNOWN_ID) { + CorruptUnknownValue otherCorruptValue = (CorruptUnknownValue) o; + + if (otherCorruptValue.getInternalID() != NativeValue.UNKNOWN_ID + && getValueStoreRevision().equals(otherCorruptValue.getValueStoreRevision())) { + // CorruptValue is from the same revision of the same native store with both IDs set + return getInternalID() == otherCorruptValue.getInternalID(); + } + } + + return super.equals(o); + } + +} diff --git a/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/model/CorruptValue.java b/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/model/CorruptValue.java new file mode 100644 index 00000000000..94028b5c579 --- /dev/null +++ b/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/model/CorruptValue.java @@ -0,0 +1,89 @@ +/******************************************************************************* + * Copyright (c) 2024 Eclipse RDF4J contributors. + * + * All rights reserved. This program and the accompanying materials + * are made available under the terms of the Eclipse Distribution License v1.0 + * which accompanies this distribution, and is available at + * http://www.eclipse.org/org/documents/edl-v10.php. + * + * SPDX-License-Identifier: BSD-3-Clause + ******************************************************************************/ + +package org.eclipse.rdf4j.sail.nativerdf.model; + +import org.eclipse.rdf4j.sail.nativerdf.NativeStore; +import org.eclipse.rdf4j.sail.nativerdf.ValueStoreRevision; + +/** + * CorruptValue is used when a NativeValue cannot be read from the ValueStore and if soft failure is enabled + * + * @see NativeStore#SOFT_FAIL_ON_CORRUPT_DATA_AND_REPAIR_INDEXES . + *

+ * There is no method isCorruptValue() as it would exist for a "regular" implementation of NativeValue. Since + * CorruptValue is only to be used in exceptional situations, the recommended way of checking for it is using + * "instanceof". + * + * @author Hannes Ebner + */ +public class CorruptValue implements NativeValue { + + private static final long serialVersionUID = 8829067881854394802L; + + private final byte[] data; + private volatile ValueStoreRevision revision; + private volatile int internalID; + + public CorruptValue(ValueStoreRevision revision, int internalID, byte[] data) { + setInternalID(internalID, revision); + this.data = data; + } + + @Override + public void setInternalID(int internalID, ValueStoreRevision revision) { + this.internalID = internalID; + this.revision = revision; + } + + @Override + public ValueStoreRevision getValueStoreRevision() { + return revision; + } + + @Override + public int getInternalID() { + return internalID; + } + + public String stringValue() { + return "CorruptValue_with_ID_" + internalID; + } + + /** + * Returns the bytes that were read from the ValueStore for this value's internalID. Since the value is corrupt the + * data may be null or an empty array. + * + * @return null, empty array or corrupt data + */ + public byte[] getData() { + return data; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + + if (o instanceof CorruptValue && internalID != NativeValue.UNKNOWN_ID) { + CorruptValue otherCorruptValue = (CorruptValue) o; + + if (otherCorruptValue.internalID != NativeValue.UNKNOWN_ID && revision.equals(otherCorruptValue.revision)) { + // CorruptValue is from the same revision of the same native store with both IDs set + return internalID == otherCorruptValue.internalID; + } + } + + return super.equals(o); + } + +} diff --git a/core/sail/nativerdf/src/test/java/org/eclipse/rdf4j/sail/nativerdf/NativeSailStoreCorruptionTest.java b/core/sail/nativerdf/src/test/java/org/eclipse/rdf4j/sail/nativerdf/NativeSailStoreCorruptionTest.java new file mode 100644 index 00000000000..12119ceb50b --- /dev/null +++ b/core/sail/nativerdf/src/test/java/org/eclipse/rdf4j/sail/nativerdf/NativeSailStoreCorruptionTest.java @@ -0,0 +1,397 @@ +/******************************************************************************* + * Copyright (c) 2024 Eclipse RDF4J contributors. + * + * All rights reserved. This program and the accompanying materials + * are made available under the terms of the Eclipse Distribution License v1.0 + * which accompanies this distribution, and is available at + * http://www.eclipse.org/org/documents/edl-v10.php. + * + * SPDX-License-Identifier: BSD-3-Clause + ******************************************************************************/ + +package org.eclipse.rdf4j.sail.nativerdf; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.io.File; +import java.io.IOException; +import java.io.RandomAccessFile; +import java.io.StringWriter; +import java.nio.file.Files; +import java.nio.file.StandardCopyOption; +import java.util.ArrayList; +import java.util.List; + +import org.eclipse.rdf4j.model.IRI; +import org.eclipse.rdf4j.model.Statement; +import org.eclipse.rdf4j.model.ValueFactory; +import org.eclipse.rdf4j.model.impl.SimpleValueFactory; +import org.eclipse.rdf4j.model.util.Values; +import org.eclipse.rdf4j.model.vocabulary.RDF; +import org.eclipse.rdf4j.model.vocabulary.RDFS; +import org.eclipse.rdf4j.repository.Repository; +import org.eclipse.rdf4j.repository.RepositoryConnection; +import org.eclipse.rdf4j.repository.RepositoryResult; +import org.eclipse.rdf4j.repository.sail.SailRepository; +import org.eclipse.rdf4j.rio.RDFFormat; +import org.eclipse.rdf4j.rio.RDFWriter; +import org.eclipse.rdf4j.rio.Rio; +import org.jetbrains.annotations.NotNull; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Tests how the NativeStore handles corruption in the data files. + */ +public class NativeSailStoreCorruptionTest { + + private static final Logger logger = LoggerFactory.getLogger(NativeSailStoreCorruptionTest.class); + + @TempDir + File tempFolder; + + protected Repository repo; + + protected final ValueFactory F = SimpleValueFactory.getInstance(); + + private File dataDir; + + @BeforeEach + public void before() throws IOException { + this.dataDir = new File(tempFolder, "dbmodel"); + dataDir.mkdir(); + repo = new SailRepository(new NativeStore(dataDir, "spoc,posc")); + repo.init(); + + IRI CTX_1 = F.createIRI("urn:one"); + IRI CTX_2 = F.createIRI("urn:two"); + + Statement S0 = F.createStatement(F.createIRI("http://example.org/a0"), RDFS.LABEL, F.createLiteral("zero")); + Statement S1 = F.createStatement(F.createIRI("http://example.org/b1"), RDFS.LABEL, F.createLiteral("one")); + Statement S2 = F.createStatement(F.createIRI("http://example.org/c2"), RDFS.LABEL, F.createLiteral("two")); + Statement S3 = F.createStatement(Values.bnode(), RDF.TYPE, Values.bnode()); + Statement S4 = F.createStatement(F.createIRI("http://example.org/c2"), RDFS.LABEL, + F.createLiteral("two", "en")); + Statement S5 = F.createStatement(F.createIRI("http://example.org/c2"), RDFS.LABEL, F.createLiteral(1.2)); + + try (RepositoryConnection conn = repo.getConnection()) { + conn.add(S0); + conn.add(S1, CTX_1); + conn.add(S2, CTX_2); + conn.add(S2, CTX_2); + conn.add(S3, CTX_2); + conn.add(S4, CTX_2); + conn.add(S5, CTX_2); + } + backupFile(dataDir, "values.dat"); + backupFile(dataDir, "values.id"); + backupFile(dataDir, "values.hash"); + backupFile(dataDir, "namespaces.dat"); + backupFile(dataDir, "contexts.dat"); + backupFile(dataDir, "triples-posc.alloc"); + backupFile(dataDir, "triples-posc.dat"); + backupFile(dataDir, "triples-spoc.alloc"); + backupFile(dataDir, "triples-spoc.dat"); + + NativeStore.SOFT_FAIL_ON_CORRUPT_DATA_AND_REPAIR_INDEXES = true; + + } + + public static void overwriteByteInFile(File valuesFile, long pos, int newVal) throws IOException { + + // Use RandomAccessFile in "rw" mode to read and write to the file + try (RandomAccessFile raf = new RandomAccessFile(valuesFile, "rw")) { + // Get the length of the file + long fileLength = raf.length(); + + // Check if the position is within the file bounds + if (pos >= fileLength) { + throw new IOException( + "Attempt to write outside the existing file bounds: " + pos + " >= " + fileLength); + } + + // Move the file pointer to byte position 32 + raf.seek(pos); + + // Write the byte value 0x0 at the current position + raf.writeByte(newVal); + } + } + + public static void backupFile(File dataDir, String s) throws IOException { + File valuesFile = new File(dataDir, s); + File backupFile = new File(dataDir, s + ".bak"); + + if (!valuesFile.exists()) { + throw new IOException(s + " does not exist and cannot be backed up."); + } + + // Copy values.dat to values.dat.bak + Files.copy(valuesFile.toPath(), backupFile.toPath(), StandardCopyOption.REPLACE_EXISTING); + } + + public static void restoreFile(File dataDir, String s) throws IOException { + File valuesFile = new File(dataDir, s); + File backupFile = new File(dataDir, s + ".bak"); + + if (!backupFile.exists()) { + throw new IOException("Backup file " + s + ".bak does not exist."); + } + + // Copy values.dat.bak back to values.dat + Files.copy(backupFile.toPath(), valuesFile.toPath(), StandardCopyOption.REPLACE_EXISTING); + } + + @Test + public void testCorruptValuesDatFileNamespace() throws IOException { + repo.shutDown(); + + overwriteByteInFile(new File(dataDir, "values.dat"), 12, 0x0); + + repo.init(); + + List list = getStatements(); + assertEquals(6, list.size()); + } + + @Test + public void testCorruptValuesDatFileNamespaceDatatype() throws IOException { + repo.shutDown(); + + overwriteByteInFile(new File(dataDir, "values.dat"), 96, 0x0); + + repo.init(); + + List list = getStatements(); + assertEquals(6, list.size()); + } + + @Test + public void testCorruptValuesDatFileEmptyDataArrayError() throws IOException { + repo.shutDown(); + + overwriteByteInFile(new File(dataDir, "values.dat"), 173, 0x0); + + repo.init(); + + List list = getStatements(); + assertEquals(6, list.size()); + } + + @Test + public void testCorruptValuesDatFileInvalidTypeError() throws IOException { + repo.shutDown(); + + overwriteByteInFile(new File(dataDir, "values.dat"), 174, 0x0); + + repo.init(); + + List list = getStatements(); + assertEquals(6, list.size()); + } + + @Test + public void testCorruptValuesDatFileEntireValuesDatFile() throws IOException { + for (int i = 4; i < 437; i++) { + logger.debug("Corrupting byte at position " + i); + repo.shutDown(); + restoreFile(dataDir, "values.dat"); + + overwriteByteInFile(new File(dataDir, "values.dat"), i, 0x0); + + repo.init(); + + List list = getStatements(); + assertEquals(6, list.size()); + } + } + + @Test + public void testCorruptLastByteOfValuesDatFile() throws IOException { + repo.shutDown(); + File valuesFile = new File(dataDir, "values.dat"); + long fileSize = valuesFile.length(); + + overwriteByteInFile(valuesFile, fileSize - 1, 0x0); + + repo.init(); + + List list = getStatements(); + assertEquals(6, list.size()); + } + + @Test + public void testCorruptValuesIdFile() throws IOException { + repo.shutDown(); + File valuesIdFile = new File(dataDir, "values.id"); + long fileSize = valuesIdFile.length(); + + for (long i = 4; i < fileSize; i++) { + restoreFile(dataDir, "values.id"); + overwriteByteInFile(valuesIdFile, i, 0x0); + repo.init(); + List list = getStatements(); + assertEquals(6, list.size(), "Failed at byte position " + i); + repo.shutDown(); + } + } + + @Test + public void testCorruptValuesHashFile() throws IOException { + repo.shutDown(); + String file = "values.hash"; + File nativeStoreFile = new File(dataDir, file); + long fileSize = nativeStoreFile.length(); + + for (long i = 4; i < fileSize; i++) { + restoreFile(dataDir, file); + overwriteByteInFile(nativeStoreFile, i, 0x0); + repo.init(); + List list = getStatements(); + assertEquals(6, list.size(), "Failed at byte position " + i); + repo.shutDown(); + } + } + + @Test + public void testCorruptValuesNamespacesFile() throws IOException { + repo.shutDown(); + String file = "namespaces.dat"; + File nativeStoreFile = new File(dataDir, file); + long fileSize = nativeStoreFile.length(); + + for (long i = 4; i < fileSize; i++) { + restoreFile(dataDir, file); + overwriteByteInFile(nativeStoreFile, i, 0x0); + repo.init(); + List list = getStatements(); + assertEquals(6, list.size(), "Failed at byte position " + i); + repo.shutDown(); + } + } + + @Test + public void testCorruptValuesContextsFile() throws IOException { + repo.shutDown(); + String file = "contexts.dat"; + File nativeStoreFile = new File(dataDir, file); + long fileSize = nativeStoreFile.length(); + + for (long i = 4; i < fileSize; i++) { + restoreFile(dataDir, file); + overwriteByteInFile(nativeStoreFile, i, 0x0); + repo.init(); + List list = getStatements(); + assertEquals(6, list.size(), "Failed at byte position " + i); + repo.shutDown(); + } + } + + @Test + public void testCorruptValuesPoscAllocFile() throws IOException { + repo.shutDown(); + String file = "triples-posc.alloc"; + File nativeStoreFile = new File(dataDir, file); + long fileSize = nativeStoreFile.length(); + + for (long i = 4; i < fileSize; i++) { + restoreFile(dataDir, file); + overwriteByteInFile(nativeStoreFile, i, 0x0); + repo.init(); + List list = getStatements(); + assertEquals(6, list.size(), "Failed at byte position " + i); + repo.shutDown(); + } + } + + @Test + public void testCorruptValuesPoscDataFile() throws IOException { + repo.shutDown(); + String file = "triples-posc.dat"; + File nativeStoreFile = new File(dataDir, file); + long fileSize = nativeStoreFile.length(); + + for (long i = 4; i < fileSize; i++) { + NativeStore.SOFT_FAIL_ON_CORRUPT_DATA_AND_REPAIR_INDEXES = true; + restoreFile(dataDir, file); + overwriteByteInFile(nativeStoreFile, i, 0x0); + repo.init(); + List list = getStatements(); + assertEquals(6, list.size(), "Failed at byte position " + i); + repo.shutDown(); + } + } + + @Test + public void testCorruptValuesSpocAllocFile() throws IOException { + repo.shutDown(); + String file = "triples-spoc.alloc"; + File nativeStoreFile = new File(dataDir, file); + long fileSize = nativeStoreFile.length(); + + for (long i = 4; i < fileSize; i++) { + restoreFile(dataDir, file); + overwriteByteInFile(nativeStoreFile, i, 0x0); + repo.init(); + List list = getStatements(); + assertEquals(6, list.size(), "Failed at byte position " + i); + repo.shutDown(); + } + } + + @Test + public void testCorruptValuesSpocDataFile() throws IOException { + repo.shutDown(); + String file = "triples-spoc.dat"; + File nativeStoreFile = new File(dataDir, file); + long fileSize = nativeStoreFile.length(); + + for (long i = 4; i < fileSize; i++) { + restoreFile(dataDir, file); + overwriteByteInFile(nativeStoreFile, i, 0x0); + repo.init(); + try { + List list = getStatements(); + assertEquals(6, list.size(), "Failed at byte position " + i); + } catch (Throwable ignored) { + repo.shutDown(); + nativeStoreFile.delete(); + repo.init(); + List list = getStatements(); + assertEquals(6, list.size(), "Failed at byte position " + i); + } + + repo.shutDown(); + } + } + + @NotNull + private List getStatements() { + List list = new ArrayList<>(); + + try (RepositoryConnection conn = repo.getConnection()) { + StringWriter stringWriter = new StringWriter(); + RDFWriter writer = Rio.createWriter(RDFFormat.NQUADS, stringWriter); + conn.export(writer); + logger.debug(stringWriter.toString()); + try (RepositoryResult statements = conn.getStatements(null, null, null, false)) { + while (statements.hasNext()) { + Statement next = statements.next(); + list.add(next); + logger.debug(next.toString()); + } + } + return list; + } + } + + @AfterEach + public void after() throws IOException { + NativeStore.SOFT_FAIL_ON_CORRUPT_DATA_AND_REPAIR_INDEXES = false; + repo.shutDown(); + } +} diff --git a/core/sail/shacl/src/test/java/org/eclipse/rdf4j/sail/shacl/MultithreadedTest.java b/core/sail/shacl/src/test/java/org/eclipse/rdf4j/sail/shacl/MultithreadedTest.java index 89aea92cac9..be87b248131 100644 --- a/core/sail/shacl/src/test/java/org/eclipse/rdf4j/sail/shacl/MultithreadedTest.java +++ b/core/sail/shacl/src/test/java/org/eclipse/rdf4j/sail/shacl/MultithreadedTest.java @@ -46,6 +46,7 @@ import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.Timeout; import org.junit.jupiter.api.parallel.Isolated; import org.slf4j.LoggerFactory; @@ -67,6 +68,7 @@ public static void afterAll() { } @Test + @Timeout(value = 30, unit = TimeUnit.MINUTES) public void testDataAndShapes() { System.out.println("testDataAndShapes"); @@ -339,6 +341,7 @@ private void remove(String turtle, IRI graph) { } @Test + @Timeout(value = 30, unit = TimeUnit.MINUTES) public void testLotsOfValidationFailuresSnapshot() throws IOException { System.out.println("testLotsOfValidationFailuresSnapshot"); ShaclSail sail = new ShaclSail(getBaseSail()); @@ -354,6 +357,7 @@ public void testLotsOfValidationFailuresSnapshot() throws IOException { } @Test + @Timeout(value = 30, unit = TimeUnit.MINUTES) public void testLotsOfValidationFailuresSerializableValidation() throws IOException { System.out.println("testLotsOfValidationFailuresSerializableValidation"); Logger root = (Logger) LoggerFactory.getLogger(ShaclSailBaseConfiguration.class.getName()); @@ -371,6 +375,7 @@ public void testLotsOfValidationFailuresSerializableValidation() throws IOExcept } @Test + @Timeout(value = 30, unit = TimeUnit.MINUTES) public void testLotsOfValidationFailuresSerializable() throws IOException { System.out.println("testLotsOfValidationFailuresSerializable"); @@ -389,6 +394,7 @@ public void testLotsOfValidationFailuresSerializable() throws IOException { } @Test + @Timeout(value = 30, unit = TimeUnit.MINUTES) public void testLotsOfValidationFailuresReadCommitted() throws IOException { System.out.println("testLotsOfValidationFailuresReadCommitted"); ShaclSail sail = new ShaclSail(getBaseSail()); @@ -403,6 +409,7 @@ public void testLotsOfValidationFailuresReadCommitted() throws IOException { } @Test + @Timeout(value = 30, unit = TimeUnit.MINUTES) public void testLotsOfValidationFailuresReadUncommitted() throws IOException { System.out.println("testLotsOfValidationFailuresReadUncommitted"); ShaclSail sail = new ShaclSail(getBaseSail()); diff --git a/site/content/documentation/programming/repository.md b/site/content/documentation/programming/repository.md index d1fa4f8324c..691a839c74e 100644 --- a/site/content/documentation/programming/repository.md +++ b/site/content/documentation/programming/repository.md @@ -98,6 +98,10 @@ import org.eclipse.rdf4j.sail.nativerdf.NativeStore; Repository repo = new SailRepository(new NativeStore()); ``` +In the unlikely event of corruption the system property `org.eclipse.rdf4j.sail.nativerdf.softFailOnCorruptDataAndRepairIndexes` can be set to `true` to +allow the NativeStore to output CorruptValue/CorruptIRI/CorruptIRIOrBNode/CorruptLiteral objects. Take a backup of all data before setting +this property as it allows the NativeStore to delete corrupt indexes in an attempt to recreate them. Consider this feature experimental and use with caution. + ### Elasticsearch RDF Repository {{< tag " New in RDF4J 3.1" >}} diff --git a/testsuites/sail/src/main/java/org/eclipse/rdf4j/testsuite/sail/SailConcurrencyTest.java b/testsuites/sail/src/main/java/org/eclipse/rdf4j/testsuite/sail/SailConcurrencyTest.java index d47793cc3ec..4ee406b0e7f 100644 --- a/testsuites/sail/src/main/java/org/eclipse/rdf4j/testsuite/sail/SailConcurrencyTest.java +++ b/testsuites/sail/src/main/java/org/eclipse/rdf4j/testsuite/sail/SailConcurrencyTest.java @@ -37,6 +37,7 @@ import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.Timeout; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -155,6 +156,7 @@ public int getSize() { * @see https://github.com/eclipse/rdf4j/issues/693 */ @Test + @Timeout(value = 30, unit = TimeUnit.MINUTES) public void testConcurrentAddLargeTxn() throws Exception { logger.info("executing two large concurrent transactions"); final CountDownLatch runnersDone = new CountDownLatch(2); @@ -196,6 +198,7 @@ public void testConcurrentAddLargeTxn() throws Exception { * one of the transactions rolls back at the end. */ @Test + @Timeout(value = 30, unit = TimeUnit.MINUTES) public void testConcurrentAddLargeTxnRollback() throws Exception { logger.info("executing two large concurrent transactions"); final CountDownLatch runnersDone = new CountDownLatch(2); @@ -237,6 +240,7 @@ public void testConcurrentAddLargeTxnRollback() throws Exception { } @Test + @Timeout(value = 30, unit = TimeUnit.MINUTES) @Disabled("This test takes a long time and accomplishes little extra") public void testGetContextIDs() throws Exception { // Create one thread which writes statements to the repository, on a @@ -314,6 +318,7 @@ public void testGetContextIDs() throws Exception { } @Test + @Timeout(value = 30, unit = TimeUnit.MINUTES) public void testConcurrentConnectionsShutdown() throws InterruptedException { if (store instanceof AbstractSail) { ((AbstractSail) store).setConnectionTimeOut(200); @@ -356,8 +361,9 @@ public void testConcurrentConnectionsShutdown() throws InterruptedException { } -// @Disabled + // @Disabled @Test + @Timeout(value = 30, unit = TimeUnit.MINUTES) public void testSerialThreads() throws InterruptedException { if (store instanceof AbstractSail) { ((AbstractSail) store).setConnectionTimeOut(200); @@ -438,6 +444,7 @@ public void testSerialThreads() throws InterruptedException { } @Test + @Timeout(value = 30, unit = TimeUnit.MINUTES) public void testConcurrentConnectionsShutdownReadCommitted() throws InterruptedException { if (store instanceof AbstractSail) { ((AbstractSail) store).setConnectionTimeOut(200); @@ -493,6 +500,7 @@ public void testConcurrentConnectionsShutdownReadCommitted() throws InterruptedE } @Test + @Timeout(value = 30, unit = TimeUnit.MINUTES) public void testConcurrentConnectionsShutdownAndClose() throws InterruptedException { if (store instanceof AbstractSail) { ((AbstractSail) store).setConnectionTimeOut(200); @@ -568,6 +576,7 @@ public void testConcurrentConnectionsShutdownAndClose() throws InterruptedExcept } @Test + @Timeout(value = 30, unit = TimeUnit.MINUTES) public void testConcurrentConnectionsShutdownAndCloseRollback() throws InterruptedException { if (store instanceof AbstractSail) { ((AbstractSail) store).setConnectionTimeOut(200);