diff --git a/CHANGES.md b/CHANGES.md index 598c1ccf..dfea0a5d 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,6 +1,12 @@ **NOTE** Generally, we only add terms to the Solr schema, so it should usually be compatible with previous versions (i.e. clients should be able to query across both without modification). However, there are been a small number of fixes which unfortunately required breaking changes you may need to be aware of or work-around. e.g. [hash becomes single-valued](https://github.com/ukwa/webarchive-discovery/issues/95)... TBA... +3.3.1 +----- + (UNRELEASED) + +source_file and source_file_path are again set correct as there were in version 3.2.0. + 3.3.0 ----- diff --git a/warc-indexer/src/main/java/uk/bl/wa/indexer/WARCIndexer.java b/warc-indexer/src/main/java/uk/bl/wa/indexer/WARCIndexer.java index 78957c1e..62338ca8 100644 --- a/warc-indexer/src/main/java/uk/bl/wa/indexer/WARCIndexer.java +++ b/warc-indexer/src/main/java/uk/bl/wa/indexer/WARCIndexer.java @@ -514,7 +514,7 @@ private void processEnvelopeHeader(SolrRecord solr, ArchiveRecordHeader header, //Will convert windows path to linux path. Linux paths will not be modified. final String linuxFilePath = FilenameUtils.separatorsToUnix(filePath); - solr.setField(SolrFields.SOURCE_FILE, linuxFilePath); + solr.setField(SolrFields.SOURCE_FILE_PATH, linuxFilePath); byte[] url_md5digest = md5 .digest(Normalisation.sanitiseWARCHeaderValue(header.getUrl()).getBytes(StandardCharsets.UTF_8)); diff --git a/warc-indexer/src/main/java/uk/bl/wa/indexer/WARCIndexerCommand.java b/warc-indexer/src/main/java/uk/bl/wa/indexer/WARCIndexerCommand.java index bfb8e204..f4255155 100644 --- a/warc-indexer/src/main/java/uk/bl/wa/indexer/WARCIndexerCommand.java +++ b/warc-indexer/src/main/java/uk/bl/wa/indexer/WARCIndexerCommand.java @@ -244,8 +244,8 @@ public static void parseWarcFiles( WARCIndexerCommandOptions opts, boolean isTex recordCount++; } } else { - log.info("No document produced by record: " + type + " for url " + url + " from " + - inFile.getName() + " @" + rec.getHeader().getOffset()); + log.debug("No document produced by record: " + type + " for url " + url + " from " + + inFile.getName() + " @" + rec.getHeader().getOffset()); //All request records will log this. It is expected there is no document. } } docConsumer.endWARC(); diff --git a/warc-indexer/src/test/java/uk/bl/wa/indexer/WARCIndexerEmbeddedSolrTest.java b/warc-indexer/src/test/java/uk/bl/wa/indexer/WARCIndexerEmbeddedSolrTest.java index c3a8ad81..cc68889e 100644 --- a/warc-indexer/src/test/java/uk/bl/wa/indexer/WARCIndexerEmbeddedSolrTest.java +++ b/warc-indexer/src/test/java/uk/bl/wa/indexer/WARCIndexerEmbeddedSolrTest.java @@ -28,6 +28,7 @@ */ import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; import java.io.File; import java.io.IOException; @@ -61,6 +62,7 @@ import uk.bl.wa.annotation.Annotations; import uk.bl.wa.annotation.AnnotationsTest; import uk.bl.wa.annotation.Annotator; +import uk.bl.wa.solr.SolrFields; import uk.bl.wa.solr.SolrRecord; import uk.bl.wa.util.Normalisation; @@ -191,6 +193,12 @@ public void testEmbeddedServer() throws SolrServerException, IOException, NoSuch } assertEquals(21L, response.getResults().getNumFound()); + //Test source_file and source_file_path + SolrDocument doc = response.getResults().get(0); + assertEquals(doc.getFieldValue(SolrFields.SOURCE_FILE),"flashfrozen-jwat-recompressed.warc.gz"); + String source_file_path = (String) doc.getFieldValue(SolrFields.SOURCE_FILE_PATH); + assertTrue(source_file_path.endsWith("wikipedia-mona-lisa/flashfrozen-jwat-recompressed.warc.gz")); //First path of path depend on project location. + } }