From be29b86bc928993593a9a7d3a3d8c6ccbfa85e63 Mon Sep 17 00:00:00 2001 From: Thomas Egense Date: Sat, 23 Dec 2023 10:03:16 +0100 Subject: [PATCH 1/6] Reverting source_file_path back to value it was in 3.2. Now both source_file and source_file_path will be set --- warc-indexer/src/main/java/uk/bl/wa/indexer/WARCIndexer.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/warc-indexer/src/main/java/uk/bl/wa/indexer/WARCIndexer.java b/warc-indexer/src/main/java/uk/bl/wa/indexer/WARCIndexer.java index 78957c1e..62338ca8 100644 --- a/warc-indexer/src/main/java/uk/bl/wa/indexer/WARCIndexer.java +++ b/warc-indexer/src/main/java/uk/bl/wa/indexer/WARCIndexer.java @@ -514,7 +514,7 @@ private void processEnvelopeHeader(SolrRecord solr, ArchiveRecordHeader header, //Will convert windows path to linux path. Linux paths will not be modified. final String linuxFilePath = FilenameUtils.separatorsToUnix(filePath); - solr.setField(SolrFields.SOURCE_FILE, linuxFilePath); + solr.setField(SolrFields.SOURCE_FILE_PATH, linuxFilePath); byte[] url_md5digest = md5 .digest(Normalisation.sanitiseWARCHeaderValue(header.getUrl()).getBytes(StandardCharsets.UTF_8)); From ac4e36254d826b1717045368c7663f63d2129f91 Mon Sep 17 00:00:00 2001 From: Thomas Egense Date: Sat, 23 Dec 2023 10:14:16 +0100 Subject: [PATCH 2/6] changes added --- CHANGES.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index 598c1ccf..6cbd536a 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,6 +1,12 @@ **NOTE** Generally, we only add terms to the Solr schema, so it should usually be compatible with previous versions (i.e. clients should be able to query across both without modification). However, there are been a small number of fixes which unfortunately required breaking changes you may need to be aware of or work-around. e.g. [hash becomes single-valued](https://github.com/ukwa/webarchive-discovery/issues/95)... TBA... +3.3.1 +----- + (UNRELEASED) + +source_file and source_file_path are not set correct as there were in version 3.2.0. + 3.3.0 ----- From efe0ad2ef4486c4bc040466cb373b10f7976b858 Mon Sep 17 00:00:00 2001 From: Thomas Egense Date: Sat, 23 Dec 2023 10:43:49 +0100 Subject: [PATCH 3/6] Added unittest for source_file and source_file_path, but the embedder-solr unittest is disabled. When fixed the unittest will be active. --- .../uk/bl/wa/indexer/WARCIndexerEmbeddedSolrTest.java | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/warc-indexer/src/test/java/uk/bl/wa/indexer/WARCIndexerEmbeddedSolrTest.java b/warc-indexer/src/test/java/uk/bl/wa/indexer/WARCIndexerEmbeddedSolrTest.java index c3a8ad81..cc68889e 100644 --- a/warc-indexer/src/test/java/uk/bl/wa/indexer/WARCIndexerEmbeddedSolrTest.java +++ b/warc-indexer/src/test/java/uk/bl/wa/indexer/WARCIndexerEmbeddedSolrTest.java @@ -28,6 +28,7 @@ */ import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; import java.io.File; import java.io.IOException; @@ -61,6 +62,7 @@ import uk.bl.wa.annotation.Annotations; import uk.bl.wa.annotation.AnnotationsTest; import uk.bl.wa.annotation.Annotator; +import uk.bl.wa.solr.SolrFields; import uk.bl.wa.solr.SolrRecord; import uk.bl.wa.util.Normalisation; @@ -191,6 +193,12 @@ public void testEmbeddedServer() throws SolrServerException, IOException, NoSuch } assertEquals(21L, response.getResults().getNumFound()); + //Test source_file and source_file_path + SolrDocument doc = response.getResults().get(0); + assertEquals(doc.getFieldValue(SolrFields.SOURCE_FILE),"flashfrozen-jwat-recompressed.warc.gz"); + String source_file_path = (String) doc.getFieldValue(SolrFields.SOURCE_FILE_PATH); + assertTrue(source_file_path.endsWith("wikipedia-mona-lisa/flashfrozen-jwat-recompressed.warc.gz")); //First path of path depend on project location. + } } From 9f7e9105841a1aa64613cf39c8be0b9edd1b5947 Mon Sep 17 00:00:00 2001 From: Thomas Egense Date: Sun, 24 Dec 2023 09:59:23 +0100 Subject: [PATCH 4/6] Changed to debug. Some harvest tools generate a request record for every response record. The request record would result in this log. For WARC-files with 50.000 response records , it would give 50000 log lines. --- .../src/main/java/uk/bl/wa/indexer/WARCIndexerCommand.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/warc-indexer/src/main/java/uk/bl/wa/indexer/WARCIndexerCommand.java b/warc-indexer/src/main/java/uk/bl/wa/indexer/WARCIndexerCommand.java index bfb8e204..dcb3ad9f 100644 --- a/warc-indexer/src/main/java/uk/bl/wa/indexer/WARCIndexerCommand.java +++ b/warc-indexer/src/main/java/uk/bl/wa/indexer/WARCIndexerCommand.java @@ -244,7 +244,7 @@ public static void parseWarcFiles( WARCIndexerCommandOptions opts, boolean isTex recordCount++; } } else { - log.info("No document produced by record: " + type + " for url " + url + " from " + + log.debug("No document produced by record: " + type + " for url " + url + " from " + inFile.getName() + " @" + rec.getHeader().getOffset()); } } From c7873c9a60e7029b70c57a3836690699dd74fa34 Mon Sep 17 00:00:00 2001 From: Thomas Egense Date: Sun, 24 Dec 2023 10:02:03 +0100 Subject: [PATCH 5/6] Added comment --- .../src/main/java/uk/bl/wa/indexer/WARCIndexerCommand.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/warc-indexer/src/main/java/uk/bl/wa/indexer/WARCIndexerCommand.java b/warc-indexer/src/main/java/uk/bl/wa/indexer/WARCIndexerCommand.java index dcb3ad9f..f4255155 100644 --- a/warc-indexer/src/main/java/uk/bl/wa/indexer/WARCIndexerCommand.java +++ b/warc-indexer/src/main/java/uk/bl/wa/indexer/WARCIndexerCommand.java @@ -245,7 +245,7 @@ public static void parseWarcFiles( WARCIndexerCommandOptions opts, boolean isTex } } else { log.debug("No document produced by record: " + type + " for url " + url + " from " + - inFile.getName() + " @" + rec.getHeader().getOffset()); + inFile.getName() + " @" + rec.getHeader().getOffset()); //All request records will log this. It is expected there is no document. } } docConsumer.endWARC(); From f98deaddfde179051ee3ba67adb3263b8111fc81 Mon Sep 17 00:00:00 2001 From: Thomas Egense Date: Tue, 26 Dec 2023 10:58:01 +0100 Subject: [PATCH 6/6] typo fix --- CHANGES.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGES.md b/CHANGES.md index 6cbd536a..dfea0a5d 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -5,7 +5,7 @@ ----- (UNRELEASED) -source_file and source_file_path are not set correct as there were in version 3.2.0. +source_file and source_file_path are again set correct as there were in version 3.2.0. 3.3.0 -----