From 8ae61db57b471d6a1e7f70ed6cd70c90eefbd5d7 Mon Sep 17 00:00:00 2001 From: Tim Hennekey Date: Tue, 14 Jan 2020 15:49:33 -0500 Subject: [PATCH] Replace A_TIMESTAMP with A_FETCH_BEGAN_TIME It seems that A_TIMESTAMP went out of favor quite a long time ago. A_FETCH_BEGAN_TIME is used within FetchHistoryProcessor and throws an exception as is because of it. --- .../archive/modules/recrawl/FetchHistoryHelper.java | 11 ++++------- .../modules/recrawl/wbm/WbmPersistLoadProcessor.java | 6 +----- .../recrawl/wbm/WbmPersistLoadProcessorTest.java | 6 ++---- 3 files changed, 7 insertions(+), 16 deletions(-) diff --git a/contrib/src/main/java/org/archive/modules/recrawl/FetchHistoryHelper.java b/contrib/src/main/java/org/archive/modules/recrawl/FetchHistoryHelper.java index 4705a9f65..a761ddce5 100644 --- a/contrib/src/main/java/org/archive/modules/recrawl/FetchHistoryHelper.java +++ b/contrib/src/main/java/org/archive/modules/recrawl/FetchHistoryHelper.java @@ -27,6 +27,7 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.archive.modules.CoreAttributeConstants; import org.archive.modules.CrawlURI; /** @@ -37,10 +38,6 @@ */ public class FetchHistoryHelper { private static final Log logger = LogFactory.getLog(FetchHistoryHelper.class); - /** - * key for storing timestamp in crawl history map. - */ - public static final String A_TIMESTAMP = ".ts"; /** * returns a Map to store recrawl data, positioned properly in CrawlURI's @@ -64,10 +61,10 @@ public static Map getFetchHistory(CrawlURI uri, long timestamp, for (int i = 0; i < history.length; i++) { if (history[i] == null) { history[i] = new HashMap(); - history[i].put(A_TIMESTAMP, timestamp); + history[i].put(CoreAttributeConstants.A_FETCH_BEGAN_TIME, timestamp); return history[i]; } - Object ts = history[i].get(A_TIMESTAMP); + Object ts = history[i].get(CoreAttributeConstants.A_FETCH_BEGAN_TIME); // no timestamp value is regarded as older than anything. if (!(ts instanceof Long) || timestamp > (Long)ts) { if (i < history.length - 2) { @@ -76,7 +73,7 @@ public static Map getFetchHistory(CrawlURI uri, long timestamp, history[i + 1] = history[i]; } history[i] = new HashMap(); - history[i].put(A_TIMESTAMP, timestamp); + history[i].put(CoreAttributeConstants.A_FETCH_BEGAN_TIME, timestamp); return history[i]; } } diff --git a/contrib/src/main/java/org/archive/modules/recrawl/wbm/WbmPersistLoadProcessor.java b/contrib/src/main/java/org/archive/modules/recrawl/wbm/WbmPersistLoadProcessor.java index b3748c3f1..124fa75cc 100644 --- a/contrib/src/main/java/org/archive/modules/recrawl/wbm/WbmPersistLoadProcessor.java +++ b/contrib/src/main/java/org/archive/modules/recrawl/wbm/WbmPersistLoadProcessor.java @@ -455,7 +455,7 @@ protected ProcessResult innerProcessResult(CrawlURI curi) throws InterruptedExce } if (info != null) { Map history = FetchHistoryHelper.getFetchHistory(curi, - (Long)info.get(FetchHistoryHelper.A_TIMESTAMP), historyLength); + (Long)info.get(CoreAttributeConstants.A_FETCH_BEGAN_TIME), historyLength); if (history != null) history.putAll(info); loadedCount.incrementAndGet(); @@ -516,10 +516,6 @@ protected HashMap getLastCrawl(InputStream is) throws IOExceptio if (tsbuffer.remaining() == 0) { try { long ts = DateUtils.parse14DigitDate(new String(tsbuffer.array())).getTime(); - // A_TIMESTAMP has been used for sorting history long before A_FETCH_BEGAN_TIME - // field was introduced. Now FetchHistoryProcessor fails if A_FETCH_BEGAN_TIME is - // not set. We could stop storing A_TIMESTAMP and sort by A_FETCH_BEGAN_TIME. - info.put(FetchHistoryHelper.A_TIMESTAMP, ts); info.put(CoreAttributeConstants.A_FETCH_BEGAN_TIME, ts); } catch (ParseException ex) { } diff --git a/contrib/src/test/java/org/archive/modules/recrawl/wbm/WbmPersistLoadProcessorTest.java b/contrib/src/test/java/org/archive/modules/recrawl/wbm/WbmPersistLoadProcessorTest.java index 88d1c0cf9..3e8aece0c 100644 --- a/contrib/src/test/java/org/archive/modules/recrawl/wbm/WbmPersistLoadProcessorTest.java +++ b/contrib/src/test/java/org/archive/modules/recrawl/wbm/WbmPersistLoadProcessorTest.java @@ -25,7 +25,6 @@ import org.archive.modules.CoreAttributeConstants; import org.archive.modules.CrawlURI; import org.archive.modules.ProcessResult; -import org.archive.modules.recrawl.FetchHistoryHelper; import org.archive.modules.recrawl.FetchHistoryProcessor; import org.archive.modules.recrawl.RecrawlAttributeConstants; import org.archive.net.UURIFactory; @@ -122,12 +121,11 @@ public void _testInnerProcessResultSingleShotWithMock() throws Exception { final byte[] digestValue0 = sha1Digest("0"); final byte[] digestValue1 = sha1Digest("1"); fetchHistory[0] = new HashMap(); - fetchHistory[0].put(FetchHistoryHelper.A_TIMESTAMP, expected_ts + 2000); fetchHistory[0].put(CoreAttributeConstants.A_FETCH_BEGAN_TIME, expected_ts + 2000); fetchHistory[0].put(RecrawlAttributeConstants.A_CONTENT_DIGEST, CONTENT_DIGEST_SCHEME + Base32.encode(digestValue0)); fetchHistory[1] = new HashMap(); - fetchHistory[1].put(FetchHistoryHelper.A_TIMESTAMP, expected_ts - 2000); + fetchHistory[1].put(CoreAttributeConstants.A_FETCH_BEGAN_TIME, expected_ts - 2000); fetchHistory[1].put(RecrawlAttributeConstants.A_CONTENT_DIGEST, CONTENT_DIGEST_SCHEME + Base32.encode(digestValue1)); @@ -140,7 +138,7 @@ public void _testInnerProcessResultSingleShotWithMock() throws Exception { String hash = (String)history.get(RecrawlAttributeConstants.A_CONTENT_DIGEST); assertEquals("CONTENT_DIGEST", CONTENT_DIGEST_SCHEME+TestNormalHttpResponse.EXPECTED_HASH, hash); - Long ts = (Long)history.get(FetchHistoryHelper.A_TIMESTAMP); + Long ts = (Long)history.get(CoreAttributeConstants.A_FETCH_BEGAN_TIME); assertNotNull("ts is non-null", ts); assertEquals("'ts' has expected timestamp", expected_ts, ts.longValue());