Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace A_TIMESTAMP with A_FETCH_BEGAN_TIME #292

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.archive.modules.CoreAttributeConstants;
import org.archive.modules.CrawlURI;

/**
Expand All @@ -37,10 +38,6 @@
*/
public class FetchHistoryHelper {
private static final Log logger = LogFactory.getLog(FetchHistoryHelper.class);
/**
* key for storing timestamp in crawl history map.
*/
public static final String A_TIMESTAMP = ".ts";

/**
* returns a Map to store recrawl data, positioned properly in CrawlURI's
Expand All @@ -64,10 +61,10 @@ public static Map<String, Object> getFetchHistory(CrawlURI uri, long timestamp,
for (int i = 0; i < history.length; i++) {
if (history[i] == null) {
history[i] = new HashMap<String, Object>();
history[i].put(A_TIMESTAMP, timestamp);
history[i].put(CoreAttributeConstants.A_FETCH_BEGAN_TIME, timestamp);
return history[i];
}
Object ts = history[i].get(A_TIMESTAMP);
Object ts = history[i].get(CoreAttributeConstants.A_FETCH_BEGAN_TIME);
// no timestamp value is regarded as older than anything.
if (!(ts instanceof Long) || timestamp > (Long)ts) {
if (i < history.length - 2) {
Expand All @@ -76,7 +73,7 @@ public static Map<String, Object> getFetchHistory(CrawlURI uri, long timestamp,
history[i + 1] = history[i];
}
history[i] = new HashMap<String, Object>();
history[i].put(A_TIMESTAMP, timestamp);
history[i].put(CoreAttributeConstants.A_FETCH_BEGAN_TIME, timestamp);
return history[i];
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -455,7 +455,7 @@ protected ProcessResult innerProcessResult(CrawlURI curi) throws InterruptedExce
}
if (info != null) {
Map<String, Object> history = FetchHistoryHelper.getFetchHistory(curi,
(Long)info.get(FetchHistoryHelper.A_TIMESTAMP), historyLength);
(Long)info.get(CoreAttributeConstants.A_FETCH_BEGAN_TIME), historyLength);
if (history != null)
history.putAll(info);
loadedCount.incrementAndGet();
Expand Down Expand Up @@ -516,10 +516,6 @@ protected HashMap<String, Object> getLastCrawl(InputStream is) throws IOExceptio
if (tsbuffer.remaining() == 0) {
try {
long ts = DateUtils.parse14DigitDate(new String(tsbuffer.array())).getTime();
// A_TIMESTAMP has been used for sorting history long before A_FETCH_BEGAN_TIME
// field was introduced. Now FetchHistoryProcessor fails if A_FETCH_BEGAN_TIME is
// not set. We could stop storing A_TIMESTAMP and sort by A_FETCH_BEGAN_TIME.
info.put(FetchHistoryHelper.A_TIMESTAMP, ts);
info.put(CoreAttributeConstants.A_FETCH_BEGAN_TIME, ts);
} catch (ParseException ex) {
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
import org.archive.modules.CoreAttributeConstants;
import org.archive.modules.CrawlURI;
import org.archive.modules.ProcessResult;
import org.archive.modules.recrawl.FetchHistoryHelper;
import org.archive.modules.recrawl.FetchHistoryProcessor;
import org.archive.modules.recrawl.RecrawlAttributeConstants;
import org.archive.net.UURIFactory;
Expand Down Expand Up @@ -122,12 +121,11 @@ public void _testInnerProcessResultSingleShotWithMock() throws Exception {
final byte[] digestValue0 = sha1Digest("0");
final byte[] digestValue1 = sha1Digest("1");
fetchHistory[0] = new HashMap<String, Object>();
fetchHistory[0].put(FetchHistoryHelper.A_TIMESTAMP, expected_ts + 2000);
fetchHistory[0].put(CoreAttributeConstants.A_FETCH_BEGAN_TIME, expected_ts + 2000);
fetchHistory[0].put(RecrawlAttributeConstants.A_CONTENT_DIGEST,
CONTENT_DIGEST_SCHEME + Base32.encode(digestValue0));
fetchHistory[1] = new HashMap<String, Object>();
fetchHistory[1].put(FetchHistoryHelper.A_TIMESTAMP, expected_ts - 2000);
fetchHistory[1].put(CoreAttributeConstants.A_FETCH_BEGAN_TIME, expected_ts - 2000);
fetchHistory[1].put(RecrawlAttributeConstants.A_CONTENT_DIGEST,
CONTENT_DIGEST_SCHEME + Base32.encode(digestValue1));

Expand All @@ -140,7 +138,7 @@ public void _testInnerProcessResultSingleShotWithMock() throws Exception {
String hash = (String)history.get(RecrawlAttributeConstants.A_CONTENT_DIGEST);
assertEquals("CONTENT_DIGEST", CONTENT_DIGEST_SCHEME+TestNormalHttpResponse.EXPECTED_HASH, hash);

Long ts = (Long)history.get(FetchHistoryHelper.A_TIMESTAMP);
Long ts = (Long)history.get(CoreAttributeConstants.A_FETCH_BEGAN_TIME);
assertNotNull("ts is non-null", ts);
assertEquals("'ts' has expected timestamp", expected_ts, ts.longValue());

Expand Down