Skip to content

Commit

Permalink
- Show the "Retry-After" value, if it exits, when receiving a "429-HT…
Browse files Browse the repository at this point in the history
…TP-error". Add TODOs for handling it.

- Update dependencies.
  • Loading branch information
LSmyrnaios committed Jan 15, 2024
1 parent db41c5b commit 97ed774
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 5 deletions.
8 changes: 4 additions & 4 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.11.0</version>
<version>3.12.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
Expand Down Expand Up @@ -61,7 +61,7 @@
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>3.2.3</version>
<version>3.2.5</version>
<configuration>
<!--<excludes>
<exclude>some test to exclude here</exclude>
Expand Down Expand Up @@ -106,14 +106,14 @@
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.17.1</version>
<version>1.17.2</version>
</dependency>

<!-- https://mvnrepository.com/artifact/com.google.guava/guava -->
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>32.1.3-jre</version>
<version>33.0.0-jre</version>
</dependency>

<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-compress -->
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -616,7 +616,7 @@ public static DocFileData getDocFileAndHandleExisting(String docFileName, String
try {
if ( !hasUnretrievableDocName ) // If we retrieved the fileName, go check if it's a duplicate.
{
if ( (curDuplicateNum = numbersOfDuplicateDocFileNames.get(docFileName)) != null ) // Since this datastructure is accessed inside the SYNCHRONIZED BLOCK, it can simpy be a HashMap without any internal synch, in order to speed it up.
if ( (curDuplicateNum = numbersOfDuplicateDocFileNames.get(docFileName)) != null ) // Since this data-structure is accessed inside the SYNCHRONIZED BLOCK, it can simpy be a HashMap without any internal sync, in order to speed it up.
curDuplicateNum += 1;
else if ( docFile.exists() ) // If it's not an already-known duplicate (this is the first duplicate-case for this file), go check if it exists in the fileSystem.
curDuplicateNum = 1; // It was "null", after the "ConcurrentHashMap.get()" check.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -494,6 +494,18 @@ public static String onErrorStatusCode(String urlStr, String domainStr, int erro
}
if ( errorStatusCode == 403 )
on403ErrorCode(urlStr, domainStr, calledForPageUrl); // The "DomainBlockedException" will go up-method by its own, if thrown inside this one.
else if ( errorStatusCode == 429 ) {
String retryAfterTime = conn.getHeaderField("Retry-After"); // Get the "Retry-After" header, if it exists.
if ( retryAfterTime != null ) {
errorLogMessage += " | Retry-After:" + retryAfterTime;
// TODO - Add this domain in a special hashMap, having the retry-after time as a value.
// TODO - Upon deciding the delay between requests of the same domain, lookup each domain and take into account this "retry-after" time.
// TODO - One possible problem with this: we may get our threads starved, waiting even a day, for a couple of domains.
// Ideally, we should put those urls aside somehow and retry them in the end, if the waiting time is above some threshold.
// (Just put them in a list and go to the next url. In the end, take care all the urls in that list. If the retry-time is huge (e.g. > 1 day, set a could-retry status, with a nice error-msg, and finish the current-batch.))
// TODO - Check the syntax of this header here: https://www.geeksforgeeks.org/http-headers-retry-after/
}
}
}
else { // Other errorCodes. Retrieve the domain and make the required actions.
if ( (domainStr == null) || !urlStr.contains(domainStr) ) // The domain might have changed after redirections.
Expand Down

0 comments on commit 97ed774

Please sign in to comment.