Skip to content

Commit

Permalink
- Fix missing changes.
Browse files Browse the repository at this point in the history
- Show special warning when a DOI-url has an invalid DOI-ID, as reported by the "doi.org" domain.
- Update user-agent.
- Code polishing.
  • Loading branch information
LSmyrnaios committed Nov 28, 2024
1 parent d886bf8 commit 8a58df7
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 18 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -699,20 +699,19 @@ else if ( docFile.exists() ) // If it's not an already-known duplicate (this is

private static int getMaxStoringWaitingTime(int contentSize)
{
if ( contentSize != -2 ) {
if ( contentSize <= fiftyMBInBytes )
return 45_000; // 45 seconds
else if ( contentSize <= oneHundredMBInBytes )
return 60_000; // 1 min.
else if ( contentSize <= twoHundredMBInBytes )
return 120_000; // 2 mins.
else if ( contentSize <= threeHundredMBInBytes )
return 180_000; // 3 mins.
else
return 300_000; // 5 mins.
}
else // In case the server did not provide the "Content Length" header.
if ( contentSize == -2 ) // In case the server did not provide the "Content Length" header.
return 45_000; // 45 seconds

if ( contentSize <= fiftyMBInBytes )
return 45_000; // 45 seconds
else if ( contentSize <= oneHundredMBInBytes )
return 60_000; // 1 min.
else if ( contentSize <= twoHundredMBInBytes )
return 120_000; // 2 mins.
else if ( contentSize <= threeHundredMBInBytes )
return 180_000; // 3 mins.
else
return 300_000; // 5 mins.
}


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ public class ConnSupportUtils

public static final ConcurrentHashMap<String, DomainConnectionData> domainsWithConnectionData = new ConcurrentHashMap<>();

public static String userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:130.0) Gecko/20100101 Firefox/130.0"; // This should not be "final", another program, using this software as a library, should be able to set its own "UserAgent".
public static String userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0"; // This should not be "final", another program, using this software as a library, should be able to set its own "UserAgent".
public static String acceptLanguage = "en-US,en;q=0.5";


Expand Down Expand Up @@ -351,8 +351,8 @@ public static String getPlainMimeType(String mimeType)
* @return
* @throws FileNotRetrievedException
*/
public static DocFileData downloadAndStoreDocFile(HttpURLConnection conn, String id, String domainStr, String docUrl, boolean calledForPageUrl)
throws DocFileNotRetrievedException
public static FileData downloadAndStoreDocFile(HttpURLConnection conn, String id, String domainStr, String docUrl, boolean calledForPageUrl)
throws FileNotRetrievedException
{
boolean reconnected = false;
try {
Expand Down Expand Up @@ -387,9 +387,9 @@ public static DocFileData downloadAndStoreDocFile(HttpURLConnection conn, String
throw new FileNotRetrievedException(errMsg);
}

File docFile = docFileData.getDocFile();
try { // In the "S3"-mode, we don't keep the files locally.
File docFile = fileData.getFile();
if ( ArgsUtils.shouldUploadFilesToS3 ) {
try { // In the "S3"-mode, we don't keep the files locally, after they get transferred.
FileDeleteStrategy.FORCE.delete(docFile); // We don't need the local file anymore..
} catch (Exception e) {
logger.warn("The file \"" + docFile.getName() + "\" could not be deleted after being uploaded to S3 ObjectStore!");
Expand Down Expand Up @@ -528,6 +528,12 @@ public static String onErrorStatusCode(String urlStr, String domainStr, int erro
if ( calledForPageUrl && (errorStatusCode != 404) && (errorStatusCode != 410) ) {
String errorText = getErrorMessageFromResponseBody(conn);
if ( errorText != null ) {

if ( domainStr.contains("doi.org") && errorText.contains("Not a DOI") ) {
logger.warn("Found a \"doi.org\" url with an invalid DOI: " + urlStr);
// In this case it is highly likely that the "DOI" in the url is a DOI-LINK.
}

errorLogMessage += " Error-text: " + errorText;
/*if ( errorStatusCode == 403 && errorText.toLowerCase().contains("javascript") ) {
// Use selenium to execute the JS.
Expand Down
1 change: 1 addition & 0 deletions src/main/resources/logback.xml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
</appender>

<logger name="crawlercommons" level="warn"/>
<logger name="eu.openaire.publications_retriever.crawler.MetadataHandler" level="debug"/>

<root level="debug">
<appender-ref ref="RollingFile" />
Expand Down

0 comments on commit 8a58df7

Please sign in to comment.