From 8a58df775d0ecc515114248bd7fbbe2f3b4389c3 Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Thu, 28 Nov 2024 18:01:23 +0200 Subject: [PATCH] - Fix missing changes. - Show special warning when a DOI-url has an invalid DOI-ID, as reported by the "doi.org" domain. - Update user-agent. - Code polishing. --- .../util/file/FileUtils.java | 25 +++++++++---------- .../util/http/ConnSupportUtils.java | 16 ++++++++---- src/main/resources/logback.xml | 1 + 3 files changed, 24 insertions(+), 18 deletions(-) diff --git a/src/main/java/eu/openaire/publications_retriever/util/file/FileUtils.java b/src/main/java/eu/openaire/publications_retriever/util/file/FileUtils.java index 0794b08..83460d3 100644 --- a/src/main/java/eu/openaire/publications_retriever/util/file/FileUtils.java +++ b/src/main/java/eu/openaire/publications_retriever/util/file/FileUtils.java @@ -699,20 +699,19 @@ else if ( docFile.exists() ) // If it's not an already-known duplicate (this is private static int getMaxStoringWaitingTime(int contentSize) { - if ( contentSize != -2 ) { - if ( contentSize <= fiftyMBInBytes ) - return 45_000; // 45 seconds - else if ( contentSize <= oneHundredMBInBytes ) - return 60_000; // 1 min. - else if ( contentSize <= twoHundredMBInBytes ) - return 120_000; // 2 mins. - else if ( contentSize <= threeHundredMBInBytes ) - return 180_000; // 3 mins. - else - return 300_000; // 5 mins. - } - else // In case the server did not provide the "Content Length" header. + if ( contentSize == -2 ) // In case the server did not provide the "Content Length" header. return 45_000; // 45 seconds + + if ( contentSize <= fiftyMBInBytes ) + return 45_000; // 45 seconds + else if ( contentSize <= oneHundredMBInBytes ) + return 60_000; // 1 min. + else if ( contentSize <= twoHundredMBInBytes ) + return 120_000; // 2 mins. + else if ( contentSize <= threeHundredMBInBytes ) + return 180_000; // 3 mins. + else + return 300_000; // 5 mins. } diff --git a/src/main/java/eu/openaire/publications_retriever/util/http/ConnSupportUtils.java b/src/main/java/eu/openaire/publications_retriever/util/http/ConnSupportUtils.java index d83163b..5b9932a 100644 --- a/src/main/java/eu/openaire/publications_retriever/util/http/ConnSupportUtils.java +++ b/src/main/java/eu/openaire/publications_retriever/util/http/ConnSupportUtils.java @@ -89,7 +89,7 @@ public class ConnSupportUtils public static final ConcurrentHashMap domainsWithConnectionData = new ConcurrentHashMap<>(); - public static String userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:130.0) Gecko/20100101 Firefox/130.0"; // This should not be "final", another program, using this software as a library, should be able to set its own "UserAgent". + public static String userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0"; // This should not be "final", another program, using this software as a library, should be able to set its own "UserAgent". public static String acceptLanguage = "en-US,en;q=0.5"; @@ -351,8 +351,8 @@ public static String getPlainMimeType(String mimeType) * @return * @throws FileNotRetrievedException */ - public static DocFileData downloadAndStoreDocFile(HttpURLConnection conn, String id, String domainStr, String docUrl, boolean calledForPageUrl) - throws DocFileNotRetrievedException + public static FileData downloadAndStoreDocFile(HttpURLConnection conn, String id, String domainStr, String docUrl, boolean calledForPageUrl) + throws FileNotRetrievedException { boolean reconnected = false; try { @@ -387,9 +387,9 @@ public static DocFileData downloadAndStoreDocFile(HttpURLConnection conn, String throw new FileNotRetrievedException(errMsg); } - File docFile = docFileData.getDocFile(); - try { // In the "S3"-mode, we don't keep the files locally. + File docFile = fileData.getFile(); if ( ArgsUtils.shouldUploadFilesToS3 ) { + try { // In the "S3"-mode, we don't keep the files locally, after they get transferred. FileDeleteStrategy.FORCE.delete(docFile); // We don't need the local file anymore.. } catch (Exception e) { logger.warn("The file \"" + docFile.getName() + "\" could not be deleted after being uploaded to S3 ObjectStore!"); @@ -528,6 +528,12 @@ public static String onErrorStatusCode(String urlStr, String domainStr, int erro if ( calledForPageUrl && (errorStatusCode != 404) && (errorStatusCode != 410) ) { String errorText = getErrorMessageFromResponseBody(conn); if ( errorText != null ) { + + if ( domainStr.contains("doi.org") && errorText.contains("Not a DOI") ) { + logger.warn("Found a \"doi.org\" url with an invalid DOI: " + urlStr); + // In this case it is highly likely that the "DOI" in the url is a DOI-LINK. + } + errorLogMessage += " Error-text: " + errorText; /*if ( errorStatusCode == 403 && errorText.toLowerCase().contains("javascript") ) { // Use selenium to execute the JS. diff --git a/src/main/resources/logback.xml b/src/main/resources/logback.xml index 8348d7a..ff0cc00 100644 --- a/src/main/resources/logback.xml +++ b/src/main/resources/logback.xml @@ -28,6 +28,7 @@ +