From db72f2a9ab6e8251962b61f224736cc85fdac954 Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Thu, 7 Dec 2023 03:19:44 +0200 Subject: [PATCH] - Update the "LoaderAndChecker.getWasValidAndCouldRetry()" to return an error-message as well, depending on the given Exception. - Replace double quotes with single ones, in an error-message which was written to a json-file. - Code grouping. --- .../crawler/SpecialUrlsHandler.java | 3 +- .../util/url/LoaderAndChecker.java | 133 ++++++++---------- 2 files changed, 60 insertions(+), 76 deletions(-) diff --git a/src/main/java/eu/openaire/publications_retriever/crawler/SpecialUrlsHandler.java b/src/main/java/eu/openaire/publications_retriever/crawler/SpecialUrlsHandler.java index d2ef23a..15ab70f 100644 --- a/src/main/java/eu/openaire/publications_retriever/crawler/SpecialUrlsHandler.java +++ b/src/main/java/eu/openaire/publications_retriever/crawler/SpecialUrlsHandler.java @@ -224,7 +224,8 @@ public static boolean extractAndCheckTurkjgastroenterolDocUrl(String pageHtml, S // The pageUrl is a VALID-URL, but whether we couldRetry or not, it depends on the error of the docUrl.. So if it is a 404, then we can never get the fulltext. On the contrary, if it is a 503, then in the future wy might get it. String wasValid = list.get(0); String couldRetry = list.get(1); - UrlUtils.logOutputData(urlId, sourceUrl, pageUrl, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in 'PageCrawler.visit()' method, as there was a problem in checking the retrieved \"turkjgastroenterol\"-pdf-url.", pageDomain, true, "true", wasValid, "false", "false", couldRetry, null, "null"); + String errorMsg = "Discarded in 'PageCrawler.visit()' method, as there was a problem in checking the retrieved 'turkjgastroenterol'-pdf-url: " + list.get(2); + UrlUtils.logOutputData(urlId, sourceUrl, pageUrl, UrlUtils.unreachableDocOrDatasetUrlIndicator, errorMsg, pageDomain, true, "true", wasValid, "false", "false", couldRetry, null, "null"); return false; } diff --git a/src/main/java/eu/openaire/publications_retriever/util/url/LoaderAndChecker.java b/src/main/java/eu/openaire/publications_retriever/util/url/LoaderAndChecker.java index 0c77c45..918d8b2 100644 --- a/src/main/java/eu/openaire/publications_retriever/util/url/LoaderAndChecker.java +++ b/src/main/java/eu/openaire/publications_retriever/util/url/LoaderAndChecker.java @@ -4,6 +4,7 @@ import crawlercommons.filters.basic.BasicURLNormalizer; import eu.openaire.publications_retriever.PublicationsRetriever; import eu.openaire.publications_retriever.exceptions.ConnTimeoutException; +import eu.openaire.publications_retriever.exceptions.DomainBlockedException; import eu.openaire.publications_retriever.exceptions.DomainWithUnsupportedHEADmethodException; import eu.openaire.publications_retriever.util.file.FileUtils; import eu.openaire.publications_retriever.util.http.ConnSupportUtils; @@ -142,27 +143,14 @@ public static void loadAndCheckUrls() throws RuntimeException List list = getWasValidAndCouldRetry(e, urlToCheck); String wasUrlValid = list.get(0); String couldRetry = list.get(1); - UrlUtils.logOutputData("null", retrievedUrlToCheck, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false", couldRetry, null, "null"); + String errorMsg = "Discarded at loading time, as " + list.get(2); + UrlUtils.logOutputData("null", retrievedUrlToCheck, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, errorMsg, null, true, "true", wasUrlValid, "false", "false", couldRetry, null, "null"); return false; } return true; }); }// end for-loop - int numFailedTasks = invokeAllTasksAndWait(callableTasks); - if ( numFailedTasks == -1 ) { - FileUtils.writeResultsToFile(); // Writes to the output file - System.err.println("Invoking and/or executing the callableTasks failed with the exception written in the log files!"); - System.exit(99); - } else if ( numFailedTasks > 0 ) { - logger.warn(numFailedTasks + " tasks failed in batch_" + batchCount); - totalNumFailedTasks.incrementAndGet(); - } - - callableTasks.clear(); - logger.debug("The number of cookies is: " + cookieStore.getCookies().size()); - boolean cookiesDeleted = cookieStore.removeAll(); - logger.debug(cookiesDeleted ? "The cookies where removed!" : "No cookies where removed!"); - FileUtils.writeResultsToFile(); // Writes to the output file + executeTasksAndHandleResults(callableTasks, batchCount, cookieStore); }// end while-loop } @@ -298,7 +286,8 @@ else if ( neutralUrl != null ) List list = getWasValidAndCouldRetry(e, urlToCheck); String wasUrlValid = list.get(0); String couldRetry = list.get(1); - UrlUtils.logOutputData(retrievedId, urlToCheck, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false", couldRetry, null, "null"); + String errorMsg = "Discarded at loading time, as " + list.get(2); + UrlUtils.logOutputData(retrievedId, urlToCheck, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, errorMsg, null, true, "true", wasUrlValid, "false", "false", couldRetry, null, "null"); // This url had connectivity problems.. but the rest might not, go check them out. if ( !isSingleIdUrlPair ) { loggedUrlsOfCurrentId.add(urlToCheck); @@ -313,21 +302,7 @@ else if ( neutralUrl != null ) return wasSuccessful; }); }// end id-for-loop - int numFailedTasks = invokeAllTasksAndWait(callableTasks); - if ( numFailedTasks == -1 ) { - FileUtils.writeResultsToFile(); // Writes to the output file - System.err.println("Invoking and/or executing the callableTasks failed with the exception written in the log files!"); - System.exit(99); - } else if ( numFailedTasks > 0 ) { - logger.warn(numFailedTasks + " tasks failed in batch_" + batchCount); - totalNumFailedTasks.incrementAndGet(); - } - - callableTasks.clear(); - logger.debug("The number of cookies is: " + cookieStore.getCookies().size()); - boolean cookiesDeleted = cookieStore.removeAll(); - logger.debug(cookiesDeleted ? "The cookies where removed!" : "No cookies where removed!"); - FileUtils.writeResultsToFile(); // Writes to the output file + executeTasksAndHandleResults(callableTasks, batchCount, cookieStore); }// end loading-while-loop } @@ -401,27 +376,14 @@ public static void loadAndCheckEachIdUrlPairInEntries() throws RuntimeException List list = getWasValidAndCouldRetry(e, urlToCheck); String wasUrlValid = list.get(0); String couldRetry = list.get(1); - UrlUtils.logOutputData(retrievedId, urlToCheck, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false", couldRetry, null, "null"); + String errorMsg = "Discarded at loading time, as " + list.get(2); + UrlUtils.logOutputData(retrievedId, urlToCheck, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, errorMsg, null, true, "true", wasUrlValid, "false", "false", couldRetry, null, "null"); return false; } return true; }); }// end pairs-for-loop - int numFailedTasks = invokeAllTasksAndWait(callableTasks); - if ( numFailedTasks == -1 ) { - FileUtils.writeResultsToFile(); // Writes to the output file - System.err.println("Invoking and/or executing the callableTasks failed with the exception written in the log files!"); - System.exit(99); - } else if ( numFailedTasks > 0 ) { - logger.warn(numFailedTasks + " tasks failed in batch_" + batchCount); - totalNumFailedTasks.incrementAndGet(); - } - - callableTasks.clear(); - logger.debug("The number of cookies is: " + cookieStore.getCookies().size()); - boolean cookiesDeleted = cookieStore.removeAll(); - logger.debug(cookiesDeleted ? "The cookies where removed!" : "No cookies where removed!"); - FileUtils.writeResultsToFile(); // Writes to the output file + executeTasksAndHandleResults(callableTasks, batchCount, cookieStore); }// end loading-while-loop } @@ -494,29 +456,36 @@ public static void loadAndCheckEachIdUrlPair() throws RuntimeException List list = getWasValidAndCouldRetry(e, urlToCheck); String wasUrlValid = list.get(0); String couldRetry = list.get(1); - UrlUtils.logOutputData(retrievedId, urlToCheck, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false", couldRetry, null, "null"); + String errorMsg = "Discarded at loading time, as " + list.get(2); + UrlUtils.logOutputData(retrievedId, urlToCheck, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, errorMsg, null, true, "true", wasUrlValid, "false", "false", couldRetry, null, "null"); return false; } } return true; }); }// end for-id-loop - int numFailedTasks = invokeAllTasksAndWait(callableTasks); - if ( numFailedTasks == -1 ) { - FileUtils.writeResultsToFile(); // Writes to the output file - System.err.println("Invoking and/or executing the callableTasks failed with the exception written in the log files!"); - System.exit(99); - } else if ( numFailedTasks > 0 ) { - logger.warn(numFailedTasks + " tasks failed in batch_" + batchCount); - totalNumFailedTasks.incrementAndGet(); - } + executeTasksAndHandleResults(callableTasks, batchCount, cookieStore); + }// end loading-while-loop + } + - callableTasks.clear(); - logger.debug("The number of cookies is: " + cookieStore.getCookies().size()); - boolean cookiesDeleted = cookieStore.removeAll(); - logger.debug(cookiesDeleted ? "The cookies where removed!" : "No cookies where removed!"); + public static void executeTasksAndHandleResults(List> callableTasks, int batchCount, CookieStore cookieStore) + { + int numFailedTasks = invokeAllTasksAndWait(callableTasks); + if ( numFailedTasks == -1 ) { FileUtils.writeResultsToFile(); // Writes to the output file - }// end loading-while-loop + System.err.println("Invoking and/or executing the callableTasks failed with the exception written in the log files!"); + System.exit(99); + } else if ( numFailedTasks > 0 ) { + logger.warn(numFailedTasks + " tasks failed in batch_" + batchCount); + totalNumFailedTasks.incrementAndGet(); + } + + callableTasks.clear(); + logger.debug("The number of cookies is: " + cookieStore.getCookies().size()); + boolean cookiesDeleted = cookieStore.removeAll(); + logger.debug(cookiesDeleted ? "The cookies where removed!" : "No cookies where removed!"); + FileUtils.writeResultsToFile(); // Writes to the output file } @@ -581,7 +550,8 @@ private static boolean checkRemainingUrls(String retrievedId, Set retrie List list = getWasValidAndCouldRetry(e, urlToCheck); String wasUrlValid = list.get(0); String couldRetry = list.get(1); - UrlUtils.logOutputData(retrievedId, urlToCheck, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, in checkRemainingUrls(), due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false", couldRetry, null, "null"); + String errorMsg = "Discarded at loading time, in checkRemainingUrls(), as " + list.get(2); + UrlUtils.logOutputData(retrievedId, urlToCheck, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, errorMsg, null, true, "true", wasUrlValid, "false", "false", couldRetry, null, "null"); if ( !isSingleIdUrlPair ) loggedUrlsOfThisId.add(urlToCheck); // Try the next url.. @@ -730,30 +700,43 @@ public static void setCouldRetryRegex() public static Pattern COULD_RETRY_URLS = Pattern.compile("[^/]+://[^/]*(?:sciencedirect|elsevier).com[^/]*/.*"); // The urls having the aforementioned domains are likely to be specially-handled in future updates, so we want to keep their urls available for retrying. - public static List getWasValidAndCouldRetry(Exception e, String pageUrl) + public static List getWasValidAndCouldRetry(Exception e, String url) { - List list = new ArrayList<>(2); + List list = new ArrayList<>(3); String wasUrlValid = "true"; String couldRetry = "false"; + String errorMsg = null; if ( e instanceof RuntimeException ) { // This check also covers the: (e != null) check. String message = e.getMessage(); if ( message != null) { - if ( INVALID_URL_HTTP_STATUS.matcher(message).matches() ) + if ( INVALID_URL_HTTP_STATUS.matcher(message).matches() ) { wasUrlValid = "false"; - else if ( COULD_RETRY_HTTP_STATUS.matcher(message).matches() ) - couldRetry = "true"; // We could retry at a later time, since some errors might be temporal. - } - } else if ( e instanceof ConnTimeoutException - || e instanceof DomainWithUnsupportedHEADmethodException ) // This should never get caught here normally. + errorMsg = "the url is invalid and lead to http-client-error"; + } else if ( COULD_RETRY_HTTP_STATUS.matcher(message).matches() ) { + couldRetry = "true"; // We could retry at a later time, since some errors might be temporal. + errorMsg = "the url had a non-fatal http-error"; + } + } else + errorMsg = "there is an unspecified runtime error"; + } else if ( e instanceof ConnTimeoutException ) { couldRetry = "true"; - // else if it's a "DomainBlockedException", the default values apply - - if ( (pageUrl != null) && COULD_RETRY_URLS.matcher(pageUrl).matches() ) + errorMsg = "the url had a connection-timeout"; + } else if ( e instanceof DomainWithUnsupportedHEADmethodException ) { // This should never get caught here normally. + couldRetry = "true"; + errorMsg = "the url does not support HEAD method for checking most of the internal links"; + } else if ( e instanceof DomainBlockedException ) { + // the default values apply + errorMsg = "the url had its initial or redirected domain blocked"; + } else + errorMsg = "there is a serious unspecified error"; + + if ( (url != null) && COULD_RETRY_URLS.matcher(url).matches() ) couldRetry = "true"; list.add(0, wasUrlValid); list.add(1, couldRetry); + list.add(2, errorMsg); return list; }