From 58022be163ea584fdb815793e680209f1a32dbf4 Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Wed, 29 Nov 2023 17:17:02 +0200 Subject: [PATCH] - Fix some parameters given to "UrlUtils.logOutputData()". - Fix some return-values. --- .../crawler/SpecialUrlsHandler.java | 1 + .../publications_retriever/util/file/FileUtils.java | 8 ++++---- .../util/url/LoaderAndChecker.java | 13 ++++++++++--- 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/src/main/java/eu/openaire/publications_retriever/crawler/SpecialUrlsHandler.java b/src/main/java/eu/openaire/publications_retriever/crawler/SpecialUrlsHandler.java index cdfa89d..d2ef23a 100644 --- a/src/main/java/eu/openaire/publications_retriever/crawler/SpecialUrlsHandler.java +++ b/src/main/java/eu/openaire/publications_retriever/crawler/SpecialUrlsHandler.java @@ -225,6 +225,7 @@ public static boolean extractAndCheckTurkjgastroenterolDocUrl(String pageHtml, S String wasValid = list.get(0); String couldRetry = list.get(1); UrlUtils.logOutputData(urlId, sourceUrl, pageUrl, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in 'PageCrawler.visit()' method, as there was a problem in checking the retrieved \"turkjgastroenterol\"-pdf-url.", pageDomain, true, "true", wasValid, "false", "false", couldRetry, null, "null"); + return false; } return true; diff --git a/src/main/java/eu/openaire/publications_retriever/util/file/FileUtils.java b/src/main/java/eu/openaire/publications_retriever/util/file/FileUtils.java index 96a7260..1bf7921 100644 --- a/src/main/java/eu/openaire/publications_retriever/util/file/FileUtils.java +++ b/src/main/java/eu/openaire/publications_retriever/util/file/FileUtils.java @@ -142,7 +142,7 @@ public static void handleStoreDocFileDirectory() // If the directory doesn't exist, try to (re)create it. try { if ( !dir.exists() ) { - if ( !dir.mkdirs() ) { // Try to create the directory(-ies) if they don't exist. If they exist OR if sth went wrong, the result os the same: "false". + if ( !dir.mkdirs() ) { // Try to create the directory(-ies) if they don't exist. If they exist OR if sth went wrong, the result is the same: "false". String errorMessage; if ( PublicationsRetriever.docFilesStorageGivenByUser ) errorMessage = "Problem when creating the \"storeDocFilesDir\": \"" + FileUtils.storeDocFilesDir + "\"." @@ -274,7 +274,7 @@ public static HashMultimap getNextIdUrlPairBatchFromJson() if ( !idAndUrlMappedInput.put(inputIdUrlTuple.id, inputIdUrlTuple.url) ) { // We have a duplicate id-url pair in the input, log it here as we cannot pass it through the HashMultimap. We will handle the first found pair only. duplicateIdUrlEntries ++; - UrlUtils.logOutputData(inputIdUrlTuple.id, inputIdUrlTuple.url, null, UrlUtils.duplicateUrlIndicator, "Discarded in FileUtils.getNextIdUrlPairBatchFromJson(), as it is a duplicate.", null, false, "true", "N/A", "N/A", "N/A", "true", null, "null"); + UrlUtils.logOutputData(inputIdUrlTuple.id, inputIdUrlTuple.url, null, UrlUtils.duplicateUrlIndicator, "Discarded in FileUtils.getNextIdUrlPairBatchFromJson(), as it is a duplicate.", null, false, "false", "N/A", "N/A", "N/A", "true", null, "null"); } } @@ -303,7 +303,7 @@ public static IdUrlTuple getDecodedJson(String jsonLine) if ( urlStr.isEmpty() ) { if ( !idStr.isEmpty() ) // If we only have the id, then go and log it. - UrlUtils.logOutputData(idStr, urlStr, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in FileUtils.jsonDecoder(), as the url was not found.", null, false, "true", "false", "false", "false", "false", null, "null"); + UrlUtils.logOutputData(idStr, urlStr, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in FileUtils.jsonDecoder(), as the url was not found.", null, true, "true", "false", "false", "false", "false", null, "null"); return null; } @@ -764,7 +764,7 @@ public static Collection getNextUrlBatchTest() //logger.debug("Loaded from inputFile: " + retrievedLineStr); // DEBUG! if ( !urlGroup.add(retrievedLineStr) ) // We have a duplicate in the input.. log it here as we cannot pass it through the HashSet. It's possible that this as well as the original might be/give a docUrl. - UrlUtils.logOutputData(null, retrievedLineStr, null, UrlUtils.duplicateUrlIndicator, "Discarded in FileUtils.getNextUrlGroupTest(), as it is a duplicate.", null, false, "true", "N/A", "N/A", "N/A", "true", null, "null"); + UrlUtils.logOutputData(null, retrievedLineStr, null, UrlUtils.duplicateUrlIndicator, "Discarded in FileUtils.getNextUrlGroupTest(), as it is a duplicate.", null, false, "false", "N/A", "N/A", "N/A", "true", null, "null"); } return urlGroup; diff --git a/src/main/java/eu/openaire/publications_retriever/util/url/LoaderAndChecker.java b/src/main/java/eu/openaire/publications_retriever/util/url/LoaderAndChecker.java index bac0f63..0b69b22 100644 --- a/src/main/java/eu/openaire/publications_retriever/util/url/LoaderAndChecker.java +++ b/src/main/java/eu/openaire/publications_retriever/util/url/LoaderAndChecker.java @@ -141,6 +141,7 @@ public static void loadAndCheckUrls() throws RuntimeException String wasUrlValid = list.get(0); String couldRetry = list.get(1); UrlUtils.logOutputData("null", retrievedUrlToCheck, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false", couldRetry, null, "null"); + return false; } return true; }); @@ -285,10 +286,12 @@ else if ( neutralUrl != null ) return false; // Exit this runnable to go to the next ID. } + boolean wasSuccessful = true; try { // Check if it's a docUrl, if not, it gets crawled. HttpConnUtils.connectAndCheckMimeType(retrievedId, sourceUrl, urlToCheck, urlToCheck, null, true, isPossibleDocOrDatasetUrl); if ( !isSingleIdUrlPair ) // Otherwise it's already logged. loggedUrlsOfCurrentId.add(urlToCheck); + // Here the runnable was successful in any case. } catch (Exception e) { List list = getWasValidAndCouldRetry(e, urlToCheck); String wasUrlValid = list.get(0); @@ -297,14 +300,15 @@ else if ( neutralUrl != null ) // This url had connectivity problems.. but the rest might not, go check them out. if ( !isSingleIdUrlPair ) { loggedUrlsOfCurrentId.add(urlToCheck); - checkRemainingUrls(retrievedId, retrievedUrlsOfCurrentId, loggedUrlsOfCurrentId, isSingleIdUrlPair); // Go check the other urls because they might not have a normalization problem. - } + wasSuccessful = checkRemainingUrls(retrievedId, retrievedUrlsOfCurrentId, loggedUrlsOfCurrentId, isSingleIdUrlPair); // Go check the other urls because they might not have a connection problem. + } else + wasSuccessful = false; } if ( !isSingleIdUrlPair ) // Don't forget to write the valid but not-to-be-connected urls to the outputFile. handleLogOfRemainingUrls(retrievedId, retrievedUrlsOfCurrentId, loggedUrlsOfCurrentId); - return true; + return wasSuccessful; }); }// end id-for-loop int numFailedTasks = invokeAllTasksAndWait(callableTasks); @@ -396,6 +400,7 @@ public static void loadAndCheckEachIdUrlPairInEntries() throws RuntimeException String wasUrlValid = list.get(0); String couldRetry = list.get(1); UrlUtils.logOutputData(retrievedId, urlToCheck, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false", couldRetry, null, "null"); + return false; } return true; }); @@ -488,6 +493,7 @@ public static void loadAndCheckEachIdUrlPair() throws RuntimeException String wasUrlValid = list.get(0); String couldRetry = list.get(1); UrlUtils.logOutputData(retrievedId, urlToCheck, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false", couldRetry, null, "null"); + return false; } } return true; @@ -576,6 +582,7 @@ private static boolean checkRemainingUrls(String retrievedId, Set retrie UrlUtils.logOutputData(retrievedId, urlToCheck, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, in checkRemainingUrls(), due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false", couldRetry, null, "null"); if ( !isSingleIdUrlPair ) loggedUrlsOfThisId.add(urlToCheck); + // Try the next url.. } } return false;