Skip to content

Commit

Permalink
- Fix some parameters given to "UrlUtils.logOutputData()".
Browse files Browse the repository at this point in the history
- Fix some return-values.
  • Loading branch information
LSmyrnaios committed Nov 29, 2023
1 parent 4d45d73 commit 58022be
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,7 @@ public static boolean extractAndCheckTurkjgastroenterolDocUrl(String pageHtml, S
String wasValid = list.get(0);
String couldRetry = list.get(1);
UrlUtils.logOutputData(urlId, sourceUrl, pageUrl, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in 'PageCrawler.visit()' method, as there was a problem in checking the retrieved \"turkjgastroenterol\"-pdf-url.", pageDomain, true, "true", wasValid, "false", "false", couldRetry, null, "null");
return false;
}

return true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ public static void handleStoreDocFileDirectory()
// If the directory doesn't exist, try to (re)create it.
try {
if ( !dir.exists() ) {
if ( !dir.mkdirs() ) { // Try to create the directory(-ies) if they don't exist. If they exist OR if sth went wrong, the result os the same: "false".
if ( !dir.mkdirs() ) { // Try to create the directory(-ies) if they don't exist. If they exist OR if sth went wrong, the result is the same: "false".
String errorMessage;
if ( PublicationsRetriever.docFilesStorageGivenByUser )
errorMessage = "Problem when creating the \"storeDocFilesDir\": \"" + FileUtils.storeDocFilesDir + "\"."
Expand Down Expand Up @@ -274,7 +274,7 @@ public static HashMultimap<String, String> getNextIdUrlPairBatchFromJson()

if ( !idAndUrlMappedInput.put(inputIdUrlTuple.id, inputIdUrlTuple.url) ) { // We have a duplicate id-url pair in the input, log it here as we cannot pass it through the HashMultimap. We will handle the first found pair only.
duplicateIdUrlEntries ++;
UrlUtils.logOutputData(inputIdUrlTuple.id, inputIdUrlTuple.url, null, UrlUtils.duplicateUrlIndicator, "Discarded in FileUtils.getNextIdUrlPairBatchFromJson(), as it is a duplicate.", null, false, "true", "N/A", "N/A", "N/A", "true", null, "null");
UrlUtils.logOutputData(inputIdUrlTuple.id, inputIdUrlTuple.url, null, UrlUtils.duplicateUrlIndicator, "Discarded in FileUtils.getNextIdUrlPairBatchFromJson(), as it is a duplicate.", null, false, "false", "N/A", "N/A", "N/A", "true", null, "null");
}
}

Expand Down Expand Up @@ -303,7 +303,7 @@ public static IdUrlTuple getDecodedJson(String jsonLine)

if ( urlStr.isEmpty() ) {
if ( !idStr.isEmpty() ) // If we only have the id, then go and log it.
UrlUtils.logOutputData(idStr, urlStr, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in FileUtils.jsonDecoder(), as the url was not found.", null, false, "true", "false", "false", "false", "false", null, "null");
UrlUtils.logOutputData(idStr, urlStr, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in FileUtils.jsonDecoder(), as the url was not found.", null, true, "true", "false", "false", "false", "false", null, "null");
return null;
}

Expand Down Expand Up @@ -764,7 +764,7 @@ public static Collection<String> getNextUrlBatchTest()
//logger.debug("Loaded from inputFile: " + retrievedLineStr); // DEBUG!

if ( !urlGroup.add(retrievedLineStr) ) // We have a duplicate in the input.. log it here as we cannot pass it through the HashSet. It's possible that this as well as the original might be/give a docUrl.
UrlUtils.logOutputData(null, retrievedLineStr, null, UrlUtils.duplicateUrlIndicator, "Discarded in FileUtils.getNextUrlGroupTest(), as it is a duplicate.", null, false, "true", "N/A", "N/A", "N/A", "true", null, "null");
UrlUtils.logOutputData(null, retrievedLineStr, null, UrlUtils.duplicateUrlIndicator, "Discarded in FileUtils.getNextUrlGroupTest(), as it is a duplicate.", null, false, "false", "N/A", "N/A", "N/A", "true", null, "null");
}

return urlGroup;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ public static void loadAndCheckUrls() throws RuntimeException
String wasUrlValid = list.get(0);
String couldRetry = list.get(1);
UrlUtils.logOutputData("null", retrievedUrlToCheck, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false", couldRetry, null, "null");
return false;
}
return true;
});
Expand Down Expand Up @@ -285,10 +286,12 @@ else if ( neutralUrl != null )
return false; // Exit this runnable to go to the next ID.
}

boolean wasSuccessful = true;
try { // Check if it's a docUrl, if not, it gets crawled.
HttpConnUtils.connectAndCheckMimeType(retrievedId, sourceUrl, urlToCheck, urlToCheck, null, true, isPossibleDocOrDatasetUrl);
if ( !isSingleIdUrlPair ) // Otherwise it's already logged.
loggedUrlsOfCurrentId.add(urlToCheck);
// Here the runnable was successful in any case.
} catch (Exception e) {
List<String> list = getWasValidAndCouldRetry(e, urlToCheck);
String wasUrlValid = list.get(0);
Expand All @@ -297,14 +300,15 @@ else if ( neutralUrl != null )
// This url had connectivity problems.. but the rest might not, go check them out.
if ( !isSingleIdUrlPair ) {
loggedUrlsOfCurrentId.add(urlToCheck);
checkRemainingUrls(retrievedId, retrievedUrlsOfCurrentId, loggedUrlsOfCurrentId, isSingleIdUrlPair); // Go check the other urls because they might not have a normalization problem.
}
wasSuccessful = checkRemainingUrls(retrievedId, retrievedUrlsOfCurrentId, loggedUrlsOfCurrentId, isSingleIdUrlPair); // Go check the other urls because they might not have a connection problem.
} else
wasSuccessful = false;
}

if ( !isSingleIdUrlPair ) // Don't forget to write the valid but not-to-be-connected urls to the outputFile.
handleLogOfRemainingUrls(retrievedId, retrievedUrlsOfCurrentId, loggedUrlsOfCurrentId);

return true;
return wasSuccessful;
});
}// end id-for-loop
int numFailedTasks = invokeAllTasksAndWait(callableTasks);
Expand Down Expand Up @@ -396,6 +400,7 @@ public static void loadAndCheckEachIdUrlPairInEntries() throws RuntimeException
String wasUrlValid = list.get(0);
String couldRetry = list.get(1);
UrlUtils.logOutputData(retrievedId, urlToCheck, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false", couldRetry, null, "null");
return false;
}
return true;
});
Expand Down Expand Up @@ -488,6 +493,7 @@ public static void loadAndCheckEachIdUrlPair() throws RuntimeException
String wasUrlValid = list.get(0);
String couldRetry = list.get(1);
UrlUtils.logOutputData(retrievedId, urlToCheck, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false", couldRetry, null, "null");
return false;
}
}
return true;
Expand Down Expand Up @@ -576,6 +582,7 @@ private static boolean checkRemainingUrls(String retrievedId, Set<String> retrie
UrlUtils.logOutputData(retrievedId, urlToCheck, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, in checkRemainingUrls(), due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false", couldRetry, null, "null");
if ( !isSingleIdUrlPair )
loggedUrlsOfThisId.add(urlToCheck);
// Try the next url..
}
}
return false;
Expand Down

0 comments on commit 58022be

Please sign in to comment.