- Fix duplicate output when a record was discarded during redirection…

…s, after matching to regex-rules. - Fix an issue, where in some error-cases, no value was assigned for the "pageUrl" filed, thus resulting in some of the results missing this field altogether.
LSmyrnaios · Dec 15, 2024 · 76d2889 · 76d2889
1 parent 699c0c0
commit 76d2889
Show file tree

Hide file tree

Showing 4 changed files with 67 additions and 17 deletions.
diff --git a/src/main/java/eu/openaire/publications_retriever/util/http/HttpConnUtils.java b/src/main/java/eu/openaire/publications_retriever/util/http/HttpConnUtils.java
@@ -541,7 +541,7 @@ public static HttpURLConnection handleRedirects(String urlId, String sourceUrl,
 				String lowerCaseTargetUrl = targetUrl.toLowerCase();
 				if ( (calledForPageUrl && UrlTypeChecker.shouldNotAcceptPageUrl(urlId, sourceUrl, targetUrl, lowerCaseTargetUrl, calledForPageUrl))	// Redirecting a pageUrl.
 						|| (!calledForPageUrl && UrlTypeChecker.shouldNotAcceptInternalLink(targetUrl, lowerCaseTargetUrl)) )	// Redirecting an internalPageLink.
-					throw new RuntimeException("Url: \"" + initialUrl + "\" was prevented to redirect to the unwanted location: \"" + targetUrl + "\", after receiving an \"HTTP " + responseCode + "\" Redirect Code, in redirection-number: " + curRedirectsNum);
+					throw new RuntimeException("Url: \"" + initialUrl + "\" was prevented to redirect to the unwanted location: \"" + targetUrl + "\", after receiving an \"HTTP " + responseCode + "\" Redirect Code, in redirection-number: " + curRedirectsNum + (calledForPageUrl ? " | " + LoaderAndChecker.alreadyLoggedMessage : ""));
 				else if ( lowerCaseTargetUrl.contains("sharedsitesession") ) {	// either "getSharedSiteSession" or "consumeSharedSiteSession".
 					logger.warn("Initial-url: \"" + initialUrl + "\" tried to cause a \"sharedSiteSession-redirectionPack\" by redirecting to \"" + targetUrl + "\"!");
 					List<String> blockedDomains = ConnSupportUtils.blockSharedSiteSessionDomains(targetUrl, currentUrl);

diff --git a/src/main/java/eu/openaire/publications_retriever/util/url/LoaderAndChecker.java b/src/main/java/eu/openaire/publications_retriever/util/url/LoaderAndChecker.java
@@ -45,6 +45,7 @@ public class LoaderAndChecker
 			+ "|[p]?cap|dmp|vcf|cbor|biosample|hic|warc|ig[e]?s|sla|dxf|pdb|[sc]df|cif|f(?:ast)?[qa]|apng|sra|vtp|gltf|[sm]tl|ply|abc|md|rtf|ttl|shp|shx|exr|cdf|glb|mtl|kmz|textFile)";
 	public static final Pattern DATASET_URL_FILTER = Pattern.compile(".+(?:dataset[s]?/.*|(?:\\.|format=)" + dataset_formats + "(?:\\?.+)?$)");
 
+	public static final String alreadyLoggedMessage = "__LOGGED__";
 
 	public static final BasicURLNormalizer basicURLNormalizer = BasicURLNormalizer.newBuilder().build();
 
@@ -123,7 +124,7 @@ public static void loadAndCheckUrls() throws RuntimeException
 					String urlToCheck = retrievedUrlToCheck;
 					if ( (urlToCheck = basicURLNormalizer.filter(retrievedUrlToCheck)) == null ) {
 						logger.warn("Could not normalize url: " + retrievedUrlToCheck);
-						UrlUtils.addOutputData("null", retrievedUrlToCheck, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, due to normalization's problems.", null, true, "true", "false", "false", "false", "false", null, "null", "N/A");
+						UrlUtils.addOutputData("null", retrievedUrlToCheck, "N/A", UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, due to normalization's problems.", null, true, "true", "false", "false", "false", "false", null, "null", "N/A");
 						LoaderAndChecker.connProblematicUrls.incrementAndGet();
 						return false;
 					}
@@ -142,11 +143,18 @@ public static void loadAndCheckUrls() throws RuntimeException
 					try {	// We sent the < null > into quotes to avoid causing NPEs in the thread-safe datastructures that do not support null input.
 						HttpConnUtils.connectAndCheckMimeType("null", retrievedUrlToCheck, urlToCheck, urlToCheck, null, true, isPossibleDocOrDatasetUrl);
 					} catch (Exception e) {
+
+						if ( e instanceof RuntimeException ) {
+							String msg = e.getMessage();
+							if ( (msg != null) && msg.contains(alreadyLoggedMessage) )
+								return false;	// The error has already been logged in better detail.
+						}
+
 						List<String> list = getWasValidAndCouldRetry(e, urlToCheck);
 						String wasUrlValid = list.get(0);
 						String couldRetry = list.get(1);
 						String errorMsg = "Discarded at loading time, as " + list.get(2);
-						UrlUtils.addOutputData("null", retrievedUrlToCheck, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, errorMsg, null, true, "true", wasUrlValid, "false", "false", couldRetry, null, "null", "N/A");
+						UrlUtils.addOutputData("null", retrievedUrlToCheck, "N/A", UrlUtils.unreachableDocOrDatasetUrlIndicator, errorMsg, null, true, "true", wasUrlValid, "false", "false", couldRetry, null, "null", "N/A");
 						return false;
 					}
 					return true;
@@ -266,7 +274,7 @@ else if ( neutralUrl != null )
 					String sourceUrl = urlToCheck;	// Hold it here for the logging-messages.
 					if ( (urlToCheck = basicURLNormalizer.filter(sourceUrl)) == null ) {
 						logger.warn("Could not normalize url: " + sourceUrl);
-						UrlUtils.addOutputData(retrievedId, sourceUrl, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, due to normalization's problems.", null, true, "true", "false", "false", "false", "false", null, "null", "N/A");
+						UrlUtils.addOutputData(retrievedId, sourceUrl, "N/A", UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, due to normalization's problems.", null, true, "true", "false", "false", "false", "false", null, "null", "N/A");
 						LoaderAndChecker.connProblematicUrls.incrementAndGet();
 
 						// If other urls exits, then go and check those.
@@ -285,11 +293,18 @@ else if ( neutralUrl != null )
 							loggedUrlsOfCurrentId.add(urlToCheck);
 						// Here the runnable was successful in any case.
 					} catch (Exception e) {
+
+						if ( e instanceof RuntimeException ) {
+							String msg = e.getMessage();
+							if ( (msg != null) && msg.contains(alreadyLoggedMessage) )
+								return false;	// The error has already been logged in better detail.
+						}
+
 						List<String> list = getWasValidAndCouldRetry(e, urlToCheck);
 						String wasUrlValid = list.get(0);
 						String couldRetry = list.get(1);
 						String errorMsg = "Discarded at loading time, as " + list.get(2);
-						UrlUtils.addOutputData(retrievedId, urlToCheck, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, errorMsg, null, true, "true", wasUrlValid, "false", "false", couldRetry, null, "null", "N/A");
+						UrlUtils.addOutputData(retrievedId, urlToCheck, "N/A", UrlUtils.unreachableDocOrDatasetUrlIndicator, errorMsg, null, true, "true", wasUrlValid, "false", "false", couldRetry, null, "null", "N/A");
 						// This url had connectivity problems.. but the rest might not, go check them out.
 						if ( !isSingleIdUrlPair ) {
 							loggedUrlsOfCurrentId.add(urlToCheck);
@@ -346,14 +361,14 @@ public static void loadAndCheckEachIdUrlPairInEntries() throws RuntimeException
 					String retrievedUrl = pair.getValue();
 
 					if ( (retrievedUrl = handleUrlChecks(retrievedId, retrievedUrl)) == null ) {
-						return false;
+						return false;	// The logging happens inside.
 					}    // The "retrievedUrl" might have changed (inside "handleUrlChecks()").
 
 					String urlToCheck = retrievedUrl;
 					String sourceUrl = urlToCheck;    // Hold it here for the logging-messages.
 					if ( (urlToCheck = basicURLNormalizer.filter(sourceUrl)) == null ) {
 						logger.warn("Could not normalize url: " + sourceUrl);
-						UrlUtils.addOutputData(retrievedId, sourceUrl, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, due to normalization's problems.", null, true, "true", "false", "false", "false", "false", null, "null", "N/A");
+						UrlUtils.addOutputData(retrievedId, sourceUrl, "N/A", UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, due to normalization's problems.", null, true, "true", "false", "false", "false", "false", null, "null", "N/A");
 						LoaderAndChecker.connProblematicUrls.incrementAndGet();
 						return false;
 					}
@@ -375,11 +390,18 @@ public static void loadAndCheckEachIdUrlPairInEntries() throws RuntimeException
 					try {    // Check if it's a docUrl, if not, it gets crawled.
 						HttpConnUtils.connectAndCheckMimeType(retrievedId, sourceUrl, urlToCheck, urlToCheck, null, true, isPossibleDocOrDatasetUrl);
 					} catch (Exception e) {
+
+						if ( e instanceof RuntimeException ) {
+							String msg = e.getMessage();
+							if ( (msg != null) && msg.contains(alreadyLoggedMessage) )
+								return false;	// The error has already been logged in better detail.
+						}
+
 						List<String> list = getWasValidAndCouldRetry(e, urlToCheck);
 						String wasUrlValid = list.get(0);
 						String couldRetry = list.get(1);
 						String errorMsg = "Discarded at loading time, as " + list.get(2);
-						UrlUtils.addOutputData(retrievedId, urlToCheck, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, errorMsg, null, true, "true", wasUrlValid, "false", "false", couldRetry, null, "null", "N/A");
+						UrlUtils.addOutputData(retrievedId, urlToCheck, "N/A", UrlUtils.unreachableDocOrDatasetUrlIndicator, errorMsg, null, true, "true", wasUrlValid, "false", "false", couldRetry, null, "null", "N/A");
 						return false;
 					}
 					return true;
@@ -433,7 +455,7 @@ public static void loadAndCheckEachIdUrlPair() throws RuntimeException
 						String sourceUrl = urlToCheck;    // Hold it here for the logging-messages.
 						if ( (urlToCheck = basicURLNormalizer.filter(sourceUrl)) == null ) {
 							logger.warn("Could not normalize url: " + sourceUrl);
-							UrlUtils.addOutputData(retrievedId, sourceUrl, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, due to normalization's problems.", null, true, "true", "false", "false", "false", "false", null, "null", "N/A");
+							UrlUtils.addOutputData(retrievedId, sourceUrl, "N/A", UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, due to normalization's problems.", null, true, "true", "false", "false", "false", "false", null, "null", "N/A");
 							LoaderAndChecker.connProblematicUrls.incrementAndGet();
 							continue;
 						}
@@ -455,11 +477,18 @@ public static void loadAndCheckEachIdUrlPair() throws RuntimeException
 						try {    // Check if it's a docUrl, if not, it gets crawled.
 							HttpConnUtils.connectAndCheckMimeType(retrievedId, sourceUrl, urlToCheck, urlToCheck, null, true, isPossibleDocOrDatasetUrl);
 						} catch (Exception e) {
+
+							if ( e instanceof RuntimeException ) {
+								String msg = e.getMessage();
+								if ( (msg != null) && msg.contains(alreadyLoggedMessage) )
+									return false;	// The error has already been logged in better detail.
+							}
+
 							List<String> list = getWasValidAndCouldRetry(e, urlToCheck);
 							String wasUrlValid = list.get(0);
 							String couldRetry = list.get(1);
 							String errorMsg = "Discarded at loading time, as " + list.get(2);
-							UrlUtils.addOutputData(retrievedId, urlToCheck, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, errorMsg, null, true, "true", wasUrlValid, "false", "false", couldRetry, null, "null", "N/A");
+							UrlUtils.addOutputData(retrievedId, urlToCheck, "N/A", UrlUtils.unreachableDocOrDatasetUrlIndicator, errorMsg, null, true, "true", wasUrlValid, "false", "false", couldRetry, null, "null", "N/A");
 							return false;
 						}
 					}
@@ -549,11 +578,18 @@ private static boolean checkRemainingUrls(String retrievedId, Set<String> retrie
 					loggedUrlsOfThisId.add(urlToCheck);
 				return true;	// A url was checked and didn't have any problems, return and log the remaining urls.
 			} catch (Exception e) {
+
+				if ( e instanceof RuntimeException ) {
+					String msg = e.getMessage();
+					if ( (msg != null) && msg.contains(alreadyLoggedMessage) )
+						return false;	// The error has already been logged in better detail.
+				}
+
 				List<String> list = getWasValidAndCouldRetry(e, urlToCheck);
 				String wasUrlValid = list.get(0);
 				String couldRetry = list.get(1);
 				String errorMsg = "Discarded at loading time, in checkRemainingUrls(), as " + list.get(2);
-				UrlUtils.addOutputData(retrievedId, urlToCheck, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, errorMsg, null, true, "true", wasUrlValid, "false", "false", couldRetry, null, "null", "N/A");
+				UrlUtils.addOutputData(retrievedId, urlToCheck, "N/A", UrlUtils.unreachableDocOrDatasetUrlIndicator, errorMsg, null, true, "true", wasUrlValid, "false", "false", couldRetry, null, "null", "N/A");
 				if ( !isSingleIdUrlPair )
 					loggedUrlsOfThisId.add(urlToCheck);
 				// Try the next url..
@@ -575,23 +611,23 @@ public static String handleUrlChecks(String urlId, String retrievedUrl)
 		String urlDomain = UrlUtils.getDomainStr(retrievedUrl, null);
 		if ( urlDomain == null ) {    // If the domain is not found, it means that a serious problem exists with this docPage, and we shouldn't crawl it.
 			// The reason is already logged.
-			UrlUtils.addOutputData(urlId, retrievedUrl, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in 'LoaderAndChecker.handleUrlChecks()' method, after the occurrence of a domain-retrieval error.", null, true, "true", "false", "false", "false", "false", null, "null", "N/A");
+			UrlUtils.addOutputData(urlId, retrievedUrl, "N/A", UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in 'LoaderAndChecker.handleUrlChecks()' method, after the occurrence of a domain-retrieval error.", null, true, "true", "false", "false", "false", "false", null, "null", "N/A");
 			if ( !useIdUrlPairs )
 				connProblematicUrls.incrementAndGet();
 			return null;
 		}
 
 		if ( HttpConnUtils.blacklistedDomains.contains(urlDomain) ) {	// Check if it has been blacklisted after running internal links' checks.
 			logger.debug("Avoid connecting to blacklisted domain: \"" + urlDomain + "\" with url: " + retrievedUrl);
-			UrlUtils.addOutputData(urlId, retrievedUrl, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in 'LoaderAndChecker.handleUrlChecks()' method, as its domain was found blacklisted.", null, true, "true", "true", "false", "false", "false", null, "null", "N/A");
+			UrlUtils.addOutputData(urlId, retrievedUrl, "N/A", UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in 'LoaderAndChecker.handleUrlChecks()' method, as its domain was found blacklisted.", null, true, "true", "true", "false", "false", "false", null, "null", "N/A");
 			if ( !useIdUrlPairs )
 				connProblematicUrls.incrementAndGet();
 			return null;
 		}
 
 		if ( ConnSupportUtils.checkIfPathIs403BlackListed(retrievedUrl, urlDomain) ) {	// The path-extraction is independent of the jsessionid-removal, so this gets executed before.
 			logger.debug("Preventing reaching 403ErrorCode with url: \"" + retrievedUrl + "\"!");
-			UrlUtils.addOutputData(urlId, retrievedUrl, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in 'LoaderAndChecker.handleUrlChecks()' as it had a blacklisted urlPath.", null, true, "true", "true", "false", "false", "false", null, "null", "N/A");
+			UrlUtils.addOutputData(urlId, retrievedUrl, "N/A", UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in 'LoaderAndChecker.handleUrlChecks()' as it had a blacklisted urlPath.", null, true, "true", "true", "false", "false", "false", null, "null", "N/A");
 			if ( !useIdUrlPairs )
 				connProblematicUrls.incrementAndGet();
 			return null;
@@ -609,7 +645,7 @@ public static String handleUrlChecks(String urlId, String retrievedUrl)
 		// Check if it's a duplicate.
 		if ( UrlUtils.duplicateUrls.contains(retrievedUrl) ) {
 			logger.debug("Skipping non-DocOrDataset-url: \"" + retrievedUrl + "\", at loading, as it has already been checked.");
-			UrlUtils.addOutputData(urlId, retrievedUrl, null, UrlUtils.duplicateUrlIndicator, "Discarded in 'LoaderAndChecker.handleUrlChecks()', as it's a duplicate.", null, false, "true", "N/A", "N/A", "N/A", "true", null, "null", "N/A");
+			UrlUtils.addOutputData(urlId, retrievedUrl, "N/A", UrlUtils.duplicateUrlIndicator, "Discarded in 'LoaderAndChecker.handleUrlChecks()', as it's a duplicate.", null, false, "true", "N/A", "N/A", "N/A", "true", null, "null", "N/A");
 			if ( !useIdUrlPairs )
 				inputDuplicatesNum.incrementAndGet();
 			return null;
@@ -673,7 +709,7 @@ private static void handleLogOfRemainingUrls(String retrievedId, Set<String> ret
 					retrievedUrl = tempUrl;	// Make sure we check the non-normalized version.
 
 			if ( !loggedUrlsOfThisId.contains(retrievedUrl) )
-				UrlUtils.addOutputData(retrievedId, retrievedUrl, null, UrlUtils.unreachableDocOrDatasetUrlIndicator,
+				UrlUtils.addOutputData(retrievedId, retrievedUrl, "N/A", UrlUtils.unreachableDocOrDatasetUrlIndicator,
 					"Skipped in LoaderAndChecker, as a better url was selected for id: " + retrievedId, null, true, "false", "N/A", "N/A", "N/A", "true", null, "null", "N/A");
 		}
 	}

diff --git a/src/test/java/eu/openaire/publications_retriever/test/TestNonStandardInputOutput.java b/src/test/java/eu/openaire/publications_retriever/test/TestNonStandardInputOutput.java
@@ -150,6 +150,20 @@ public void testCustomInputOutputWithDatasets()
 	}
 
 
+	@Disabled
+	@Test
+	public void testCustomInputOutputWithDocsAndDatasets()
+	{
+		String[] args = new String[4];
+		args[0] = "-retrieveDataType";
+		args[1] = "all";	// "document" OR "dataset" OR "all"
+		args[2] = "-inputFileFullPath";
+		//args[3] = "./testData/idUrlPairs/test.json";
+		args[3] = "./testData/idUrlPairs/ukrn_data_test.json";
+		//args[3] = "./testData/idUrlPairs/docsAndDatasets.json";
+		main(args);
+	}
+
 
 	@Disabled
 	@Test

diff --git a/src/test/java/eu/openaire/publications_retriever/test/UrlChecker.java b/src/test/java/eu/openaire/publications_retriever/test/UrlChecker.java
@@ -835,7 +835,7 @@ public void checkUrlConnectivity()
 				HttpConnUtils.connectAndCheckMimeType(testID, urlToCheck, urlToCheck, urlToCheck, null, true, false);	// Sent the < null > in quotes to avoid an NPE in the concurrent data-structures.
 			} catch (Exception e) {
 				// The problem was logged inside.
-				UrlUtils.addOutputData(testID, urlToCheck, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, due to connectivity problems.", null, true, "true", "true", "false", "false", "false", null, "null", "N/A");
+				UrlUtils.addOutputData(testID, urlToCheck, "N/A", UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, due to connectivity problems.", null, true, "true", "true", "false", "false", "false", null, "null", "N/A");
 			}
 		}