- Avoid checking the MetaDocUrl when running in dataset-only retrieva…

…l mode. - Rename PageCrawler.handlePageWithNoDocUrls() to "handlePageWithNoDocOrDatasetUrls" and fix log-messages.
LSmyrnaios · Nov 17, 2024 · 889b478 · 889b478
1 parent 5c32b86
commit 889b478
Show file tree

Hide file tree

Showing 2 changed files with 13 additions and 10 deletions.
diff --git a/src/main/java/eu/openaire/publications_retriever/crawler/MetadataHandler.java b/src/main/java/eu/openaire/publications_retriever/crawler/MetadataHandler.java
@@ -102,6 +102,9 @@ public static boolean checkAndHandleMetadata(String urlId, String sourceUrl, Str
             }
         }
 
+        if ( !LoaderAndChecker.retrieveDocuments )
+            return false;    // There was no definitive handling, nor we want to check for metaDocUrl, go get the internal-links.
+
         // Check if the docLink is provided in a metaTag and connect to it directly.
         String metaDocUrl = null;
         if ( (metaDocUrl = getMetaDocUrlFromHTML(pageHtml)) == null ) { // This is mostly the case when the page does not have a docUrl, although not always, so we continue crawling it.

diff --git a/src/main/java/eu/openaire/publications_retriever/crawler/PageCrawler.java b/src/main/java/eu/openaire/publications_retriever/crawler/PageCrawler.java
@@ -180,7 +180,7 @@ else if ( firstHTMLlineFromDetectedContentType != null ) {
 
 				if ( (++possibleDocOrDatasetUrlsCounter) > MAX_POSSIBLE_DOC_OR_DATASET_LINKS_TO_CONNECT ) {
 					logger.warn("The maximum limit (" + MAX_POSSIBLE_DOC_OR_DATASET_LINKS_TO_CONNECT + ") of possible doc or dataset links to be connected was reached for pageUrl: \"" + pageUrl + "\". The page was discarded.");
-					handlePageWithNoDocUrls(urlId, sourceUrl, pageUrl, pageDomain, true, false);
+					handlePageWithNoDocOrDatasetUrls(urlId, sourceUrl, pageUrl, pageDomain, true, false);
 					return;
 				}
 
@@ -227,7 +227,7 @@ else if ( firstHTMLlineFromDetectedContentType != null ) {
 		if ( should_check_remaining_links && !remainingLinks.isEmpty() )
 			checkRemainingInternalLinks(urlId, sourceUrl, pageUrl, pageDomain, remainingLinks);
 		else
-			handlePageWithNoDocUrls(urlId, sourceUrl, pageUrl, pageDomain, false, false);
+			handlePageWithNoDocOrDatasetUrls(urlId, sourceUrl, pageUrl, pageDomain, false, false);
 	}
 
 
@@ -240,18 +240,18 @@ else if ( firstHTMLlineFromDetectedContentType != null ) {
 	 * @param hasWarningLogBeenShown
 	 * @param isAlreadyLoggedToOutput
 	 */
-	private static void handlePageWithNoDocUrls(String urlId, String sourceUrl, String pageUrl, String pageDomain, boolean hasWarningLogBeenShown, boolean isAlreadyLoggedToOutput)
+	private static void handlePageWithNoDocOrDatasetUrls(String urlId, String sourceUrl, String pageUrl, String pageDomain, boolean hasWarningLogBeenShown, boolean isAlreadyLoggedToOutput)
 	{
 		// If we get here it means that this pageUrl is not a docUrl itself, nor it contains a docUrl..
 		if ( !hasWarningLogBeenShown )
-			logger.warn("Page: \"" + pageUrl + "\" does not contain a docUrl.");
+			logger.warn("Page: \"" + pageUrl + "\" does not contain a " + ArgsUtils.targetUrlType + ".");
 
 		UrlTypeChecker.pagesNotProvidingDocUrls.incrementAndGet();
 		if ( !isAlreadyLoggedToOutput )	// This check is used in error-cases, where we have already logged the Quadruple.
 			UrlUtils.addOutputData(urlId, sourceUrl, pageUrl, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Logged in 'PageCrawler.visit()' method, as no " + ArgsUtils.targetUrlType + " was found inside.", null, true, "true", "true", "false", "false", "false", null, "null");
 
 		if ( ConnSupportUtils.countAndBlockDomainAfterTimes(HttpConnUtils.blacklistedDomains, PageCrawler.timesDomainNotGivingDocUrls, pageDomain, PageCrawler.timesToGiveNoDocUrlsBeforeBlocked, true) )
-			logger.warn("Domain: \"" + pageDomain + "\" was blocked after giving no docUrls more than " + PageCrawler.timesToGiveNoDocUrlsBeforeBlocked + " times.");
+			logger.warn("Domain: \"" + pageDomain + "\" was blocked after giving no " + ArgsUtils.targetUrlType + " more than " + PageCrawler.timesToGiveNoDocUrlsBeforeBlocked + " times.");
 	}
 
 
@@ -275,13 +275,13 @@ public static HashSet<String> retrieveInternalLinks(String urlId, String sourceU
 			return null;
 		} catch ( DocLinkFoundException dlfe) {
 			if ( !verifyDocLink(urlId, sourceUrl, pageUrl, pageContentType, dlfe) )	// url-logging is handled inside.
-				handlePageWithNoDocUrls(urlId, sourceUrl, pageUrl, pageDomain, false, true);
+				handlePageWithNoDocOrDatasetUrls(urlId, sourceUrl, pageUrl, pageDomain, false, true);
 			return null;	// This DocLink is the only docLink we will ever go to get from this page. The sourceUrl is logged inside the called method.
 			// If this "DocLink" is a DocUrl, then returning "null" here, will trigger the 'PageCrawler.retrieveInternalLinks()' method to exit immediately (and normally).
 		} catch ( DocLinkInvalidException dlie ) {
 			//logger.warn("An invalid docLink < " + dlie.getMessage() + " > was found for pageUrl: \"" + pageUrl + "\". Search was stopped.");	// DEBUG!
 			UrlUtils.addOutputData(urlId, sourceUrl, pageUrl, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in 'PageCrawler.retrieveInternalLinks()' method, as there was an invalid docLink. Its contentType is: '" + pageContentType + "'", null, true, "true", "true", "false", "false", "false", null, "null");
-			handlePageWithNoDocUrls(urlId, sourceUrl, pageUrl, pageDomain, false, true);
+			handlePageWithNoDocOrDatasetUrls(urlId, sourceUrl, pageUrl, pageDomain, false, true);
 			return null;
 		} catch (DocLinkUnavailableException dlue) {
 			logger.warn("The docLink was not available inside pageUrl: " + pageUrl);
@@ -626,7 +626,7 @@ public static boolean checkRemainingInternalLinks(String urlId, String sourceUrl
 			if ( percentage < leastPercentageOfHitsFromRemainingLinks ) {
 				logger.warn("The percentage of found docUrls from the remaining links is too low ( " + percentage + "% ). Stop checking the remaining-internalLinks for any pageUrl..");
 				should_check_remaining_links = false;
-				handlePageWithNoDocUrls(urlId, sourceUrl, pageUrl, pageDomain, false, false);
+				handlePageWithNoDocOrDatasetUrls(urlId, sourceUrl, pageUrl, pageDomain, false, false);
 				return false;
 			}
 		}
@@ -649,7 +649,7 @@ public static boolean checkRemainingInternalLinks(String urlId, String sourceUrl
 
 			if ( (++remainingUrlsCounter) > MAX_REMAINING_INTERNAL_LINKS_TO_CONNECT ) {    // The counter is incremented only on "aboutToConnect" links, so no need to pre-clean the "remainingLinks"-set.
 				logger.warn("The maximum limit (" + MAX_REMAINING_INTERNAL_LINKS_TO_CONNECT + ") of remaining links to be connected was reached for pageUrl: \"" + pageUrl + "\". The page was discarded.");
-				handlePageWithNoDocUrls(urlId, sourceUrl, pageUrl, pageDomain, true, false);
+				handlePageWithNoDocOrDatasetUrls(urlId, sourceUrl, pageUrl, pageDomain, true, false);
 				return false;
 			}
 
@@ -694,7 +694,7 @@ public static boolean checkRemainingInternalLinks(String urlId, String sourceUrl
 			}
 		}// end for-loop
 
-		handlePageWithNoDocUrls(urlId, sourceUrl, pageUrl, pageDomain, false, false);
+		handlePageWithNoDocOrDatasetUrls(urlId, sourceUrl, pageUrl, pageDomain, false, false);
 		return false;
 	}