Skip to content

Commit

Permalink
- Avoid checking the MetaDocUrl when running in dataset-only retrieva…
Browse files Browse the repository at this point in the history
…l mode.

- Rename PageCrawler.handlePageWithNoDocUrls() to "handlePageWithNoDocOrDatasetUrls" and fix log-messages.
  • Loading branch information
LSmyrnaios committed Nov 17, 2024
1 parent 5c32b86 commit 889b478
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,9 @@ public static boolean checkAndHandleMetadata(String urlId, String sourceUrl, Str
}
}

if ( !LoaderAndChecker.retrieveDocuments )
return false; // There was no definitive handling, nor we want to check for metaDocUrl, go get the internal-links.

// Check if the docLink is provided in a metaTag and connect to it directly.
String metaDocUrl = null;
if ( (metaDocUrl = getMetaDocUrlFromHTML(pageHtml)) == null ) { // This is mostly the case when the page does not have a docUrl, although not always, so we continue crawling it.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ else if ( firstHTMLlineFromDetectedContentType != null ) {

if ( (++possibleDocOrDatasetUrlsCounter) > MAX_POSSIBLE_DOC_OR_DATASET_LINKS_TO_CONNECT ) {
logger.warn("The maximum limit (" + MAX_POSSIBLE_DOC_OR_DATASET_LINKS_TO_CONNECT + ") of possible doc or dataset links to be connected was reached for pageUrl: \"" + pageUrl + "\". The page was discarded.");
handlePageWithNoDocUrls(urlId, sourceUrl, pageUrl, pageDomain, true, false);
handlePageWithNoDocOrDatasetUrls(urlId, sourceUrl, pageUrl, pageDomain, true, false);
return;
}

Expand Down Expand Up @@ -227,7 +227,7 @@ else if ( firstHTMLlineFromDetectedContentType != null ) {
if ( should_check_remaining_links && !remainingLinks.isEmpty() )
checkRemainingInternalLinks(urlId, sourceUrl, pageUrl, pageDomain, remainingLinks);
else
handlePageWithNoDocUrls(urlId, sourceUrl, pageUrl, pageDomain, false, false);
handlePageWithNoDocOrDatasetUrls(urlId, sourceUrl, pageUrl, pageDomain, false, false);
}


Expand All @@ -240,18 +240,18 @@ else if ( firstHTMLlineFromDetectedContentType != null ) {
* @param hasWarningLogBeenShown
* @param isAlreadyLoggedToOutput
*/
private static void handlePageWithNoDocUrls(String urlId, String sourceUrl, String pageUrl, String pageDomain, boolean hasWarningLogBeenShown, boolean isAlreadyLoggedToOutput)
private static void handlePageWithNoDocOrDatasetUrls(String urlId, String sourceUrl, String pageUrl, String pageDomain, boolean hasWarningLogBeenShown, boolean isAlreadyLoggedToOutput)
{
// If we get here it means that this pageUrl is not a docUrl itself, nor it contains a docUrl..
if ( !hasWarningLogBeenShown )
logger.warn("Page: \"" + pageUrl + "\" does not contain a docUrl.");
logger.warn("Page: \"" + pageUrl + "\" does not contain a " + ArgsUtils.targetUrlType + ".");

UrlTypeChecker.pagesNotProvidingDocUrls.incrementAndGet();
if ( !isAlreadyLoggedToOutput ) // This check is used in error-cases, where we have already logged the Quadruple.
UrlUtils.addOutputData(urlId, sourceUrl, pageUrl, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Logged in 'PageCrawler.visit()' method, as no " + ArgsUtils.targetUrlType + " was found inside.", null, true, "true", "true", "false", "false", "false", null, "null");

if ( ConnSupportUtils.countAndBlockDomainAfterTimes(HttpConnUtils.blacklistedDomains, PageCrawler.timesDomainNotGivingDocUrls, pageDomain, PageCrawler.timesToGiveNoDocUrlsBeforeBlocked, true) )
logger.warn("Domain: \"" + pageDomain + "\" was blocked after giving no docUrls more than " + PageCrawler.timesToGiveNoDocUrlsBeforeBlocked + " times.");
logger.warn("Domain: \"" + pageDomain + "\" was blocked after giving no " + ArgsUtils.targetUrlType + " more than " + PageCrawler.timesToGiveNoDocUrlsBeforeBlocked + " times.");
}


Expand All @@ -275,13 +275,13 @@ public static HashSet<String> retrieveInternalLinks(String urlId, String sourceU
return null;
} catch ( DocLinkFoundException dlfe) {
if ( !verifyDocLink(urlId, sourceUrl, pageUrl, pageContentType, dlfe) ) // url-logging is handled inside.
handlePageWithNoDocUrls(urlId, sourceUrl, pageUrl, pageDomain, false, true);
handlePageWithNoDocOrDatasetUrls(urlId, sourceUrl, pageUrl, pageDomain, false, true);
return null; // This DocLink is the only docLink we will ever go to get from this page. The sourceUrl is logged inside the called method.
// If this "DocLink" is a DocUrl, then returning "null" here, will trigger the 'PageCrawler.retrieveInternalLinks()' method to exit immediately (and normally).
} catch ( DocLinkInvalidException dlie ) {
//logger.warn("An invalid docLink < " + dlie.getMessage() + " > was found for pageUrl: \"" + pageUrl + "\". Search was stopped."); // DEBUG!
UrlUtils.addOutputData(urlId, sourceUrl, pageUrl, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in 'PageCrawler.retrieveInternalLinks()' method, as there was an invalid docLink. Its contentType is: '" + pageContentType + "'", null, true, "true", "true", "false", "false", "false", null, "null");
handlePageWithNoDocUrls(urlId, sourceUrl, pageUrl, pageDomain, false, true);
handlePageWithNoDocOrDatasetUrls(urlId, sourceUrl, pageUrl, pageDomain, false, true);
return null;
} catch (DocLinkUnavailableException dlue) {
logger.warn("The docLink was not available inside pageUrl: " + pageUrl);
Expand Down Expand Up @@ -626,7 +626,7 @@ public static boolean checkRemainingInternalLinks(String urlId, String sourceUrl
if ( percentage < leastPercentageOfHitsFromRemainingLinks ) {
logger.warn("The percentage of found docUrls from the remaining links is too low ( " + percentage + "% ). Stop checking the remaining-internalLinks for any pageUrl..");
should_check_remaining_links = false;
handlePageWithNoDocUrls(urlId, sourceUrl, pageUrl, pageDomain, false, false);
handlePageWithNoDocOrDatasetUrls(urlId, sourceUrl, pageUrl, pageDomain, false, false);
return false;
}
}
Expand All @@ -649,7 +649,7 @@ public static boolean checkRemainingInternalLinks(String urlId, String sourceUrl

if ( (++remainingUrlsCounter) > MAX_REMAINING_INTERNAL_LINKS_TO_CONNECT ) { // The counter is incremented only on "aboutToConnect" links, so no need to pre-clean the "remainingLinks"-set.
logger.warn("The maximum limit (" + MAX_REMAINING_INTERNAL_LINKS_TO_CONNECT + ") of remaining links to be connected was reached for pageUrl: \"" + pageUrl + "\". The page was discarded.");
handlePageWithNoDocUrls(urlId, sourceUrl, pageUrl, pageDomain, true, false);
handlePageWithNoDocOrDatasetUrls(urlId, sourceUrl, pageUrl, pageDomain, true, false);
return false;
}

Expand Down Expand Up @@ -694,7 +694,7 @@ public static boolean checkRemainingInternalLinks(String urlId, String sourceUrl
}
}// end for-loop

handlePageWithNoDocUrls(urlId, sourceUrl, pageUrl, pageDomain, false, false);
handlePageWithNoDocOrDatasetUrls(urlId, sourceUrl, pageUrl, pageDomain, false, false);
return false;
}

Expand Down

0 comments on commit 889b478

Please sign in to comment.