diff --git a/src/main/java/eu/openaire/publications_retriever/crawler/MetaDocUrlsHandler.java b/src/main/java/eu/openaire/publications_retriever/crawler/MetaDocUrlsHandler.java index 7057352..d3706a8 100644 --- a/src/main/java/eu/openaire/publications_retriever/crawler/MetaDocUrlsHandler.java +++ b/src/main/java/eu/openaire/publications_retriever/crawler/MetaDocUrlsHandler.java @@ -17,6 +17,20 @@ public class MetaDocUrlsHandler { private static final Logger logger = LoggerFactory.getLogger(MetaDocUrlsHandler.class); + + // Order-independent META_RESTRICTED_ACCESS_RIGHTS-regex. + // ]*[/]?> + private static final String metaAccessName = "name=\"DC.(?:Access)?Rights\""; + private static final String metaAccessContent = "content=\"([^\"]+)\""; + // It may be "restricted", "info:eu-repo/semantics/(openAccess|closedAccess|embargoedAccess|restrictedAccess)", "Open Access", etc.. + public static final Pattern META_RESTRICTED_ACCESS_RIGHTS = Pattern.compile("<(?:(?i)meta)(?:[^<]*" + metaAccessName + "[^<]*" + metaAccessContent + "|[^<]*" + metaAccessContent + "[^<]*" + metaAccessName + ")[^>]*[/]?>", Pattern.CASE_INSENSITIVE); + + public static final Pattern NO_ACCESS_RIGHTS = Pattern.compile(".*(?:(close[d]?|embargo(?:ed)?|restrict(?:ed)?|metadata" + PageCrawler.spaceOrDashes + "only|paid)(?:" + PageCrawler.spaceOrDashes + "access)?|(?:no[t]?|není)" + PageCrawler.spaceOrDashes + "(?:accessible|přístupná)" + + "|inaccessible|(?:acceso" + PageCrawler.spaceOrDashes + ")?cerrado).*"); + // We want the "capturing-group" to know which is the case for each url and write it in the logs. The text given to this regex is made lowercase. + // "není přístupná" = "not accessible" in Czech + + // Order-independent META_DOC_URL-regex. // ]*[/]?> private static final String metaName = "name=\"(?:[^<]*(?:(?:citation|wkhealth)(?:_fulltext)?_)?pdf|eprints.document)_url\""; @@ -42,6 +56,7 @@ else if ( !LoaderAndChecker.retrieveDocuments ) public static Pattern LOCALHOST_DOMAIN_REPLACEMENT_PATTERN = Pattern.compile("://(?:localhost|127.0.0.1)(?:\\:[\\d]+)?"); + public static AtomicInteger numOfProhibitedAccessPagesFound = new AtomicInteger(0); public static AtomicInteger numOfMetaDocUrlsFound = new AtomicInteger(0); @@ -59,6 +74,34 @@ else if ( !LoaderAndChecker.retrieveDocuments ) */ public static boolean checkIfAndHandleMetaDocUrl(String urlId, String sourceUrl, String pageUrl, String pageDomain, String pageHtml) { + // Before checking for the MetaDocUrl, check whether this publication is restricted or not. It may have a metaDocUrl, but it will redirect to the landing page. + // e.g.: https://le.uwpress.org/content/78/2/260 + + // Some websites use upper of mixed-case meta tags, names or contents or even the values. + // We cannot make the HTML lower-case or we will make the metaDocUrls invalid. + // So the REGEXes have to be case-insensitive..! + + String metaAccessRights = null; + if ( (metaAccessRights = getMetaAccessRightsFromHTML(pageHtml)) == null ) { // This is mostly the case when the page does not include any info about "access rights". It may or may not provide access to the docUrl. + if ( logger.isTraceEnabled() ) + logger.trace("Could not retrieve the metaAccessRights for url \"" + pageUrl + "\", continue by checking the metaDocUrl.."); + } else if ( logger.isTraceEnabled() ) + logger.trace("metaAccessRights: " + metaAccessRights); // DEBUG! + + if ( metaAccessRights != null ) { + String lowercaseMetaAccessRights = metaAccessRights.toLowerCase(); + Matcher noAccessRightsMatcher = NO_ACCESS_RIGHTS.matcher(lowercaseMetaAccessRights); + if ( noAccessRightsMatcher.matches() ) { + String noAccessCase = noAccessRightsMatcher.group(1); + if ( (noAccessCase == null) || noAccessCase.isEmpty() ) + noAccessCase = "prohibited"; // This is not an "official" status, but a good description of the situation and makes it clear that there was a minor problem when determining the exact reason. + logger.debug("The metaAccessRights were found to be \"" + noAccessCase + "\"! Do not check the metaDocUrl, nor crawl the page!"); + UrlUtils.logOutputData(urlId, sourceUrl, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in 'MetaDocUrlsHandler.checkIfAndHandleMetaDocUrl()' method, as its accessRight were '" + noAccessCase + "'.", null, true, "true", "true", "false", "false", "true", null, "null"); + numOfProhibitedAccessPagesFound.incrementAndGet(); + return true; // This publication has "restricted" metaAccessRights, so it will not be handled in another way. Although, it may be rechecked in the future. + } + } + // Check if the docLink is provided in a metaTag and connect to it directly. String metaDocUrl = null; if ( (metaDocUrl = getMetaDocUrlFromHTML(pageHtml)) == null ) { // This is mostly the case when the page does not have a docUrl, although not always, so we continue crawling it. @@ -158,6 +201,48 @@ public static boolean checkIfAndHandleMetaDocUrl(String urlId, String sourceUrl, } + /** + * Scan the HTML-code for the metaDocUrl. + * It may be located either in the start or at the end of the HTML. + * An HTML-Code may be only a few VERY-LONG lines of code, instead of hundreds of "normal-sized" lines. + * */ + public static String getMetaAccessRightsFromHTML(String pageHtml) + { + Matcher metaAccessRightsMatcher = META_RESTRICTED_ACCESS_RIGHTS.matcher(pageHtml); + + // There may be multiple meta-tags about "access rights". Find them all and concatenate them to check them later. + + final StringBuilder stringBuilder = new StringBuilder(500); + + while ( metaAccessRightsMatcher.find() ) + { + String currentMetaAccessRights = null; + //logger.debug("Matched metaAccessRights-line: " + metaAccessRightsMatcher.group(0)); // DEBUG!! + try { + currentMetaAccessRights = metaAccessRightsMatcher.group(1); + } catch ( Exception e ) { logger.error("", e); } + if ( currentMetaAccessRights == null ) { + try { + currentMetaAccessRights = metaAccessRightsMatcher.group(2); // Try the other group. + } catch ( Exception e ) { logger.error("", e); } + } + if ( (currentMetaAccessRights != null) + && !currentMetaAccessRights.startsWith("http") && (currentMetaAccessRights.length() <= 200) ) + stringBuilder.append(currentMetaAccessRights).append(" -- "); + } + + if ( stringBuilder.length() == 0 ) + return null; // It was not found and so it was not handled. We don't log the sourceUrl, since it will be handled later. + else + return stringBuilder.toString(); + } + + + /** + * Scan the HTML-code for the metaDocUrl. + * It may be located either in the start or at the end of the HTML. + * An HTML-Code may be only a few VERY-LONG lines of code, instead of hundreds of "normal-sized" lines. + * */ public static String getMetaDocUrlFromHTML(String pageHtml) { Matcher metaDocUrlMatcher = META_DOC_URL.matcher(pageHtml); diff --git a/src/main/java/eu/openaire/publications_retriever/crawler/PageCrawler.java b/src/main/java/eu/openaire/publications_retriever/crawler/PageCrawler.java index 38e7b13..344b1f9 100644 --- a/src/main/java/eu/openaire/publications_retriever/crawler/PageCrawler.java +++ b/src/main/java/eu/openaire/publications_retriever/crawler/PageCrawler.java @@ -53,7 +53,7 @@ public class PageCrawler public static boolean should_check_remaining_links = true; // The remaining links very rarely give docUrls.. so, for time-performance, we can disable them. private static final int MAX_REMAINING_INTERNAL_LINKS_TO_CONNECT = 10; // The < 10 > is the optimal value, figured out after experimentation. - private static final String spaceOrDashes = "(?:\\s|%20|-|_)*"; // This includes the encoded space inside the url-string. + public static final String spaceOrDashes = "(?:\\s|%20|-|_)*"; // This includes the encoded space inside the url-string. public static final Pattern DOCUMENT_TEXT = Pattern.compile("pdf|full" + spaceOrDashes + "text|download|t[ée]l[ée]charger|descargar|texte" + spaceOrDashes + "intégral");