Handle the "restricted"-related full-text access defined inside the h…

…tml-meta-tags. In this case: - Avoid checking the page further (metaDocUrl, internal-links) - Mark the source-url as "re-try-able".
LSmyrnaios · Nov 29, 2023 · ab6978c · ab6978c
1 parent ff70907
commit ab6978c
Show file tree

Hide file tree

Showing 2 changed files with 86 additions and 1 deletion.
diff --git a/src/main/java/eu/openaire/publications_retriever/crawler/MetaDocUrlsHandler.java b/src/main/java/eu/openaire/publications_retriever/crawler/MetaDocUrlsHandler.java
@@ -17,6 +17,20 @@ public class MetaDocUrlsHandler {
 
     private static final Logger logger = LoggerFactory.getLogger(MetaDocUrlsHandler.class);
 
+
+    // Order-independent META_RESTRICTED_ACCESS_RIGHTS-regex.
+    // <meta(?:[^<]*name=\"DC.AccessRights\"[^<]*content=\"restricted\"|[^<]*content=\"restricted\"[^<]*name=\"DC.AccessRights\")[^>]*[/]?>
+    private static final String metaAccessName = "name=\"DC.(?:Access)?Rights\"";
+    private static final String metaAccessContent = "content=\"([^\"]+)\"";
+    // It may be "restricted", "info:eu-repo/semantics/(openAccess|closedAccess|embargoedAccess|restrictedAccess)", "Open Access", etc..
+    public static final Pattern META_RESTRICTED_ACCESS_RIGHTS = Pattern.compile("<(?:(?i)meta)(?:[^<]*" + metaAccessName + "[^<]*" + metaAccessContent + "|[^<]*" + metaAccessContent + "[^<]*" + metaAccessName + ")[^>]*[/]?>", Pattern.CASE_INSENSITIVE);
+
+    public static final Pattern NO_ACCESS_RIGHTS = Pattern.compile(".*(?:(close[d]?|embargo(?:ed)?|restrict(?:ed)?|metadata" + PageCrawler.spaceOrDashes + "only|paid)(?:" + PageCrawler.spaceOrDashes + "access)?|(?:no[t]?|není)" + PageCrawler.spaceOrDashes + "(?:accessible|přístupná)"
+            + "|inaccessible|(?:acceso" + PageCrawler.spaceOrDashes + ")?cerrado).*");
+    // We want the "capturing-group" to know which is the case for each url and write it in the logs. The text given to this regex is made lowercase.
+    // "není přístupná" = "not accessible" in Czech
+
+
     // Order-independent META_DOC_URL-regex.
     // <meta(?:[^<]*name=\"(?:[^<]*(?:citation|wkhealth)_pdf|eprints.document)_url\"[^<]*content=\"(http[^\"]+)\"|[^<]*content=\"(http[^\"]+)\"[^<]*name=\"(?:[^<]*(?:citation|wkhealth)_pdf|eprints.document)_url\")[^>]*[/]?>
     private static final String metaName = "name=\"(?:[^<]*(?:(?:citation|wkhealth)(?:_fulltext)?_)?pdf|eprints.document)_url\"";
@@ -42,6 +56,7 @@ else if ( !LoaderAndChecker.retrieveDocuments )
 
     public static Pattern LOCALHOST_DOMAIN_REPLACEMENT_PATTERN = Pattern.compile("://(?:localhost|127.0.0.1)(?:\\:[\\d]+)?");
 
+    public static AtomicInteger numOfProhibitedAccessPagesFound = new AtomicInteger(0);
     public static AtomicInteger numOfMetaDocUrlsFound = new AtomicInteger(0);
 
 
@@ -59,6 +74,34 @@ else if ( !LoaderAndChecker.retrieveDocuments )
      */
     public static boolean checkIfAndHandleMetaDocUrl(String urlId, String sourceUrl, String pageUrl, String pageDomain, String pageHtml)
     {
+        // Before checking for the MetaDocUrl, check whether this publication is restricted or not. It may have a metaDocUrl, but it will redirect to the landing page.
+        // e.g.: https://le.uwpress.org/content/78/2/260
+
+        // Some websites use upper of mixed-case meta tags, names or contents or even the values.
+        // We cannot make the HTML lower-case or we will make the metaDocUrls invalid.
+        // So the REGEXes have to be case-insensitive..!
+
+        String metaAccessRights = null;
+        if ( (metaAccessRights = getMetaAccessRightsFromHTML(pageHtml)) == null ) { // This is mostly the case when the page does not include any info about "access rights". It may or may not provide access to the docUrl.
+            if ( logger.isTraceEnabled() )
+                logger.trace("Could not retrieve the metaAccessRights for url \"" + pageUrl + "\", continue by checking the metaDocUrl..");
+        } else if ( logger.isTraceEnabled() )
+            logger.trace("metaAccessRights: " + metaAccessRights);  // DEBUG!
+
+        if ( metaAccessRights != null ) {
+            String lowercaseMetaAccessRights = metaAccessRights.toLowerCase();
+            Matcher noAccessRightsMatcher = NO_ACCESS_RIGHTS.matcher(lowercaseMetaAccessRights);
+            if ( noAccessRightsMatcher.matches() ) {
+                String noAccessCase = noAccessRightsMatcher.group(1);
+                if ( (noAccessCase == null) || noAccessCase.isEmpty() )
+                    noAccessCase = "prohibited";    // This is not an "official" status, but a good description of the situation and makes it clear that there was a minor problem when determining the exact reason.
+                logger.debug("The metaAccessRights were found to be \"" + noAccessCase + "\"! Do not check the metaDocUrl, nor crawl the page!");
+                UrlUtils.logOutputData(urlId, sourceUrl, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in 'MetaDocUrlsHandler.checkIfAndHandleMetaDocUrl()' method, as its accessRight were '" + noAccessCase + "'.", null, true, "true", "true", "false", "false", "true", null, "null");
+                numOfProhibitedAccessPagesFound.incrementAndGet();
+                return true;   // This publication has "restricted" metaAccessRights, so it will not be handled in another way. Although, it may be rechecked in the future.
+            }
+        }
+
         // Check if the docLink is provided in a metaTag and connect to it directly.
         String metaDocUrl = null;
         if ( (metaDocUrl = getMetaDocUrlFromHTML(pageHtml)) == null ) { // This is mostly the case when the page does not have a docUrl, although not always, so we continue crawling it.
@@ -158,6 +201,48 @@ public static boolean checkIfAndHandleMetaDocUrl(String urlId, String sourceUrl,
     }
 
 
+    /**
+     * Scan the HTML-code for the metaDocUrl.
+     * It may be located either in the start or at the end of the HTML.
+     * An HTML-Code may be only a few VERY-LONG lines of code, instead of hundreds of "normal-sized" lines.
+     * */
+    public static String getMetaAccessRightsFromHTML(String pageHtml)
+    {
+        Matcher metaAccessRightsMatcher = META_RESTRICTED_ACCESS_RIGHTS.matcher(pageHtml);
+
+        // There may be multiple meta-tags about "access rights". Find them all and concatenate them to check them later.
+
+        final StringBuilder stringBuilder = new StringBuilder(500);
+
+        while ( metaAccessRightsMatcher.find() )
+        {
+            String currentMetaAccessRights = null;
+            //logger.debug("Matched metaAccessRights-line: " + metaAccessRightsMatcher.group(0));	// DEBUG!!
+            try {
+                currentMetaAccessRights = metaAccessRightsMatcher.group(1);
+            } catch ( Exception e ) { logger.error("", e); }
+            if ( currentMetaAccessRights == null ) {
+                try {
+                    currentMetaAccessRights = metaAccessRightsMatcher.group(2);    // Try the other group.
+                } catch ( Exception e ) { logger.error("", e); }
+            }
+            if ( (currentMetaAccessRights != null)
+                    && !currentMetaAccessRights.startsWith("http") && (currentMetaAccessRights.length() <= 200) )
+                stringBuilder.append(currentMetaAccessRights).append(" -- ");
+        }
+
+        if ( stringBuilder.length() == 0 )
+            return null;    // It was not found and so it was not handled. We don't log the sourceUrl, since it will be handled later.
+        else
+            return stringBuilder.toString();
+    }
+
+
+    /**
+     * Scan the HTML-code for the metaDocUrl.
+     * It may be located either in the start or at the end of the HTML.
+     * An HTML-Code may be only a few VERY-LONG lines of code, instead of hundreds of "normal-sized" lines.
+     * */
     public static String getMetaDocUrlFromHTML(String pageHtml)
     {
         Matcher metaDocUrlMatcher = META_DOC_URL.matcher(pageHtml);

diff --git a/src/main/java/eu/openaire/publications_retriever/crawler/PageCrawler.java b/src/main/java/eu/openaire/publications_retriever/crawler/PageCrawler.java
@@ -53,7 +53,7 @@ public class PageCrawler
 	public static boolean should_check_remaining_links = true;	// The remaining links very rarely give docUrls.. so, for time-performance, we can disable them.
 	private static final int MAX_REMAINING_INTERNAL_LINKS_TO_CONNECT = 10;	// The < 10 > is the optimal value, figured out after experimentation.
 
-	private static final String spaceOrDashes = "(?:\\s|%20|-|_)*";	// This includes the encoded space inside the url-string.
+	public static final String spaceOrDashes = "(?:\\s|%20|-|_)*";	// This includes the encoded space inside the url-string.
 
 	public static final Pattern DOCUMENT_TEXT = Pattern.compile("pdf|full" + spaceOrDashes + "text|download|t[ée]l[ée]charger|descargar|texte" + spaceOrDashes + "intégral");