From 2ab0927609eff70d657a907e72d38987b7ea583e Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Wed, 24 Jan 2024 15:33:44 +0200 Subject: [PATCH] Add/modify rules in some url-filter regexes. --- .../publications_retriever/PublicationsRetriever.java | 2 +- .../publications_retriever/util/url/UrlTypeChecker.java | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main/java/eu/openaire/publications_retriever/PublicationsRetriever.java b/src/main/java/eu/openaire/publications_retriever/PublicationsRetriever.java index 7e3afde..cc597d0 100644 --- a/src/main/java/eu/openaire/publications_retriever/PublicationsRetriever.java +++ b/src/main/java/eu/openaire/publications_retriever/PublicationsRetriever.java @@ -362,7 +362,7 @@ public static void showStatistics(Instant startTime) if ( LoaderAndChecker.useIdUrlPairs && (inputCheckedUrlNum < currentlyLoadedUrls) ) logger.info("Total num of urls (IDs) checked (& connected) from the input was: " + inputCheckedUrlNum - + ". The rest " + notConnectedIDs + " urls (about " + df.format(notConnectedIDs * 100.0 / LoaderAndChecker.numOfIDs) + "%) belonged to duplicate (" + FileUtils.duplicateIdUrlEntries +") or problematic (" + LoaderAndChecker.numOfIDsWithoutAcceptableSourceUrl + ") IDs."); + + ". The rest " + notConnectedIDs + " urls (about " + df.format(notConnectedIDs * 100.0 / LoaderAndChecker.numOfIDs) + "%) belonged to duplicate (" + FileUtils.duplicateIdUrlEntries +") and/or problematic (" + LoaderAndChecker.numOfIDsWithoutAcceptableSourceUrl + ") IDs."); else logger.info("Total num of urls (IDs) checked from the input was: " + inputCheckedUrlNum); diff --git a/src/main/java/eu/openaire/publications_retriever/util/url/UrlTypeChecker.java b/src/main/java/eu/openaire/publications_retriever/util/url/UrlTypeChecker.java index 4735825..9a7d2c4 100644 --- a/src/main/java/eu/openaire/publications_retriever/util/url/UrlTypeChecker.java +++ b/src/main/java/eu/openaire/publications_retriever/util/url/UrlTypeChecker.java @@ -28,7 +28,7 @@ public class UrlTypeChecker public static final Pattern URL_DIRECTORY_FILTER = Pattern.compile("[^/]+://.*/(?:(discover|profile|user|survey|index|media|theme|product|deposit|default|shop|view)/" + docOrDatasetNegativeLookAroundPattern // Avoid blocking these if the url is likely to give a file. - + "|(?:(?:ldap|password)-)?login|auth(?:entication)?\\.|ac[c]?ess(?!\\.)|sign[-]?(?:in|out|up)|session|(?:how-to-)?(:?join[^t]|subscr)|regist(?:er|ration)|submi(?:t|ssion)|(?:post|send|export|(?:wp-)?admin|home|form|career[s]?|company)/|watch|browse|import|bookmark|announcement|feedback|share[^d]|about|(?:[^/]+-)?faq|wiki|news|events|cart|support|(?:site|html)map|documentation|help|license|disclaimer|copyright|(?:site-)?polic(?:y|ies)|privacy|terms|law|principles" + + "|(?:(?:ldap|password)-)?login|auth(?:entication)?\\.|ac[c]?ess(?![./]+)|sign[-]?(?:in|out|up)|session|(?:how-to-)?(:?join[^t]|subscr)|regist(?:er|ration)|submi(?:t|ssion)|(?:post|send|export|(?:wp-)?admin|home|form|career[s]?|company)/|watch|browse|import|bookmark|announcement|feedback|share[^d]|about|(?:[^/]+-)?faq|wiki|news|events|cart|support|(?:site|html)map|documentation|help|license|disclaimer|copyright|(?:site-)?polic(?:y|ies)|privacy|terms|law|principles" + "|(?:my|your|create)?[-]?account|my(?:dspace|selection|cart)|(?:service|help)[-]?desk|settings|fund|aut[h]?or|journal/key|(?:journal-)?editor|author:|(?