From 50a59d5102c38fffdbc9f113b0cf23373e0f2562 Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Tue, 5 Nov 2024 22:40:18 +0200 Subject: [PATCH] - Improve a rule in "PageCrawler.NON_VALID_DOCUMENT"-regex, to avoid losing some fulltexts. - Update dependencies. --- pom.xml | 18 ++---------------- .../crawler/PageCrawler.java | 2 +- 2 files changed, 3 insertions(+), 17 deletions(-) diff --git a/pom.xml b/pom.xml index 1d32158..e24caa0 100644 --- a/pom.xml +++ b/pom.xml @@ -62,7 +62,7 @@ org.apache.maven.plugins maven-surefire-plugin - 3.5.1 + 3.5.2 diff --git a/src/main/java/eu/openaire/publications_retriever/crawler/PageCrawler.java b/src/main/java/eu/openaire/publications_retriever/crawler/PageCrawler.java index 826bb6f..1a5fb2b 100644 --- a/src/main/java/eu/openaire/publications_retriever/crawler/PageCrawler.java +++ b/src/main/java/eu/openaire/publications_retriever/crawler/PageCrawler.java @@ -58,7 +58,7 @@ public class PageCrawler public static final Pattern DOCUMENT_TEXT = Pattern.compile("pdf|full" + spaceOrDashes + "text|download|t[ée]l[ée]charger|descargar|texte" + spaceOrDashes + "intégral"); // The following regex is used both in the text around the links and in the links themselves. Everything should be LOWERCASE, from the regex-rules to the link to be matched against them. - public static final Pattern NON_VALID_DOCUMENT = Pattern.compile(".*(?:[^e]manu[ae]l|(?:\\|\\|" + spaceOrDashes + ")?gu[ií](?:de|a)|directive[s]?|preview|leaflet|agreement(?!.*thesis" + spaceOrDashes + "(?:19|20)[\\d]{2}.*)|accessibility|journal" + spaceOrDashes + "catalog|disclose" + spaceOrDashes + "file|poli(?:c(?:y|ies)(?!.*paper)|tika(?:si)?)" // "policy" can be a lone word or a word after: repository|embargo|privacy|data protection|take down|supplement|access + public static final Pattern NON_VALID_DOCUMENT = Pattern.compile(".*(?:[^e]manu[ae]l|(?:\\|\\|" + spaceOrDashes + ")?gu[ií](?:de|a)|directive[s]?|(?