From 8b90e0c91fcb08b27fca29e95822fb5e9d7ecb6b Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Wed, 29 Nov 2023 13:33:23 +0200 Subject: [PATCH] Improve MetaDocUrl detection. --- .../publications_retriever/crawler/MetaDocUrlsHandler.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/eu/openaire/publications_retriever/crawler/MetaDocUrlsHandler.java b/src/main/java/eu/openaire/publications_retriever/crawler/MetaDocUrlsHandler.java index e8f7e03..7057352 100644 --- a/src/main/java/eu/openaire/publications_retriever/crawler/MetaDocUrlsHandler.java +++ b/src/main/java/eu/openaire/publications_retriever/crawler/MetaDocUrlsHandler.java @@ -19,9 +19,9 @@ public class MetaDocUrlsHandler { // Order-independent META_DOC_URL-regex. // ]*[/]?> - private static final String metaName = "name=\"(?:[^<]*(?:citation|wkhealth)_pdf|eprints.document)_url\""; + private static final String metaName = "name=\"(?:[^<]*(?:(?:citation|wkhealth)(?:_fulltext)?_)?pdf|eprints.document)_url\""; private static final String metaContent = "content=\"(http[^\"]+)\""; - public static final Pattern META_DOC_URL = Pattern.compile("]*[/]?>"); + public static final Pattern META_DOC_URL = Pattern.compile("]*[/]?>", Pattern.CASE_INSENSITIVE); public static Pattern COMMON_UNSUPPORTED_META_DOC_OR_DATASET_URL_EXTENSIONS; // Its pattern gets compiled at runtime, only one time, depending on the Datatype. static {