From 7c24c356731d536bf1fe7ba0b6a55c3628c8c43f Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Thu, 28 Nov 2024 18:54:11 +0200 Subject: [PATCH] - Improve some regexes. - Fix missing change. --- .../publications_retriever/PublicationsRetriever.java | 2 +- .../util/url/LoaderAndChecker.java | 2 +- .../publications_retriever/util/url/UrlTypeChecker.java | 9 +++++---- .../test/TestNonStandardInputOutput.java | 2 +- .../openaire/publications_retriever/test/UrlChecker.java | 4 +--- 5 files changed, 9 insertions(+), 10 deletions(-) diff --git a/src/main/java/eu/openaire/publications_retriever/PublicationsRetriever.java b/src/main/java/eu/openaire/publications_retriever/PublicationsRetriever.java index bd8ac25..3e49f31 100644 --- a/src/main/java/eu/openaire/publications_retriever/PublicationsRetriever.java +++ b/src/main/java/eu/openaire/publications_retriever/PublicationsRetriever.java @@ -70,7 +70,7 @@ public static void main( String[] args ) logger.info("Starting PublicationsRetriever.."); ConnSupportUtils.setKnownMimeTypes(); - UrlTypeChecker.setURLDirectoryFilterRegex(); + UrlTypeChecker.setRuntimeInitializedRegexes(); // Check if the user gave the input file in the commandLineArgument, if not, then check for other options. if ( ArgsUtils.inputStream == null ) { diff --git a/src/main/java/eu/openaire/publications_retriever/util/url/LoaderAndChecker.java b/src/main/java/eu/openaire/publications_retriever/util/url/LoaderAndChecker.java index e21fd98..ef1acb0 100644 --- a/src/main/java/eu/openaire/publications_retriever/util/url/LoaderAndChecker.java +++ b/src/main/java/eu/openaire/publications_retriever/util/url/LoaderAndChecker.java @@ -42,7 +42,7 @@ public class LoaderAndChecker private static final String dataset_formats = "xls[xbm]?|xlt[x]?|[ct]sv|tab|(?:(?:geo)?j|b)son|(?:x|k|g|nmr|sb|wiley|y[a]?)ml|xsd|o[dt]s|ddi|rdf|[g]?zip|zipx|[rt]ar|[7x]z|[t]?gz|[gb]z[\\d]*|smi[l]?|por|ascii|dta|sav|dat|txt|ti[f]{1,2}|tfw|dwg|nt|fits|feather" + "|svg|sas7b(?:dat|ve)|spss|sas|stata|(?:my|postgre)?sql(?:ite)?|bigquery|sh[px]|sb[xn]|prj|dbf|(?:m|acc)db|mif|mat|pcd|bt|n[sc]?[\\d]*|h[\\d]+|hdf[\\d]*|trs|opj|jcamp|fcs|fas(?:ta)?|keys|values|las|rdata|parquet|avro|sql|dcm|gr[i]?b]|rds" - + "|[p]?cap|dmp|vcf|cbor|biosample|hic|warc|ig[e]?s|sla|dxf|pdb|[sc]df|cif|f(?:ast)?[qa]|apng|sra|vtp|gltf|[sm]tl|ply|abc|md|rtf|ttl|shp|shx|exr|cdf|glb|mtl"; + + "|[p]?cap|dmp|vcf|cbor|biosample|hic|warc|ig[e]?s|sla|dxf|pdb|[sc]df|cif|f(?:ast)?[qa]|apng|sra|vtp|gltf|[sm]tl|ply|abc|md|rtf|ttl|shp|shx|exr|cdf|glb|mtl|kmz|textFile"; public static final Pattern DATASET_URL_FILTER = Pattern.compile(".+(?:dataset[s]?/.*|(?:\\.|format=)" + dataset_formats + "(?:\\?.+)?$)"); diff --git a/src/main/java/eu/openaire/publications_retriever/util/url/UrlTypeChecker.java b/src/main/java/eu/openaire/publications_retriever/util/url/UrlTypeChecker.java index d91136b..8403951 100644 --- a/src/main/java/eu/openaire/publications_retriever/util/url/UrlTypeChecker.java +++ b/src/main/java/eu/openaire/publications_retriever/util/url/UrlTypeChecker.java @@ -41,9 +41,7 @@ public class UrlTypeChecker // So, we make a new REGEX for these extensions, this time, without a potential argument in the end (e.g. ?id=XXX..), except for the potential "lang". public static final Pattern PLAIN_PAGE_EXTENSION_FILTER = Pattern.compile(".+(? urlList = new ArrayList<>(); @@ -793,7 +791,7 @@ public void checkUrlConnectivity() // Set some needed data. ConnSupportUtils.setKnownMimeTypes(); - UrlTypeChecker.setURLDirectoryFilterRegex(); + UrlTypeChecker.setRuntimeInitializedRegexes(); ArgsUtils.shouldDownloadDocFiles = true; ArgsUtils.fileNameType = ArgsUtils.fileNameTypeEnum.idName;