- Improve various regexes.

- Code polishing.
LSmyrnaios · Dec 5, 2024 · f37a8e5 · f37a8e5
1 parent dd1d110
commit f37a8e5
Show file tree

Hide file tree

Showing 4 changed files with 31 additions and 30 deletions.
diff --git a/src/main/java/eu/openaire/publications_retriever/crawler/MetadataHandler.java b/src/main/java/eu/openaire/publications_retriever/crawler/MetadataHandler.java
@@ -44,12 +44,12 @@ public class MetadataHandler {
         String regex = ".+\\.(?:";
 
         if ( !LoaderAndChecker.retrieveDatasets )
-            regex += "zip|rar|";  // If no datasets retrieved, block these types.
+            regex += LoaderAndChecker.dataset_formats;  // If no datasets retrieved, block these types.
         else if ( !LoaderAndChecker.retrieveDocuments )
-            regex += "pdf|doc[x]?|";  // If no documents retrieved, block these types.
+            regex += "pdf|" + UrlTypeChecker.unsupportedDocFileTypes;  // If no documents retrieved, block these types.
         //else -> no more datatype-dependent additions
 
-        regex += "apk|jpg|png)(?:\\?.+)?$";
+        regex += "|apk|jpg|png)(?:\\?.+)?$";
         logger.debug("COMMON_UNSUPPORTED_META_DOC_OR_DATASET_URL_EXTENSIONS -> REGEX: " + regex);
         COMMON_UNSUPPORTED_META_DOC_OR_DATASET_URL_EXTENSIONS = Pattern.compile(regex);
     }

diff --git a/src/main/java/eu/openaire/publications_retriever/util/url/LoaderAndChecker.java b/src/main/java/eu/openaire/publications_retriever/util/url/LoaderAndChecker.java
@@ -40,9 +40,9 @@ public class LoaderAndChecker
 	// "DOC_URL_FILTER" works for lowerCase Strings (we make sure they are in lowerCase before we check).
 	// Note that we still need to check if it's an alive link and if it's actually a docUrl (though it's mimeType).
 
-	private static final String dataset_formats = "xls[xbm]?|xlt[x]?|[ct]sv|tab|(?:(?:geo)?j|b)son|(?:x|k|g|nmr|sb|wiley|y[a]?)ml|xsd|o[dt]s|ddi|rdf|[g]?zip|zipx|[rt]ar|[7x]z|[t]?gz|[gb]z[\\d]*|smi[l]?|por|ascii|dta|sav|dat|txt|ti[f]{1,2}|tfw|dwg|nt|fits|feather"
+	public static final String dataset_formats = "(?:xls[xbm]?|xlt[x]?|[ct]sv|tab|(?:(?:geo)?j|b)son|(?:x|k|g|nmr|sb|wiley|y[a]?)ml|xsd|o[dt]s|ddi|rdf|[g]?zip|zipx|[rt]ar|[7x]z|[t]?gz|[gb]z[\\d]*|smi[l]?|por|ascii|dta|sav|dat|txt|ti[f]{1,2}|tfw|dwg|nt|fits|feather"
 			+ "|svg|sas7b(?:dat|ve)|spss|sas|stata|(?:my|postgre)?sql(?:ite)?|bigquery|sh[px]|sb[xn]|prj|dbf|(?:m|acc)db|mif|mat|pcd|bt|n[sc]?[\\d]*|h[\\d]+|hdf[\\d]*|trs|opj|jcamp|fcs|fas(?:ta)?|keys|values|las|rdata|parquet|avro|sql|dcm|gr[i]?b]|rds"
-			+ "|[p]?cap|dmp|vcf|cbor|biosample|hic|warc|ig[e]?s|sla|dxf|pdb|[sc]df|cif|f(?:ast)?[qa]|apng|sra|vtp|gltf|[sm]tl|ply|abc|md|rtf|ttl|shp|shx|exr|cdf|glb|mtl|kmz|textFile";
+			+ "|[p]?cap|dmp|vcf|cbor|biosample|hic|warc|ig[e]?s|sla|dxf|pdb|[sc]df|cif|f(?:ast)?[qa]|apng|sra|vtp|gltf|[sm]tl|ply|abc|md|rtf|ttl|shp|shx|exr|cdf|glb|mtl|kmz|textFile)";
 	public static final Pattern DATASET_URL_FILTER = Pattern.compile(".+(?:dataset[s]?/.*|(?:\\.|format=)" + dataset_formats + "(?:\\?.+)?$)");
 
 

diff --git a/src/main/java/eu/openaire/publications_retriever/util/url/UrlTypeChecker.java b/src/main/java/eu/openaire/publications_retriever/util/url/UrlTypeChecker.java
@@ -19,7 +19,7 @@ public class UrlTypeChecker
 {
 	private static final Logger logger = LoggerFactory.getLogger(UrlTypeChecker.class);
 
-	private static final String htOrPhpExtensionsPattern = "(?:[\\w]?ht(?:[\\w]{1,2})?|php[\\d]?)";
+	private static final String htOrPhpExtensionsPattern = "(?:[\\w]?ht(?:[\\w]{1,2})?|php[\\d]{0,2})";
 	private static final String mediaExtensionsPattern = "ico|gif|jpg|jpeg|png|wav|mp3|mp4|webm|mkv|mov";
 
 
@@ -28,9 +28,10 @@ public class UrlTypeChecker
 	private static final String docOrDatasetNegativeLookAroundPattern = "(?<!" + wordsPattern + docOrDatasetKeywords + wordsPattern + ")(?!.*" + docOrDatasetKeywords + ".*)";
 	// Note: Up to Java 8, we cannot use the "*" or "+" inside the lookbehind, so we use character-class with limits.
 
-	public static Pattern URL_DIRECTORY_FILTER;	// Set this regex during runtime to account for the user's preference in selecting to retrieve documents only (not datasets).
+	public static Pattern URL_DIRECTORY_FILTER = null;	// Set this regex during runtime to account for the user's preference in selecting to retrieve documents only (not datasets).
 
-	public static final Pattern CURRENTLY_UNSUPPORTED_DOC_EXTENSION_FILTER = Pattern.compile(".+\\.(?:(?:doc|ppt)[x]?|ps|epub|od[tp]|djvu|rtf)(?:\\?.+)?$");	// Doc-extensions which are currently unsupported. Some pageUrls give also .zip files, but that's another story.
+	public static final String unsupportedDocFileTypes = "(?:doc|ppt)[x]?|ps|epub|od[tp]|djvu|rtf)";
+	public static final Pattern CURRENTLY_UNSUPPORTED_DOC_EXTENSION_FILTER = Pattern.compile(".+\\.(?:" + unsupportedDocFileTypes + "(?:\\?.+)?$");	// Doc-extensions which are currently unsupported. Some pageUrls give also .zip files, but that's another story.
 
 	public static final Pattern URL_FILE_EXTENSION_FILTER = Pattern.compile(".+\\.(?:css|js(?:\\?y)?|" + mediaExtensionsPattern + "|pt|bib|nt|refer|enw|ris|mso|dtl|do|asc|c|cc" + docOrDatasetNegativeLookAroundPattern + "|cxx|cpp|java|py)(?:\\?.+)?$");
 	// In the above, don't include .php and relative extensions, since even this can be a docUrl. For example: https://www.dovepress.com/getfile.php?fileID=5337
@@ -115,22 +116,22 @@ public class UrlTypeChecker
 	 * */
 	public static void setRuntimeInitializedRegexes() {
 		URL_DIRECTORY_FILTER =
-			Pattern.compile("[^/]+://.*/(?:(discover|profile|user|survey|index|media|theme|product|deposit|default|shop|view)/" + docOrDatasetNegativeLookAroundPattern	// Avoid blocking these if the url is likely to give a file.
+			Pattern.compile("[^/]+://.*/(?:(?:(?:(?:discover|profile|user|survey|index|media|theme|product|deposit|default|shop|view)/" + docOrDatasetNegativeLookAroundPattern	// Avoid blocking these if the url is likely to give a file.
 				+ "|(?:(?:ldap|password)-)?login|ac[c]?ess(?![./]+)|sign[-]?(?:in|out|up)|session|(?:how-to-)?(:?join[^t]|subscr)|regist(?:er|ration)|submi(?:t|ssion)|(?:post|send|export|(?:wp-)?admin|home|form|career[s]?|company)/|watch|browse|import|bookmark|announcement|feedback|share[^d]|about|(?:[^/]+-)?faq|wiki|news|events|cart|support|(?:site|html)map|documentation|help|license|disclaimer|copyright|(?:site-)?polic(?:y|ies)(?!.*paper)|privacy|terms|law|principles"
 				+ "|(?:my|your|create)?[-]?account|my(?:dspace|selection|cart)|(?:service|help)[-]?desk|settings|fund|aut[h]?or" + docOrDatasetNegativeLookAroundPattern + "|journal/key|(?:journal-)?editor|author:|(?<!ntrs.nasa.gov/(?:api/)?)citation|review|external|facets|statistics|application|selfarchive|permission|ethic(s)?/.*/view/|/view/" + docOrDatasetNegativeLookAroundPattern + "|conta[c]?t|wallet|contribute|donate|our[_-][\\w]+|template|logo|image|photo/|video|advertiser|most-popular|people|(?:the)?press|for-authors|customer-service[s]?|captcha|clipboard|dropdown|widget"
-				+ "|(?:forum|blog|column|row|js|css|rss|legal)/"	// These are absolute directory names.	TODO - Should I add the "|citation[s]?" rule ? BUT, The NASA-docUrls include it normally..
+				+ "|(?:forum|blog|column|row|js|[cr]ss|legal)/"	// These are absolute directory names.	TODO - Should I add the "|citation[s]?" rule ? BUT, The NASA-docUrls include it normally..
 				+ "|(?:(?:advanced[-]?)?search|search/advanced|search-results|(?:[e]?books|journals)(?:-catalog)?|issue|docs|oai|(?:abstracting-)?indexing|online[-]?early|honors|awards|meetings|calendar|diversity|scholarships|invo(?:ice|lved)|errata|classroom|publish(?:-with-us)?|upload|products|forgot|home|ethics|comics|podcast|trends|bestof|booksellers|recommendations|bibliographic|volume[s]?)[/]?$"	// Url ends with these. Note that some of them are likely to be part of a docUrl, for ex. the "/trends/"-dir.
 				+ "|rights[-]?permissions|publication[-]?ethics|advertising|reset[-]?password|\\*/|communit(?:y|ies)"
 				+ "|restricted|noaccess|crawlprevention|error|(?:mis|ab)use|\\?denied|gateway|defaultwebpage|sorryserver|(?<!response_type=)cookie|(?:page-)?not[-]?found"
-				+ "|(?:404(?:_response)?|accessibility|invalid|catalog(?:ue|ar|o)?)\\." + htOrPhpExtensionsPattern
+				+ "|(?:(?:error)?404(?:_response)?|accessibility|invalid|catalog(?:ue|ar|o)?)\\." + htOrPhpExtensionsPattern
 
 				// Add pages with a specific blocking-reason, in "capturing-groups", in order to be able to get the matched-group-number and know the exact reason the block occurred.
 				+ "|(.*/view/" + docOrDatasetNegativeLookAroundPattern + ")"	// 1. Avoid pages having their DocUrls in larger depth (internalPagesToDocUrls or PreviousOfDocUrls).
 				+ "|(.*sharedsitesession)"	// 2. Avoid urls which contain either "getSharedSiteSession" or "consumeSharedSiteSession" as these cause an infinite loop.
 				+ "|(doi.org/https://doi.org/.*pangaea." + (!LoaderAndChecker.retrieveDatasets ? "|pangaea.)" : ")")	// 3. Avoid "PANGAEA."-urls with problematic form and non docUrl internal links (yes WITH the "DOT", it's not a domain-name!).
 
 				// The following pattern is the reason we need to set this regex in runtime.
-				+ (!LoaderAndChecker.retrieveDatasets ? "|(?:bibtext|dc(?:terms)?|[^/]*(?:tei|endnote))$)" : ").*")
+				+ (!LoaderAndChecker.retrieveDatasets ? ").*)|(?:bibtext|dc(?:terms)?|[^/]*(?:tei|endnote))$)" : ").*")
 			);
 
 		// We check the above rules, mostly as directories to avoid discarding publications' urls about these subjects. There's "acesso" (single "c") in Portuguese.. Also there's "autore" & "contatto" in Italian.

diff --git a/src/main/java/eu/openaire/publications_retriever/util/url/UrlUtils.java b/src/main/java/eu/openaire/publications_retriever/util/url/UrlUtils.java
@@ -34,7 +34,7 @@ public class UrlUtils
 	public static final Pattern ANCHOR_FILTER = Pattern.compile("(.+)(#(?!/).+)");	// Remove the anchor at the end of the url to avoid duplicate versions. (anchors might exist even in docUrls themselves)
 	// Note that we may have this: https://academic.microsoft.com/#/detail/2945595536 (these urls are dead now, but others like it , may exist)
 
-	public static AtomicInteger sumOfDocUrlsFound = new AtomicInteger(0);	// Change it back to simple int if finally in singleThread mode
+	public static AtomicInteger sumOfDocUrlsFound = new AtomicInteger(0);
 
 	public static final Set<String> duplicateUrls = Collections.newSetFromMap(new ConcurrentHashMap<String, Boolean>());
 
@@ -53,7 +53,7 @@ public class UrlUtils
      * @param urlId (it may be null if no id was provided in the input)
      * @param sourceUrl
      * @param pageUrl
-     * @param docUrl
+     * @param docOrDatasetUrl
      * @param comment
      * @param pageDomain (it may be null)
      * @param isFirstCrossed
@@ -65,24 +65,24 @@ public class UrlUtils
      * @param fileSize
      * @param fileHash
      */
-    public static void addOutputData(String urlId, String sourceUrl, String pageUrl, String docUrl, String comment, String pageDomain,
+    public static void addOutputData(String urlId, String sourceUrl, String pageUrl, String docOrDatasetUrl, String comment, String pageDomain,
 									 boolean isFirstCrossed, String wasUrlChecked, String wasUrlValid, String wasDocumentOrDatasetAccessible, String wasDirectLink, String couldRetry, Long fileSize, String fileHash)
     {
-        String finalDocUrl = docUrl;
+        String finalDocOrDatasetUrl = docOrDatasetUrl;
 
-        if ( !finalDocUrl.equals(duplicateUrlIndicator) )
+        if ( !finalDocOrDatasetUrl.equals(duplicateUrlIndicator) )
         {
-			if ( !finalDocUrl.equals(unreachableDocOrDatasetUrlIndicator) )
+			if ( !finalDocOrDatasetUrl.equals(unreachableDocOrDatasetUrlIndicator) )
 			{
 				sumOfDocUrlsFound.incrementAndGet();
 
-				// Remove the "temporalId" from urls for "cleaner" output and "already found docUrl"-matching. These IDs will expire eventually anyway.
-				String lowerCaseUrl = finalDocUrl.toLowerCase();
+				// Remove the "temporalId" from urls for "cleaner" output and "already found docOrDatasetUrl"-matching. These IDs will expire eventually anyway.
+				String lowerCaseUrl = finalDocOrDatasetUrl.toLowerCase();
 				if ( lowerCaseUrl.contains("token") || lowerCaseUrl.contains("jsessionid") )
-					finalDocUrl = UrlUtils.removeTemporalIdentifier(finalDocUrl);	// We send the non-lowerCase-url as we may want to continue with that docUrl in case of an error.
+					finalDocOrDatasetUrl = UrlUtils.removeTemporalIdentifier(finalDocOrDatasetUrl);	// We send the non-lowerCase-url as we may want to continue with that docOrDatasetUrl in case of an error.
 
-				if ( isFirstCrossed )	// Add this id, only if this is a first-crossed docUrl.
-					docOrDatasetUrlsWithIDs.put(finalDocUrl, new IdUrlTuple(urlId, sourceUrl));	// Add it here, in order to be able to recognize it and quick-log it later, but also to distinguish it from other duplicates.
+				if ( isFirstCrossed )	// Add this id, only if this is a first-crossed docOrDatasetUrl.
+					docOrDatasetUrlsWithIDs.put(finalDocOrDatasetUrl, new IdUrlTuple(urlId, sourceUrl));	// Add it here, in order to be able to recognize it and quick-log it later, but also to distinguish it from other duplicates.
 
 				if ( pageDomain == null )
 					pageDomain = UrlUtils.getDomainStr(pageUrl, null);
@@ -91,26 +91,26 @@ public static void addOutputData(String urlId, String sourceUrl, String pageUrl,
 				{
 					// Gather data for the MLA, if we decide to have it enabled.
 					if ( MachineLearning.useMLA )
-						MachineLearning.gatherMLData(pageUrl, finalDocUrl, pageDomain);
+						MachineLearning.gatherMLData(pageUrl, finalDocOrDatasetUrl, pageDomain);
 
-					// Add the domains of the pageUrl and the finalDocUrl to the successful domains as both lead in some way to a docUrl.
+					// Add the domains of the pageUrl and the finalDocOrDatasetUrl to the successful domains as both lead in some way to a docOrDatasetUrl.
 					// The data inside ConcurrentHashMap "domainsAndHits" is used to evaluate how good the domain is doing while is having some problems.
 					// If the "goods" surpass the "bads", then that domain will not get blocked, even if the "minimum-accepted-bad-cases" was exceeded.
 					ConnSupportUtils.countInsertAndGetTimes(domainsAndHits, pageDomain);
 
-					// Now if the "finalDocUrl" is different from the "pageUrl", get the domain of the "finalDocUrl" and if it's different, then add it to "domainsAndHits"-HashMap.
-					if ( !pageUrl.equals(finalDocUrl) ) {
-						String docUrlDomain = UrlUtils.getDomainStr(finalDocUrl, null);
+					// Now if the "finalDocOrDatasetUrl" is different from the "pageUrl", get the domain of the "finalDocOrDatasetUrl" and if it's different, then add it to "domainsAndHits"-HashMap.
+					if ( !pageUrl.equals(finalDocOrDatasetUrl) ) {
+						String docUrlDomain = UrlUtils.getDomainStr(finalDocOrDatasetUrl, null);
 						if ( (docUrlDomain != null) && !docUrlDomain.equals(pageDomain) )
 							ConnSupportUtils.countInsertAndGetTimes(domainsAndHits, docUrlDomain);
 					}
 				}
 			}
-			else	// Else if this url is not a docUrl and has not been processed before..
+			else	// Else if this url is not a docOrDatasetUrl and has not been processed before..
 				duplicateUrls.add(sourceUrl);	// Add it in duplicates BlackList, in order not to be accessed for 2nd time in the future. We don't add docUrls here, as we want them to be separate for checking purposes.
 		}
 
-        FileUtils.dataForOutput.add(new DataForOutput(urlId, sourceUrl, pageUrl, finalDocUrl, wasUrlChecked, wasUrlValid, wasDocumentOrDatasetAccessible, wasDirectLink, couldRetry, fileHash, fileSize, comment));    // Log it to be written later in the outputFile.
+        FileUtils.dataForOutput.add(new DataForOutput(urlId, sourceUrl, pageUrl, finalDocOrDatasetUrl, wasUrlChecked, wasUrlValid, wasDocumentOrDatasetAccessible, wasDirectLink, couldRetry, fileHash, fileSize, comment));    // Log it to be written later in the outputFile.
     }