Skip to content

Commit

Permalink
- Improve some regexes.
Browse files Browse the repository at this point in the history
- Fix missing change.
  • Loading branch information
LSmyrnaios committed Nov 28, 2024
1 parent 613e900 commit 7c24c35
Show file tree
Hide file tree
Showing 5 changed files with 9 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ public static void main( String[] args )

logger.info("Starting PublicationsRetriever..");
ConnSupportUtils.setKnownMimeTypes();
UrlTypeChecker.setURLDirectoryFilterRegex();
UrlTypeChecker.setRuntimeInitializedRegexes();

// Check if the user gave the input file in the commandLineArgument, if not, then check for other options.
if ( ArgsUtils.inputStream == null ) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ public class LoaderAndChecker

private static final String dataset_formats = "xls[xbm]?|xlt[x]?|[ct]sv|tab|(?:(?:geo)?j|b)son|(?:x|k|g|nmr|sb|wiley|y[a]?)ml|xsd|o[dt]s|ddi|rdf|[g]?zip|zipx|[rt]ar|[7x]z|[t]?gz|[gb]z[\\d]*|smi[l]?|por|ascii|dta|sav|dat|txt|ti[f]{1,2}|tfw|dwg|nt|fits|feather"
+ "|svg|sas7b(?:dat|ve)|spss|sas|stata|(?:my|postgre)?sql(?:ite)?|bigquery|sh[px]|sb[xn]|prj|dbf|(?:m|acc)db|mif|mat|pcd|bt|n[sc]?[\\d]*|h[\\d]+|hdf[\\d]*|trs|opj|jcamp|fcs|fas(?:ta)?|keys|values|las|rdata|parquet|avro|sql|dcm|gr[i]?b]|rds"
+ "|[p]?cap|dmp|vcf|cbor|biosample|hic|warc|ig[e]?s|sla|dxf|pdb|[sc]df|cif|f(?:ast)?[qa]|apng|sra|vtp|gltf|[sm]tl|ply|abc|md|rtf|ttl|shp|shx|exr|cdf|glb|mtl";
+ "|[p]?cap|dmp|vcf|cbor|biosample|hic|warc|ig[e]?s|sla|dxf|pdb|[sc]df|cif|f(?:ast)?[qa]|apng|sra|vtp|gltf|[sm]tl|ply|abc|md|rtf|ttl|shp|shx|exr|cdf|glb|mtl|kmz|textFile";
public static final Pattern DATASET_URL_FILTER = Pattern.compile(".+(?:dataset[s]?/.*|(?:\\.|format=)" + dataset_formats + "(?:\\?.+)?$)");


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,7 @@ public class UrlTypeChecker
// So, we make a new REGEX for these extensions, this time, without a potential argument in the end (e.g. ?id=XXX..), except for the potential "lang".
public static final Pattern PLAIN_PAGE_EXTENSION_FILTER = Pattern.compile(".+(?<!" + docOrDatasetKeywords + ")\\.(?:" + htOrPhpExtensionsPattern+ "|[aj]sp[x]?|jsf|do|asc|cgi|cfm)(?:\\?(?!.*" + docOrDatasetKeywords + ").*)?$"); // We may have this page, which runs a script to return the pdf: "https://www.ijcseonline.org/pdf_paper_view.php?paper_id=4547&48-IJCSE-07375.pdf" or this pdf-internal-link: "https://meetingorganizer.copernicus.org/EGU2020/EGU2020-6296.html?pdf"

public static final Pattern INTERNAL_LINKS_FILE_FORMAT_FILTER = Pattern.compile(".+format=(?:xml|" + htOrPhpExtensionsPattern + "|rss|ris|bib).*"); // This exists as a url-parameter.

// TODO - SET THE ABOVE REGEX AT RUNTIME TO AVOID EXCLUDING XML WHEN THE USE WANTS DATASETS..
public static Pattern INTERNAL_LINKS_FILE_FORMAT_FILTER = null; // This includes filter for url-parameters.

public static final Pattern SPECIFIC_DOMAIN_FILTER = Pattern.compile("[^/]+://[^/]*(?<=[/.])(?:(?<!drive.)google\\.|goo.gl|gstatic|facebook|fb.me|twitter|(?:meta|xing|baidu|t|x|vk).co|insta(?:gram|paper)|tiktok|youtube|vimeo|linkedin|ebay|bing|(?:amazon|[./]analytics)\\.|s.w.org|wikipedia|myspace|yahoo|mail|pinterest|reddit|tumblr"
+ "|www.ccdc.cam.ac.uk|figshare.com/collections/|datadryad.org/stash/dataset/"
Expand Down Expand Up @@ -114,7 +112,7 @@ public class UrlTypeChecker
/**
* This method depends on the initialization of the "LoaderAndChecker.retrieveDatasets" variable, given by the user as cmd-arg, or defined by a service which wraps this software.
* */
public static void setURLDirectoryFilterRegex() {
public static void setRuntimeInitializedRegexes() {
URL_DIRECTORY_FILTER =
Pattern.compile("[^/]+://.*/(?:(discover|profile|user|survey|index|media|theme|product|deposit|default|shop|view)/" + docOrDatasetNegativeLookAroundPattern // Avoid blocking these if the url is likely to give a file.
+ "|(?:(?:ldap|password)-)?login|ac[c]?ess(?![./]+)|sign[-]?(?:in|out|up)|session|(?:how-to-)?(:?join[^t]|subscr)|regist(?:er|ration)|submi(?:t|ssion)|(?:post|send|export|(?:wp-)?admin|home|form|career[s]?|company)/|watch|browse|import|bookmark|announcement|feedback|share[^d]|about|(?:[^/]+-)?faq|wiki|news|events|cart|support|(?:site|html)map|documentation|help|license|disclaimer|copyright|(?:site-)?polic(?:y|ies)(?!.*paper)|privacy|terms|law|principles"
Expand All @@ -137,6 +135,9 @@ public static void setURLDirectoryFilterRegex() {
// We check the above rules, mostly as directories to avoid discarding publications' urls about these subjects. There's "acesso" (single "c") in Portuguese.. Also there's "autore" & "contatto" in Italian.
if ( logger.isTraceEnabled() )
logger.trace("URL_DIRECTORY_FILTER:\n" + URL_DIRECTORY_FILTER);

INTERNAL_LINKS_FILE_FORMAT_FILTER =
Pattern.compile(".+format=(?:" + (!LoaderAndChecker.retrieveDatasets ? "xml|" : "") + htOrPhpExtensionsPattern + "|rss|ris|bib|citation_|events_kml).*");
}


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ public static void main( String[] args )

logger.info("Starting PublicationsRetriever..");
ConnSupportUtils.setKnownMimeTypes();
UrlTypeChecker.setURLDirectoryFilterRegex();
UrlTypeChecker.setRuntimeInitializedRegexes();

// Use testing input/output files.
setInputOutput();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,6 @@ public class UrlChecker {
@Test
public void checkUrlConnectivity()
{
FileUtils.shouldDownloadDocFiles = false; // Default is: "true".

// Here test individual urls.

ArrayList<String> urlList = new ArrayList<>();
Expand Down Expand Up @@ -793,7 +791,7 @@ public void checkUrlConnectivity()

// Set some needed data.
ConnSupportUtils.setKnownMimeTypes();
UrlTypeChecker.setURLDirectoryFilterRegex();
UrlTypeChecker.setRuntimeInitializedRegexes();

ArgsUtils.shouldDownloadDocFiles = true;
ArgsUtils.fileNameType = ArgsUtils.fileNameTypeEnum.idName;
Expand Down

0 comments on commit 7c24c35

Please sign in to comment.