Skip to content

Commit

Permalink
Avoid discarding some urls, which include the word "joint".
Browse files Browse the repository at this point in the history
  • Loading branch information
LSmyrnaios committed Jan 12, 2024
1 parent 10e5ce1 commit db41c5b
Show file tree
Hide file tree
Showing 2 changed files with 2 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ public class PageCrawler
// The following regex is used both in the text around the links and in the links themselves. Everything should be LOWERCASE, from the regex-rules to the link to be matched against them.
public static final Pattern NON_VALID_DOCUMENT = Pattern.compile(".*(?:[^e]manu[ae]l|(?:\\|\\|" + spaceOrDashes + ")?gu[ií](?:de|a)|directive[s]?|preview|leaflet|agreement(?!.*thesis" + spaceOrDashes + "(?:19|20)[\\d]{2}.*)|accessibility|journal" + spaceOrDashes + "catalog|disclose" + spaceOrDashes + "file|poli(?:c(?:y|ies)|tika(?:si)?)" // "policy" can be a lone word or a word after: repository|embargo|privacy|data protection|take down|supplement|access
// We may have the "Emanuel" writer's name in the url-string. Also, we may have the "agreement"-keyword in a valid pub-url like: https://irep.ntu.ac.uk/id/eprint/40188/1/__Opel.ads.ntu.ac.uk_IRep-PGR%24_2020%20Theses%20and%20deposit%20agreement%20forms_BLSS_NBS_FARRIER-WILLIAMS%2C%20Elizabeth_EFW%20Thesis%202020.pdf
+ "|licen(?:se|cia)" + spaceOrDashes + "(?:of|de)" + spaceOrDashes + "us[eo]|(?:governance|safety)" + spaceOrDashes + "statement|normativa|(?:consumer|hazard|copyright)" + spaceOrDashes + "(?:information|(?:release" + spaceOrDashes + ")?form)|copyright|permission|(?:editorial|review)" + spaceOrDashes + "board|d[ée](?:p(?:ôt[s]?|oser|osit(?!ed))|butez)|cr[ée]er" + spaceOrDashes + "(?:votre|son)|orcid|subscription|instruction|code" + spaceOrDashes + "of" + spaceOrDashes + "conduct|[^_]request|join|compte|[^_]account"
+ "|licen(?:se|cia)" + spaceOrDashes + "(?:of|de)" + spaceOrDashes + "us[eo]|(?:governance|safety)" + spaceOrDashes + "statement|normativa|(?:consumer|hazard|copyright)" + spaceOrDashes + "(?:information|(?:release" + spaceOrDashes + ")?form)|copyright|permission|(?:editorial|review)" + spaceOrDashes + "board|d[ée](?:p(?:ôt[s]?|oser|osit(?!ed))|butez)|cr[ée]er" + spaceOrDashes + "(?:votre|son)|orcid|subscription|instruction|code" + spaceOrDashes + "of" + spaceOrDashes + "conduct|[^_]request|join[^t]|compte|[^_]account"
+ "|table" + spaceOrDashes + "of" + spaceOrDashes + "contents|(?:front|back|end)" + spaceOrDashes + "matter|information" + spaceOrDashes + "for" + spaceOrDashes + "authors|pdf(?:/a)?" + spaceOrDashes + "conversion|catalogue|factsheet|classifieds" // classifieds = job-ads
+ "|pdf-viewer|certificate" + spaceOrDashes + "of|conflict[s]?" + spaceOrDashes + "of" + spaceOrDashes + "interest|(?:recommendation|order)" + spaceOrDashes + "form|adverti[sz]e|mandatory" + spaceOrDashes + "open" + spaceOrDashes + "access|recommandations" + spaceOrDashes + "pour" + spaceOrDashes + "s'affilier|hal.*collections|terms|conditions|hakuohjeet|logigramme|export_liste_publi|yearbook|pubs_(?:brochure|overview)|thermal-letter|réutiliser" + spaceOrDashes + "des" + spaceOrDashes + "images" + spaceOrDashes + "dans" + spaceOrDashes + "des" + spaceOrDashes + "publications"
+ "|procedure|規程|運営規程" // 規程 == procedure, 運営規程 = Operating regulations (in japanese)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ public class UrlTypeChecker

public static final Pattern URL_DIRECTORY_FILTER =
Pattern.compile("[^/]+://.*/(?:(discover|profile|user|survey|index|media|theme|product|deposit|default|shop|view)/" + docOrDatasetNegativeLookAroundPattern // Avoid blocking these if the url is likely to give a file.
+ "|(?:(?:ldap|password)-)?login|auth(?:entication)?\\.|ac[c]?ess(?!\\.)|sign[-]?(?:in|out|up)|session|(?:how-to-)?(:?join|subscr)|regist(?:er|ration)|submi(?:t|ssion)|(?:post|send|export|(?:wp-)?admin|home|form|career[s]?|company)/|watch|browse|import|bookmark|announcement|feedback|share[^d]|about|(?:[^/]+-)?faq|wiki|news|events|cart|support|(?:site|html)map|documentation|help|license|disclaimer|copyright|(?:site-)?polic(?:y|ies)|privacy|terms|law|principles"
+ "|(?:(?:ldap|password)-)?login|auth(?:entication)?\\.|ac[c]?ess(?!\\.)|sign[-]?(?:in|out|up)|session|(?:how-to-)?(:?join[^t]|subscr)|regist(?:er|ration)|submi(?:t|ssion)|(?:post|send|export|(?:wp-)?admin|home|form|career[s]?|company)/|watch|browse|import|bookmark|announcement|feedback|share[^d]|about|(?:[^/]+-)?faq|wiki|news|events|cart|support|(?:site|html)map|documentation|help|license|disclaimer|copyright|(?:site-)?polic(?:y|ies)|privacy|terms|law|principles"
+ "|(?:my|your|create)?[-]?account|my(?:dspace|selection|cart)|(?:service|help)[-]?desk|settings|fund|aut[h]?or|journal/key|(?:journal-)?editor|author:|(?<!ntrs.nasa.gov/(?:api/)?)citation|review|external|facets|statistics|application|selfarchive|permission|ethic(s)?/.*/view/|conta[c]?t|wallet|contribute|donate|our[_-][\\w]+|template|logo|image|photo/|video|advertiser|most-popular|people|(?:the)?press|for-authors|customer-service[s]?|captcha|clipboard|dropdown|widget"
+ "|(?:forum|blog|column|row|js|css|rss|legal)/" // These are absolute directory names. TODO - Should I add the "|citation[s]?" rule ? The nasa-docUrls include it..
+ "|(?:(?:advanced[-]?)?search|search/advanced|search-results|(?:[e]?books|journals)(?:-catalog)?|issue|docs|oai|(?:abstracting-)?indexing|online[-]?early|honors|awards|meetings|calendar|diversity|scholarships|invo(?:ice|lved)|errata|classroom|publish(?:-with-us)?|upload|products|forgot|home|ethics|comics|podcast|trends|bestof|booksellers|recommendations|bibliographic|volume[s]?)[/]?$" // Url ends with these. Note that some of them are likely to be part of a docUrl, for ex. the "/trends/"-dir.
Expand Down

0 comments on commit db41c5b

Please sign in to comment.