Skip to content

Commit

Permalink
- Improve a rule in "PageCrawler.NON_VALID_DOCUMENT"-regex, to avoid …
Browse files Browse the repository at this point in the history
…losing some fulltexts.

- Update dependencies.
  • Loading branch information
LSmyrnaios committed Nov 5, 2024
1 parent 9830974 commit 50a59d5
Show file tree
Hide file tree
Showing 2 changed files with 3 additions and 17 deletions.
18 changes: 2 additions & 16 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>3.5.1</version>
<version>3.5.2</version>
<configuration>
<!--<excludes>
<exclude>some test to exclude here</exclude>
Expand Down Expand Up @@ -165,21 +165,7 @@
<dependency>
<groupId>io.minio</groupId>
<artifactId>minio</artifactId>
<version>8.5.12</version>
<exclusions>
<exclusion>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.commons</groupId>
<artifactId>commons-compress</artifactId>
</exclusion>
<exclusion>
<groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId>
</exclusion>
</exclusions>
<version>8.5.13</version>
</dependency>

<!-- Use this dependency in case we move to Java 11 in the future (or at least use JDK-11 in a VM).. -->
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ public class PageCrawler
public static final Pattern DOCUMENT_TEXT = Pattern.compile("pdf|full" + spaceOrDashes + "text|download|t[ée]l[ée]charger|descargar|texte" + spaceOrDashes + "intégral");

// The following regex is used both in the text around the links and in the links themselves. Everything should be LOWERCASE, from the regex-rules to the link to be matched against them.
public static final Pattern NON_VALID_DOCUMENT = Pattern.compile(".*(?:[^e]manu[ae]l|(?:\\|\\|" + spaceOrDashes + ")?gu[ií](?:de|a)|directive[s]?|preview|leaflet|agreement(?!.*thesis" + spaceOrDashes + "(?:19|20)[\\d]{2}.*)|accessibility|journal" + spaceOrDashes + "catalog|disclose" + spaceOrDashes + "file|poli(?:c(?:y|ies)(?!.*paper)|tika(?:si)?)" // "policy" can be a lone word or a word after: repository|embargo|privacy|data protection|take down|supplement|access
public static final Pattern NON_VALID_DOCUMENT = Pattern.compile(".*(?:[^e]manu[ae]l|(?:\\|\\|" + spaceOrDashes + ")?gu[ií](?:de|a)|directive[s]?|(?<!readonly_)preview|leaflet|agreement(?!.*thesis" + spaceOrDashes + "(?:19|20)[\\d]{2}.*)|accessibility|journal" + spaceOrDashes + "catalog|disclose" + spaceOrDashes + "file|poli(?:c(?:y|ies)(?!.*paper)|tika(?:si)?)" // "policy" can be a lone word or a word after: repository|embargo|privacy|data protection|take down|supplement|access
// We may have the "Emanuel" writer's name in the url-string. Also, we may have the "agreement"-keyword in a valid pub-url like: https://irep.ntu.ac.uk/id/eprint/40188/1/__Opel.ads.ntu.ac.uk_IRep-PGR%24_2020%20Theses%20and%20deposit%20agreement%20forms_BLSS_NBS_FARRIER-WILLIAMS%2C%20Elizabeth_EFW%20Thesis%202020.pdf
+ "|licen(?:se|cia)" + spaceOrDashes + "(?:of|de)" + spaceOrDashes + "us[eo]|(?:governance|safety)" + spaceOrDashes + "statement|normativa|(?:consumer|hazard|copyright)" + spaceOrDashes + "(?:information|(?:release" + spaceOrDashes + ")?form)|copyright|permission|(?:editorial|review)" + spaceOrDashes + "board|d[ée](?:p(?:ôt[s]?|oser|osit(?!ed))|butez)|cr[ée]er" + spaceOrDashes + "(?:votre|son)|orcid|subscription|instruction|code" + spaceOrDashes + "of" + spaceOrDashes + "conduct|[^_]request|join[^t]|compte|[^_]account"
+ "|table" + spaceOrDashes + "of" + spaceOrDashes + "contents|(?:front|back|end)" + spaceOrDashes + "matter|information" + spaceOrDashes + "for" + spaceOrDashes + "authors|pdf(?:/a)?" + spaceOrDashes + "conversion|catalogue|factsheet|classifieds" // classifieds = job-ads
Expand Down

0 comments on commit 50a59d5

Please sign in to comment.