From 1bf0955ac15fe867dd6bf05027c3227ce54f2387 Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Thu, 5 Sep 2024 22:18:17 +0300 Subject: [PATCH] - Avoid losing some full-texts. - Update the "runExample.sh" script to use the newest version of the app. - Update the "sample_output.json" file to include the "fileHash" and "fileSize" parameters. - Code polishing. --- example/sample_output/sample_output.json | 6 +++--- runExample.sh | 8 ++------ .../publications_retriever/crawler/PageCrawler.java | 2 +- .../util/http/ConnSupportUtils.java | 2 +- .../publications_retriever/util/url/UrlTypeChecker.java | 2 +- 5 files changed, 8 insertions(+), 12 deletions(-) diff --git a/example/sample_output/sample_output.json b/example/sample_output/sample_output.json index 42f87e1f..083b2706 100644 --- a/example/sample_output/sample_output.json +++ b/example/sample_output/sample_output.json @@ -1,3 +1,3 @@ -{"id":"50|dedup_wf_001::160ed2b773e79f31c3074f354ef1bb37","sourceUrl":"https://europepmc.org/articles/PMC3814013/","docOrDatasetUrl":"https://europepmc.org/backend/ptpmcrender.fcgi?accid=PMC3814013&blobtype=pdf","wasUrlChecked":"true","wasUrlValid":"true","wasDocumentOrDatasetAccessible":"true","wasDirectLink":"false","couldRetry":"true","comment":"/home/lampros/PublicationsRetriever/target/../example/sample_output/DocFiles/1.pdf"} -{"id":"artemis___fr::60eafea9b28a64cd218110abcf928d15","sourceUrl":"http://depozit.isae.fr/theses/2013/2013_Roche_Sebastien.pdf","docOrDatasetUrl":"https://depozit.isae.fr/theses/2013/2013_Roche_Sebastien.pdf","wasUrlChecked":"true","wasUrlValid":"true","wasDocumentOrDatasetAccessible":"true","wasDirectLink":"true","couldRetry":"true","comment":"/home/lampros/PublicationsRetriever/target/../example/sample_output/DocFiles/2.pdf"} -{"id":"dedup_wf_001::83872a151fd78b045e62275ca626ec94","sourceUrl":"https://zenodo.org/record/884160","docOrDatasetUrl":"https://zenodo.org/record/884160/files/Data_for_Policy_2017_paper_55.pdf","wasUrlChecked":"true","wasUrlValid":"true","wasDocumentOrDatasetAccessible":"true","wasDirectLink":"false","couldRetry":"true","comment":"/home/lampros/PublicationsRetriever/target/../example/sample_output/DocFiles/3.pdf"} +{"id":"50|dedup_wf_001::160ed2b773e79f31c3074f354ef1bb37","sourceUrl":"https://europepmc.org/articles/PMC3814013/","docOrDatasetUrl":"https://europepmc.org/backend/ptpmcrender.fcgi?accid=PMC3814013&blobtype=pdf","wasUrlChecked":"true","wasUrlValid":"true","wasDocumentOrDatasetAccessible":"true","wasDirectLink":"false","couldRetry":"true","fileHash":"f8cb381dcd913b270045103a86ea40b1","fileSize":"58791","comment":"/home/user/PublicationsRetriever/target/../example/sample_output/DocFiles/1.pdf"} +{"id":"artemis___fr::60eafea9b28a64cd218110abcf928d15","sourceUrl":"http://depozit.isae.fr/theses/2013/2013_Roche_Sebastien.pdf","docOrDatasetUrl":"https://depozit.isae.fr/theses/2013/2013_Roche_Sebastien.pdf","wasUrlChecked":"true","wasUrlValid":"true","wasDocumentOrDatasetAccessible":"true","wasDirectLink":"true","couldRetry":"true","fileHash":"f2785bba2296919108b5cce18c716c73","fileSize":"5578443","comment":"/home/user/PublicationsRetriever/target/../example/sample_output/DocFiles/2.pdf"} +{"id":"dedup_wf_001::83872a151fd78b045e62275ca626ec94","sourceUrl":"https://zenodo.org/record/884160","docOrDatasetUrl":"https://zenodo.org/records/884160/files/Data_for_Policy_2017_paper_55.pdf","wasUrlChecked":"true","wasUrlValid":"true","wasDocumentOrDatasetAccessible":"true","wasDirectLink":"false","couldRetry":"true","fileHash":"4e38a82fe1182e62b1c752b50f5ea59b","fileSize":"263917","comment":"/home/user/PublicationsRetriever/target/../example/sample_output/DocFiles/3.pdf"} diff --git a/runExample.sh b/runExample.sh index 48368325..6944b38e 100755 --- a/runExample.sh +++ b/runExample.sh @@ -9,11 +9,7 @@ rm -rf example/sample_output/* # Run the program. cd target || exit -command="java -jar publications_retriever-1.1-SNAPSHOT.jar -retrieveDataType all -downloadDocFiles -docFileNameType numberName -firstDocFileNum 1 -docFilesStorage ../example/sample_output/DocFiles < ../example/sample_input/sample_input.json > ../example/sample_output/sample_output.json" +command="java -jar publications_retriever-1.2-SNAPSHOT.jar -retrieveDataType all -downloadDocFiles -docFileNameType numberName -firstDocFileNum 1 -docFilesStorage ../example/sample_output/DocFiles < ../example/sample_input/sample_input.json > ../example/sample_output/sample_output.json" echo -e "\nRunning: $command\n" - -# Unfortunately, the plain "$command" does not work ,so we have to re-type the commend.. - -java -jar publications_retriever-1.1-SNAPSHOT.jar -retrieveDataType all -downloadDocFiles -docFileNameType numberName -firstDocFileNum 1 -docFilesStorage ../example/sample_output/DocFiles < ../example/sample_input/sample_input.json > ../example/sample_output/sample_output.json - +eval "$command" echo "Finished" diff --git a/src/main/java/eu/openaire/publications_retriever/crawler/PageCrawler.java b/src/main/java/eu/openaire/publications_retriever/crawler/PageCrawler.java index 4c7013af..0d7632d0 100644 --- a/src/main/java/eu/openaire/publications_retriever/crawler/PageCrawler.java +++ b/src/main/java/eu/openaire/publications_retriever/crawler/PageCrawler.java @@ -58,7 +58,7 @@ public class PageCrawler public static final Pattern DOCUMENT_TEXT = Pattern.compile("pdf|full" + spaceOrDashes + "text|download|t[ée]l[ée]charger|descargar|texte" + spaceOrDashes + "intégral"); // The following regex is used both in the text around the links and in the links themselves. Everything should be LOWERCASE, from the regex-rules to the link to be matched against them. - public static final Pattern NON_VALID_DOCUMENT = Pattern.compile(".*(?:[^e]manu[ae]l|(?:\\|\\|" + spaceOrDashes + ")?gu[ií](?:de|a)|directive[s]?|preview|leaflet|agreement(?!.*thesis" + spaceOrDashes + "(?:19|20)[\\d]{2}.*)|accessibility|journal" + spaceOrDashes + "catalog|disclose" + spaceOrDashes + "file|poli(?:c(?:y|ies)|tika(?:si)?)" // "policy" can be a lone word or a word after: repository|embargo|privacy|data protection|take down|supplement|access + public static final Pattern NON_VALID_DOCUMENT = Pattern.compile(".*(?:[^e]manu[ae]l|(?:\\|\\|" + spaceOrDashes + ")?gu[ií](?:de|a)|directive[s]?|preview|leaflet|agreement(?!.*thesis" + spaceOrDashes + "(?:19|20)[\\d]{2}.*)|accessibility|journal" + spaceOrDashes + "catalog|disclose" + spaceOrDashes + "file|poli(?:c(?:y|ies)(?!.*paper)|tika(?:si)?)" // "policy" can be a lone word or a word after: repository|embargo|privacy|data protection|take down|supplement|access // We may have the "Emanuel" writer's name in the url-string. Also, we may have the "agreement"-keyword in a valid pub-url like: https://irep.ntu.ac.uk/id/eprint/40188/1/__Opel.ads.ntu.ac.uk_IRep-PGR%24_2020%20Theses%20and%20deposit%20agreement%20forms_BLSS_NBS_FARRIER-WILLIAMS%2C%20Elizabeth_EFW%20Thesis%202020.pdf + "|licen(?:se|cia)" + spaceOrDashes + "(?:of|de)" + spaceOrDashes + "us[eo]|(?:governance|safety)" + spaceOrDashes + "statement|normativa|(?:consumer|hazard|copyright)" + spaceOrDashes + "(?:information|(?:release" + spaceOrDashes + ")?form)|copyright|permission|(?:editorial|review)" + spaceOrDashes + "board|d[ée](?:p(?:ôt[s]?|oser|osit(?!ed))|butez)|cr[ée]er" + spaceOrDashes + "(?:votre|son)|orcid|subscription|instruction|code" + spaceOrDashes + "of" + spaceOrDashes + "conduct|[^_]request|join[^t]|compte|[^_]account" + "|table" + spaceOrDashes + "of" + spaceOrDashes + "contents|(?:front|back|end)" + spaceOrDashes + "matter|information" + spaceOrDashes + "for" + spaceOrDashes + "authors|pdf(?:/a)?" + spaceOrDashes + "conversion|catalogue|factsheet|classifieds" // classifieds = job-ads diff --git a/src/main/java/eu/openaire/publications_retriever/util/http/ConnSupportUtils.java b/src/main/java/eu/openaire/publications_retriever/util/http/ConnSupportUtils.java index 8285e3da..99caa80d 100644 --- a/src/main/java/eu/openaire/publications_retriever/util/http/ConnSupportUtils.java +++ b/src/main/java/eu/openaire/publications_retriever/util/http/ConnSupportUtils.java @@ -88,7 +88,7 @@ public class ConnSupportUtils public static final ConcurrentHashMap domainsWithConnectionData = new ConcurrentHashMap<>(); - public static String userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0"; // This should not be "final", another program, using this software as a library, should be able to set its own "UserAgent". + public static String userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:130.0) Gecko/20100101 Firefox/130.0"; // This should not be "final", another program, using this software as a library, should be able to set its own "UserAgent". public static String acceptLanguage = "en-US,en;q=0.5"; diff --git a/src/main/java/eu/openaire/publications_retriever/util/url/UrlTypeChecker.java b/src/main/java/eu/openaire/publications_retriever/util/url/UrlTypeChecker.java index ee15726e..1aafed24 100644 --- a/src/main/java/eu/openaire/publications_retriever/util/url/UrlTypeChecker.java +++ b/src/main/java/eu/openaire/publications_retriever/util/url/UrlTypeChecker.java @@ -28,7 +28,7 @@ public class UrlTypeChecker public static final Pattern URL_DIRECTORY_FILTER = Pattern.compile("[^/]+://.*/(?:(discover|profile|user|survey|index|media|theme|product|deposit|default|shop|view)/" + docOrDatasetNegativeLookAroundPattern // Avoid blocking these if the url is likely to give a file. - + "|(?:(?:ldap|password)-)?login|ac[c]?ess(?![./]+)|sign[-]?(?:in|out|up)|session|(?:how-to-)?(:?join[^t]|subscr)|regist(?:er|ration)|submi(?:t|ssion)|(?:post|send|export|(?:wp-)?admin|home|form|career[s]?|company)/|watch|browse|import|bookmark|announcement|feedback|share[^d]|about|(?:[^/]+-)?faq|wiki|news|events|cart|support|(?:site|html)map|documentation|help|license|disclaimer|copyright|(?:site-)?polic(?:y|ies)|privacy|terms|law|principles" + + "|(?:(?:ldap|password)-)?login|ac[c]?ess(?![./]+)|sign[-]?(?:in|out|up)|session|(?:how-to-)?(:?join[^t]|subscr)|regist(?:er|ration)|submi(?:t|ssion)|(?:post|send|export|(?:wp-)?admin|home|form|career[s]?|company)/|watch|browse|import|bookmark|announcement|feedback|share[^d]|about|(?:[^/]+-)?faq|wiki|news|events|cart|support|(?:site|html)map|documentation|help|license|disclaimer|copyright|(?:site-)?polic(?:y|ies)(?!.*paper)|privacy|terms|law|principles" + "|(?:my|your|create)?[-]?account|my(?:dspace|selection|cart)|(?:service|help)[-]?desk|settings|fund|aut[h]?or" + docOrDatasetNegativeLookAroundPattern + "|journal/key|(?:journal-)?editor|author:|(?