Skip to content

Commit

Permalink
- Avoid losing some full-texts.
Browse files Browse the repository at this point in the history
- Update the "runExample.sh" script to use the newest version of the app.
- Update the "sample_output.json" file to include the "fileHash" and "fileSize" parameters.
- Code polishing.
  • Loading branch information
LSmyrnaios committed Sep 5, 2024
1 parent bc1b1ff commit 1bf0955
Show file tree
Hide file tree
Showing 5 changed files with 8 additions and 12 deletions.
6 changes: 3 additions & 3 deletions example/sample_output/sample_output.json
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
{"id":"50|dedup_wf_001::160ed2b773e79f31c3074f354ef1bb37","sourceUrl":"https://europepmc.org/articles/PMC3814013/","docOrDatasetUrl":"https://europepmc.org/backend/ptpmcrender.fcgi?accid=PMC3814013&blobtype=pdf","wasUrlChecked":"true","wasUrlValid":"true","wasDocumentOrDatasetAccessible":"true","wasDirectLink":"false","couldRetry":"true","comment":"/home/lampros/PublicationsRetriever/target/../example/sample_output/DocFiles/1.pdf"}
{"id":"artemis___fr::60eafea9b28a64cd218110abcf928d15","sourceUrl":"http://depozit.isae.fr/theses/2013/2013_Roche_Sebastien.pdf","docOrDatasetUrl":"https://depozit.isae.fr/theses/2013/2013_Roche_Sebastien.pdf","wasUrlChecked":"true","wasUrlValid":"true","wasDocumentOrDatasetAccessible":"true","wasDirectLink":"true","couldRetry":"true","comment":"/home/lampros/PublicationsRetriever/target/../example/sample_output/DocFiles/2.pdf"}
{"id":"dedup_wf_001::83872a151fd78b045e62275ca626ec94","sourceUrl":"https://zenodo.org/record/884160","docOrDatasetUrl":"https://zenodo.org/record/884160/files/Data_for_Policy_2017_paper_55.pdf","wasUrlChecked":"true","wasUrlValid":"true","wasDocumentOrDatasetAccessible":"true","wasDirectLink":"false","couldRetry":"true","comment":"/home/lampros/PublicationsRetriever/target/../example/sample_output/DocFiles/3.pdf"}
{"id":"50|dedup_wf_001::160ed2b773e79f31c3074f354ef1bb37","sourceUrl":"https://europepmc.org/articles/PMC3814013/","docOrDatasetUrl":"https://europepmc.org/backend/ptpmcrender.fcgi?accid=PMC3814013&blobtype=pdf","wasUrlChecked":"true","wasUrlValid":"true","wasDocumentOrDatasetAccessible":"true","wasDirectLink":"false","couldRetry":"true","fileHash":"f8cb381dcd913b270045103a86ea40b1","fileSize":"58791","comment":"/home/user/PublicationsRetriever/target/../example/sample_output/DocFiles/1.pdf"}
{"id":"artemis___fr::60eafea9b28a64cd218110abcf928d15","sourceUrl":"http://depozit.isae.fr/theses/2013/2013_Roche_Sebastien.pdf","docOrDatasetUrl":"https://depozit.isae.fr/theses/2013/2013_Roche_Sebastien.pdf","wasUrlChecked":"true","wasUrlValid":"true","wasDocumentOrDatasetAccessible":"true","wasDirectLink":"true","couldRetry":"true","fileHash":"f2785bba2296919108b5cce18c716c73","fileSize":"5578443","comment":"/home/user/PublicationsRetriever/target/../example/sample_output/DocFiles/2.pdf"}
{"id":"dedup_wf_001::83872a151fd78b045e62275ca626ec94","sourceUrl":"https://zenodo.org/record/884160","docOrDatasetUrl":"https://zenodo.org/records/884160/files/Data_for_Policy_2017_paper_55.pdf","wasUrlChecked":"true","wasUrlValid":"true","wasDocumentOrDatasetAccessible":"true","wasDirectLink":"false","couldRetry":"true","fileHash":"4e38a82fe1182e62b1c752b50f5ea59b","fileSize":"263917","comment":"/home/user/PublicationsRetriever/target/../example/sample_output/DocFiles/3.pdf"}
8 changes: 2 additions & 6 deletions runExample.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,7 @@ rm -rf example/sample_output/*
# Run the program.
cd target || exit

command="java -jar publications_retriever-1.1-SNAPSHOT.jar -retrieveDataType all -downloadDocFiles -docFileNameType numberName -firstDocFileNum 1 -docFilesStorage ../example/sample_output/DocFiles < ../example/sample_input/sample_input.json > ../example/sample_output/sample_output.json"
command="java -jar publications_retriever-1.2-SNAPSHOT.jar -retrieveDataType all -downloadDocFiles -docFileNameType numberName -firstDocFileNum 1 -docFilesStorage ../example/sample_output/DocFiles < ../example/sample_input/sample_input.json > ../example/sample_output/sample_output.json"
echo -e "\nRunning: $command\n"

# Unfortunately, the plain "$command" does not work ,so we have to re-type the commend..

java -jar publications_retriever-1.1-SNAPSHOT.jar -retrieveDataType all -downloadDocFiles -docFileNameType numberName -firstDocFileNum 1 -docFilesStorage ../example/sample_output/DocFiles < ../example/sample_input/sample_input.json > ../example/sample_output/sample_output.json

eval "$command"
echo "Finished"
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ public class PageCrawler
public static final Pattern DOCUMENT_TEXT = Pattern.compile("pdf|full" + spaceOrDashes + "text|download|t[ée]l[ée]charger|descargar|texte" + spaceOrDashes + "intégral");

// The following regex is used both in the text around the links and in the links themselves. Everything should be LOWERCASE, from the regex-rules to the link to be matched against them.
public static final Pattern NON_VALID_DOCUMENT = Pattern.compile(".*(?:[^e]manu[ae]l|(?:\\|\\|" + spaceOrDashes + ")?gu[ií](?:de|a)|directive[s]?|preview|leaflet|agreement(?!.*thesis" + spaceOrDashes + "(?:19|20)[\\d]{2}.*)|accessibility|journal" + spaceOrDashes + "catalog|disclose" + spaceOrDashes + "file|poli(?:c(?:y|ies)|tika(?:si)?)" // "policy" can be a lone word or a word after: repository|embargo|privacy|data protection|take down|supplement|access
public static final Pattern NON_VALID_DOCUMENT = Pattern.compile(".*(?:[^e]manu[ae]l|(?:\\|\\|" + spaceOrDashes + ")?gu[ií](?:de|a)|directive[s]?|preview|leaflet|agreement(?!.*thesis" + spaceOrDashes + "(?:19|20)[\\d]{2}.*)|accessibility|journal" + spaceOrDashes + "catalog|disclose" + spaceOrDashes + "file|poli(?:c(?:y|ies)(?!.*paper)|tika(?:si)?)" // "policy" can be a lone word or a word after: repository|embargo|privacy|data protection|take down|supplement|access
// We may have the "Emanuel" writer's name in the url-string. Also, we may have the "agreement"-keyword in a valid pub-url like: https://irep.ntu.ac.uk/id/eprint/40188/1/__Opel.ads.ntu.ac.uk_IRep-PGR%24_2020%20Theses%20and%20deposit%20agreement%20forms_BLSS_NBS_FARRIER-WILLIAMS%2C%20Elizabeth_EFW%20Thesis%202020.pdf
+ "|licen(?:se|cia)" + spaceOrDashes + "(?:of|de)" + spaceOrDashes + "us[eo]|(?:governance|safety)" + spaceOrDashes + "statement|normativa|(?:consumer|hazard|copyright)" + spaceOrDashes + "(?:information|(?:release" + spaceOrDashes + ")?form)|copyright|permission|(?:editorial|review)" + spaceOrDashes + "board|d[ée](?:p(?:ôt[s]?|oser|osit(?!ed))|butez)|cr[ée]er" + spaceOrDashes + "(?:votre|son)|orcid|subscription|instruction|code" + spaceOrDashes + "of" + spaceOrDashes + "conduct|[^_]request|join[^t]|compte|[^_]account"
+ "|table" + spaceOrDashes + "of" + spaceOrDashes + "contents|(?:front|back|end)" + spaceOrDashes + "matter|information" + spaceOrDashes + "for" + spaceOrDashes + "authors|pdf(?:/a)?" + spaceOrDashes + "conversion|catalogue|factsheet|classifieds" // classifieds = job-ads
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ public class ConnSupportUtils

public static final ConcurrentHashMap<String, DomainConnectionData> domainsWithConnectionData = new ConcurrentHashMap<>();

public static String userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0"; // This should not be "final", another program, using this software as a library, should be able to set its own "UserAgent".
public static String userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:130.0) Gecko/20100101 Firefox/130.0"; // This should not be "final", another program, using this software as a library, should be able to set its own "UserAgent".
public static String acceptLanguage = "en-US,en;q=0.5";


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ public class UrlTypeChecker

public static final Pattern URL_DIRECTORY_FILTER =
Pattern.compile("[^/]+://.*/(?:(discover|profile|user|survey|index|media|theme|product|deposit|default|shop|view)/" + docOrDatasetNegativeLookAroundPattern // Avoid blocking these if the url is likely to give a file.
+ "|(?:(?:ldap|password)-)?login|ac[c]?ess(?![./]+)|sign[-]?(?:in|out|up)|session|(?:how-to-)?(:?join[^t]|subscr)|regist(?:er|ration)|submi(?:t|ssion)|(?:post|send|export|(?:wp-)?admin|home|form|career[s]?|company)/|watch|browse|import|bookmark|announcement|feedback|share[^d]|about|(?:[^/]+-)?faq|wiki|news|events|cart|support|(?:site|html)map|documentation|help|license|disclaimer|copyright|(?:site-)?polic(?:y|ies)|privacy|terms|law|principles"
+ "|(?:(?:ldap|password)-)?login|ac[c]?ess(?![./]+)|sign[-]?(?:in|out|up)|session|(?:how-to-)?(:?join[^t]|subscr)|regist(?:er|ration)|submi(?:t|ssion)|(?:post|send|export|(?:wp-)?admin|home|form|career[s]?|company)/|watch|browse|import|bookmark|announcement|feedback|share[^d]|about|(?:[^/]+-)?faq|wiki|news|events|cart|support|(?:site|html)map|documentation|help|license|disclaimer|copyright|(?:site-)?polic(?:y|ies)(?!.*paper)|privacy|terms|law|principles"
+ "|(?:my|your|create)?[-]?account|my(?:dspace|selection|cart)|(?:service|help)[-]?desk|settings|fund|aut[h]?or" + docOrDatasetNegativeLookAroundPattern + "|journal/key|(?:journal-)?editor|author:|(?<!ntrs.nasa.gov/(?:api/)?)citation|review|external|facets|statistics|application|selfarchive|permission|ethic(s)?/.*/view/|conta[c]?t|wallet|contribute|donate|our[_-][\\w]+|template|logo|image|photo/|video|advertiser|most-popular|people|(?:the)?press|for-authors|customer-service[s]?|captcha|clipboard|dropdown|widget"
+ "|(?:forum|blog|column|row|js|css|rss|legal)/" // These are absolute directory names. TODO - Should I add the "|citation[s]?" rule ? The nasa-docUrls include it..
+ "|(?:(?:advanced[-]?)?search|search/advanced|search-results|(?:[e]?books|journals)(?:-catalog)?|issue|docs|oai|(?:abstracting-)?indexing|online[-]?early|honors|awards|meetings|calendar|diversity|scholarships|invo(?:ice|lved)|errata|classroom|publish(?:-with-us)?|upload|products|forgot|home|ethics|comics|podcast|trends|bestof|booksellers|recommendations|bibliographic|volume[s]?)[/]?$" // Url ends with these. Note that some of them are likely to be part of a docUrl, for ex. the "/trends/"-dir.
Expand Down

0 comments on commit 1bf0955

Please sign in to comment.