Skip to content

Commit

Permalink
- Add the "pageUrl" field in the output.
Browse files Browse the repository at this point in the history
- Fix a missing change.
- Rename "DataToBeLogged"-class to "DataForOutput".
- Update software's version in "example.sh".
- Set a test-storage-dir in "UrlChecker.checkUrlConnectivity()".
  • Loading branch information
LSmyrnaios committed Nov 11, 2024
1 parent 71a1d2f commit bec6087
Show file tree
Hide file tree
Showing 13 changed files with 39 additions and 26 deletions.
2 changes: 1 addition & 1 deletion example/sample_input/sample_input.json
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
{"id":"artemis___fr::60eafea9b28a64cd218110abcf928d15","url":"http://depozit.isae.fr/theses/2013/2013_Roche_Sebastien.pdf"}
{"id":"dedup_wf_001::83872a151fd78b045e62275ca626ec94","url":"https://zenodo.org/record/884160"}
{"id":"50|dedup_wf_001::160ed2b773e79f31c3074f354ef1bb37","url":"https://europepmc.org/articles/PMC3814013/"}
{"id":"od______2661::4e4a2b01449ecdb83f826ab93443aa17","url":"http://doi.org/10.1007/s10853-008-3039-6"}
Binary file modified example/sample_output/DocFiles/1.pdf
Binary file not shown.
Binary file modified example/sample_output/DocFiles/2.pdf
Binary file not shown.
Binary file modified example/sample_output/DocFiles/3.pdf
Binary file not shown.
6 changes: 3 additions & 3 deletions example/sample_output/sample_output.json
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
{"id":"50|dedup_wf_001::160ed2b773e79f31c3074f354ef1bb37","sourceUrl":"https://europepmc.org/articles/PMC3814013/","docOrDatasetUrl":"https://europepmc.org/backend/ptpmcrender.fcgi?accid=PMC3814013&blobtype=pdf","wasUrlChecked":"true","wasUrlValid":"true","wasDocumentOrDatasetAccessible":"true","wasDirectLink":"false","couldRetry":"true","fileHash":"f8cb381dcd913b270045103a86ea40b1","fileSize":"58791","comment":"/home/user/PublicationsRetriever/target/../example/sample_output/DocFiles/1.pdf"}
{"id":"artemis___fr::60eafea9b28a64cd218110abcf928d15","sourceUrl":"http://depozit.isae.fr/theses/2013/2013_Roche_Sebastien.pdf","docOrDatasetUrl":"https://depozit.isae.fr/theses/2013/2013_Roche_Sebastien.pdf","wasUrlChecked":"true","wasUrlValid":"true","wasDocumentOrDatasetAccessible":"true","wasDirectLink":"true","couldRetry":"true","fileHash":"f2785bba2296919108b5cce18c716c73","fileSize":"5578443","comment":"/home/user/PublicationsRetriever/target/../example/sample_output/DocFiles/2.pdf"}
{"id":"dedup_wf_001::83872a151fd78b045e62275ca626ec94","sourceUrl":"https://zenodo.org/record/884160","docOrDatasetUrl":"https://zenodo.org/records/884160/files/Data_for_Policy_2017_paper_55.pdf","wasUrlChecked":"true","wasUrlValid":"true","wasDocumentOrDatasetAccessible":"true","wasDirectLink":"false","couldRetry":"true","fileHash":"4e38a82fe1182e62b1c752b50f5ea59b","fileSize":"263917","comment":"/home/user/PublicationsRetriever/target/../example/sample_output/DocFiles/3.pdf"}
{"id":"dedup_wf_001::83872a151fd78b045e62275ca626ec94","sourceUrl":"https://zenodo.org/record/884160","pageUrl":"https://zenodo.org/records/884160","docOrDatasetUrl":"https://zenodo.org/records/884160/files/Data_for_Policy_2017_paper_55.pdf","wasUrlChecked":"true","wasUrlValid":"true","wasDocumentOrDatasetAccessible":"true","wasDirectLink":"false","couldRetry":"true","fileHash":"4e38a82fe1182e62b1c752b50f5ea59b","fileSize":"263917","comment":"/home/labros/MEGAsync/UOA-JOB/OpenAIRE/JOB-DownloadPDFs/docUrlsRetriever/PublicationsRetriever/target/../example/sample_output/DocFiles/2.pdf"}
{"id":"artemis___fr::60eafea9b28a64cd218110abcf928d15","sourceUrl":"http://depozit.isae.fr/theses/2013/2013_Roche_Sebastien.pdf","pageUrl":"http://depozit.isae.fr/theses/2013/2013_Roche_Sebastien.pdf","docOrDatasetUrl":"https://depozit.isae.fr/theses/2013/2013_Roche_Sebastien.pdf","wasUrlChecked":"true","wasUrlValid":"true","wasDocumentOrDatasetAccessible":"true","wasDirectLink":"true","couldRetry":"true","fileHash":"f2785bba2296919108b5cce18c716c73","fileSize":"5578443","comment":"/home/labros/MEGAsync/UOA-JOB/OpenAIRE/JOB-DownloadPDFs/docUrlsRetriever/PublicationsRetriever/target/../example/sample_output/DocFiles/1.pdf"}
{"id":"od______2661::4e4a2b01449ecdb83f826ab93443aa17","sourceUrl":"http://doi.org/10.1007/s10853-008-3039-6","pageUrl":"https://link.springer.com/article/10.1007/s10853-008-3039-6","docOrDatasetUrl":"https://link.springer.com/content/pdf/10.1007/s10853-008-3039-6.pdf","wasUrlChecked":"true","wasUrlValid":"true","wasDocumentOrDatasetAccessible":"true","wasDirectLink":"false","couldRetry":"true","fileHash":"5b75d58655e2830112ff16434d75cfa7","fileSize":"1083351","comment":"/home/labros/MEGAsync/UOA-JOB/OpenAIRE/JOB-DownloadPDFs/docUrlsRetriever/PublicationsRetriever/target/../example/sample_output/DocFiles/3.pdf"}
2 changes: 1 addition & 1 deletion runExample.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ rm -rf example/sample_output/*
# Run the program.
cd target || exit

command="java -jar publications_retriever-1.2-SNAPSHOT.jar -retrieveDataType all -downloadDocFiles -docFileNameType numberName -firstDocFileNum 1 -docFilesStorage ../example/sample_output/DocFiles < ../example/sample_input/sample_input.json > ../example/sample_output/sample_output.json"
command="java -jar publications_retriever-1.3-SNAPSHOT.jar -retrieveDataType all -downloadDocFiles -docFileNameType numberName -firstDocFileNum 1 -docFilesStorage ../example/sample_output/DocFiles < ../example/sample_input/sample_input.json > ../example/sample_output/sample_output.json"
echo -e "\nRunning: $command\n"
eval "$command"
echo "Finished"
Original file line number Diff line number Diff line change
Expand Up @@ -606,7 +606,7 @@ public static boolean verifyDocLink(String urlId, String sourceUrl, String pageU
return true;
} catch (Exception e) { // After connecting to the possibleDocLink.
logger.warn("The DocLink < " + docLink + " > was not reached!"); // The specific error has already been written inside the called method.
UrlUtils.logOutputData(urlId, sourceUrl, pageUrl, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in 'PageCrawler.visit()' method, as the retrieved DocLink: < " + docLink + " > had connectivity problems.", null, true, "true", "true", "false", "false", "false", null, "null");
UrlUtils.addOutputData(urlId, sourceUrl, pageUrl, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in 'PageCrawler.visit()' method, as the retrieved DocLink: < " + docLink + " > had connectivity problems.", null, true, "true", "true", "false", "false", "false", null, "null");
return false;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import eu.openaire.publications_retriever.exceptions.DocFileNotRetrievedException;
import eu.openaire.publications_retriever.util.args.ArgsUtils;
import eu.openaire.publications_retriever.util.http.ConnSupportUtils;
import eu.openaire.publications_retriever.util.url.DataToBeLogged;
import eu.openaire.publications_retriever.util.url.DataForOutput;
import eu.openaire.publications_retriever.util.url.LoaderAndChecker;
import eu.openaire.publications_retriever.util.url.UrlUtils;
import org.apache.commons.io.FileDeleteStrategy;
Expand Down Expand Up @@ -58,7 +58,7 @@ public class FileUtils
public static int unretrievableInputLines = 0; // For better statistics in the end.


public static final List<DataToBeLogged> dataForOutput = Collections.synchronizedList(new ArrayList<>(jsonBatchSize));
public static final List<DataForOutput> dataForOutput = Collections.synchronizedList(new ArrayList<>(jsonBatchSize));

public static final HashMap<String, Integer> numbersOfDuplicateDocFileNames = new HashMap<>(); // Holds docFileNames with their duplicatesNum.
// If we use the above without external synchronization, then the "ConcurrentHashMap" should be used instead.
Expand Down Expand Up @@ -321,7 +321,7 @@ public static void writeResultsToFile()
if ( stringToBeWritten == null )
stringToBeWritten = new StringBuilder(jsonBatchSize * 900); // 900: the usual-maximum-expected-length for an <id-sourceUrl-docUrl-comment> quadruple.

for ( DataToBeLogged data : FileUtils.dataForOutput )
for ( DataForOutput data : FileUtils.dataForOutput )
{
stringToBeWritten.append(data.toJsonString()).append(endOfLine);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ public S3ObjectStore()
// It's not safe, nor helpful to show the credentials in the logs.

minioClient = MinioClient.builder().endpoint(endpoint).credentials(accessKey, secretKey).region(region).build();
// Default timeouts are: conn: 30secs, read: 60secs, write: 30secs.
minioClient.setTimeout(TimeUnit.MINUTES.toMillis(1), TimeUnit.MINUTES.toMillis(5), TimeUnit.MINUTES.toMillis(5));

boolean bucketExists = false;
try {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,26 +15,28 @@
* This class is responsible to store the quadruple <urlId, sourceUrl, docOrDatasetUrl, wasUrlChecked, wasUrlValid, wasDocumentOrDatasetAccessible, wasDirectLink, errorCause / comment> for it to be written in the outputFile.
* @author Lampros Smyrnaios
*/
public class DataToBeLogged
public class DataForOutput
{
private String urlId;
private String sourceUrl;
private String pageUrl;
private String docOrDatasetUrl;
private String wasUrlChecked, wasUrlValid, wasDocumentOrDatasetAccessible, wasDirectLink, couldRetry;
private String hash;
private Long size;
private String comment; // This will be an emptyString, unless there is an error causing the docUrl to be unreachable.

private static final Logger logger = LoggerFactory.getLogger(DataToBeLogged.class);
private static final Logger logger = LoggerFactory.getLogger(DataForOutput.class);

public DataToBeLogged(String urlId, String sourceUrl, String docOrDatasetUrl, String wasUrlChecked, String wasUrlValid, String wasDocumentOrDatasetAccessible, String wasDirectLink, String couldRetry, String hash, Long size, String comment)
public DataForOutput(String urlId, String sourceUrl, String pageUrl, String docOrDatasetUrl, String wasUrlChecked, String wasUrlValid, String wasDocumentOrDatasetAccessible, String wasDirectLink, String couldRetry, String hash, Long size, String comment)
{
if ( urlId == null )
urlId = "unretrievable";

this.urlId = urlId;
this.sourceUrl = escapeSourceUrl(sourceUrl); // The input may have non-expected '\"', '\\' or even '\\\"' which will be unescaped by JsonObject, and we have to re-escape them in the output.
this.docOrDatasetUrl = docOrDatasetUrl;
this.sourceUrl = escapeUrl(sourceUrl); // The input may have non-expected '\"', '\\' or even '\\\"' which will be unescaped by JsonObject, and we have to re-escape them in the output.
this.pageUrl = escapeUrl(pageUrl);
this.docOrDatasetUrl = docOrDatasetUrl;
this.wasUrlChecked = wasUrlChecked;
this.wasUrlValid = wasUrlValid;
this.wasDocumentOrDatasetAccessible = wasDocumentOrDatasetAccessible;
Expand All @@ -47,26 +49,26 @@ public DataToBeLogged(String urlId, String sourceUrl, String docOrDatasetUrl, St


/**
* This method, escapes the <backSlashes> and the <doubleQuotes> from the sourceUrl.
* This method, escapes the <backSlashes> and the <doubleQuotes> from the sourceUrl and the pageUrl (in case it is the same).
* When we read from jsonObjects, the string returns unescaped.
* Now, there are libraries for escaping and unescaping chars, like "org.apache.commons.text.StringEscapeUtils".
* But they can't handle the case where you want this: \" to be this: \\\" as they thing you are already satisfied what what you have.
* But they can't handle the case where you want this: \" to be this: \\\" as they think you are already satisfied what what you have.
* Tha might be true in general.. just not when you want to have a valid-jason-output.
* @param sourceUrl
* @param url
* @return
*/
public static String escapeSourceUrl(String sourceUrl)
public static String escapeUrl(String url)
{
/*
Here we might even have these in the input <\\\"> which will be read by jsonObject as <\"> and we will have to re-make them <\\\"> in order to have a valid-json-output.
http://www.scopus.com/record/display.url?eid=2-s2.0-82955208478&origin=resultslist&sort=plf-f&src=s&st1=aZZONI+r&nlo=&nlr=&nls=&sid=YfPXTZ5QQuqvNMHCo-geSvN%3a60&sot=b&sdt=cl&cluster=scoauthid%2c%227004337609%22%2ct%2bscosubtype%2c%22ar%22%2ct%2bscosubjabbr%2c%22MEDI%22%2ct%2c%22MULT%22%2ct&sl=21&s=AUTHOR-NAME%28aZZONI+r%29&relpos=0&relpos=0&searchTerm=AUTHOR-NAME(aZZONI r) AND ( LIMIT-TO(AU-ID,\\\"Azzoni, Roberto\\\" 7004337609) ) AND ( LIMIT-TO(DOCTYPE,\\\"ar\\\" ) ) AND ( LIMIT-TO(SUBJAREA,\\\"MEDI\\\" ) OR LIMIT-TO(SUBJAREA,\\\"MULT\\\" ) )
*/

// Escape backSlash.
sourceUrl = StringUtils.replace(sourceUrl, "\\", "\\\\", -1); // http://koara.lib.keio.ac.jp/xoonips/modules/xoonips/detail.php?koara_id=pdf\AN00150430-00000039--001
url = StringUtils.replace(url, "\\", "\\\\", -1); // http://koara.lib.keio.ac.jp/xoonips/modules/xoonips/detail.php?koara_id=pdf\AN00150430-00000039--001

// Escape doubleQuotes and return.
return StringUtils.replace(sourceUrl, "\"", "\\\"", -1); // https://jual.nipissingu.ca/wp-content/uploads/sites/25/2016/03/v10202.pdf" rel="
return StringUtils.replace(url, "\"", "\\\"", -1); // https://jual.nipissingu.ca/wp-content/uploads/sites/25/2016/03/v10202.pdf" rel="
}


Expand All @@ -92,6 +94,7 @@ public String toJsonString()
jsonObject.put("id", this.urlId);
}
jsonObject.put("sourceUrl", this.sourceUrl);
jsonObject.put("pageUrl", this.pageUrl);
jsonObject.put(ArgsUtils.targetUrlType, this.docOrDatasetUrl);
jsonObject.put("wasUrlChecked", this.wasUrlChecked);
jsonObject.put("wasUrlValid", this.wasUrlValid);
Expand Down Expand Up @@ -126,6 +129,14 @@ public void setSourceUrl(String sourceUrl) {
this.sourceUrl = sourceUrl;
}

public String getPageUrl() {
return pageUrl;
}

public void setPageUrl(String pageUrl) {
this.pageUrl = pageUrl;
}

public String getDocOrDatasetUrl() {
return docOrDatasetUrl;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -415,8 +415,8 @@ public static void loadAndCheckEachIdUrlPair() throws RuntimeException

logger.info("Batch counter: " + (++batchCount) + ((ArgsUtils.inputFileFullPath != null) ? (" | progress: " + PublicationsRetriever.df.format(((batchCount-1) * FileUtils.jsonBatchSize) * 100.0 / FileUtils.numOfLines) + "%") : "") + " | every batch contains at most " + FileUtils.jsonBatchSize + " id-url pairs.");

for ( String retrievedId : loadedIdUrlPairs.keySet() ) {

for ( String retrievedId : loadedIdUrlPairs.keySet() )
{
Set<String> retrievedUrlsOfCurrentId = loadedIdUrlPairs.get(retrievedId);
numOfIDs += retrievedUrlsOfCurrentId.size();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ public static void addOutputData(String urlId, String sourceUrl, String pageUrl,
// If the "goods" surpass the "bads", then that domain will not get blocked, even if the "minimum-accepted-bad-cases" was exceeded.
ConnSupportUtils.countInsertAndGetTimes(domainsAndHits, pageDomain);

// Now if the "finalDocUrl" is different than the "pageUrl", get the domain of the "finalDocUrl" and if it's different, then add it to "domainsAndHits"-HashMap.
// Now if the "finalDocUrl" is different from the "pageUrl", get the domain of the "finalDocUrl" and if it's different, then add it to "domainsAndHits"-HashMap.
if ( !pageUrl.equals(finalDocUrl) ) {
String docUrlDomain = UrlUtils.getDomainStr(finalDocUrl, null);
if ( (docUrlDomain != null) && !docUrlDomain.equals(pageDomain) )
Expand All @@ -110,7 +110,7 @@ public static void addOutputData(String urlId, String sourceUrl, String pageUrl,
duplicateUrls.add(sourceUrl); // Add it in duplicates BlackList, in order not to be accessed for 2nd time in the future. We don't add docUrls here, as we want them to be separate for checking purposes.
}

FileUtils.dataForOutput.add(new DataToBeLogged(urlId, sourceUrl, finalDocUrl, wasUrlChecked, wasUrlValid, wasDocumentOrDatasetAccessible, wasDirectLink, couldRetry, fileHash, fileSize, comment)); // Log it to be written later in the outputFile.
FileUtils.dataForOutput.add(new DataForOutput(urlId, sourceUrl, pageUrl, finalDocUrl, wasUrlChecked, wasUrlValid, wasDocumentOrDatasetAccessible, wasDirectLink, couldRetry, fileHash, fileSize, comment)); // Log it to be written later in the outputFile.
}


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -785,12 +785,12 @@ public void checkUrlConnectivity()
for ( String url: urlList )
logger.info(url);


//LoaderAndChecker.retrieveDatasets = false;
LoaderAndChecker.retrieveDatasets = false;
FileUtils.shouldDownloadDocFiles = true;
FileUtils.docFileNameType = FileUtils.DocFileNameType.idName;
if ( FileUtils.shouldDownloadDocFiles ) {
FileUtils.shouldDeleteOlderDocFiles = true;
FileUtils.storeDocFilesDir = FileUtils.workingDir + "testDocFiles" + File.separator;
FileUtils.handleStoreDocFileDirectory();
}

Expand Down

0 comments on commit bec6087

Please sign in to comment.