Skip to content

Commit

Permalink
- Add support for more PDF and dataset mimetypes and extensions.
Browse files Browse the repository at this point in the history
- Add JUnit-test for datasets-list and perform necessary changes in tests.
  • Loading branch information
LSmyrnaios committed Nov 17, 2024
1 parent 889b478 commit 8925fa0
Show file tree
Hide file tree
Showing 7 changed files with 149 additions and 28 deletions.
6 changes: 3 additions & 3 deletions example/sample_output/sample_output.json
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
{"id":"dedup_wf_001::83872a151fd78b045e62275ca626ec94","sourceUrl":"https://zenodo.org/record/884160","pageUrl":"https://zenodo.org/records/884160","docOrDatasetUrl":"https://zenodo.org/records/884160/files/Data_for_Policy_2017_paper_55.pdf","wasUrlChecked":"true","wasUrlValid":"true","wasDocumentOrDatasetAccessible":"true","wasDirectLink":"false","couldRetry":"true","fileHash":"4e38a82fe1182e62b1c752b50f5ea59b","fileSize":"263917","comment":"/home/labros/MEGAsync/UOA-JOB/OpenAIRE/JOB-DownloadPDFs/docUrlsRetriever/PublicationsRetriever/target/../example/sample_output/DocFiles/2.pdf"}
{"id":"artemis___fr::60eafea9b28a64cd218110abcf928d15","sourceUrl":"http://depozit.isae.fr/theses/2013/2013_Roche_Sebastien.pdf","pageUrl":"http://depozit.isae.fr/theses/2013/2013_Roche_Sebastien.pdf","docOrDatasetUrl":"https://depozit.isae.fr/theses/2013/2013_Roche_Sebastien.pdf","wasUrlChecked":"true","wasUrlValid":"true","wasDocumentOrDatasetAccessible":"true","wasDirectLink":"true","couldRetry":"true","fileHash":"f2785bba2296919108b5cce18c716c73","fileSize":"5578443","comment":"/home/labros/MEGAsync/UOA-JOB/OpenAIRE/JOB-DownloadPDFs/docUrlsRetriever/PublicationsRetriever/target/../example/sample_output/DocFiles/1.pdf"}
{"id":"od______2661::4e4a2b01449ecdb83f826ab93443aa17","sourceUrl":"http://doi.org/10.1007/s10853-008-3039-6","pageUrl":"https://link.springer.com/article/10.1007/s10853-008-3039-6","docOrDatasetUrl":"https://link.springer.com/content/pdf/10.1007/s10853-008-3039-6.pdf","wasUrlChecked":"true","wasUrlValid":"true","wasDocumentOrDatasetAccessible":"true","wasDirectLink":"false","couldRetry":"true","fileHash":"5b75d58655e2830112ff16434d75cfa7","fileSize":"1083351","comment":"/home/labros/MEGAsync/UOA-JOB/OpenAIRE/JOB-DownloadPDFs/docUrlsRetriever/PublicationsRetriever/target/../example/sample_output/DocFiles/3.pdf"}
{"id":"dedup_wf_001::83872a151fd78b045e62275ca626ec94","sourceUrl":"https://zenodo.org/record/884160","pageUrl":"https://zenodo.org/records/884160","docOrDatasetUrl":"https://zenodo.org/records/884160/files/Data_for_Policy_2017_paper_55.pdf","wasUrlChecked":"true","wasUrlValid":"true","wasDocumentOrDatasetAccessible":"true","wasDirectLink":"false","couldRetry":"true","fileHash":"4e38a82fe1182e62b1c752b50f5ea59b","fileSize":"263917","comment":"/home/user/PublicationsRetriever/target/../example/sample_output/DocFiles/2.pdf"}
{"id":"artemis___fr::60eafea9b28a64cd218110abcf928d15","sourceUrl":"http://depozit.isae.fr/theses/2013/2013_Roche_Sebastien.pdf","pageUrl":"http://depozit.isae.fr/theses/2013/2013_Roche_Sebastien.pdf","docOrDatasetUrl":"https://depozit.isae.fr/theses/2013/2013_Roche_Sebastien.pdf","wasUrlChecked":"true","wasUrlValid":"true","wasDocumentOrDatasetAccessible":"true","wasDirectLink":"true","couldRetry":"true","fileHash":"f2785bba2296919108b5cce18c716c73","fileSize":"5578443","comment":"/home/user/PublicationsRetriever/target/../example/sample_output/DocFiles/1.pdf"}
{"id":"od______2661::4e4a2b01449ecdb83f826ab93443aa17","sourceUrl":"http://doi.org/10.1007/s10853-008-3039-6","pageUrl":"https://link.springer.com/article/10.1007/s10853-008-3039-6","docOrDatasetUrl":"https://link.springer.com/content/pdf/10.1007/s10853-008-3039-6.pdf","wasUrlChecked":"true","wasUrlValid":"true","wasDocumentOrDatasetAccessible":"true","wasDirectLink":"false","couldRetry":"true","fileHash":"5b75d58655e2830112ff16434d75cfa7","fileSize":"1083351","comment":"/home/user/PublicationsRetriever/target/../example/sample_output/DocFiles/3.pdf"}
Original file line number Diff line number Diff line change
Expand Up @@ -132,33 +132,55 @@ public static void setKnownDocMimeTypes()
knownDocMimeTypes.add("application/pdf");
knownDocMimeTypes.add("application/x-pdf");
knownDocMimeTypes.add("image/pdf");
knownDocMimeTypes.add("application/acrobat");
knownDocMimeTypes.add("application/vnd.adobe.pdf");
knownDocMimeTypes.add("application/vnd.pdf");
knownDocMimeTypes.add("application/vnd.ms-pdf");
knownDocMimeTypes.add("application/x-pdf-stream");
// TODO - Add support for other document formats, like "ps", "doc", "docx", ...
// Then create a file to keep all mimetypes and load them in memory, just like we do for the datasets below.
}


private static final Pattern FILTER_COMMENT_FROM_MIMETYPE = Pattern.compile("([^/]+/[^/]+)(?:[\\s]*//.*)?");


public static void setKnownDatasetMimeTypes()
{
logger.debug("Setting up the official dataset mime types. Currently there is support for xls, xlsx, csv, tsv, tab, json, geojson, xml, ods, rdf, zip, gzip, rar, tar, 7z, tgz, gz[\\d]*, bz[\\d]*, xz, smi, por, ascii, dta, sav, dat, txt, ti[f]+, twf, svg, sas7bdat, spss, sas, stata, sql, mysql, postgresql, sqlite, bigquery, shp, shx, prj, sbx, sbn, dbf, mdb, accdb, dwg, mat, pcd, bt, n[sc]?[\\d]*, h4, h5, hdf, hdf4, hdf5, trs, opj, fcs, fas, fasta, values datasets.");
knownDatasetMimeTypes.add("application/vnd.ms-excel");
knownDatasetMimeTypes.add("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
knownDatasetMimeTypes.add("text/csv");
knownDatasetMimeTypes.add("text/tab-separated-values");
knownDatasetMimeTypes.add("application/json");
knownDatasetMimeTypes.add("application/xml"); // There is also the "text/xml", but that is not a binary-dataset-file.
knownDatasetMimeTypes.add("application/rdf+xml");
knownDatasetMimeTypes.add("application/smil+xml"); // .smi
knownDatasetMimeTypes.add("application/smil"); // .smi
knownDatasetMimeTypes.add("text/rdf+n3");
knownDatasetMimeTypes.add("text/plain"); // Some csv and txt datasets
knownDatasetMimeTypes.add("application/zip");
knownDatasetMimeTypes.add("application/gzip");
knownDatasetMimeTypes.add("application/rar");
knownDatasetMimeTypes.add("application/vnd.rar");
knownDatasetMimeTypes.add("application/x-tar");
knownDatasetMimeTypes.add("application/x-7z-compressed");
knownDatasetMimeTypes.add("application/x-sas-data"); // ".sas7bdat" file
knownDatasetMimeTypes.add("application/x-netcdf"); // nc3, nc4, ns
knownDatasetMimeTypes.add("application/x-sql");
knownDatasetMimeTypes.add("image/tiff");
logger.debug("Setting up the official dataset mime-types.");
String resourcePath = "dataset-mimetypes.txt";
try (InputStream inputStream = ConnSupportUtils.class.getClassLoader().getResourceAsStream(resourcePath))
{
if ( inputStream == null ) {
String errorMsg = "File not found in resources: " + resourcePath;
logger.error(errorMsg);
System.err.println(errorMsg);
System.exit(77);
}

try (BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8), FileUtils.mb)) {
String line;
while ( (line = reader.readLine()) != null ) {
Matcher matcher = FILTER_COMMENT_FROM_MIMETYPE.matcher(line);
if ( matcher.matches() ) {
String mimeType = matcher.group(1);
if ( (mimeType != null) && !mimeType.isEmpty() ) {
knownDatasetMimeTypes.add(mimeType.trim());
} else
logger.error("Failed to extract the mimetype from line: " + line);
} else
logger.error("Failed to match the line using the \"FILTER_COMMENT_FROM_MIMETYPE\"-regex: " + line);
}

if ( logger.isTraceEnabled() )
logger.trace(knownDatasetMimeTypes.toString());
}
} catch (IOException ioe) {
String errorMsg = "Could not read file:" + resourcePath;
logger.error(errorMsg, ioe);
System.err.println(errorMsg);
System.exit(78);
}
}


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,9 @@ public class LoaderAndChecker
// "DOC_URL_FILTER" works for lowerCase Strings (we make sure they are in lowerCase before we check).
// Note that we still need to check if it's an alive link and if it's actually a docUrl (though it's mimeType).

private static final String dataset_formats = "xls[x]?|[ct]sv|tab|(?:geo)?json|xml|ods|ddi|rdf|[g]?zip|[rt]ar|[7x]z|tgz|[gb]z[\\d]*|smi|por|ascii|dta|sav|dat|txt|ti[f]+|tfw|dwg"
+ "|svg|sas7bdat|spss|sas|stata|(?:my|postgre)?sql(?:ite)?|bigquery|sh[px]|sb[xn]|prj|dbf|(?:m|acc)db|mif|mat|pcd|bt|n[sc]?[\\d]*|h[\\d]+|hdf[\\d]*|trs|opj|jcamp|fcs|fas(?:ta)?|keys|values";
private static final String dataset_formats = "xls[xbm]?|xlt[x]?|[ct]sv|tab|(?:(?:geo)?j|b)son|(?:x|k|g|nmr|sb|wiley|y[a]?)ml|xsd|o[dt]s|ddi|rdf|[g]?zip|zipx|[rt]ar|[7x]z|[t]?gz|[gb]z[\\d]*|smi[l]?|por|ascii|dta|sav|dat|txt|ti[f]{1,2}|tfw|dwg|nt|fits|feather"
+ "|svg|sas7b(?:dat|ve)|spss|sas|stata|(?:my|postgre)?sql(?:ite)?|bigquery|sh[px]|sb[xn]|prj|dbf|(?:m|acc)db|mif|mat|pcd|bt|n[sc]?[\\d]*|h[\\d]+|hdf[\\d]*|trs|opj|jcamp|fcs|fas(?:ta)?|keys|values|las|rdata|parquet|avro|sql|dcm|gr[i]?b]|rds"
+ "|[p]?cap|dmp|vcf|cbor|biosample|hic|warc|ig[e]?s|sla|dxf|pdb|[sc]df|cif|f(?:ast)?[qa]|apng|sra|vtp|gltf|[sm]tl|ply|abc|md|rtf|ttl|shp|shx|exr|cdf|glb|mtl";
public static final Pattern DATASET_URL_FILTER = Pattern.compile(".+(?:dataset[s]?/.*|(?:\\.|format=)" + dataset_formats + "(?:\\?.+)?$)");


Expand Down
76 changes: 76 additions & 0 deletions src/main/resources/dataset-mimetypes.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
application/vnd.ms-excel
application/vnd.ms-excel.sheet.binary.macroenabled.12
application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
application/vnd.oasis.opendocument.spreadsheet
application/vnd.las
text/csv
text/tab-separated-values
text/plain // Some csv and txt datasets
application/json
application/bson
application/geo+json
application/xml // There is also the "text/xml", but that is not a binary-dataset-file.
application/rdf+xml
text/turtle
application/n-triples
application/xml (schema) // The space with the "parenthesis" is part of the mimetype.
application/smil+xml
application/smil
application/yaml
text/yaml
application/x-rdata
text/rdf+n3
application/zip
application/gzip
application/rar
application/vnd.rar
application/x-tar
application/x-7z-compressed
application/x-sas-data // ".sas7bdat" file
application/x-netcdf // nc3, nc4, ns
application/netcdf // nc3, nc4, ns
application/x-parquet
application/x-avro
application/x-sql
application/sql
image/tiff
application/fits
application/x-esri-shapefile
application/vnd.google-earth.kml+xml
application/gml+xml
application/x-hdf5
application/x-spss-sav
application/x-stata-dta
application/x-sas7bdat
application/x-feather
application/x-matlab-data
application/dicom
application/grib
application/vnd.tcpdump.pcap
text/x-vcard
application/vcf
application/cbor
image/x-exr
application/x-biosample
application/x-hic
application/warc
application/iges
application/sla
application/dxf
chemical/x-pdb
chemical/x-mdl-sdfile
chemical/x-cif
text/x-fastaq
text/x-fasta
image/apng
application/nmrml+xml
application/sbml+xml
application/vnd.wileyml+xml
application/x-sra
application/x-cdf
application/x-vtp
model/gltf+json
model/stl
application/x-ply
application/x-mtl
application/abc
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import eu.openaire.publications_retriever.exceptions.DocLinkFoundException;
import eu.openaire.publications_retriever.exceptions.DocLinkInvalidException;
import eu.openaire.publications_retriever.util.http.ConnSupportUtils;
import eu.openaire.publications_retriever.util.url.LoaderAndChecker;
import eu.openaire.publications_retriever.util.url.UrlTypeChecker;
import eu.openaire.publications_retriever.util.url.UrlUtils;
import org.junit.jupiter.api.BeforeAll;
Expand Down Expand Up @@ -33,7 +34,10 @@ public class LinkExtraction {

@BeforeAll
static void setExampleHtml() {
LoaderAndChecker.retrieveDocuments = true;
LoaderAndChecker.retrieveDatasets = true;
ConnSupportUtils.setKnownMimeTypes();
UrlTypeChecker.setURLDirectoryFilterRegex();
exampleHtml = "<head><head>" +
"<body>" +
"<p>Select a link from below!</p>" +
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,20 @@ public void testCustomInputOutputWithoutDownloading()
}


@Disabled
@Test
public void testCustomInputOutputWithDatasets()
{
String[] args = new String[4];
args[0] = "-retrieveDataType";
args[1] = "dataset"; // "document" OR "dataset" OR "all"
args[2] = "-inputFileFullPath";
args[3] = "./testData/idUrlPairs/datasets_100.json";
main(args);
}



@Disabled
@Test
public void testCustomInputOutputWithoutDownloadingWithInputFile()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ public void checkUrlConnectivity()

ArrayList<String> urlList = new ArrayList<>();

ConnSupportUtils.setKnownMimeTypes();

// TODO - After that test runs I should be able to observe any such conflicts and try to resolve them.. resulting in more DocUrls and faster processing times!

//urlList.add("http://repositorio.ipen.br:8080/xmlui/bitstream/handle/123456789/11176/09808.pdf?sequence=1&isAllowed=y");
//urlList.add("https://ris.utwente.nl/ws/portalfiles/portal/5118887");
Expand Down Expand Up @@ -792,6 +793,8 @@ public void checkUrlConnectivity()
if ( FileUtils.shouldDownloadDocFiles ) {
FileUtils.shouldDeleteOlderDocFiles = true;
FileUtils.storeDocFilesDir = FileUtils.workingDir + "testDocFiles" + File.separator;
ConnSupportUtils.setKnownMimeTypes();
UrlTypeChecker.setURLDirectoryFilterRegex();
FileUtils.handleStoreDocFileDirectory();
}

Expand Down Expand Up @@ -827,6 +830,7 @@ public void checkUrlConnectivity()
try {
HttpConnUtils.connectAndCheckMimeType(testID, urlToCheck, urlToCheck, urlToCheck, null, true, false); // Sent the < null > in quotes to avoid an NPE in the concurrent data-structures.
} catch (Exception e) {
// The problem was logged inside.
UrlUtils.addOutputData(testID, urlToCheck, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, due to connectivity problems.", null, true, "true", "true", "false", "false", "false", null, "null");
}
}
Expand Down

0 comments on commit 8925fa0

Please sign in to comment.