Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RO-Crate metadata mime type detection #10016

Merged
merged 9 commits into from
May 7, 2024
10 changes: 10 additions & 0 deletions doc/release-notes/10015-RO-Crate-metadata-file.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
Detection of mime-types based on a filename with extension and detection of the RO-Crate metadata files.

From now on, filenames with extensions can be added into `MimeTypeDetectionByFileName.properties` file. Filenames added there will take precedence over simply recognizing files by extensions. For example, two new filenames are added into that file:
```
ro-crate-metadata.json=application/ld+json; profile="http://www.w3.org/ns/json-ld#flattened http://www.w3.org/ns/json-ld#compacted https://w3id.org/ro/crate"
ro-crate-metadata.jsonld=application/ld+json; profile="http://www.w3.org/ns/json-ld#flattened http://www.w3.org/ns/json-ld#compacted https://w3id.org/ro/crate"
```

Therefore, files named `ro-crate-metadata.json` will be then detected as RO-Crated metadata files from now on, instead as generic `JSON` files.
For more information on the RO-Crate specifications, see https://www.researchobject.org/ro-crate
74 changes: 49 additions & 25 deletions src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@
import java.util.HashMap;
import java.util.List;
import java.util.Optional;
import java.util.ResourceBundle;
import java.util.UUID;
import java.util.logging.Level;
import java.util.logging.Logger;
Expand Down Expand Up @@ -176,6 +177,7 @@ public class FileUtil implements java.io.Serializable {
public static final String MIME_TYPE_NETCDF = "application/netcdf";
public static final String MIME_TYPE_XNETCDF = "application/x-netcdf";
public static final String MIME_TYPE_HDF5 = "application/x-hdf5";
public static final String MIME_TYPE_RO_CRATE = "application/ld+json; profile=\"http://www.w3.org/ns/json-ld#flattened http://www.w3.org/ns/json-ld#compacted https://w3id.org/ro/crate\"";

// File type "thumbnail classes" tags:

Expand Down Expand Up @@ -272,6 +274,11 @@ public static String getUserFriendlyFileType(DataFile dataFile) {
if (fileType.equalsIgnoreCase(ShapefileHandler.SHAPEFILE_FILE_TYPE)){
return ShapefileHandler.SHAPEFILE_FILE_TYPE_FRIENDLY_NAME;
}
try {
return BundleUtil.getStringFromPropertyFile(fileType,"MimeTypeDisplay" );
} catch (MissingResourceException e) {
//NOOP: we will try again after trimming ";"
}
if (fileType.contains(";")) {
fileType = fileType.substring(0, fileType.indexOf(";"));
}
Expand All @@ -286,6 +293,11 @@ public static String getUserFriendlyFileType(DataFile dataFile) {
}

public static String getIndexableFacetFileType(DataFile dataFile) {
try {
return BundleUtil.getStringFromDefaultPropertyFile(dataFile.getContentType(),"MimeTypeFacets" );
} catch (MissingResourceException e) {
//NOOP: we will try again after trimming ";"
}
String fileType = getFileType(dataFile);
try {
return BundleUtil.getStringFromDefaultPropertyFile(fileType,"MimeTypeFacets" );
Expand Down Expand Up @@ -415,7 +427,10 @@ public static String retestIngestableFileType(File file, String fileType) {
}

public static String determineFileType(File f, String fileName) throws IOException{
String fileType = null;
String fileType = lookupFileTypeByFileName(fileName);
if (fileType != null) {
return fileType;
}
String fileExtension = getFileExtension(fileName);


Expand Down Expand Up @@ -474,17 +489,17 @@ public static String determineFileType(File f, String fileName) throws IOExcepti
if (fileType != null && fileType.startsWith("text/plain") && STATISTICAL_FILE_EXTENSION.containsKey(fileExtension)) {
fileType = STATISTICAL_FILE_EXTENSION.get(fileExtension);
} else {
fileType = determineFileTypeByNameAndExtension(fileName);
fileType = lookupFileTypeByExtension(fileName);
}

logger.fine("mime type recognized by extension: "+fileType);
}
} else {
logger.fine("fileExtension is null");
String fileTypeByName = lookupFileTypeFromPropertiesFile(fileName);
if(!StringUtil.isEmpty(fileTypeByName)) {
logger.fine(String.format("mime type: %s recognized by filename: %s", fileTypeByName, fileName));
fileType = fileTypeByName;
final String fileTypeByExtension = lookupFileTypeByExtensionFromPropertiesFile(fileName);
if(!StringUtil.isEmpty(fileTypeByExtension)) {
logger.fine(String.format("mime type: %s recognized by extension: %s", fileTypeByExtension, fileName));
fileType = fileTypeByExtension;
}
}

Expand Down Expand Up @@ -529,33 +544,41 @@ public static String determineFileType(File f, String fileName) throws IOExcepti
return fileType;
}

public static String determineFileTypeByNameAndExtension(String fileName) {
String mimetypesFileTypeMapResult = MIME_TYPE_MAP.getContentType(fileName);
public static String determineFileTypeByNameAndExtension(final String fileName) {
final String fileType = lookupFileTypeByFileName(fileName);
if (fileType != null) {
return fileType;
}
return lookupFileTypeByExtension(fileName);
}

private static String lookupFileTypeByExtension(final String fileName) {
final String mimetypesFileTypeMapResult = MIME_TYPE_MAP.getContentType(fileName);
logger.fine("MimetypesFileTypeMap type by extension, for " + fileName + ": " + mimetypesFileTypeMapResult);
if (mimetypesFileTypeMapResult != null) {
if ("application/octet-stream".equals(mimetypesFileTypeMapResult)) {
return lookupFileTypeFromPropertiesFile(fileName);
} else {
return mimetypesFileTypeMapResult;
}
} else {
if (mimetypesFileTypeMapResult == null) {
return null;
}
if ("application/octet-stream".equals(mimetypesFileTypeMapResult)) {
return lookupFileTypeByExtensionFromPropertiesFile(fileName);
}
return mimetypesFileTypeMapResult;
}

public static String lookupFileTypeFromPropertiesFile(String fileName) {
String fileKey = FilenameUtils.getExtension(fileName);
String propertyFileName = "MimeTypeDetectionByFileExtension";
if(fileKey == null || fileKey.isEmpty()) {
fileKey = fileName;
propertyFileName = "MimeTypeDetectionByFileName";
private static String lookupFileTypeByFileName(final String fileName) {
return lookupFileTypeFromPropertiesFile("MimeTypeDetectionByFileName", fileName);
}

}
String propertyFileNameOnDisk = propertyFileName + ".properties";
private static String lookupFileTypeByExtensionFromPropertiesFile(final String fileName) {
final String fileKey = FilenameUtils.getExtension(fileName);
return lookupFileTypeFromPropertiesFile("MimeTypeDetectionByFileExtension", fileKey);
}

private static String lookupFileTypeFromPropertiesFile(final String propertyFileName, final String fileKey) {
final String propertyFileNameOnDisk = propertyFileName + ".properties";
try {
logger.fine("checking " + propertyFileNameOnDisk + " for file key " + fileKey);
return BundleUtil.getStringFromPropertyFile(fileKey, propertyFileName);
} catch (MissingResourceException ex) {
} catch (final MissingResourceException ex) {
logger.info(fileKey + " is a filename/extension Dataverse doesn't know about. Consider adding it to the " + propertyFileNameOnDisk + " file.");
return null;
}
Expand Down Expand Up @@ -810,7 +833,8 @@ public static boolean useRecognizedType(String suppliedContentType, String recog
|| canIngestAsTabular(recognizedType) || recognizedType.equals("application/fits-gzipped")
|| recognizedType.equalsIgnoreCase(ShapefileHandler.SHAPEFILE_FILE_TYPE)
|| recognizedType.equalsIgnoreCase(BagItFileHandler.FILE_TYPE)
|| recognizedType.equals(MIME_TYPE_ZIP)) {
|| recognizedType.equals(MIME_TYPE_ZIP)
|| recognizedType.equals(MIME_TYPE_RO_CRATE)) {
return true;
}
return false;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,5 @@ Makefile=text/x-makefile
Snakemake=text/x-snakemake
Dockerfile=application/x-docker-file
Vagrantfile=application/x-vagrant-file
ro-crate-metadata.json=application/ld+json; profile="http://www.w3.org/ns/json-ld#flattened http://www.w3.org/ns/json-ld#compacted https://w3id.org/ro/crate"
ro-crate-metadata.jsonld=application/ld+json; profile="http://www.w3.org/ns/json-ld#flattened http://www.w3.org/ns/json-ld#compacted https://w3id.org/ro/crate"
1 change: 1 addition & 0 deletions src/main/java/propertyFiles/MimeTypeDisplay.properties
Original file line number Diff line number Diff line change
Expand Up @@ -222,5 +222,6 @@ text/xml-graphml=GraphML Network Data
application/octet-stream=Unknown
application/x-docker-file=Docker Image File
application/x-vagrant-file=Vagrant Image File
application/ld+json;\u0020profile\u003d\u0022http\u003a//www.w3.org/ns/json-ld#flattened\u0020http\u003a//www.w3.org/ns/json-ld#compacted\u0020https\u003a//w3id.org/ro/crate\u0022=RO-Crate metadata
# Dataverse-specific
application/vnd.dataverse.file-package=Dataverse Package
1 change: 1 addition & 0 deletions src/main/java/propertyFiles/MimeTypeFacets.properties
Original file line number Diff line number Diff line change
Expand Up @@ -224,5 +224,6 @@ video/webm=Video
text/xml-graphml=Network Data
# Other
application/octet-stream=Unknown
application/ld+json;\u0020profile\u003d\u0022http\u003a//www.w3.org/ns/json-ld#flattened\u0020http\u003a//www.w3.org/ns/json-ld#compacted\u0020https\u003a//w3id.org/ro/crate\u0022=Metadata
# Dataverse-specific
application/vnd.dataverse.file-package=Data
25 changes: 25 additions & 0 deletions src/test/java/edu/harvard/iq/dataverse/util/FileUtilTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -409,4 +409,29 @@ public void testGZipFile() throws IOException {
assertEquals("application/fits-gzipped", contentType);
}

@Test
public void testDetermineFileTypeROCrate() {
final String roCrateContentType = "application/ld+json; profile=\"http://www.w3.org/ns/json-ld#flattened http://www.w3.org/ns/json-ld#compacted https://w3id.org/ro/crate\"";
final DataFile rocrate = new DataFile(roCrateContentType);

assertEquals(roCrateContentType, rocrate.getContentType());
assertEquals("RO-Crate metadata", FileUtil.getUserFriendlyFileType(rocrate));
assertEquals("Metadata", FileUtil.getIndexableFacetFileType(rocrate));

final File roCrateFile = new File("src/test/resources/fileutil/ro-crate-metadata.json");
try {
assertEquals(roCrateContentType, FileUtil.determineFileType(roCrateFile, "ro-crate-metadata.json"));
} catch (IOException ex) {
fail(ex);
}

// test ";" removal
final String dockerFileWithProfile = "application/x-docker-file; profile=\"http://www.w3.org/ns/json-ld#flattened http://www.w3.org/ns/json-ld#compacted https://w3id.org/ro/crate\"";
final DataFile dockerDataFile = new DataFile(dockerFileWithProfile);

assertEquals(dockerFileWithProfile, dockerDataFile.getContentType());
assertEquals("Docker Image File", FileUtil.getUserFriendlyFileType(dockerDataFile));
assertEquals("Code", FileUtil.getIndexableFacetFileType(dockerDataFile));
}

}
Loading