Skip to content

Commit

Permalink
Merge pull request #52 from DiSSCo/feature/dwca-extension-performance…
Browse files Browse the repository at this point in the history
…-update

Feature/dwca extension performance update
  • Loading branch information
samleeflang authored Dec 13, 2023
2 parents 463b7f4 + 8062831 commit 8303e15
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 12 deletions.
36 changes: 25 additions & 11 deletions src/main/java/eu/dissco/core/translator/service/DwcaService.java
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Optional;
Expand Down Expand Up @@ -65,6 +66,7 @@ public class DwcaService extends WebClientService {

private static final String DWC_ASSOCIATED_MEDIA = "dwc:associatedMedia";
private static final String GBIF_MULTIMEDIA = "gbif:Multimedia";
private static final String EML_LICENSE = "eml:license";
private static final String AC_MULTIMEDIA = "http://rs.tdwg.org/ac/terms/Multimedia";
private static final String EXTENSIONS = "extensions";

Expand Down Expand Up @@ -112,7 +114,7 @@ public void retrieveData() {
}
}

public void getSpecimenData(List<String> ids, Archive archive) throws JsonProcessingException {
public void getSpecimenData(Set<String> ids, Archive archive) throws JsonProcessingException {
var batches = prepareChunks(ids, 10000);
var optionalEmlData = addDatasetMeta(archive.getMetadataLocationFile());
for (var batch : batches) {
Expand Down Expand Up @@ -204,7 +206,7 @@ private void retrieveEmlData(XMLEventReader xmlEventReader, HashMap<String, Stri
if (isStartElement(element, "intellectualRights")) {
var license = retrieveLicense(xmlEventReader);
if (license != null) {
emlData.put("eml:license", license);
emlData.put(EML_LICENSE, license);
}
}
}
Expand Down Expand Up @@ -236,22 +238,33 @@ private List<DigitalMediaObjectEvent> processMedia(String recordId, JsonNode ful
if (extensions != null) {
if (extensions.get(AC_MULTIMEDIA) != null) {
var imageArray = extensions.get(AC_MULTIMEDIA);
addDatasetMetadata(imageArray, fullDigitalSpecimen);
if (imageArray.isArray() && !imageArray.isEmpty()) {
return extractMultiMedia(recordId, imageArray, organisationId);
}
} else if (extensions.get(GBIF_MULTIMEDIA) != null) {
var imageArray = extensions.get(GBIF_MULTIMEDIA);
addDatasetMetadata(imageArray, fullDigitalSpecimen);
if (imageArray.isArray() && !imageArray.isEmpty()) {
return extractMultiMedia(recordId, imageArray, organisationId);
}
}
} else if (fullDigitalSpecimen.get(DWC_ASSOCIATED_MEDIA) != null) {
return publishAssociatedMedia(recordId,
fullDigitalSpecimen.get(DWC_ASSOCIATED_MEDIA).asText(), organisationId);
fullDigitalSpecimen.get(DWC_ASSOCIATED_MEDIA).asText(), organisationId,
fullDigitalSpecimen.get(EML_LICENSE));
}
return List.of();
}

private void addDatasetMetadata(JsonNode imageArray, JsonNode fullDigitalSpecimen) {
for (JsonNode jsonNode : imageArray) {
var imageNode = (ObjectNode) jsonNode;
imageNode.set(EML_LICENSE, fullDigitalSpecimen.get(EML_LICENSE));
}

}

private List<DigitalMediaObjectEvent> extractMultiMedia(String recordId, JsonNode imageArray,
String organisationId) {
var digitalMediaObjectEvents = new ArrayList<DigitalMediaObjectEvent>();
Expand Down Expand Up @@ -279,8 +292,8 @@ private List<DigitalMediaObjectEvent> extractMultiMedia(String recordId, JsonNod
}

private List<DigitalMediaObjectEvent> publishAssociatedMedia(String recordId,
String associatedMedia,
String organisationId) throws OrganisationException {
String associatedMedia, String organisationId, JsonNode licenseNode)
throws OrganisationException {
log.debug("Digital Specimen: {}, has associatedMedia {}", recordId,
associatedMedia);
String[] mediaUrls = associatedMedia.split("\\|");
Expand All @@ -291,7 +304,8 @@ private List<DigitalMediaObjectEvent> publishAssociatedMedia(String recordId,
fdoProperties.getDigitalMediaObjectType(),
recordId,
digitalSpecimenDirector.assembleDigitalMediaObjects(true,
mapper.createObjectNode().put("ac:accessUri", mediaUrl),
mapper.createObjectNode().put("ac:accessUri", mediaUrl)
.set(EML_LICENSE, licenseNode),
organisationId),
null));
digitalMediaObjects.add(digitalMediaObject);
Expand Down Expand Up @@ -325,14 +339,14 @@ private JsonNode cleanupRedundantFields(JsonNode fullRecord) {
return originalData;
}

private Collection<List<String>> prepareChunks(List<String> inputList, int chunkSize) {
private Collection<List<String>> prepareChunks(Set<String> inputList, int chunkSize) {
AtomicInteger counter = new AtomicInteger();
return inputList.stream()
.collect(Collectors.groupingBy(it -> counter.getAndIncrement() / chunkSize)).values();
}


private List<String> postArchiveToDatabase(Archive archive) throws DisscoRepositoryException {
private Set<String> postArchiveToDatabase(Archive archive) throws DisscoRepositoryException {
var tableNames = generateTableNames(archive);
createTempTables(tableNames);
log.info("Created tables: {}", tableNames);
Expand Down Expand Up @@ -374,9 +388,9 @@ private void createTempTables(List<String> tableNames) {
}
}

private ArrayList<String> postCore(ArchiveFile core) throws DisscoRepositoryException {
private Set<String> postCore(ArchiveFile core) throws DisscoRepositoryException {
var dbRecords = new ArrayList<Pair<String, JsonNode>>();
var idList = new ArrayList<String>();
var idList = new HashSet<String>();
for (var rec : core) {
var basisOfRecord = rec.value(DwcTerm.basisOfRecord);
if (basisOfRecord != null && allowedBasisOfRecord.contains(basisOfRecord.toUpperCase())) {
Expand Down Expand Up @@ -406,7 +420,7 @@ private void postToDatabase(ArchiveFile archiveFile,
dbRecords.clear();
}

private void postExtensions(Set<ArchiveFile> extensions, List<String> idsList)
private void postExtensions(Set<ArchiveFile> extensions, Set<String> idsList)
throws DisscoRepositoryException {
var dbRecords = new ArrayList<Pair<String, JsonNode>>();
for (var extension : extensions) {
Expand Down
5 changes: 4 additions & 1 deletion src/main/java/eu/dissco/core/translator/terms/License.java
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,10 @@
public class License extends Term {

public static final String TERM = DCTERMS_PREFIX + "license";
private final List<String> dwcaTerms = List.of(TERM, "dc:license", "eml:license");

// Fall back to dc terms rights if no license is present
private final List<String> dwcaTerms = List.of(TERM, "dc:license", "eml:license",
"dcterms:rights", "dc:rights");
private final List<String> abcdUnitTerms = List.of(
"abcd:iprstatements/licenses/license/0/uri",
"abcd:iprstatements/licenses/license/0/text",
Expand Down

0 comments on commit 8303e15

Please sign in to comment.