Skip to content

Commit

Permalink
Merge pull request #42 from DiSSCo/feature/mids-0-check
Browse files Browse the repository at this point in the history
enforce MIDS 0 minimum, plus some fixes
  • Loading branch information
samleeflang authored Oct 17, 2023
2 parents 3bc10bc + 36b04d2 commit 8355b14
Show file tree
Hide file tree
Showing 14 changed files with 178 additions and 96 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,11 @@ private void processUnit(DataSet dataset, Unit unit)
if (isAcceptedBasisOfRecord(unit)) {
try {
var attributes = digitalSpecimenDirector.assembleDigitalSpecimenTerm(unitAttributes, false);
if (attributes.getOdsNormalisedPhysicalSpecimenId() == null
|| attributes.getDwcInstitutionId() == null) {
throw new DiSSCoDataException(
"Record does not comply to MIDS level 0 (id and organisation), ignoring record");
}
var digitalSpecimen = new DigitalSpecimenWrapper(
attributes.getOdsNormalisedPhysicalSpecimenId(),
fdoProperties.getDigitalSpecimenType(),
Expand Down
89 changes: 40 additions & 49 deletions src/main/java/eu/dissco/core/translator/service/DwcaService.java
Original file line number Diff line number Diff line change
Expand Up @@ -136,18 +136,15 @@ private void processDigitalSpecimen(Collection<ObjectNode> fullRecords)
throws JsonProcessingException {
for (var fullRecord : fullRecords) {
if (fullRecord != null) {
var recordId = fullRecord.get(DwcaTerm.ID.prefixedName()).asText();
if (!recordNeedsToBeIgnored(fullRecord, recordId)) {
try {
var digitalObjects = createDigitalObjects(fullRecord);
log.debug("Digital Specimen: {}", digitalObjects);
var translatorEvent = new DigitalSpecimenEvent(enrichmentServices(false),
digitalObjects.getLeft(), digitalObjects.getRight());
kafkaService.sendMessage("digital-specimen",
mapper.writeValueAsString(translatorEvent));
} catch (DiSSCoDataException e) {
log.error("Encountered data issue with record: {}", fullRecord, e);
}
try {
var digitalObjects = createDigitalObjects(fullRecord);
log.debug("Digital Specimen: {}", digitalObjects);
var translatorEvent = new DigitalSpecimenEvent(enrichmentServices(false),
digitalObjects.getLeft(), digitalObjects.getRight());
kafkaService.sendMessage("digital-specimen",
mapper.writeValueAsString(translatorEvent));
} catch (DiSSCoDataException e) {
log.error("Encountered data issue with record: {}", fullRecord, e);
}
}
}
Expand Down Expand Up @@ -214,6 +211,10 @@ private List<DigitalMediaObjectEvent> publishAssociatedMedia(String recordId,
private Pair<DigitalSpecimenWrapper, List<DigitalMediaObjectEvent>> createDigitalObjects(
JsonNode fullRecord) throws DiSSCoDataException {
var ds = digitalSpecimenDirector.assembleDigitalSpecimenTerm(fullRecord, true);
if (ds.getOdsNormalisedPhysicalSpecimenId() == null || ds.getDwcInstitutionId() == null) {
throw new DiSSCoDataException(
"Record does not comply to MIDS level 0 (id and organisation), ignoring record");
}
return Pair.of(new DigitalSpecimenWrapper(
ds.getOdsNormalisedPhysicalSpecimenId(),
fdoProperties.getDigitalSpecimenType(),
Expand All @@ -233,29 +234,6 @@ private JsonNode cleanupRedundantFields(JsonNode fullRecord) {
return originalData;
}

private boolean recordNeedsToBeIgnored(JsonNode fullRecord, String recordId) {
var basisOfRecord = fullRecord.get(DwcTerm.basisOfRecord.prefixedName());
if (basisOfRecord == null) {
log.warn("Record with id: {} is missing the basis of Record, Record will be ignored",
recordId);
return true;
} else {
if (!basisOfRecord.isTextual()) {
log.warn("Record with id: {} has basis of Record which is not a text field: {}",
recordId, basisOfRecord);
return true;
}
var value = basisOfRecord.asText().toUpperCase();
if (allowedBasisOfRecord.contains(value)) {
return false;
} else {
log.warn("Record with id: {} has basisOfRecord: {} which is not a physical specimen",
recordId, basisOfRecord);
return true;
}
}
}

private Collection<List<String>> prepareChunks(List<String> inputList, int chunkSize) {
AtomicInteger counter = new AtomicInteger();
return inputList.stream()
Expand All @@ -268,7 +246,7 @@ private List<String> postArchiveToDatabase(Archive archive) {
createTempTables(tableNames);
log.info("Created tables: {}", tableNames);
var idList = postCore(archive.getCore());
postExtensions(archive.getExtensions());
postExtensions(archive.getExtensions(), idList);
return idList;

}
Expand Down Expand Up @@ -305,16 +283,24 @@ private ArrayList<String> postCore(ArchiveFile core) {
var dbRecords = new ArrayList<Pair<String, JsonNode>>();
var idList = new ArrayList<String>();
for (var rec : core) {
idList.add(rec.id());
var json = convertToJson(core, rec);
json.set(EXTENSIONS, mapper.createObjectNode());
dbRecords.add(Pair.of(rec.id(), json));
if (dbRecords.size() % 10000 == 0) {
postToDatabase(core, dbRecords);
var basisOfRecord = rec.value(DwcTerm.basisOfRecord);
if (basisOfRecord != null && allowedBasisOfRecord.contains(basisOfRecord.toUpperCase())) {
idList.add(rec.id());
var json = convertToJson(core, rec);
json.set(EXTENSIONS, mapper.createObjectNode());
dbRecords.add(Pair.of(rec.id(), json));
if (dbRecords.size() % 10000 == 0) {
postToDatabase(core, dbRecords);
}
} else {
log.debug("Record with id: {} has basisOfRecord: {} which is not an accepted basisOfRecord",
rec.id(), basisOfRecord);
}
}
postToDatabase(core, dbRecords);
log.info("Finished posting core archive to database");
if (!dbRecords.isEmpty()) {
postToDatabase(core, dbRecords);
}
log.info("Finished posting core archive to database, total records: {}", idList.size());
return idList;
}

Expand All @@ -326,16 +312,21 @@ private void postToDatabase(ArchiveFile archiveFile,
}


private void postExtensions(Set<ArchiveFile> extensions) {
private void postExtensions(Set<ArchiveFile> extensions, List<String> idsList) {
var dbRecords = new ArrayList<Pair<String, JsonNode>>();
for (var extension : extensions) {
log.info("Processing records of extension: {}", extension.getRowType().toString());
for (var rec : extension) {
dbRecords.add(Pair.of(rec.id(), convertToJson(extension, rec)));
if (dbRecords.size() % 10000 == 0) {
postToDatabase(extension, dbRecords);
if (idsList.contains(rec.id())) {
dbRecords.add(Pair.of(rec.id(), convertToJson(extension, rec)));
if (dbRecords.size() % 10000 == 0) {
postToDatabase(extension, dbRecords);
}
}
}
postToDatabase(extension, dbRecords);
if (!dbRecords.isEmpty()) {
postToDatabase(extension, dbRecords);
}
}
log.info("Finished posting extensions archive to database");
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -316,8 +316,7 @@ protected Identifications createIdentification(JsonNode data, boolean dwc) {
.withDwcVernacularName(termMapper.retrieveTerm(new VernacularName(), data, dwc));
return new Identifications()
.withDwcIdentificationID(termMapper.retrieveTerm(new IdentificationId(), data, dwc))
.withDwcIdentificationVerificationStatus(Boolean.valueOf(
termMapper.retrieveTerm(new IdentificationVerificationStatus(), data, dwc)))
.withDwcIdentificationVerificationStatus(parseToBoolean(new IdentificationVerificationStatus(), data, dwc))
.withDwcTypeStatus(termMapper.retrieveTerm(new TypeStatus(), data, dwc))
.withDwcDateIdentified(termMapper.retrieveTerm(new DateIdentified(), data, dwc))
.withDwcIdentifiedBy(termMapper.retrieveTerm(new IdentifiedBy(), data, dwc))
Expand Down
6 changes: 5 additions & 1 deletion src/main/java/eu/dissco/core/translator/terms/Term.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package eu.dissco.core.translator.terms;


import eu.dissco.core.translator.schema.Identifications;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import java.util.List;
Expand Down Expand Up @@ -113,9 +114,12 @@ protected String searchABCDSplitTerms(JsonNode unit, List<String> searchTerms,
return null;
}

protected eu.dissco.core.translator.schema.Identifications retrieveAcceptedIdentification(
protected Identifications retrieveAcceptedIdentification(
eu.dissco.core.translator.schema.DigitalSpecimen ds) {
if (ds.getDwcIdentification() != null && !ds.getDwcIdentification().isEmpty()) {
if (ds.getDwcIdentification().size() == 1) {
return ds.getDwcIdentification().get(0);
}
for (var identification : ds.getDwcIdentification()) {
if (Boolean.TRUE.equals(identification.getDwcIdentificationVerificationStatus())) {
return identification;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,14 @@ public class IdentificationVerificationStatus extends Term {
@Override
public String retrieveFromDWCA(JsonNode unit) {
var result = super.searchJsonForTerm(unit, dwcaTerms);
if (result.equals("1")) {
return Boolean.TRUE.toString();
if (result != null){
if (result.equals("1")) {
return Boolean.TRUE.toString();
} else {
return Boolean.FALSE.toString();
}
} else {
return Boolean.FALSE.toString();
return null;
}
}

Expand Down
12 changes: 4 additions & 8 deletions src/main/resources/json-schema/citations.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,23 +5,19 @@
"properties": {
"dcterms:type": {
"type": "string",
"description": "Unclear yet",
"$comment": "Unknown what this field should be"
"description": "https://purl.org/dc/terms/type"
},
"dcterms:date": {
"type": "string",
"description": "Unclear yet",
"$comment": "Unknown what this field should be"
"description": "https://purl.org/dc/terms/date"
},
"dcterms:title": {
"type": "string",
"description": "Unclear yet",
"$comment": "Unknown what this field should be"
"description": "https://purl.org/dc/terms/title"
},
"dcterms:creator": {
"type": "string",
"description": "Unclear yet",
"$comment": "Unknown what this field should be"
"description": "https://purl.org/dc/elements/1.1/creator"
},
"???:citationPageNumber": {
"type": "string",
Expand Down
18 changes: 16 additions & 2 deletions src/test/java/eu/dissco/core/translator/TestUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Map;
import java.util.stream.Stream;
import org.junit.jupiter.params.provider.Arguments;
import org.springframework.core.io.ClassPathResource;

public class TestUtils {
Expand All @@ -13,11 +15,14 @@ public class TestUtils {
public static String SOURCE_SYSTEM_ID = "20.5000.1025/GW0-TYL-YRU";
public static String ENDPOINT = "https://data.rbge.org.uk/service/dwca/data/darwin_core_living.zip";

public static String INSTITUTION_ID = "https://ror.org/02y22ws83";
public static String NORMALISED_PHYSICAL_SPECIMEN_ID = "http://coldb.mnhn.fr/catalognumber/mnhn/ec/ec10867";

public static String MOCK_DATE = "29-09-2023";
public static Map<String, String> DEFAULT_MAPPING = Map.of(
"ods:physicalSpecimenIdType", "cetaf",
"ods:type", "ZoologyVertebrateSpecimen",
"ods:organisationId", "https://ror.org/02y22ws83"
"ods:organisationId", INSTITUTION_ID
);
public static Map<String, String> TERM_MAPPING = Map.of(
"ods:physicalSpecimenId", "dwc:occurrenceID",
Expand Down Expand Up @@ -61,6 +66,15 @@ public static String loadResourceFile(String fileName) throws IOException {

public static DigitalSpecimen givenDigitalSpecimen() {
return new DigitalSpecimen()
.withOdsNormalisedPhysicalSpecimenId("http://coldb.mnhn.fr/catalognumber/mnhn/ec/ec10867");
.withOdsNormalisedPhysicalSpecimenId(NORMALISED_PHYSICAL_SPECIMEN_ID)
.withDwcInstitutionId(INSTITUTION_ID);
}

public static Stream<Arguments> provideInvalidDigitalSpecimen() {
return Stream.of(
Arguments.of(new DigitalSpecimen().withOdsNormalisedPhysicalSpecimenId(
NORMALISED_PHYSICAL_SPECIMEN_ID)),
Arguments.of(new DigitalSpecimen().withDwcInstitutionId(INSTITUTION_ID))
);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,13 @@ class DwcaRepositoryTest extends BaseRepositoryIT {

private DwcaRepository repository;

private static JsonNode givenRecord(String corruptedValue) {
var objectNode = MAPPER.createObjectNode();
objectNode.put("someRandomInformation", "someRandomInformation");
objectNode.put("someCorruptedInformation", corruptedValue);
return objectNode;
}

@BeforeEach
void setup() {
repository = new DwcaRepository(MAPPER, context);
Expand Down Expand Up @@ -47,7 +54,8 @@ void getCoreRecords() {
void getCorruptCoreRecords() {
// Given
var tableName = "XXX-XXX-XXX_Core";
var records = List.of(Pair.of(UUID.randomUUID().toString(), givenRecord("\u0000 someCorruptedInformation")));
var records = List.of(
Pair.of(UUID.randomUUID().toString(), givenRecord("\u0000 someCorruptedInformation")));
repository.createTable(tableName);
repository.postRecords(tableName, records);

Expand All @@ -57,7 +65,8 @@ void getCorruptCoreRecords() {
repository.deleteTable(tableName);

// Then
assertThat(results).isEqualTo(Map.of(records.get(0).getLeft(), givenRecord(" someCorruptedInformation")));
assertThat(results).isEqualTo(
Map.of(records.get(0).getLeft(), givenRecord(" someCorruptedInformation")));
}

@Test
Expand Down Expand Up @@ -104,11 +113,4 @@ private List<Pair<String, JsonNode>> givenCoreRecords() {
return records;
}

private static JsonNode givenRecord(String corruptedValue) {
var objectNode = MAPPER.createObjectNode();
objectNode.put("someRandomInformation", "someRandomInformation");
objectNode.put("someCorruptedInformation", corruptedValue);
return objectNode;
}

}
Loading

0 comments on commit 8355b14

Please sign in to comment.