From 03debc16cfc68e42a5bd4aad3076ed2dbadad94c Mon Sep 17 00:00:00 2001 From: Leonid Andreev Date: Mon, 9 Sep 2024 10:49:53 -0400 Subject: [PATCH 1/9] Some work-in-progress modifications of the Harvested imports and indexing process. #10734 --- .../api/imports/ImportServiceBean.java | 175 ++++++++++-------- .../impl/AbstractCreateDatasetCommand.java | 18 +- .../iq/dataverse/search/IndexServiceBean.java | 6 +- 3 files changed, 118 insertions(+), 81 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java index d2bba56f884..42d1a14bf1d 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java @@ -20,6 +20,7 @@ import edu.harvard.iq.dataverse.DataverseContact; import edu.harvard.iq.dataverse.DataverseServiceBean; import edu.harvard.iq.dataverse.EjbDataverseEngine; +import edu.harvard.iq.dataverse.GlobalId; import edu.harvard.iq.dataverse.MetadataBlockServiceBean; import edu.harvard.iq.dataverse.api.dto.DatasetDTO; import edu.harvard.iq.dataverse.api.imports.ImportUtil.ImportType; @@ -268,29 +269,97 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve } JsonObject obj = JsonUtil.getJsonObject(json); - //and call parse Json to read it into a dataset + + String protocol = obj.getString("protocol", null); + String authority = obj.getString("authority", null); + String identifier = obj.getString("identifier",null); + + GlobalId globalId; + + // A Global ID is required: + // (meaning, we will fail with an exception if the imports above have + // not managed to find an acceptable global identifier in the harvested + // metadata) + + try { + globalId = PidUtil.parseAsGlobalID(protocol, authority, identifier); + } catch (IllegalArgumentException iax) { + throw new ImportException("The harvested metadata record with the OAI server identifier " + harvestIdentifier + " does not contain a global identifier this Dataverse can parse, skipping."); + } + + if (globalId == null) { + throw new ImportException("The harvested metadata record with the OAI server identifier " + harvestIdentifier + " does not contain a global identifier this Dataverse recognizes, skipping."); + } + + String globalIdString = globalId.asString(); + + if (StringUtils.isEmpty(globalIdString)) { + // @todo this check shouldn't be necessary, now that there's a null check above + throw new ImportException("The harvested metadata record with the OAI server identifier " + harvestIdentifier + " does not contain a global identifier this Dataverse recognizes, skipping."); + } + + DatasetVersion harvestedVersion; + + Dataset existingDataset = datasetService.findByGlobalId(globalIdString); + + try { + Dataset harvestedDataset = null; + JsonParser parser = new JsonParser(datasetfieldService, metadataBlockService, settingsService, licenseService, datasetTypeService, harvestingClient); parser.setLenient(true); - Dataset ds = parser.parseDataset(obj); - // For ImportType.NEW, if the metadata contains a global identifier, and it's not a protocol - // we support, it should be rejected. - // (TODO: ! - add some way of keeping track of supported protocols!) - //if (ds.getGlobalId() != null && !ds.getProtocol().equals(settingsService.getValueForKey(SettingsServiceBean.Key.Protocol, ""))) { - // throw new ImportException("Could not register id " + ds.getGlobalId() + ", protocol not supported"); - //} - ds.setOwner(owner); - ds.getLatestVersion().setDatasetFields(ds.getLatestVersion().initDatasetFields()); + if (existingDataset != null) { + // If this dataset already exists IN ANOTHER COLLECTION + // we are just going to skip it! + if (existingDataset.getOwner() != null && !owner.getId().equals(existingDataset.getOwner().getId())) { + throw new ImportException("The dataset with the global id " + globalIdString + " already exists, in the dataverse " + existingDataset.getOwner().getAlias() + ", skipping."); + } + // And if we already have a dataset with this same id, in this same + // dataverse, but it is a LOCAL dataset (can happen!), we're going to + // skip it also: + if (!existingDataset.isHarvested()) { + throw new ImportException("A LOCAL dataset with the global id " + globalIdString + " already exists in this dataverse; skipping."); + } + // For harvested datasets, there should always only be one version. + // We will replace the current version with the imported version. + // @todo or should we just destroy any extra versions quietly? + if (existingDataset.getVersions().size() != 1) { + throw new ImportException("Error importing Harvested Dataset, existing dataset has " + existingDataset.getVersions().size() + " versions"); + } + + // ... do the magic - parse the version json, do the switcheroo ... + + harvestedDataset = existingDataset; // @todo!! + + harvestedVersion = parser.parseDatasetVersion(obj.getJsonObject("datasetVersion")); + + } else { + // Creating a new dataset from scratch: + + harvestedDataset = parser.parseDataset(obj); + harvestedDataset.setOwner(owner); + + harvestedDataset.setHarvestedFrom(harvestingClient); + harvestedDataset.setHarvestIdentifier(harvestIdentifier); + + harvestedVersion = harvestedDataset.getVersions().get(0); + } + + // Either a full new import, or an update of an existing harvested + // Dataset, perform some cleanup on the new version imported from the + // parsed metadata: + + harvestedVersion.setDatasetFields(harvestedVersion.initDatasetFields()); - if (ds.getVersions().get(0).getReleaseTime() == null) { - ds.getVersions().get(0).setReleaseTime(oaiDateStamp); + if (harvestedVersion.getReleaseTime() == null) { + harvestedVersion.setReleaseTime(oaiDateStamp); } // Check data against required contraints - List> violations = ds.getVersions().get(0).validateRequired(); + List> violations = harvestedVersion.validateRequired(); if (!violations.isEmpty()) { - // For migration and harvest, add NA for missing required values + // When harvesting, we add NA for missing required values: for (ConstraintViolation v : violations) { DatasetField f = v.getRootBean(); f.setSingleValue(DatasetField.NA_VALUE); @@ -298,86 +367,42 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve } // Check data against validation constraints - // If we are migrating and "scrub migration data" is true we attempt to fix invalid data - // if the fix fails stop processing of this file by throwing exception - Set invalidViolations = ds.getVersions().get(0).validate(); - ValidatorFactory factory = Validation.buildDefaultValidatorFactory(); - Validator validator = factory.getValidator(); + // Similarly to how we handle missing required values (above), we + // replace invalid values with NA when harvesting. + + Set invalidViolations = harvestedVersion.validate(); if (!invalidViolations.isEmpty()) { for (ConstraintViolation v : invalidViolations) { DatasetFieldValue f = v.getRootBean(); - boolean fixed = false; - boolean converted = false; - // TODO: Is this scrubbing something we want to continue doing? - if (settingsService.isTrueForKey(SettingsServiceBean.Key.ScrubMigrationData, false)) { - fixed = processMigrationValidationError(f, cleanupLog, metadataFile.getName()); - converted = true; - if (fixed) { - Set> scrubbedViolations = validator.validate(f); - if (!scrubbedViolations.isEmpty()) { - fixed = false; - } - } - } - if (!fixed) { - String msg = "Data modified - File: " + metadataFile.getName() + "; Field: " + f.getDatasetField().getDatasetFieldType().getDisplayName() + "; " - + "Invalid value: '" + f.getValue() + "'" + " Converted Value:'" + DatasetField.NA_VALUE + "'"; - cleanupLog.println(msg); - f.setValue(DatasetField.NA_VALUE); - } + String msg = "Invalid metadata: " + metadataFile.getName() + "; Field: " + f.getDatasetField().getDatasetFieldType().getDisplayName() + "; " + + "Invalid value: '" + f.getValue() + "'" + ", replaced with '" + DatasetField.NA_VALUE + "'"; + cleanupLog.println(msg); + f.setValue(DatasetField.NA_VALUE); } } - // A Global ID is required, in order for us to be able to harvest and import - // this dataset: - if (StringUtils.isEmpty(ds.getGlobalId().asString())) { - throw new ImportException("The harvested metadata record with the OAI server identifier "+harvestIdentifier+" does not contain a global unique identifier that we could recognize, skipping."); - } - - ds.setHarvestedFrom(harvestingClient); - ds.setHarvestIdentifier(harvestIdentifier); - - Dataset existingDs = datasetService.findByGlobalId(ds.getGlobalId().asString()); - - if (existingDs != null) { - // If this dataset already exists IN ANOTHER DATAVERSE - // we are just going to skip it! - if (existingDs.getOwner() != null && !owner.getId().equals(existingDs.getOwner().getId())) { - throw new ImportException("The dataset with the global id "+ds.getGlobalId().asString()+" already exists, in the dataverse "+existingDs.getOwner().getAlias()+", skipping."); - } - // And if we already have a dataset with this same id, in this same - // dataverse, but it is LOCAL dataset (can happen!), we're going to - // skip it also: - if (!existingDs.isHarvested()) { - throw new ImportException("A LOCAL dataset with the global id "+ds.getGlobalId().asString()+" already exists in this dataverse; skipping."); - } - // For harvested datasets, there should always only be one version. - // We will replace the current version with the imported version. - if (existingDs.getVersions().size() != 1) { - throw new ImportException("Error importing Harvested Dataset, existing dataset has " + existingDs.getVersions().size() + " versions"); - } + if (existingDataset != null) { // Purge all the SOLR documents associated with this client from the // index server: - indexService.deleteHarvestedDocuments(existingDs); + indexService.deleteHarvestedDocuments(existingDataset); // files from harvested datasets are removed unceremoniously, // directly in the database. no need to bother calling the // DeleteFileCommand on them. - for (DataFile harvestedFile : existingDs.getFiles()) { + for (DataFile harvestedFile : existingDataset.getFiles()) { DataFile merged = em.merge(harvestedFile); em.remove(merged); harvestedFile = null; } - // TODO: - // Verify what happens with the indexed files in SOLR? - // are they going to be overwritten by the reindexing of the dataset? - existingDs.setFiles(null); - Dataset merged = em.merge(existingDs); + existingDataset.setFiles(null); + Dataset merged = em.merge(existingDataset); // harvested datasets don't have physical files - so no need to worry about that. engineSvc.submit(new DestroyDatasetCommand(merged, dataverseRequest)); + // @todo! + // UpdateHarvestedDatasetCommand() ? + } else { + importedDataset = engineSvc.submit(new CreateHarvestedDatasetCommand(harvestedDataset, dataverseRequest)); } - - importedDataset = engineSvc.submit(new CreateHarvestedDatasetCommand(ds, dataverseRequest)); } catch (JsonParseException | ImportException | CommandException ex) { logger.fine("Failed to import harvested dataset: " + ex.getClass() + ": " + ex.getMessage()); diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractCreateDatasetCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractCreateDatasetCommand.java index 7b7c5fd0e93..0567b737a58 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractCreateDatasetCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractCreateDatasetCommand.java @@ -13,8 +13,10 @@ import edu.harvard.iq.dataverse.engine.command.exception.CommandException; import edu.harvard.iq.dataverse.pidproviders.PidProvider; import static edu.harvard.iq.dataverse.util.StringUtil.isEmpty; +import java.io.IOException; import java.util.Objects; import java.util.logging.Logger; +import org.apache.solr.client.solrj.SolrServerException; /**; * An abstract base class for commands that creates {@link Dataset}s. @@ -146,9 +148,19 @@ public Dataset execute(CommandContext ctxt) throws CommandException { //Use for code that requires database ids postDBFlush(theDataset, ctxt); - - ctxt.index().asyncIndexDataset(theDataset, true); - + + if (harvested) { + try { + ctxt.index().indexDataset(theDataset, true); + } catch (SolrServerException | IOException solrEx) { + logger.warning("Failed to index harvested dataset. " + solrEx.getMessage()); + } + } else { + // The asynchronous version does not throw any exceptions, + // logging them internally instead. + ctxt.index().asyncIndexDataset(theDataset, true); + } + return theDataset; } diff --git a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java index fd769846490..a659976769a 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java @@ -418,7 +418,7 @@ synchronized private static Dataset getNextToIndex(Long id, Dataset d) { public void asyncIndexDataset(Dataset dataset, boolean doNormalSolrDocCleanUp) { try { acquirePermitFromSemaphore(); - doAyncIndexDataset(dataset, doNormalSolrDocCleanUp); + doAsyncIndexDataset(dataset, doNormalSolrDocCleanUp); } catch (InterruptedException e) { String failureLogText = "Indexing failed: interrupted. You can kickoff a re-index of this dataset with: \r\n curl http://localhost:8080/api/admin/index/datasets/" + dataset.getId().toString(); failureLogText += "\r\n" + e.getLocalizedMessage(); @@ -428,7 +428,7 @@ public void asyncIndexDataset(Dataset dataset, boolean doNormalSolrDocCleanUp) { } } - private void doAyncIndexDataset(Dataset dataset, boolean doNormalSolrDocCleanUp) { + private void doAsyncIndexDataset(Dataset dataset, boolean doNormalSolrDocCleanUp) { Long id = dataset.getId(); Dataset next = getNextToIndex(id, dataset); // if there is an ongoing index job for this dataset, next is null (ongoing index job will reindex the newest version after current indexing finishes) while (next != null) { @@ -449,7 +449,7 @@ public void asyncIndexDatasetList(List datasets, boolean doNormalSolrDo for(Dataset dataset : datasets) { try { acquirePermitFromSemaphore(); - doAyncIndexDataset(dataset, true); + doAsyncIndexDataset(dataset, true); } catch (InterruptedException e) { String failureLogText = "Indexing failed: interrupted. You can kickoff a re-index of this dataset with: \r\n curl http://localhost:8080/api/admin/index/datasets/" + dataset.getId().toString(); failureLogText += "\r\n" + e.getLocalizedMessage(); From e62fc55a1831999d6024d5a12c20cf8c81f172a4 Mon Sep 17 00:00:00 2001 From: Leonid Andreev Date: Tue, 10 Sep 2024 11:05:31 -0400 Subject: [PATCH 2/9] intermediate/unfinished state - work in progress #10734 --- .../api/imports/ImportServiceBean.java | 198 +++++++++++++++--- .../iq/dataverse/search/IndexServiceBean.java | 5 + 2 files changed, 171 insertions(+), 32 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java index 42d1a14bf1d..a3147b68463 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java @@ -20,6 +20,7 @@ import edu.harvard.iq.dataverse.DataverseContact; import edu.harvard.iq.dataverse.DataverseServiceBean; import edu.harvard.iq.dataverse.EjbDataverseEngine; +import edu.harvard.iq.dataverse.FileMetadata; import edu.harvard.iq.dataverse.GlobalId; import edu.harvard.iq.dataverse.MetadataBlockServiceBean; import edu.harvard.iq.dataverse.api.dto.DatasetDTO; @@ -41,6 +42,7 @@ import edu.harvard.iq.dataverse.util.json.JsonUtil; import edu.harvard.iq.dataverse.license.LicenseServiceBean; import edu.harvard.iq.dataverse.pidproviders.PidUtil; +import static edu.harvard.iq.dataverse.search.IndexServiceBean.solrDocIdentifierFile; import java.io.File; import java.io.FileOutputStream; @@ -71,6 +73,8 @@ import jakarta.validation.Validation; import jakarta.validation.Validator; import jakarta.validation.ValidatorFactory; +import java.util.HashMap; +import java.util.Map; import javax.xml.stream.XMLStreamException; import org.apache.commons.lang3.StringUtils; @@ -209,7 +213,7 @@ public JsonObjectBuilder handleFile(DataverseRequest dataverseRequest, Dataverse @TransactionAttribute(TransactionAttributeType.REQUIRES_NEW) public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, HarvestingClient harvestingClient, String harvestIdentifier, String metadataFormat, File metadataFile, Date oaiDateStamp, PrintWriter cleanupLog) throws ImportException, IOException { if (harvestingClient == null || harvestingClient.getDataverse() == null) { - throw new ImportException("importHarvestedDataset called wiht a null harvestingClient, or an invalid harvestingClient."); + throw new ImportException("importHarvestedDataset called with a null harvestingClient, or an invalid harvestingClient."); } Dataverse owner = harvestingClient.getDataverse(); Dataset importedDataset = null; @@ -309,7 +313,19 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve JsonParser parser = new JsonParser(datasetfieldService, metadataBlockService, settingsService, licenseService, datasetTypeService, harvestingClient); parser.setLenient(true); - if (existingDataset != null) { + if (existingDataset == null) { + // Creating a new dataset from scratch: + + harvestedDataset = parser.parseDataset(obj); + harvestedDataset.setOwner(owner); + + harvestedDataset.setHarvestedFrom(harvestingClient); + harvestedDataset.setHarvestIdentifier(harvestIdentifier); + + harvestedVersion = harvestedDataset.getVersions().get(0); + } else { + // We already have a dataset with this id in the database. + // If this dataset already exists IN ANOTHER COLLECTION // we are just going to skip it! if (existingDataset.getOwner() != null && !owner.getId().equals(existingDataset.getOwner().getId())) { @@ -327,23 +343,8 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve if (existingDataset.getVersions().size() != 1) { throw new ImportException("Error importing Harvested Dataset, existing dataset has " + existingDataset.getVersions().size() + " versions"); } - - // ... do the magic - parse the version json, do the switcheroo ... - - harvestedDataset = existingDataset; // @todo!! - + harvestedVersion = parser.parseDatasetVersion(obj.getJsonObject("datasetVersion")); - - } else { - // Creating a new dataset from scratch: - - harvestedDataset = parser.parseDataset(obj); - harvestedDataset.setOwner(owner); - - harvestedDataset.setHarvestedFrom(harvestingClient); - harvestedDataset.setHarvestIdentifier(harvestIdentifier); - - harvestedVersion = harvestedDataset.getVersions().get(0); } // Either a full new import, or an update of an existing harvested @@ -370,6 +371,19 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve // Similarly to how we handle missing required values (above), we // replace invalid values with NA when harvesting. + boolean sanitized = validateDatasetVersion(harvestedVersion, true, cleanupLog); + + // Note: this sanitizing approach, of replacing invalid values with + // "NA" does not work with certain fields. For example, using it to + // populate a GeoBox coordinate value will result in an invalid + // field. So we will attempt to validate the santized version again, + // this time around, it will throw an exception if still invalid, so + // we'll stop before proceeding any further. + + if (sanitized) { + + } + Set invalidViolations = harvestedVersion.validate(); if (!invalidViolations.isEmpty()) { for (ConstraintViolation v : invalidViolations) { @@ -379,27 +393,104 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve + "Invalid value: '" + f.getValue() + "'" + ", replaced with '" + DatasetField.NA_VALUE + "'"; cleanupLog.println(msg); f.setValue(DatasetField.NA_VALUE); + // Note: "NA" does not work with certain fields. For example, + // using it to populate a GeoBox coordinate value is going + // to result in an invalid field. @todo? - see below } } + // @todo? - re-validate the version before we do anything else? + // something along the lines of + // if (this.validate) {validateOrDie(newVersion, false);} + // DatasetFieldUtil.tidyUpFields(newVersion.getDatasetFields(), true); + if (existingDataset != null) { - // Purge all the SOLR documents associated with this client from the - // index server: - indexService.deleteHarvestedDocuments(existingDataset); - // files from harvested datasets are removed unceremoniously, - // directly in the database. no need to bother calling the - // DeleteFileCommand on them. - for (DataFile harvestedFile : existingDataset.getFiles()) { - DataFile merged = em.merge(harvestedFile); - em.remove(merged); - harvestedFile = null; + // @todo + // ... do the magic - parse the version json, do the switcheroo ... + DatasetVersion existingVersion = existingDataset.getVersions().get(0); + + Map existingFilesIndex = new HashMap<>(); + + for (int i = 0; i < existingDataset.getFiles().size(); i++) { + String storageIdentifier = existingDataset.getFiles().get(i).getStorageIdentifier(); + if (storageIdentifier != null) { + existingFilesIndex.put(storageIdentifier, i); + } + } + + for (FileMetadata newFileMetadata : harvestedVersion.getFileMetadatas()) { + // is it safe to assume that each new FileMetadata will be + // pointing to a non-null DataFile + String location = newFileMetadata.getDataFile().getStorageIdentifier(); + if (location != null && existingFilesIndex.containsKey(location)) { + newFileMetadata.getDataFile().setFileMetadatas(new ArrayList<>()); + + int fileIndex = existingFilesIndex.get(location); + newFileMetadata.setDataFile(existingDataset.getFiles().get(fileIndex)); + existingDataset.getFiles().get(fileIndex).getFileMetadatas().add(newFileMetadata); + existingFilesIndex.remove(location); + } + } + + List solrIdsOfDocumentsToDelete = new ArrayList<>(); + + // Go through the existing files and delete the ones that are + // no longer present in the version that we have just harvesed: + for (FileMetadata oldFileMetadata : existingDataset.getVersions().get(0).getFileMetadatas()) { + DataFile oldDataFile = oldFileMetadata.getDataFile(); + solrIdsOfDocumentsToDelete.add(solrDocIdentifierFile + oldDataFile.getId()); + existingDataset.getFiles().remove(oldDataFile); + // Files from harvested datasets are removed unceremoniously, + // directly in the database. No need to bother calling the + // DeleteFileCommand on them. + em.remove(em.merge(oldDataFile)); + em.remove(em.merge(oldFileMetadata)); + oldDataFile = null; + oldFileMetadata = null; + } + + // purge all the SOLR documents associated with the files + // we have just deleted: + if (!solrIdsOfDocumentsToDelete.isEmpty()) { + indexService.deleteHarvestedDocuments(solrIdsOfDocumentsToDelete); + } + + // ... And now delete the existing version itself: + + existingDataset.setVersions(new ArrayList<>()); + em.remove(em.merge(existingVersion)); + + // Now attach the newly-harvested version to the dataset: + existingDataset.getVersions().add(harvestedVersion); + harvestedVersion.setDataset(existingDataset); + + // ... There's one more thing to do - go through the new files, + // that are not in the database yet, and make sure they are + // attached to this existing dataset: + + for (FileMetadata newFileMetadata : harvestedVersion.getFileMetadatas()) { + if (newFileMetadata.getDataFile().getId() == null) { + existingDataset.getFiles().add(newFileMetadata.getDataFile()); + newFileMetadata.getDataFile().setOwner(existingDataset); + } } - existingDataset.setFiles(null); - Dataset merged = em.merge(existingDataset); + + em.persist(harvestedVersion); + + ///@todo remove for (DataFile harvestedFile : existingDataset.getFiles()) { + ///@todo remove DataFile merged = em.merge(harvestedFile); + ///@todo remove em.remove(merged); + ///@todo remove harvestedFile = null; + ///@todo remove } + ///@todo remove existingDataset.setFiles(null); + ///@todo remove Dataset merged = em.merge(existingDataset); // harvested datasets don't have physical files - so no need to worry about that. - engineSvc.submit(new DestroyDatasetCommand(merged, dataverseRequest)); + ///@todo remove engineSvc.submit(new DestroyDatasetCommand(merged, dataverseRequest)); // @todo! - // UpdateHarvestedDatasetCommand() ? + // UpdateHarvestedDatasetCommand() ? (later on) + importedDataset = em.merge(existingDataset); + //@todo reindex + } else { importedDataset = engineSvc.submit(new CreateHarvestedDatasetCommand(harvestedDataset, dataverseRequest)); } @@ -421,6 +512,49 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve } return importedDataset; } + /** + * Shortcut method for validating AND attempting to sanitize a DatasetVersion + * @param version + * @param cleanupLog - any invalid values and their replacements are logged there + * @return true if any invalid values were encountered and sanitized + * @throws ImportException (although it should never happen in this mode) + */ + private boolean validateAndSanitizeVersionMetadata(DatasetVersion version, PrintWriter cleanupLog) throws ImportException { + return validateVersionMetadata(version, true, cleanupLog); + } + + private void validateVersionMetadata(DatasetVersion version, PrintWriter log) throws ImportException { + validateVersionMetadata(version, false, log); + } + + private boolean validateVersionMetadata(DatasetVersion version, boolean sanitize, PrintWriter cleanupLog) throws ImportException { + boolean fixed = false; + Set invalidViolations = version.validate(); + if (!invalidViolations.isEmpty()) { + for (ConstraintViolation v : invalidViolations) { + DatasetFieldValue f = v.getRootBean(); + + String msg = "Invalid metadata field: " + f.getDatasetField().getDatasetFieldType().getDisplayName() + "; " + + "Invalid value: '" + f.getValue() + "'"; + if (sanitize) { + msg += ", replaced with '" + DatasetField.NA_VALUE + "'"; + f.setValue(DatasetField.NA_VALUE); + fixed = true; + } + cleanupLog.println(msg); + + // Note: "NA" does not work with certain fields. For example, + // using it to populate a GeoBox coordinate value is going + // to result in an invalid field. So we'll need to validate the + // version again after the first, sanitizing pass and see if it + // helped or not. + } + if (!sanitize) { + throw new ImportException("Version was still failing validation after the first attempt to sanitize the invalid values."); + } + } + return fixed; + } public JsonObject ddiToJson(String xmlToParse) throws ImportException, XMLStreamException { DatasetDTO dsDTO = importDDIService.doImport(ImportType.IMPORT, xmlToParse); diff --git a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java index a659976769a..834ea5f1ed9 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java @@ -2398,6 +2398,11 @@ public void deleteHarvestedDocuments(Dataset harvestedDataset) { solrIdsOfDocumentsToDelete.add(solrDocIdentifierFile + datafile.getId()); } + deleteHarvestedDocuments(solrIdsOfDocumentsToDelete); + } + + public void deleteHarvestedDocuments(List solrIdsOfDocumentsToDelete) { + logger.fine("attempting to delete the following documents from the index: " + StringUtils.join(solrIdsOfDocumentsToDelete, ",")); IndexResponse resultOfAttemptToDeleteDocuments = solrIndexService.deleteMultipleSolrIds(solrIdsOfDocumentsToDelete); logger.fine("result of attempt to delete harvested documents: " + resultOfAttemptToDeleteDocuments + "\n"); From 3fab4c8c823afbca7b28e74031aa9fa2ceb65bf0 Mon Sep 17 00:00:00 2001 From: Leonid Andreev Date: Tue, 10 Sep 2024 13:54:36 -0400 Subject: [PATCH 3/9] incremental. #10734 --- .../api/imports/ImportServiceBean.java | 130 ++++++++------ .../impl/UpdateHarvestedDatasetCommand.java | 159 ++++++++++++++++++ 2 files changed, 238 insertions(+), 51 deletions(-) create mode 100644 src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java diff --git a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java index a3147b68463..b3c65ceefcc 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java @@ -33,6 +33,7 @@ import edu.harvard.iq.dataverse.engine.command.impl.CreateHarvestedDatasetCommand; import edu.harvard.iq.dataverse.engine.command.impl.CreateNewDatasetCommand; import edu.harvard.iq.dataverse.engine.command.impl.DestroyDatasetCommand; +import edu.harvard.iq.dataverse.engine.command.impl.UpdateHarvestedDatasetCommand; import edu.harvard.iq.dataverse.harvest.client.HarvestingClient; import edu.harvard.iq.dataverse.search.IndexServiceBean; import edu.harvard.iq.dataverse.settings.SettingsServiceBean; @@ -43,6 +44,7 @@ import edu.harvard.iq.dataverse.license.LicenseServiceBean; import edu.harvard.iq.dataverse.pidproviders.PidUtil; import static edu.harvard.iq.dataverse.search.IndexServiceBean.solrDocIdentifierFile; +import edu.harvard.iq.dataverse.util.DatasetFieldUtil; import java.io.File; import java.io.FileOutputStream; @@ -366,25 +368,29 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve f.setSingleValue(DatasetField.NA_VALUE); } } - + + // @todo? - is this the right place to call tidyUpFields()? + // usually it is called within the body of the create/update commands. + DatasetFieldUtil.tidyUpFields(harvestedVersion.getDatasetFields(), true); + // Check data against validation constraints // Similarly to how we handle missing required values (above), we // replace invalid values with NA when harvesting. - boolean sanitized = validateDatasetVersion(harvestedVersion, true, cleanupLog); + boolean sanitized = validateAndSanitizeVersionMetadata(harvestedVersion, cleanupLog); // Note: this sanitizing approach, of replacing invalid values with // "NA" does not work with certain fields. For example, using it to // populate a GeoBox coordinate value will result in an invalid - // field. So we will attempt to validate the santized version again, - // this time around, it will throw an exception if still invalid, so - // we'll stop before proceeding any further. + // field. So we will attempt to re-validate the santized version. + // This time around, it will throw an exception if still invalid, so + // that we'll stop before proceeding any further: if (sanitized) { - + validateVersionMetadata(harvestedVersion, cleanupLog); } - Set invalidViolations = harvestedVersion.validate(); + /*Set invalidViolations = harvestedVersion.validate(); if (!invalidViolations.isEmpty()) { for (ConstraintViolation v : invalidViolations) { DatasetFieldValue f = v.getRootBean(); @@ -397,7 +403,7 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve // using it to populate a GeoBox coordinate value is going // to result in an invalid field. @todo? - see below } - } + }*/ // @todo? - re-validate the version before we do anything else? // something along the lines of @@ -407,6 +413,7 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve if (existingDataset != null) { // @todo // ... do the magic - parse the version json, do the switcheroo ... + /* DatasetVersion existingVersion = existingDataset.getVersions().get(0); Map existingFilesIndex = new HashMap<>(); @@ -490,6 +497,9 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve // UpdateHarvestedDatasetCommand() ? (later on) importedDataset = em.merge(existingDataset); //@todo reindex + */ + + importedDataset = engineSvc.submit(new UpdateHarvestedDatasetCommand(existingDataset, harvestedVersion, dataverseRequest)); } else { importedDataset = engineSvc.submit(new CreateHarvestedDatasetCommand(harvestedDataset, dataverseRequest)); @@ -512,49 +522,6 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve } return importedDataset; } - /** - * Shortcut method for validating AND attempting to sanitize a DatasetVersion - * @param version - * @param cleanupLog - any invalid values and their replacements are logged there - * @return true if any invalid values were encountered and sanitized - * @throws ImportException (although it should never happen in this mode) - */ - private boolean validateAndSanitizeVersionMetadata(DatasetVersion version, PrintWriter cleanupLog) throws ImportException { - return validateVersionMetadata(version, true, cleanupLog); - } - - private void validateVersionMetadata(DatasetVersion version, PrintWriter log) throws ImportException { - validateVersionMetadata(version, false, log); - } - - private boolean validateVersionMetadata(DatasetVersion version, boolean sanitize, PrintWriter cleanupLog) throws ImportException { - boolean fixed = false; - Set invalidViolations = version.validate(); - if (!invalidViolations.isEmpty()) { - for (ConstraintViolation v : invalidViolations) { - DatasetFieldValue f = v.getRootBean(); - - String msg = "Invalid metadata field: " + f.getDatasetField().getDatasetFieldType().getDisplayName() + "; " - + "Invalid value: '" + f.getValue() + "'"; - if (sanitize) { - msg += ", replaced with '" + DatasetField.NA_VALUE + "'"; - f.setValue(DatasetField.NA_VALUE); - fixed = true; - } - cleanupLog.println(msg); - - // Note: "NA" does not work with certain fields. For example, - // using it to populate a GeoBox coordinate value is going - // to result in an invalid field. So we'll need to validate the - // version again after the first, sanitizing pass and see if it - // helped or not. - } - if (!sanitize) { - throw new ImportException("Version was still failing validation after the first attempt to sanitize the invalid values."); - } - } - return fixed; - } public JsonObject ddiToJson(String xmlToParse) throws ImportException, XMLStreamException { DatasetDTO dsDTO = importDDIService.doImport(ImportType.IMPORT, xmlToParse); @@ -855,6 +822,67 @@ private String convertInvalidDateString(String inString){ return null; } + /** + * A shortcut method for validating AND attempting to sanitize a DatasetVersion + * @param version + * @param cleanupLog - any invalid values and their replacements are logged there + * @return true if any invalid values were encountered and sanitized + * @throws ImportException (although it should never happen in this mode) + */ + private boolean validateAndSanitizeVersionMetadata(DatasetVersion version, PrintWriter cleanupLog) throws ImportException { + return validateVersionMetadata(version, true, cleanupLog); + } + + /** + * A shortcut method for validating a DatasetVersion; will throw an exception + * if invalid, without attempting to sanitize the invalid values. + * @param version + * @param log - will log the invalid fields encountered there + * @throws ImportException + */ + private void validateVersionMetadata(DatasetVersion version, PrintWriter log) throws ImportException { + validateVersionMetadata(version, false, log); + } + + /** + * Validate the metadata fields of a newly-created version, and depending on + * the "sanitize" flag supplied, may or may not attempt to sanitize the supplied + * values by replacing them with "NA"s. + * @param version + * @param sanitize - boolean indicating whether to attempt to fix invalid values + * @param cleanupLog - to log any invalid values encountered will be logged + * @return - true if any invalid values have been replaced + * @throws ImportException + */ + private boolean validateVersionMetadata(DatasetVersion version, boolean sanitize, PrintWriter cleanupLog) throws ImportException { + boolean fixed = false; + Set invalidViolations = version.validate(); + if (!invalidViolations.isEmpty()) { + for (ConstraintViolation v : invalidViolations) { + DatasetFieldValue f = v.getRootBean(); + + String msg = "Invalid metadata field: " + f.getDatasetField().getDatasetFieldType().getDisplayName() + "; " + + "Invalid value: '" + f.getValue() + "'"; + if (sanitize) { + msg += ", replaced with '" + DatasetField.NA_VALUE + "'"; + f.setValue(DatasetField.NA_VALUE); + fixed = true; + } + cleanupLog.println(msg); + + // Note: "NA" does not work with certain fields. For example, + // using it to populate a GeoBox coordinate value is going + // to result in an invalid field. So we'll need to validate the + // version again after the first, sanitizing pass and see if it + // helped or not. + } + if (!sanitize) { + throw new ImportException("Version was still failing validation after the first attempt to sanitize the invalid values."); + } + } + return fixed; + } + private static class MyCustomFormatter extends Formatter { diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java new file mode 100644 index 00000000000..d28950e4d9d --- /dev/null +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java @@ -0,0 +1,159 @@ +package edu.harvard.iq.dataverse.engine.command.impl; + +import edu.harvard.iq.dataverse.authorization.Permission; +import edu.harvard.iq.dataverse.engine.command.CommandContext; +import edu.harvard.iq.dataverse.engine.command.DataverseRequest; +import edu.harvard.iq.dataverse.engine.command.RequiredPermissions; +import edu.harvard.iq.dataverse.engine.command.exception.CommandException; +import edu.harvard.iq.dataverse.engine.command.exception.IllegalCommandException; +import edu.harvard.iq.dataverse.Dataset; +import edu.harvard.iq.dataverse.DatasetVersion; +import edu.harvard.iq.dataverse.DataFile; +import edu.harvard.iq.dataverse.FileMetadata; +import static edu.harvard.iq.dataverse.search.IndexServiceBean.solrDocIdentifierFile; +import java.io.IOException; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.logging.Level; +import java.util.logging.Logger; +import org.apache.solr.client.solrj.SolrServerException; + +/** + * + * @author landreev + * + * Much simplified version of UpdateDatasetVersionCommand, + * but with some extra twists. + */ +@RequiredPermissions(Permission.EditDataset) +public class UpdateHarvestedDatasetCommand extends AbstractDatasetCommand { + + private static final Logger logger = Logger.getLogger(UpdateHarvestedDatasetCommand.class.getCanonicalName()); + private final DatasetVersion newHarvestedVersion; + final private boolean validateLenient = true; + + public UpdateHarvestedDatasetCommand(Dataset theDataset, DatasetVersion newHarvestedVersion, DataverseRequest aRequest) { + super(aRequest, theDataset); + this.newHarvestedVersion = newHarvestedVersion; + } + + public boolean isValidateLenient() { + return validateLenient; + } + + @Override + public Dataset execute(CommandContext ctxt) throws CommandException { + + // ... do the magic - parse the version json, do the switcheroo ... + Dataset existingDataset = getDataset(); + + if (existingDataset == null + || existingDataset.getId() == null + || !existingDataset.isHarvested() + || existingDataset.getVersions().size() != 1) { + throw new IllegalCommandException("The command can only be called on an existing harvested dataset with only 1 version", this); + } + DatasetVersion existingVersion = existingDataset.getVersions().get(0); + + if (newHarvestedVersion == null || newHarvestedVersion.getId() != null) { + throw new IllegalCommandException("The command can only be called with a newly-harvested, not yet saved DatasetVersion supplied", this); + } + + Map existingFilesIndex = new HashMap<>(); + + for (int i = 0; i < existingDataset.getFiles().size(); i++) { + String storageIdentifier = existingDataset.getFiles().get(i).getStorageIdentifier(); + if (storageIdentifier != null) { + existingFilesIndex.put(storageIdentifier, i); + } + } + + for (FileMetadata newFileMetadata : newHarvestedVersion.getFileMetadatas()) { + // is it safe to assume that each new FileMetadata will be + // pointing to a non-null DataFile here? + String location = newFileMetadata.getDataFile().getStorageIdentifier(); + if (location != null && existingFilesIndex.containsKey(location)) { + newFileMetadata.getDataFile().setFileMetadatas(new ArrayList<>()); + + int fileIndex = existingFilesIndex.get(location); + newFileMetadata.setDataFile(existingDataset.getFiles().get(fileIndex)); + existingDataset.getFiles().get(fileIndex).getFileMetadatas().add(newFileMetadata); + existingFilesIndex.remove(location); + } + } + // @todo check that the newly-harvested DataFiles have the same checksums + // and mime types etc. These values are supposed to be immutable, normally, + // but who knows - they may have fixed something invalid on the other end + // @todo check if there's anything special that needs to be done for things + // like file categories + + List solrIdsOfDocumentsToDelete = new ArrayList<>(); + + // Go through the existing files and delete the ones that are + // no longer present in the version that we have just harvesed: + for (FileMetadata oldFileMetadata : existingDataset.getVersions().get(0).getFileMetadatas()) { + DataFile oldDataFile = oldFileMetadata.getDataFile(); + solrIdsOfDocumentsToDelete.add(solrDocIdentifierFile + oldDataFile.getId()); + existingDataset.getFiles().remove(oldDataFile); + // Files from harvested datasets are removed unceremoniously, + // directly in the database. No need to bother calling the + // DeleteFileCommand on them. + ctxt.em().remove(ctxt.em().merge(oldDataFile)); + ctxt.em().remove(ctxt.em().merge(oldFileMetadata)); + oldDataFile = null; + oldFileMetadata = null; + } + + // purge all the SOLR documents associated with the files + // we have just deleted: + if (!solrIdsOfDocumentsToDelete.isEmpty()) { + ctxt.index().deleteHarvestedDocuments(solrIdsOfDocumentsToDelete); + } + + // ... And now delete the existing version itself: + existingDataset.setVersions(new ArrayList<>()); + ctxt.em().remove(ctxt.em().merge(existingVersion)); + + // Now attach the newly-harvested version to the dataset: + existingDataset.getVersions().add(newHarvestedVersion); + newHarvestedVersion.setDataset(existingDataset); + + // ... There's one more thing to do - go through the new files, + // that are not in the database yet, and make sure they are + // attached to this existing dataset: + for (FileMetadata newFileMetadata : newHarvestedVersion.getFileMetadatas()) { + if (newFileMetadata.getDataFile().getId() == null) { + existingDataset.getFiles().add(newFileMetadata.getDataFile()); + newFileMetadata.getDataFile().setOwner(existingDataset); + } + } + + ctxt.em().persist(newHarvestedVersion); + + Dataset savedDataset = ctxt.em().merge(existingDataset); + ctxt.em().flush(); + + //@todo reindex + + return savedDataset; + } + + @Override + public boolean onSuccess(CommandContext ctxt, Object r) { + boolean retVal = true; + Dataset d = (Dataset) r; + + try { + // Note that we index harvested datasets synchronously: + ctxt.index().indexDataset(d, true); + } catch (SolrServerException|IOException solrServerEx) { + logger.log(Level.WARNING, "Exception while trying to index the updated Harvested dataset " + d.getGlobalId().asString(), solrServerEx.getMessage()); + retVal = false; + } + + return retVal; + } +} From bd08b698859db5c5662f8a742504832023a632d7 Mon Sep 17 00:00:00 2001 From: Leonid Andreev Date: Wed, 11 Sep 2024 14:37:34 -0400 Subject: [PATCH 4/9] Largely finalized versions of the refactored harvesting classes. #10734 --- .../api/imports/ImportServiceBean.java | 142 ++++-------------- .../impl/UpdateHarvestedDatasetCommand.java | 82 +++++++--- 2 files changed, 90 insertions(+), 134 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java index b3c65ceefcc..8c7ff5acf6d 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java @@ -300,7 +300,7 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve String globalIdString = globalId.asString(); if (StringUtils.isEmpty(globalIdString)) { - // @todo this check shouldn't be necessary, now that there's a null check above + // @todo this check may not be necessary, now that there's a null check above throw new ImportException("The harvested metadata record with the OAI server identifier " + harvestIdentifier + " does not contain a global identifier this Dataverse recognizes, skipping."); } @@ -347,6 +347,11 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve } harvestedVersion = parser.parseDatasetVersion(obj.getJsonObject("datasetVersion")); + Dataset tempDataset = createTemporaryHarvestedDataset(harvestedVersion); + // Temporarily attach the temporary dataset to the parent Collection: + // (this will be needed for the purposes of looking up the configured + // metadatablocks and such) + tempDataset.setOwner(owner); } // Either a full new import, or an update of an existing harvested @@ -362,15 +367,16 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve // Check data against required contraints List> violations = harvestedVersion.validateRequired(); if (!violations.isEmpty()) { - // When harvesting, we add NA for missing required values: + // ... and fill the missing required values with "NA"s: for (ConstraintViolation v : violations) { DatasetField f = v.getRootBean(); f.setSingleValue(DatasetField.NA_VALUE); } } - // @todo? - is this the right place to call tidyUpFields()? - // usually it is called within the body of the create/update commands. + // is this the right place to call tidyUpFields()? + // usually it is called within the body of the create/update commands + // later on. DatasetFieldUtil.tidyUpFields(harvestedVersion.getDatasetFields(), true); // Check data against validation constraints @@ -379,7 +385,7 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve boolean sanitized = validateAndSanitizeVersionMetadata(harvestedVersion, cleanupLog); - // Note: this sanitizing approach, of replacing invalid values with + // Note: our sanitizing approach, of replacing invalid values with // "NA" does not work with certain fields. For example, using it to // populate a GeoBox coordinate value will result in an invalid // field. So we will attempt to re-validate the santized version. @@ -390,117 +396,8 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve validateVersionMetadata(harvestedVersion, cleanupLog); } - /*Set invalidViolations = harvestedVersion.validate(); - if (!invalidViolations.isEmpty()) { - for (ConstraintViolation v : invalidViolations) { - DatasetFieldValue f = v.getRootBean(); - - String msg = "Invalid metadata: " + metadataFile.getName() + "; Field: " + f.getDatasetField().getDatasetFieldType().getDisplayName() + "; " - + "Invalid value: '" + f.getValue() + "'" + ", replaced with '" + DatasetField.NA_VALUE + "'"; - cleanupLog.println(msg); - f.setValue(DatasetField.NA_VALUE); - // Note: "NA" does not work with certain fields. For example, - // using it to populate a GeoBox coordinate value is going - // to result in an invalid field. @todo? - see below - } - }*/ - - // @todo? - re-validate the version before we do anything else? - // something along the lines of - // if (this.validate) {validateOrDie(newVersion, false);} - // DatasetFieldUtil.tidyUpFields(newVersion.getDatasetFields(), true); - if (existingDataset != null) { - // @todo - // ... do the magic - parse the version json, do the switcheroo ... - /* - DatasetVersion existingVersion = existingDataset.getVersions().get(0); - - Map existingFilesIndex = new HashMap<>(); - - for (int i = 0; i < existingDataset.getFiles().size(); i++) { - String storageIdentifier = existingDataset.getFiles().get(i).getStorageIdentifier(); - if (storageIdentifier != null) { - existingFilesIndex.put(storageIdentifier, i); - } - } - - for (FileMetadata newFileMetadata : harvestedVersion.getFileMetadatas()) { - // is it safe to assume that each new FileMetadata will be - // pointing to a non-null DataFile - String location = newFileMetadata.getDataFile().getStorageIdentifier(); - if (location != null && existingFilesIndex.containsKey(location)) { - newFileMetadata.getDataFile().setFileMetadatas(new ArrayList<>()); - - int fileIndex = existingFilesIndex.get(location); - newFileMetadata.setDataFile(existingDataset.getFiles().get(fileIndex)); - existingDataset.getFiles().get(fileIndex).getFileMetadatas().add(newFileMetadata); - existingFilesIndex.remove(location); - } - } - - List solrIdsOfDocumentsToDelete = new ArrayList<>(); - - // Go through the existing files and delete the ones that are - // no longer present in the version that we have just harvesed: - for (FileMetadata oldFileMetadata : existingDataset.getVersions().get(0).getFileMetadatas()) { - DataFile oldDataFile = oldFileMetadata.getDataFile(); - solrIdsOfDocumentsToDelete.add(solrDocIdentifierFile + oldDataFile.getId()); - existingDataset.getFiles().remove(oldDataFile); - // Files from harvested datasets are removed unceremoniously, - // directly in the database. No need to bother calling the - // DeleteFileCommand on them. - em.remove(em.merge(oldDataFile)); - em.remove(em.merge(oldFileMetadata)); - oldDataFile = null; - oldFileMetadata = null; - } - - // purge all the SOLR documents associated with the files - // we have just deleted: - if (!solrIdsOfDocumentsToDelete.isEmpty()) { - indexService.deleteHarvestedDocuments(solrIdsOfDocumentsToDelete); - } - - // ... And now delete the existing version itself: - - existingDataset.setVersions(new ArrayList<>()); - em.remove(em.merge(existingVersion)); - - // Now attach the newly-harvested version to the dataset: - existingDataset.getVersions().add(harvestedVersion); - harvestedVersion.setDataset(existingDataset); - - // ... There's one more thing to do - go through the new files, - // that are not in the database yet, and make sure they are - // attached to this existing dataset: - - for (FileMetadata newFileMetadata : harvestedVersion.getFileMetadatas()) { - if (newFileMetadata.getDataFile().getId() == null) { - existingDataset.getFiles().add(newFileMetadata.getDataFile()); - newFileMetadata.getDataFile().setOwner(existingDataset); - } - } - - em.persist(harvestedVersion); - - ///@todo remove for (DataFile harvestedFile : existingDataset.getFiles()) { - ///@todo remove DataFile merged = em.merge(harvestedFile); - ///@todo remove em.remove(merged); - ///@todo remove harvestedFile = null; - ///@todo remove } - ///@todo remove existingDataset.setFiles(null); - ///@todo remove Dataset merged = em.merge(existingDataset); - // harvested datasets don't have physical files - so no need to worry about that. - ///@todo remove engineSvc.submit(new DestroyDatasetCommand(merged, dataverseRequest)); - // @todo! - // UpdateHarvestedDatasetCommand() ? (later on) - importedDataset = em.merge(existingDataset); - //@todo reindex - */ - importedDataset = engineSvc.submit(new UpdateHarvestedDatasetCommand(existingDataset, harvestedVersion, dataverseRequest)); - } else { importedDataset = engineSvc.submit(new CreateHarvestedDatasetCommand(harvestedDataset, dataverseRequest)); } @@ -883,6 +780,23 @@ private boolean validateVersionMetadata(DatasetVersion version, boolean sanitize return fixed; } + /** + * Helper method that creates a throwaway Harvested Dataset to temporarily + * attach the newly-harvested version to. We need this when, instead of + * importing a brand-new harvested dataset from scratch, we are planning to + * attempt to update an already existing dataset harvested from the same + * archival location. + * @param harvestedVersion - a newly created Version imported from harvested metadata + * @return - a temporary dataset to which the new version has been attached + */ + private Dataset createTemporaryHarvestedDataset(DatasetVersion harvestedVersion) { + Dataset tempDataset = new Dataset(); + harvestedVersion.setDataset(tempDataset); + tempDataset.setVersions(new ArrayList<>(1)); + tempDataset.getVersions().add(harvestedVersion); + + return tempDataset; + } private static class MyCustomFormatter extends Formatter { diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java index d28950e4d9d..9d7461b9b30 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java @@ -26,7 +26,10 @@ * @author landreev * * Much simplified version of UpdateDatasetVersionCommand, - * but with some extra twists. + * but with some extra twists. The goal is to avoid creating new Dataset and + * DataFile objects, and to instead preserve the database ids of the re-harvested + * datasets and files, whenever possible. This in turn allows us to avoid deleting + * and rebuilding from scratch the Solr documents for these objects. */ @RequiredPermissions(Permission.EditDataset) public class UpdateHarvestedDatasetCommand extends AbstractDatasetCommand { @@ -47,7 +50,6 @@ public boolean isValidateLenient() { @Override public Dataset execute(CommandContext ctxt) throws CommandException { - // ... do the magic - parse the version json, do the switcheroo ... Dataset existingDataset = getDataset(); if (existingDataset == null @@ -62,15 +64,30 @@ public Dataset execute(CommandContext ctxt) throws CommandException { throw new IllegalCommandException("The command can only be called with a newly-harvested, not yet saved DatasetVersion supplied", this); } + newHarvestedVersion.setCreateTime(getTimestamp()); + newHarvestedVersion.setLastUpdateTime(getTimestamp()); + + Map existingFilesIndex = new HashMap<>(); + /* + Create a map of the files that are currently part of this existing + harvested dataset. We assume that a harvested file can be uniquely + defined by its storageidentifier. Which, in the case of a datafile + harvested from another Dataverse should be its data access api url. + */ for (int i = 0; i < existingDataset.getFiles().size(); i++) { String storageIdentifier = existingDataset.getFiles().get(i).getStorageIdentifier(); if (storageIdentifier != null) { existingFilesIndex.put(storageIdentifier, i); } } - + + /* + Go through the files in the newly-harvested version and check if any of + them are the files (potentially new/updated versions thereof) of the files + we already have, harvested previously from the same archive location. + */ for (FileMetadata newFileMetadata : newHarvestedVersion.getFileMetadatas()) { // is it safe to assume that each new FileMetadata will be // pointing to a non-null DataFile here? @@ -79,14 +96,32 @@ public Dataset execute(CommandContext ctxt) throws CommandException { newFileMetadata.getDataFile().setFileMetadatas(new ArrayList<>()); int fileIndex = existingFilesIndex.get(location); + + // Make sure to update the existing DataFiles that we are going + // to keep in case their newly-harvested versions have different + // checksums, mime types etc. These values are supposed to be + // immutable, normally - but who knows, errors happen, the source + // Dataverse may have had to fix these in their database to + // correct a data integrity issue (for ex.): + existingDataset.getFiles().get(fileIndex).setContentType(newFileMetadata.getDataFile().getContentType()); + existingDataset.getFiles().get(fileIndex).setFilesize(newFileMetadata.getDataFile().getFilesize()); + existingDataset.getFiles().get(fileIndex).setChecksumType(newFileMetadata.getDataFile().getChecksumType()); + existingDataset.getFiles().get(fileIndex).setChecksumValue(newFileMetadata.getDataFile().getChecksumValue()); + + // Point the newly-harvested filemetadata to the existing datafile: newFileMetadata.setDataFile(existingDataset.getFiles().get(fileIndex)); + + // Make sure this new FileMetadata is the only one attached to this existing file: + existingDataset.getFiles().get(fileIndex).setFileMetadatas(new ArrayList<>(1)); existingDataset.getFiles().get(fileIndex).getFileMetadatas().add(newFileMetadata); + // (we don't want any cascade relationships left between this existing + // dataset and its version, since we are going to attemp to delete it). + + // Drop the file from the index map: existingFilesIndex.remove(location); } } - // @todo check that the newly-harvested DataFiles have the same checksums - // and mime types etc. These values are supposed to be immutable, normally, - // but who knows - they may have fixed something invalid on the other end + // @todo check if there's anything special that needs to be done for things // like file categories @@ -96,15 +131,20 @@ public Dataset execute(CommandContext ctxt) throws CommandException { // no longer present in the version that we have just harvesed: for (FileMetadata oldFileMetadata : existingDataset.getVersions().get(0).getFileMetadatas()) { DataFile oldDataFile = oldFileMetadata.getDataFile(); - solrIdsOfDocumentsToDelete.add(solrDocIdentifierFile + oldDataFile.getId()); - existingDataset.getFiles().remove(oldDataFile); - // Files from harvested datasets are removed unceremoniously, - // directly in the database. No need to bother calling the - // DeleteFileCommand on them. - ctxt.em().remove(ctxt.em().merge(oldDataFile)); - ctxt.em().remove(ctxt.em().merge(oldFileMetadata)); - oldDataFile = null; - oldFileMetadata = null; + String location = oldDataFile.getStorageIdentifier(); + // Is it still in the existing files map? - that means it is no longer + // present in the newly-harvested version + if (location != null && existingFilesIndex.containsKey(location)) { + solrIdsOfDocumentsToDelete.add(solrDocIdentifierFile + oldDataFile.getId()); + existingDataset.getFiles().remove(oldDataFile); + // Files from harvested datasets are removed unceremoniously, + // directly in the database. No need to bother calling the + // DeleteFileCommand on them. We'll just need to remember to purge + // them from Solr as well (right below) + ctxt.em().remove(ctxt.em().merge(oldDataFile)); + // (no need to explicitly remove the oldFileMetadata; it will be + // removed with the entire old version is deleted) + } } // purge all the SOLR documents associated with the files @@ -115,7 +155,10 @@ public Dataset execute(CommandContext ctxt) throws CommandException { // ... And now delete the existing version itself: existingDataset.setVersions(new ArrayList<>()); - ctxt.em().remove(ctxt.em().merge(existingVersion)); + existingVersion.setDataset(null); + + existingVersion = ctxt.em().merge(existingVersion); + ctxt.em().remove(existingVersion); // Now attach the newly-harvested version to the dataset: existingDataset.getVersions().add(newHarvestedVersion); @@ -123,7 +166,8 @@ public Dataset execute(CommandContext ctxt) throws CommandException { // ... There's one more thing to do - go through the new files, // that are not in the database yet, and make sure they are - // attached to this existing dataset: + // attached to this existing dataset, instead of the dummy temp + // dataset into which they were originally imported: for (FileMetadata newFileMetadata : newHarvestedVersion.getFileMetadatas()) { if (newFileMetadata.getDataFile().getId() == null) { existingDataset.getFiles().add(newFileMetadata.getDataFile()); @@ -135,9 +179,7 @@ public Dataset execute(CommandContext ctxt) throws CommandException { Dataset savedDataset = ctxt.em().merge(existingDataset); ctxt.em().flush(); - - //@todo reindex - + return savedDataset; } From e02a5845ca02e86ec042411f32513c3c35536f5a Mon Sep 17 00:00:00 2001 From: Leonid Andreev Date: Wed, 11 Sep 2024 15:24:33 -0400 Subject: [PATCH 5/9] a typo + some unused imports #10734 --- .../harvard/iq/dataverse/api/imports/ImportServiceBean.java | 5 ----- .../engine/command/impl/UpdateHarvestedDatasetCommand.java | 2 +- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java index 8c7ff5acf6d..acfb696b71d 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java @@ -7,7 +7,6 @@ import com.google.gson.Gson; import com.google.gson.GsonBuilder; -import edu.harvard.iq.dataverse.DataFile; import edu.harvard.iq.dataverse.Dataset; import edu.harvard.iq.dataverse.DatasetField; import edu.harvard.iq.dataverse.DatasetFieldConstant; @@ -20,7 +19,6 @@ import edu.harvard.iq.dataverse.DataverseContact; import edu.harvard.iq.dataverse.DataverseServiceBean; import edu.harvard.iq.dataverse.EjbDataverseEngine; -import edu.harvard.iq.dataverse.FileMetadata; import edu.harvard.iq.dataverse.GlobalId; import edu.harvard.iq.dataverse.MetadataBlockServiceBean; import edu.harvard.iq.dataverse.api.dto.DatasetDTO; @@ -43,7 +41,6 @@ import edu.harvard.iq.dataverse.util.json.JsonUtil; import edu.harvard.iq.dataverse.license.LicenseServiceBean; import edu.harvard.iq.dataverse.pidproviders.PidUtil; -import static edu.harvard.iq.dataverse.search.IndexServiceBean.solrDocIdentifierFile; import edu.harvard.iq.dataverse.util.DatasetFieldUtil; import java.io.File; @@ -75,8 +72,6 @@ import jakarta.validation.Validation; import jakarta.validation.Validator; import jakarta.validation.ValidatorFactory; -import java.util.HashMap; -import java.util.Map; import javax.xml.stream.XMLStreamException; import org.apache.commons.lang3.StringUtils; diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java index 9d7461b9b30..c93983befa5 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java @@ -115,7 +115,7 @@ them are the files (potentially new/updated versions thereof) of the files existingDataset.getFiles().get(fileIndex).setFileMetadatas(new ArrayList<>(1)); existingDataset.getFiles().get(fileIndex).getFileMetadatas().add(newFileMetadata); // (we don't want any cascade relationships left between this existing - // dataset and its version, since we are going to attemp to delete it). + // dataset and this version, since we are going to attemp to delete it). // Drop the file from the index map: existingFilesIndex.remove(location); From 30266ddcafc96127430e1a22dcee722920ae76cd Mon Sep 17 00:00:00 2001 From: Leonid Andreev Date: Wed, 11 Sep 2024 15:28:30 -0400 Subject: [PATCH 6/9] comment language #10734 --- .../engine/command/impl/UpdateHarvestedDatasetCommand.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java index c93983befa5..2380d7309ee 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java @@ -74,7 +74,7 @@ public Dataset execute(CommandContext ctxt) throws CommandException { Create a map of the files that are currently part of this existing harvested dataset. We assume that a harvested file can be uniquely defined by its storageidentifier. Which, in the case of a datafile - harvested from another Dataverse should be its data access api url. + harvested from another Dataverse should be its data access api url. */ for (int i = 0; i < existingDataset.getFiles().size(); i++) { String storageIdentifier = existingDataset.getFiles().get(i).getStorageIdentifier(); @@ -85,8 +85,8 @@ public Dataset execute(CommandContext ctxt) throws CommandException { /* Go through the files in the newly-harvested version and check if any of - them are the files (potentially new/updated versions thereof) of the files - we already have, harvested previously from the same archive location. + them are (potentially new/updated) versions of files that we already + have, harvested previously from the same archive location. */ for (FileMetadata newFileMetadata : newHarvestedVersion.getFileMetadatas()) { // is it safe to assume that each new FileMetadata will be From 9291554e7e83faa74cbee9da955f4aa87cd070b3 Mon Sep 17 00:00:00 2001 From: Leonid Andreev Date: Wed, 11 Sep 2024 17:18:10 -0400 Subject: [PATCH 7/9] minor/cosmetic #10734 --- .../api/imports/ImportServiceBean.java | 10 +++++----- .../impl/UpdateHarvestedDatasetCommand.java | 17 +++++++++-------- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java index acfb696b71d..78bf9af99fa 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java @@ -328,9 +328,9 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve if (existingDataset.getOwner() != null && !owner.getId().equals(existingDataset.getOwner().getId())) { throw new ImportException("The dataset with the global id " + globalIdString + " already exists, in the dataverse " + existingDataset.getOwner().getAlias() + ", skipping."); } - // And if we already have a dataset with this same id, in this same - // dataverse, but it is a LOCAL dataset (can happen!), we're going to - // skip it also: + // And if we already have a dataset with this same global id at + // this Dataverse instance, but it is a LOCAL dataset (can happen!), + // we're going to skip it also: if (!existingDataset.isHarvested()) { throw new ImportException("A LOCAL dataset with the global id " + globalIdString + " already exists in this dataverse; skipping."); } @@ -359,7 +359,7 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve harvestedVersion.setReleaseTime(oaiDateStamp); } - // Check data against required contraints + // Check data against required constraints List> violations = harvestedVersion.validateRequired(); if (!violations.isEmpty()) { // ... and fill the missing required values with "NA"s: @@ -457,7 +457,7 @@ public JsonObjectBuilder doImport(DataverseRequest dataverseRequest, Dataverse o ds.setOwner(owner); ds.getLatestVersion().setDatasetFields(ds.getLatestVersion().initDatasetFields()); - // Check data against required contraints + // Check data against required constraints List> violations = ds.getVersions().get(0).validateRequired(); if (!violations.isEmpty()) { if ( importType.equals(ImportType.HARVEST) ) { diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java index 2380d7309ee..09563686299 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java @@ -11,6 +11,7 @@ import edu.harvard.iq.dataverse.DataFile; import edu.harvard.iq.dataverse.FileMetadata; import static edu.harvard.iq.dataverse.search.IndexServiceBean.solrDocIdentifierFile; +import edu.harvard.iq.dataverse.util.StringUtil; import java.io.IOException; import java.util.ArrayList; @@ -78,7 +79,7 @@ public Dataset execute(CommandContext ctxt) throws CommandException { */ for (int i = 0; i < existingDataset.getFiles().size(); i++) { String storageIdentifier = existingDataset.getFiles().get(i).getStorageIdentifier(); - if (storageIdentifier != null) { + if (!StringUtil.isEmpty(storageIdentifier)) { existingFilesIndex.put(storageIdentifier, i); } } @@ -91,11 +92,11 @@ them are (potentially new/updated) versions of files that we already for (FileMetadata newFileMetadata : newHarvestedVersion.getFileMetadatas()) { // is it safe to assume that each new FileMetadata will be // pointing to a non-null DataFile here? - String location = newFileMetadata.getDataFile().getStorageIdentifier(); - if (location != null && existingFilesIndex.containsKey(location)) { + String storageIdentifier = newFileMetadata.getDataFile().getStorageIdentifier(); + if (!StringUtil.isEmpty(storageIdentifier) && existingFilesIndex.containsKey(storageIdentifier)) { newFileMetadata.getDataFile().setFileMetadatas(new ArrayList<>()); - int fileIndex = existingFilesIndex.get(location); + int fileIndex = existingFilesIndex.get(storageIdentifier); // Make sure to update the existing DataFiles that we are going // to keep in case their newly-harvested versions have different @@ -118,11 +119,11 @@ them are (potentially new/updated) versions of files that we already // dataset and this version, since we are going to attemp to delete it). // Drop the file from the index map: - existingFilesIndex.remove(location); + existingFilesIndex.remove(storageIdentifier); } } - // @todo check if there's anything special that needs to be done for things + // @todo? check if there's anything special that needs to be done for things // like file categories List solrIdsOfDocumentsToDelete = new ArrayList<>(); @@ -131,10 +132,10 @@ them are (potentially new/updated) versions of files that we already // no longer present in the version that we have just harvesed: for (FileMetadata oldFileMetadata : existingDataset.getVersions().get(0).getFileMetadatas()) { DataFile oldDataFile = oldFileMetadata.getDataFile(); - String location = oldDataFile.getStorageIdentifier(); + String storageIdentifier = oldDataFile.getStorageIdentifier(); // Is it still in the existing files map? - that means it is no longer // present in the newly-harvested version - if (location != null && existingFilesIndex.containsKey(location)) { + if (StringUtil.isEmpty(storageIdentifier) || existingFilesIndex.containsKey(storageIdentifier)) { solrIdsOfDocumentsToDelete.add(solrDocIdentifierFile + oldDataFile.getId()); existingDataset.getFiles().remove(oldDataFile); // Files from harvested datasets are removed unceremoniously, From 7568cd05c4a82331b5539491a76c547cb00a4e93 Mon Sep 17 00:00:00 2001 From: Leonid Andreev Date: Wed, 18 Sep 2024 22:44:03 -0400 Subject: [PATCH 8/9] Some cleanup/streamlining #10734 --- .../api/imports/ImportServiceBean.java | 42 ++++++++----------- 1 file changed, 17 insertions(+), 25 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java index 78bf9af99fa..73d47b18b9a 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java @@ -303,9 +303,8 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve Dataset existingDataset = datasetService.findByGlobalId(globalIdString); - try { - Dataset harvestedDataset = null; + Dataset harvestedDataset; JsonParser parser = new JsonParser(datasetfieldService, metadataBlockService, settingsService, licenseService, datasetTypeService, harvestingClient); parser.setLenient(true); @@ -314,7 +313,6 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve // Creating a new dataset from scratch: harvestedDataset = parser.parseDataset(obj); - harvestedDataset.setOwner(owner); harvestedDataset.setHarvestedFrom(harvestingClient); harvestedDataset.setHarvestIdentifier(harvestIdentifier); @@ -322,6 +320,7 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve harvestedVersion = harvestedDataset.getVersions().get(0); } else { // We already have a dataset with this id in the database. + // Let's check a few things before we go any further with it: // If this dataset already exists IN ANOTHER COLLECTION // we are just going to skip it! @@ -335,20 +334,22 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve throw new ImportException("A LOCAL dataset with the global id " + globalIdString + " already exists in this dataverse; skipping."); } // For harvested datasets, there should always only be one version. - // We will replace the current version with the imported version. - // @todo or should we just destroy any extra versions quietly? if (existingDataset.getVersions().size() != 1) { throw new ImportException("Error importing Harvested Dataset, existing dataset has " + existingDataset.getVersions().size() + " versions"); } - + + // We will attempt to import the new version, and replace the + // current, already existing version with it. harvestedVersion = parser.parseDatasetVersion(obj.getJsonObject("datasetVersion")); - Dataset tempDataset = createTemporaryHarvestedDataset(harvestedVersion); - // Temporarily attach the temporary dataset to the parent Collection: - // (this will be needed for the purposes of looking up the configured - // metadatablocks and such) - tempDataset.setOwner(owner); + + // For the purposes of validation, the version needs to be attached + // to a non-null dataset. We will reate a throwaway temporary + // dataset for this: + harvestedDataset = createTemporaryHarvestedDataset(harvestedVersion); } + harvestedDataset.setOwner(owner); + // Either a full new import, or an update of an existing harvested // Dataset, perform some cleanup on the new version imported from the // parsed metadata: @@ -359,28 +360,19 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve harvestedVersion.setReleaseTime(oaiDateStamp); } - // Check data against required constraints - List> violations = harvestedVersion.validateRequired(); - if (!violations.isEmpty()) { - // ... and fill the missing required values with "NA"s: - for (ConstraintViolation v : violations) { - DatasetField f = v.getRootBean(); - f.setSingleValue(DatasetField.NA_VALUE); - } - } - // is this the right place to call tidyUpFields()? // usually it is called within the body of the create/update commands // later on. DatasetFieldUtil.tidyUpFields(harvestedVersion.getDatasetFields(), true); - // Check data against validation constraints - // Similarly to how we handle missing required values (above), we - // replace invalid values with NA when harvesting. + // Check data against validation constraints. + // Make an attempt to sanitize any invalid fields encountered - + // missing required fields or invalid values, by filling the values + // with NAs. boolean sanitized = validateAndSanitizeVersionMetadata(harvestedVersion, cleanupLog); - // Note: our sanitizing approach, of replacing invalid values with + // Note: this sanitizing approach, of replacing invalid values with // "NA" does not work with certain fields. For example, using it to // populate a GeoBox coordinate value will result in an invalid // field. So we will attempt to re-validate the santized version. From 0a83bc374ab9c4cccdf7d8e7b9fbe74ce4948848 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Mon, 23 Sep 2024 11:22:08 -0400 Subject: [PATCH 9/9] typo in comment --- .../edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java index 73d47b18b9a..66f48bfb872 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java @@ -343,7 +343,7 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve harvestedVersion = parser.parseDatasetVersion(obj.getJsonObject("datasetVersion")); // For the purposes of validation, the version needs to be attached - // to a non-null dataset. We will reate a throwaway temporary + // to a non-null dataset. We will create a throwaway temporary // dataset for this: harvestedDataset = createTemporaryHarvestedDataset(harvestedVersion); }