From 03debc16cfc68e42a5bd4aad3076ed2dbadad94c Mon Sep 17 00:00:00 2001
From: Leonid Andreev <leonid@hmdc.harvard.edu>
Date: Mon, 9 Sep 2024 10:49:53 -0400
Subject: [PATCH 1/9] Some work-in-progress modifications of the Harvested
 imports and indexing process. #10734

---
 .../api/imports/ImportServiceBean.java        | 175 ++++++++++--------
 .../impl/AbstractCreateDatasetCommand.java    |  18 +-
 .../iq/dataverse/search/IndexServiceBean.java |   6 +-
 3 files changed, 118 insertions(+), 81 deletions(-)

diff --git a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java
index d2bba56f884..42d1a14bf1d 100644
--- a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java
+++ b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java
@@ -20,6 +20,7 @@
 import edu.harvard.iq.dataverse.DataverseContact;
 import edu.harvard.iq.dataverse.DataverseServiceBean;
 import edu.harvard.iq.dataverse.EjbDataverseEngine;
+import edu.harvard.iq.dataverse.GlobalId;
 import edu.harvard.iq.dataverse.MetadataBlockServiceBean;
 import edu.harvard.iq.dataverse.api.dto.DatasetDTO;
 import edu.harvard.iq.dataverse.api.imports.ImportUtil.ImportType;
@@ -268,29 +269,97 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve
         }
 
         JsonObject obj = JsonUtil.getJsonObject(json);
-        //and call parse Json to read it into a dataset   
+        
+        String protocol = obj.getString("protocol", null);
+        String authority = obj.getString("authority", null);
+        String identifier = obj.getString("identifier",null);
+        
+        GlobalId globalId;
+        
+        // A Global ID is required:
+        // (meaning, we will fail with an exception if the imports above have 
+        // not managed to find an acceptable global identifier in the harvested 
+        // metadata)
+        
+        try { 
+            globalId = PidUtil.parseAsGlobalID(protocol, authority, identifier);
+        } catch (IllegalArgumentException iax) {
+            throw new ImportException("The harvested metadata record with the OAI server identifier " + harvestIdentifier + " does not contain a global identifier this Dataverse can parse, skipping.");
+        }
+        
+        if (globalId == null) {
+            throw new ImportException("The harvested metadata record with the OAI server identifier " + harvestIdentifier + " does not contain a global identifier this Dataverse recognizes, skipping.");
+        }
+        
+        String globalIdString = globalId.asString();
+        
+        if (StringUtils.isEmpty(globalIdString)) {
+            // @todo this check shouldn't be necessary, now that there's a null check above
+            throw new ImportException("The harvested metadata record with the OAI server identifier " + harvestIdentifier + " does not contain a global identifier this Dataverse recognizes, skipping.");
+        }
+        
+        DatasetVersion harvestedVersion; 
+        
+        Dataset existingDataset = datasetService.findByGlobalId(globalIdString);
+        
+        
         try {
+            Dataset harvestedDataset = null;
+
             JsonParser parser = new JsonParser(datasetfieldService, metadataBlockService, settingsService, licenseService, datasetTypeService, harvestingClient);
             parser.setLenient(true);
-            Dataset ds = parser.parseDataset(obj);
 
-            // For ImportType.NEW, if the metadata contains a global identifier, and it's not a protocol
-            // we support, it should be rejected.
-            // (TODO: ! - add some way of keeping track of supported protocols!)
-            //if (ds.getGlobalId() != null && !ds.getProtocol().equals(settingsService.getValueForKey(SettingsServiceBean.Key.Protocol, ""))) {
-            //    throw new ImportException("Could not register id " + ds.getGlobalId() + ", protocol not supported");
-            //}
-            ds.setOwner(owner);
-            ds.getLatestVersion().setDatasetFields(ds.getLatestVersion().initDatasetFields());
+            if (existingDataset != null) {
+                // If this dataset already exists IN ANOTHER COLLECTION
+                // we are just going to skip it!
+                if (existingDataset.getOwner() != null && !owner.getId().equals(existingDataset.getOwner().getId())) {
+                    throw new ImportException("The dataset with the global id " + globalIdString + " already exists, in the dataverse " + existingDataset.getOwner().getAlias() + ", skipping.");
+                }
+                // And if we already have a dataset with this same id, in this same
+                // dataverse, but it is a LOCAL dataset (can happen!), we're going to 
+                // skip it also: 
+                if (!existingDataset.isHarvested()) {
+                    throw new ImportException("A LOCAL dataset with the global id " + globalIdString + " already exists in this dataverse; skipping.");
+                }
+                // For harvested datasets, there should always only be one version.
+                // We will replace the current version with the imported version.
+                // @todo or should we just destroy any extra versions quietly?
+                if (existingDataset.getVersions().size() != 1) {
+                    throw new ImportException("Error importing Harvested Dataset, existing dataset has " + existingDataset.getVersions().size() + " versions");
+                }
+                
+                // ... do the magic - parse the version json, do the switcheroo ...
+                
+                harvestedDataset = existingDataset; // @todo!!
+                
+                harvestedVersion = parser.parseDatasetVersion(obj.getJsonObject("datasetVersion"));
+                
+            } else {
+                // Creating a new dataset from scratch:
+                
+                harvestedDataset = parser.parseDataset(obj);
+                harvestedDataset.setOwner(owner);
+                
+                harvestedDataset.setHarvestedFrom(harvestingClient);
+                harvestedDataset.setHarvestIdentifier(harvestIdentifier);
+                
+                harvestedVersion = harvestedDataset.getVersions().get(0);
+            }
+        
+            // Either a full new import, or an update of an existing harvested
+            // Dataset, perform some cleanup on the new version imported from the 
+            // parsed metadata:
+            
+            harvestedVersion.setDatasetFields(harvestedVersion.initDatasetFields());
 
-            if (ds.getVersions().get(0).getReleaseTime() == null) {
-                ds.getVersions().get(0).setReleaseTime(oaiDateStamp);
+            if (harvestedVersion.getReleaseTime() == null) {
+                harvestedVersion.setReleaseTime(oaiDateStamp);
             }
             
             // Check data against required contraints
-            List<ConstraintViolation<DatasetField>> violations = ds.getVersions().get(0).validateRequired();
+            List<ConstraintViolation<DatasetField>> violations = harvestedVersion.validateRequired();
             if (!violations.isEmpty()) {
-                // For migration and harvest, add NA for missing required values
+                // When harvesting, we add NA for missing required values:
                 for (ConstraintViolation<DatasetField> v : violations) {
                     DatasetField f =  v.getRootBean();
                     f.setSingleValue(DatasetField.NA_VALUE);
@@ -298,86 +367,42 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve
             }
 
             // Check data against validation constraints
-            // If we are migrating and "scrub migration data" is true we attempt to fix invalid data
-            // if the fix fails stop processing of this file by throwing exception
-            Set<ConstraintViolation> invalidViolations = ds.getVersions().get(0).validate();
-            ValidatorFactory factory = Validation.buildDefaultValidatorFactory();
-            Validator validator = factory.getValidator();
+            // Similarly to how we handle missing required values (above), we 
+            // replace invalid values with NA when harvesting.
+            
+            Set<ConstraintViolation> invalidViolations = harvestedVersion.validate();
             if (!invalidViolations.isEmpty()) {
                 for (ConstraintViolation<DatasetFieldValue> v : invalidViolations) {
                     DatasetFieldValue f = v.getRootBean();
-                    boolean fixed = false;
-                    boolean converted = false;
-                    // TODO: Is this scrubbing something we want to continue doing? 
-                    if (settingsService.isTrueForKey(SettingsServiceBean.Key.ScrubMigrationData, false)) {
-                        fixed = processMigrationValidationError(f, cleanupLog, metadataFile.getName());
-                        converted = true;
-                        if (fixed) {
-                            Set<ConstraintViolation<DatasetFieldValue>> scrubbedViolations = validator.validate(f);
-                            if (!scrubbedViolations.isEmpty()) {
-                                fixed = false;
-                            }
-                        }
-                    }
-                    if (!fixed) {
-                        String msg = "Data modified - File: " + metadataFile.getName() + "; Field: " + f.getDatasetField().getDatasetFieldType().getDisplayName() + "; "
-                                + "Invalid value:  '" + f.getValue() + "'" + " Converted Value:'" + DatasetField.NA_VALUE + "'";
-                        cleanupLog.println(msg);
-                        f.setValue(DatasetField.NA_VALUE);
 
-                    }
+                    String msg = "Invalid metadata: " + metadataFile.getName() + "; Field: " + f.getDatasetField().getDatasetFieldType().getDisplayName() + "; "
+                            + "Invalid value:  '" + f.getValue() + "'" + ", replaced with '" + DatasetField.NA_VALUE + "'";
+                    cleanupLog.println(msg);
+                    f.setValue(DatasetField.NA_VALUE);
                 }
             }
             
-            // A Global ID is required, in order for us to be able to harvest and import
-            // this dataset:
-            if (StringUtils.isEmpty(ds.getGlobalId().asString())) {
-                throw new ImportException("The harvested metadata record with the OAI server identifier "+harvestIdentifier+" does not contain a global unique identifier that we could recognize, skipping.");
-            }
-
-            ds.setHarvestedFrom(harvestingClient);
-            ds.setHarvestIdentifier(harvestIdentifier);
-            
-            Dataset existingDs = datasetService.findByGlobalId(ds.getGlobalId().asString());
-
-            if (existingDs != null) {
-                // If this dataset already exists IN ANOTHER DATAVERSE
-                // we are just going to skip it!
-                if (existingDs.getOwner() != null && !owner.getId().equals(existingDs.getOwner().getId())) {
-                    throw new ImportException("The dataset with the global id "+ds.getGlobalId().asString()+" already exists, in the dataverse "+existingDs.getOwner().getAlias()+", skipping.");
-                }
-                // And if we already have a dataset with this same id, in this same
-                // dataverse, but it is  LOCAL dataset (can happen!), we're going to 
-                // skip it also: 
-                if (!existingDs.isHarvested()) {
-                    throw new ImportException("A LOCAL dataset with the global id "+ds.getGlobalId().asString()+" already exists in this dataverse; skipping.");
-                }
-                // For harvested datasets, there should always only be one version.
-                // We will replace the current version with the imported version.
-                if (existingDs.getVersions().size() != 1) {
-                    throw new ImportException("Error importing Harvested Dataset, existing dataset has " + existingDs.getVersions().size() + " versions");
-                }
+            if (existingDataset != null) {
                 // Purge all the SOLR documents associated with this client from the 
                 // index server: 
-                indexService.deleteHarvestedDocuments(existingDs);
+                indexService.deleteHarvestedDocuments(existingDataset);
                 // files from harvested datasets are removed unceremoniously, 
                 // directly in the database. no need to bother calling the 
                 // DeleteFileCommand on them.
-                for (DataFile harvestedFile : existingDs.getFiles()) {
+                for (DataFile harvestedFile : existingDataset.getFiles()) {
                     DataFile merged = em.merge(harvestedFile);
                     em.remove(merged);
                     harvestedFile = null; 
                 }
-                // TODO: 
-                // Verify what happens with the indexed files in SOLR? 
-                // are they going to be overwritten by the reindexing of the dataset?
-                existingDs.setFiles(null);
-                Dataset merged = em.merge(existingDs);
+                existingDataset.setFiles(null);
+                Dataset merged = em.merge(existingDataset);
                 // harvested datasets don't have physical files - so no need to worry about that.
                 engineSvc.submit(new DestroyDatasetCommand(merged, dataverseRequest));
+                // @todo!
+                // UpdateHarvestedDatasetCommand() ?
+            } else {
+                importedDataset = engineSvc.submit(new CreateHarvestedDatasetCommand(harvestedDataset, dataverseRequest));
             }
-            
-            importedDataset = engineSvc.submit(new CreateHarvestedDatasetCommand(ds, dataverseRequest));
 
         } catch (JsonParseException | ImportException | CommandException ex) {
             logger.fine("Failed to import harvested dataset: " + ex.getClass() + ": " + ex.getMessage());
diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractCreateDatasetCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractCreateDatasetCommand.java
index 7b7c5fd0e93..0567b737a58 100644
--- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractCreateDatasetCommand.java
+++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractCreateDatasetCommand.java
@@ -13,8 +13,10 @@
 import edu.harvard.iq.dataverse.engine.command.exception.CommandException;
 import edu.harvard.iq.dataverse.pidproviders.PidProvider;
 import static edu.harvard.iq.dataverse.util.StringUtil.isEmpty;
+import java.io.IOException;
 import java.util.Objects;
 import java.util.logging.Logger;
+import org.apache.solr.client.solrj.SolrServerException;
 
 /**;
  * An abstract base class for commands that creates {@link Dataset}s.
@@ -146,9 +148,19 @@ public Dataset execute(CommandContext ctxt) throws CommandException {
         
         //Use for code that requires database ids
         postDBFlush(theDataset, ctxt);
-        
-        ctxt.index().asyncIndexDataset(theDataset, true);
-                 
+
+        if (harvested) {
+            try {
+                ctxt.index().indexDataset(theDataset, true);
+            } catch (SolrServerException | IOException solrEx) {
+                logger.warning("Failed to index harvested dataset. " + solrEx.getMessage());
+            }
+        } else {
+            // The asynchronous version does not throw any exceptions, 
+            // logging them internally instead. 
+            ctxt.index().asyncIndexDataset(theDataset, true);
+        }
+               
         return theDataset;
     }
 
diff --git a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java
index fd769846490..a659976769a 100644
--- a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java
+++ b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java
@@ -418,7 +418,7 @@ synchronized private static Dataset getNextToIndex(Long id, Dataset d) {
     public void asyncIndexDataset(Dataset dataset, boolean doNormalSolrDocCleanUp) {
         try {
             acquirePermitFromSemaphore();
-            doAyncIndexDataset(dataset, doNormalSolrDocCleanUp);
+            doAsyncIndexDataset(dataset, doNormalSolrDocCleanUp);
         } catch (InterruptedException e) {
             String failureLogText = "Indexing failed: interrupted. You can kickoff a re-index of this dataset with: \r\n curl http://localhost:8080/api/admin/index/datasets/" + dataset.getId().toString();
             failureLogText += "\r\n" + e.getLocalizedMessage();
@@ -428,7 +428,7 @@ public void asyncIndexDataset(Dataset dataset, boolean doNormalSolrDocCleanUp) {
         }
     }
 
-    private void doAyncIndexDataset(Dataset dataset, boolean doNormalSolrDocCleanUp) {
+    private void doAsyncIndexDataset(Dataset dataset, boolean doNormalSolrDocCleanUp) {
         Long id = dataset.getId();
         Dataset next = getNextToIndex(id, dataset); // if there is an ongoing index job for this dataset, next is null (ongoing index job will reindex the newest version after current indexing finishes)
         while (next != null) {
@@ -449,7 +449,7 @@ public void asyncIndexDatasetList(List<Dataset> datasets, boolean doNormalSolrDo
         for(Dataset dataset : datasets) {
             try {
                 acquirePermitFromSemaphore();
-                doAyncIndexDataset(dataset, true);
+                doAsyncIndexDataset(dataset, true);
             } catch (InterruptedException e) {
                 String failureLogText = "Indexing failed: interrupted. You can kickoff a re-index of this dataset with: \r\n curl http://localhost:8080/api/admin/index/datasets/" + dataset.getId().toString();
                 failureLogText += "\r\n" + e.getLocalizedMessage();

From e62fc55a1831999d6024d5a12c20cf8c81f172a4 Mon Sep 17 00:00:00 2001
From: Leonid Andreev <leonid@hmdc.harvard.edu>
Date: Tue, 10 Sep 2024 11:05:31 -0400
Subject: [PATCH 2/9] intermediate/unfinished state - work in progress #10734

---
 .../api/imports/ImportServiceBean.java        | 198 +++++++++++++++---
 .../iq/dataverse/search/IndexServiceBean.java |   5 +
 2 files changed, 171 insertions(+), 32 deletions(-)

diff --git a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java
index 42d1a14bf1d..a3147b68463 100644
--- a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java
+++ b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java
@@ -20,6 +20,7 @@
 import edu.harvard.iq.dataverse.DataverseContact;
 import edu.harvard.iq.dataverse.DataverseServiceBean;
 import edu.harvard.iq.dataverse.EjbDataverseEngine;
+import edu.harvard.iq.dataverse.FileMetadata;
 import edu.harvard.iq.dataverse.GlobalId;
 import edu.harvard.iq.dataverse.MetadataBlockServiceBean;
 import edu.harvard.iq.dataverse.api.dto.DatasetDTO;
@@ -41,6 +42,7 @@
 import edu.harvard.iq.dataverse.util.json.JsonUtil;
 import edu.harvard.iq.dataverse.license.LicenseServiceBean;
 import edu.harvard.iq.dataverse.pidproviders.PidUtil;
+import static edu.harvard.iq.dataverse.search.IndexServiceBean.solrDocIdentifierFile;
 
 import java.io.File;
 import java.io.FileOutputStream;
@@ -71,6 +73,8 @@
 import jakarta.validation.Validation;
 import jakarta.validation.Validator;
 import jakarta.validation.ValidatorFactory;
+import java.util.HashMap;
+import java.util.Map;
 import javax.xml.stream.XMLStreamException;
 import org.apache.commons.lang3.StringUtils;
 
@@ -209,7 +213,7 @@ public JsonObjectBuilder handleFile(DataverseRequest dataverseRequest, Dataverse
     @TransactionAttribute(TransactionAttributeType.REQUIRES_NEW)
     public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, HarvestingClient harvestingClient, String harvestIdentifier, String metadataFormat, File metadataFile, Date oaiDateStamp, PrintWriter cleanupLog) throws ImportException, IOException {
         if (harvestingClient == null || harvestingClient.getDataverse() == null) {
-            throw new ImportException("importHarvestedDataset called wiht a null harvestingClient, or an invalid harvestingClient.");
+            throw new ImportException("importHarvestedDataset called with a null harvestingClient, or an invalid harvestingClient.");
         }
         Dataverse owner = harvestingClient.getDataverse();
         Dataset importedDataset = null;
@@ -309,7 +313,19 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve
             JsonParser parser = new JsonParser(datasetfieldService, metadataBlockService, settingsService, licenseService, datasetTypeService, harvestingClient);
             parser.setLenient(true);
 
-            if (existingDataset != null) {
+            if (existingDataset == null) {
+                // Creating a new dataset from scratch:
+                
+                harvestedDataset = parser.parseDataset(obj);
+                harvestedDataset.setOwner(owner);
+                
+                harvestedDataset.setHarvestedFrom(harvestingClient);
+                harvestedDataset.setHarvestIdentifier(harvestIdentifier);
+                
+                harvestedVersion = harvestedDataset.getVersions().get(0);
+            } else {
+                // We already have a dataset with this id in the database.
+                
                 // If this dataset already exists IN ANOTHER COLLECTION
                 // we are just going to skip it!
                 if (existingDataset.getOwner() != null && !owner.getId().equals(existingDataset.getOwner().getId())) {
@@ -327,23 +343,8 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve
                 if (existingDataset.getVersions().size() != 1) {
                     throw new ImportException("Error importing Harvested Dataset, existing dataset has " + existingDataset.getVersions().size() + " versions");
                 }
-                
-                // ... do the magic - parse the version json, do the switcheroo ...
-                
-                harvestedDataset = existingDataset; // @todo!!
-                
+                                
                 harvestedVersion = parser.parseDatasetVersion(obj.getJsonObject("datasetVersion"));
-                
-            } else {
-                // Creating a new dataset from scratch:
-                
-                harvestedDataset = parser.parseDataset(obj);
-                harvestedDataset.setOwner(owner);
-                
-                harvestedDataset.setHarvestedFrom(harvestingClient);
-                harvestedDataset.setHarvestIdentifier(harvestIdentifier);
-                
-                harvestedVersion = harvestedDataset.getVersions().get(0);
             }
         
             // Either a full new import, or an update of an existing harvested
@@ -370,6 +371,19 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve
             // Similarly to how we handle missing required values (above), we 
             // replace invalid values with NA when harvesting.
             
+            boolean sanitized = validateDatasetVersion(harvestedVersion, true, cleanupLog);
+            
+            // Note: this sanitizing approach, of replacing invalid values with 
+            // "NA" does not work with certain fields. For example, using it to 
+            // populate a GeoBox coordinate value will result in an invalid 
+            // field. So we will attempt to validate the santized version again,
+            // this time around, it will throw an exception if still invalid, so 
+            // we'll stop before proceeding any further. 
+            
+            if (sanitized) {
+                
+            }
+            
             Set<ConstraintViolation> invalidViolations = harvestedVersion.validate();
             if (!invalidViolations.isEmpty()) {
                 for (ConstraintViolation<DatasetFieldValue> v : invalidViolations) {
@@ -379,27 +393,104 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve
                             + "Invalid value:  '" + f.getValue() + "'" + ", replaced with '" + DatasetField.NA_VALUE + "'";
                     cleanupLog.println(msg);
                     f.setValue(DatasetField.NA_VALUE);
+                    // Note: "NA" does not work with certain fields. For example, 
+                    // using it to populate a GeoBox coordinate value is going 
+                    // to result in an invalid field. @todo? - see below
                 }
             }
             
+            // @todo? - re-validate the version before we do anything else? 
+            // something along the lines of 
+            // if (this.validate) {validateOrDie(newVersion, false);}
+            // DatasetFieldUtil.tidyUpFields(newVersion.getDatasetFields(), true);
+            
             if (existingDataset != null) {
-                // Purge all the SOLR documents associated with this client from the 
-                // index server: 
-                indexService.deleteHarvestedDocuments(existingDataset);
-                // files from harvested datasets are removed unceremoniously, 
-                // directly in the database. no need to bother calling the 
-                // DeleteFileCommand on them.
-                for (DataFile harvestedFile : existingDataset.getFiles()) {
-                    DataFile merged = em.merge(harvestedFile);
-                    em.remove(merged);
-                    harvestedFile = null; 
+                // @todo
+                // ... do the magic - parse the version json, do the switcheroo ...
+                DatasetVersion existingVersion = existingDataset.getVersions().get(0);
+
+                Map<String, Integer> existingFilesIndex = new HashMap<>(); 
+                
+                for (int i = 0; i < existingDataset.getFiles().size(); i++) {
+                    String storageIdentifier = existingDataset.getFiles().get(i).getStorageIdentifier();
+                    if (storageIdentifier != null) {
+                        existingFilesIndex.put(storageIdentifier, i);
+                    }
+                }
+                
+                for (FileMetadata newFileMetadata : harvestedVersion.getFileMetadatas()) {
+                    // is it safe to assume that each new FileMetadata will be 
+                    // pointing to a non-null DataFile
+                    String location = newFileMetadata.getDataFile().getStorageIdentifier(); 
+                    if (location != null && existingFilesIndex.containsKey(location)) {
+                        newFileMetadata.getDataFile().setFileMetadatas(new ArrayList<>());
+                        
+                        int fileIndex = existingFilesIndex.get(location);
+                        newFileMetadata.setDataFile(existingDataset.getFiles().get(fileIndex));
+                        existingDataset.getFiles().get(fileIndex).getFileMetadatas().add(newFileMetadata);
+                        existingFilesIndex.remove(location);
+                    }
+                }
+                
+                List<String> solrIdsOfDocumentsToDelete = new ArrayList<>();
+                
+                // Go through the existing files and delete the ones that are 
+                // no longer present in the version that we have just harvesed:
+                for (FileMetadata oldFileMetadata : existingDataset.getVersions().get(0).getFileMetadatas()) {
+                    DataFile oldDataFile = oldFileMetadata.getDataFile();
+                    solrIdsOfDocumentsToDelete.add(solrDocIdentifierFile + oldDataFile.getId());
+                    existingDataset.getFiles().remove(oldDataFile);
+                    // Files from harvested datasets are removed unceremoniously, 
+                    // directly in the database. No need to bother calling the 
+                    // DeleteFileCommand on them.
+                    em.remove(em.merge(oldDataFile));
+                    em.remove(em.merge(oldFileMetadata));
+                    oldDataFile = null;
+                    oldFileMetadata = null; 
+                }
+                
+                // purge all the SOLR documents associated with the files
+                // we have just deleted:
+                if (!solrIdsOfDocumentsToDelete.isEmpty()) {
+                    indexService.deleteHarvestedDocuments(solrIdsOfDocumentsToDelete);
+                }
+
+                // ... And now delete the existing version itself: 
+                
+                existingDataset.setVersions(new ArrayList<>());
+                em.remove(em.merge(existingVersion));
+                
+                // Now attach the newly-harvested version to the dataset:
+                existingDataset.getVersions().add(harvestedVersion);
+                harvestedVersion.setDataset(existingDataset);
+                
+                // ... There's one more thing to do - go through the new files, 
+                // that are not in the database yet, and make sure they are 
+                // attached to this existing dataset:
+                
+                for (FileMetadata newFileMetadata : harvestedVersion.getFileMetadatas()) {
+                    if (newFileMetadata.getDataFile().getId() == null) {
+                        existingDataset.getFiles().add(newFileMetadata.getDataFile());
+                        newFileMetadata.getDataFile().setOwner(existingDataset);
+                    }
                 }
-                existingDataset.setFiles(null);
-                Dataset merged = em.merge(existingDataset);
+                
+                em.persist(harvestedVersion);
+                
+                ///@todo remove for (DataFile harvestedFile : existingDataset.getFiles()) {
+                ///@todo remove     DataFile merged = em.merge(harvestedFile);
+                ///@todo remove     em.remove(merged);
+                ///@todo remove     harvestedFile = null; 
+                ///@todo remove }
+                ///@todo remove existingDataset.setFiles(null);
+                ///@todo remove Dataset merged = em.merge(existingDataset);
                 // harvested datasets don't have physical files - so no need to worry about that.
-                engineSvc.submit(new DestroyDatasetCommand(merged, dataverseRequest));
+                ///@todo remove engineSvc.submit(new DestroyDatasetCommand(merged, dataverseRequest));
                 // @todo!
-                // UpdateHarvestedDatasetCommand() ?
+                // UpdateHarvestedDatasetCommand() ? (later on)
+                importedDataset = em.merge(existingDataset);
+                //@todo reindex 
+
             } else {
                 importedDataset = engineSvc.submit(new CreateHarvestedDatasetCommand(harvestedDataset, dataverseRequest));
             }
@@ -421,6 +512,49 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve
         }
         return importedDataset;
     }
+    /**
+     * Shortcut method for validating AND attempting to sanitize a DatasetVersion
+     * @param version
+     * @param cleanupLog - any invalid values and their replacements are logged there
+     * @return true if any invalid values were encountered and sanitized
+     * @throws ImportException (although it should never happen in this mode)
+     */
+    private boolean validateAndSanitizeVersionMetadata(DatasetVersion version, PrintWriter cleanupLog) throws ImportException {
+        return validateVersionMetadata(version, true, cleanupLog);
+    }
+    
+    private void validateVersionMetadata(DatasetVersion version, PrintWriter log) throws ImportException {
+        validateVersionMetadata(version, false, log);
+    } 
+    
+    private boolean validateVersionMetadata(DatasetVersion version, boolean sanitize, PrintWriter cleanupLog) throws ImportException {
+        boolean fixed = false;
+        Set<ConstraintViolation> invalidViolations = version.validate();
+        if (!invalidViolations.isEmpty()) {
+            for (ConstraintViolation<DatasetFieldValue> v : invalidViolations) {
+                DatasetFieldValue f = v.getRootBean();
+
+                String msg = "Invalid metadata field: " + f.getDatasetField().getDatasetFieldType().getDisplayName() + "; "
+                        + "Invalid value:  '" + f.getValue() + "'";
+                if (sanitize) {
+                    msg += ", replaced with '" + DatasetField.NA_VALUE + "'";
+                    f.setValue(DatasetField.NA_VALUE);
+                    fixed = true;
+                }
+                cleanupLog.println(msg);
+
+                // Note: "NA" does not work with certain fields. For example, 
+                // using it to populate a GeoBox coordinate value is going 
+                // to result in an invalid field. So we'll need to validate the 
+                // version again after the first, sanitizing pass and see if it 
+                // helped or not.
+            }
+            if (!sanitize) {
+                throw new ImportException("Version was still failing validation after the first attempt to sanitize the invalid values.");
+            }
+        }
+        return fixed;
+    }
 
     public JsonObject ddiToJson(String xmlToParse) throws ImportException, XMLStreamException {
         DatasetDTO dsDTO = importDDIService.doImport(ImportType.IMPORT, xmlToParse);
diff --git a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java
index a659976769a..834ea5f1ed9 100644
--- a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java
+++ b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java
@@ -2398,6 +2398,11 @@ public void deleteHarvestedDocuments(Dataset harvestedDataset) {
             solrIdsOfDocumentsToDelete.add(solrDocIdentifierFile + datafile.getId());
         }
 
+        deleteHarvestedDocuments(solrIdsOfDocumentsToDelete);
+    }
+    
+    public void deleteHarvestedDocuments(List<String> solrIdsOfDocumentsToDelete) {
+
         logger.fine("attempting to delete the following documents from the index: " + StringUtils.join(solrIdsOfDocumentsToDelete, ","));
         IndexResponse resultOfAttemptToDeleteDocuments = solrIndexService.deleteMultipleSolrIds(solrIdsOfDocumentsToDelete);
         logger.fine("result of attempt to delete harvested documents: " + resultOfAttemptToDeleteDocuments + "\n");

From 3fab4c8c823afbca7b28e74031aa9fa2ceb65bf0 Mon Sep 17 00:00:00 2001
From: Leonid Andreev <leonid@hmdc.harvard.edu>
Date: Tue, 10 Sep 2024 13:54:36 -0400
Subject: [PATCH 3/9] incremental. #10734

---
 .../api/imports/ImportServiceBean.java        | 130 ++++++++------
 .../impl/UpdateHarvestedDatasetCommand.java   | 159 ++++++++++++++++++
 2 files changed, 238 insertions(+), 51 deletions(-)
 create mode 100644 src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java

diff --git a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java
index a3147b68463..b3c65ceefcc 100644
--- a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java
+++ b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java
@@ -33,6 +33,7 @@
 import edu.harvard.iq.dataverse.engine.command.impl.CreateHarvestedDatasetCommand;
 import edu.harvard.iq.dataverse.engine.command.impl.CreateNewDatasetCommand;
 import edu.harvard.iq.dataverse.engine.command.impl.DestroyDatasetCommand;
+import edu.harvard.iq.dataverse.engine.command.impl.UpdateHarvestedDatasetCommand;
 import edu.harvard.iq.dataverse.harvest.client.HarvestingClient;
 import edu.harvard.iq.dataverse.search.IndexServiceBean;
 import edu.harvard.iq.dataverse.settings.SettingsServiceBean;
@@ -43,6 +44,7 @@
 import edu.harvard.iq.dataverse.license.LicenseServiceBean;
 import edu.harvard.iq.dataverse.pidproviders.PidUtil;
 import static edu.harvard.iq.dataverse.search.IndexServiceBean.solrDocIdentifierFile;
+import edu.harvard.iq.dataverse.util.DatasetFieldUtil;
 
 import java.io.File;
 import java.io.FileOutputStream;
@@ -366,25 +368,29 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve
                     f.setSingleValue(DatasetField.NA_VALUE);
                 }
             }
-
+            
+            // @todo? - is this the right place to call tidyUpFields()? 
+            // usually it is called within the body of the create/update commands.
+            DatasetFieldUtil.tidyUpFields(harvestedVersion.getDatasetFields(), true);
+            
             // Check data against validation constraints
             // Similarly to how we handle missing required values (above), we 
             // replace invalid values with NA when harvesting.
             
-            boolean sanitized = validateDatasetVersion(harvestedVersion, true, cleanupLog);
+            boolean sanitized = validateAndSanitizeVersionMetadata(harvestedVersion, cleanupLog);
             
             // Note: this sanitizing approach, of replacing invalid values with 
             // "NA" does not work with certain fields. For example, using it to 
             // populate a GeoBox coordinate value will result in an invalid 
-            // field. So we will attempt to validate the santized version again,
-            // this time around, it will throw an exception if still invalid, so 
-            // we'll stop before proceeding any further. 
+            // field. So we will attempt to re-validate the santized version.
+            // This time around, it will throw an exception if still invalid, so 
+            // that we'll stop before proceeding any further: 
             
             if (sanitized) {
-                
+                validateVersionMetadata(harvestedVersion, cleanupLog);
             }
             
-            Set<ConstraintViolation> invalidViolations = harvestedVersion.validate();
+            /*Set<ConstraintViolation> invalidViolations = harvestedVersion.validate();
             if (!invalidViolations.isEmpty()) {
                 for (ConstraintViolation<DatasetFieldValue> v : invalidViolations) {
                     DatasetFieldValue f = v.getRootBean();
@@ -397,7 +403,7 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve
                     // using it to populate a GeoBox coordinate value is going 
                     // to result in an invalid field. @todo? - see below
                 }
-            }
+            }*/
             
             // @todo? - re-validate the version before we do anything else? 
             // something along the lines of 
@@ -407,6 +413,7 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve
             if (existingDataset != null) {
                 // @todo
                 // ... do the magic - parse the version json, do the switcheroo ...
+                /*
                 DatasetVersion existingVersion = existingDataset.getVersions().get(0);
 
                 Map<String, Integer> existingFilesIndex = new HashMap<>(); 
@@ -490,6 +497,9 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve
                 // UpdateHarvestedDatasetCommand() ? (later on)
                 importedDataset = em.merge(existingDataset);
                 //@todo reindex 
+                */
+                
+                importedDataset = engineSvc.submit(new UpdateHarvestedDatasetCommand(existingDataset, harvestedVersion, dataverseRequest));
 
             } else {
                 importedDataset = engineSvc.submit(new CreateHarvestedDatasetCommand(harvestedDataset, dataverseRequest));
@@ -512,49 +522,6 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve
         }
         return importedDataset;
     }
-    /**
-     * Shortcut method for validating AND attempting to sanitize a DatasetVersion
-     * @param version
-     * @param cleanupLog - any invalid values and their replacements are logged there
-     * @return true if any invalid values were encountered and sanitized
-     * @throws ImportException (although it should never happen in this mode)
-     */
-    private boolean validateAndSanitizeVersionMetadata(DatasetVersion version, PrintWriter cleanupLog) throws ImportException {
-        return validateVersionMetadata(version, true, cleanupLog);
-    }
-    
-    private void validateVersionMetadata(DatasetVersion version, PrintWriter log) throws ImportException {
-        validateVersionMetadata(version, false, log);
-    } 
-    
-    private boolean validateVersionMetadata(DatasetVersion version, boolean sanitize, PrintWriter cleanupLog) throws ImportException {
-        boolean fixed = false;
-        Set<ConstraintViolation> invalidViolations = version.validate();
-        if (!invalidViolations.isEmpty()) {
-            for (ConstraintViolation<DatasetFieldValue> v : invalidViolations) {
-                DatasetFieldValue f = v.getRootBean();
-
-                String msg = "Invalid metadata field: " + f.getDatasetField().getDatasetFieldType().getDisplayName() + "; "
-                        + "Invalid value:  '" + f.getValue() + "'";
-                if (sanitize) {
-                    msg += ", replaced with '" + DatasetField.NA_VALUE + "'";
-                    f.setValue(DatasetField.NA_VALUE);
-                    fixed = true;
-                }
-                cleanupLog.println(msg);
-
-                // Note: "NA" does not work with certain fields. For example, 
-                // using it to populate a GeoBox coordinate value is going 
-                // to result in an invalid field. So we'll need to validate the 
-                // version again after the first, sanitizing pass and see if it 
-                // helped or not.
-            }
-            if (!sanitize) {
-                throw new ImportException("Version was still failing validation after the first attempt to sanitize the invalid values.");
-            }
-        }
-        return fixed;
-    }
 
     public JsonObject ddiToJson(String xmlToParse) throws ImportException, XMLStreamException {
         DatasetDTO dsDTO = importDDIService.doImport(ImportType.IMPORT, xmlToParse);
@@ -855,6 +822,67 @@ private String convertInvalidDateString(String inString){
         return null;
     }
 
+    /**
+     * A shortcut method for validating AND attempting to sanitize a DatasetVersion
+     * @param version
+     * @param cleanupLog - any invalid values and their replacements are logged there
+     * @return true if any invalid values were encountered and sanitized
+     * @throws ImportException (although it should never happen in this mode)
+     */
+    private boolean validateAndSanitizeVersionMetadata(DatasetVersion version, PrintWriter cleanupLog) throws ImportException {
+        return validateVersionMetadata(version, true, cleanupLog);
+    }
+    
+    /**
+     * A shortcut method for validating a DatasetVersion; will throw an exception 
+     * if invalid, without attempting to sanitize the invalid values. 
+     * @param version
+     * @param log - will log the invalid fields encountered there 
+     * @throws ImportException 
+     */
+    private void validateVersionMetadata(DatasetVersion version, PrintWriter log) throws ImportException {
+        validateVersionMetadata(version, false, log);
+    } 
+    
+    /**
+     * Validate the metadata fields of a newly-created version, and depending on 
+     * the "sanitize" flag supplied, may or may not attempt to sanitize the supplied
+     * values by replacing them with "NA"s. 
+     * @param version
+     * @param sanitize - boolean indicating whether to attempt to fix invalid values
+     * @param cleanupLog - to log any invalid values encountered will be logged
+     * @return - true if any invalid values have been replaced  
+     * @throws ImportException 
+     */
+    private boolean validateVersionMetadata(DatasetVersion version, boolean sanitize, PrintWriter cleanupLog) throws ImportException {
+        boolean fixed = false;
+        Set<ConstraintViolation> invalidViolations = version.validate();
+        if (!invalidViolations.isEmpty()) {
+            for (ConstraintViolation<DatasetFieldValue> v : invalidViolations) {
+                DatasetFieldValue f = v.getRootBean();
+
+                String msg = "Invalid metadata field: " + f.getDatasetField().getDatasetFieldType().getDisplayName() + "; "
+                        + "Invalid value:  '" + f.getValue() + "'";
+                if (sanitize) {
+                    msg += ", replaced with '" + DatasetField.NA_VALUE + "'";
+                    f.setValue(DatasetField.NA_VALUE);
+                    fixed = true;
+                }
+                cleanupLog.println(msg);
+
+                // Note: "NA" does not work with certain fields. For example, 
+                // using it to populate a GeoBox coordinate value is going 
+                // to result in an invalid field. So we'll need to validate the 
+                // version again after the first, sanitizing pass and see if it 
+                // helped or not.
+            }
+            if (!sanitize) {
+                throw new ImportException("Version was still failing validation after the first attempt to sanitize the invalid values.");
+            }
+        }
+        return fixed;
+    }
+
 
     private static class MyCustomFormatter extends Formatter {
 
diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java
new file mode 100644
index 00000000000..d28950e4d9d
--- /dev/null
+++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java
@@ -0,0 +1,159 @@
+package edu.harvard.iq.dataverse.engine.command.impl;
+
+import edu.harvard.iq.dataverse.authorization.Permission;
+import edu.harvard.iq.dataverse.engine.command.CommandContext;
+import edu.harvard.iq.dataverse.engine.command.DataverseRequest;
+import edu.harvard.iq.dataverse.engine.command.RequiredPermissions;
+import edu.harvard.iq.dataverse.engine.command.exception.CommandException;
+import edu.harvard.iq.dataverse.engine.command.exception.IllegalCommandException;
+import edu.harvard.iq.dataverse.Dataset;
+import edu.harvard.iq.dataverse.DatasetVersion;
+import edu.harvard.iq.dataverse.DataFile;
+import edu.harvard.iq.dataverse.FileMetadata;
+import static edu.harvard.iq.dataverse.search.IndexServiceBean.solrDocIdentifierFile;
+import java.io.IOException;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+import org.apache.solr.client.solrj.SolrServerException;
+
+/**
+ *
+ * @author landreev
+ * 
+ * Much simplified version of UpdateDatasetVersionCommand, 
+ * but with some extra twists. 
+ */
+@RequiredPermissions(Permission.EditDataset)
+public class UpdateHarvestedDatasetCommand extends AbstractDatasetCommand<Dataset> {
+
+    private static final Logger logger = Logger.getLogger(UpdateHarvestedDatasetCommand.class.getCanonicalName());
+    private final DatasetVersion newHarvestedVersion;
+    final private boolean validateLenient = true;
+
+    public UpdateHarvestedDatasetCommand(Dataset theDataset, DatasetVersion newHarvestedVersion, DataverseRequest aRequest) {
+        super(aRequest, theDataset);
+        this.newHarvestedVersion = newHarvestedVersion;
+    }
+
+    public boolean isValidateLenient() {
+        return validateLenient;
+    }
+
+    @Override
+    public Dataset execute(CommandContext ctxt) throws CommandException {
+        
+        // ... do the magic - parse the version json, do the switcheroo ...
+        Dataset existingDataset = getDataset();
+
+        if (existingDataset == null
+                || existingDataset.getId() == null
+                || !existingDataset.isHarvested()
+                || existingDataset.getVersions().size() != 1) {
+            throw new IllegalCommandException("The command can only be called on an existing harvested dataset with only 1 version", this);
+        }
+        DatasetVersion existingVersion = existingDataset.getVersions().get(0);
+
+        if (newHarvestedVersion == null || newHarvestedVersion.getId() != null) {
+            throw new IllegalCommandException("The command can only be called with a newly-harvested, not yet saved DatasetVersion supplied", this);
+        }
+        
+        Map<String, Integer> existingFilesIndex = new HashMap<>();
+
+        for (int i = 0; i < existingDataset.getFiles().size(); i++) {
+            String storageIdentifier = existingDataset.getFiles().get(i).getStorageIdentifier();
+            if (storageIdentifier != null) {
+                existingFilesIndex.put(storageIdentifier, i);
+            }
+        }
+                
+        for (FileMetadata newFileMetadata : newHarvestedVersion.getFileMetadatas()) {
+            // is it safe to assume that each new FileMetadata will be 
+            // pointing to a non-null DataFile here?
+            String location = newFileMetadata.getDataFile().getStorageIdentifier();
+            if (location != null && existingFilesIndex.containsKey(location)) {
+                newFileMetadata.getDataFile().setFileMetadatas(new ArrayList<>());
+
+                int fileIndex = existingFilesIndex.get(location);
+                newFileMetadata.setDataFile(existingDataset.getFiles().get(fileIndex));
+                existingDataset.getFiles().get(fileIndex).getFileMetadatas().add(newFileMetadata);
+                existingFilesIndex.remove(location);
+            }
+        }
+        // @todo check that the newly-harvested DataFiles have the same checksums
+        // and mime types etc. These values are supposed to be immutable, normally, 
+        // but who knows - they may have fixed something invalid on the other end
+        // @todo check if there's anything special that needs to be done for things
+        // like file categories
+                
+        List<String> solrIdsOfDocumentsToDelete = new ArrayList<>();
+
+        // Go through the existing files and delete the ones that are 
+        // no longer present in the version that we have just harvesed:
+        for (FileMetadata oldFileMetadata : existingDataset.getVersions().get(0).getFileMetadatas()) {
+            DataFile oldDataFile = oldFileMetadata.getDataFile();
+            solrIdsOfDocumentsToDelete.add(solrDocIdentifierFile + oldDataFile.getId());
+            existingDataset.getFiles().remove(oldDataFile);
+            // Files from harvested datasets are removed unceremoniously, 
+            // directly in the database. No need to bother calling the 
+            // DeleteFileCommand on them.
+            ctxt.em().remove(ctxt.em().merge(oldDataFile));
+            ctxt.em().remove(ctxt.em().merge(oldFileMetadata));
+            oldDataFile = null;
+            oldFileMetadata = null;
+        }
+                
+        // purge all the SOLR documents associated with the files
+        // we have just deleted:
+        if (!solrIdsOfDocumentsToDelete.isEmpty()) {
+            ctxt.index().deleteHarvestedDocuments(solrIdsOfDocumentsToDelete);
+        }
+
+        // ... And now delete the existing version itself: 
+        existingDataset.setVersions(new ArrayList<>());
+        ctxt.em().remove(ctxt.em().merge(existingVersion));
+
+        // Now attach the newly-harvested version to the dataset:
+        existingDataset.getVersions().add(newHarvestedVersion);
+        newHarvestedVersion.setDataset(existingDataset);
+                
+        // ... There's one more thing to do - go through the new files, 
+        // that are not in the database yet, and make sure they are 
+        // attached to this existing dataset:
+        for (FileMetadata newFileMetadata : newHarvestedVersion.getFileMetadatas()) {
+            if (newFileMetadata.getDataFile().getId() == null) {
+                existingDataset.getFiles().add(newFileMetadata.getDataFile());
+                newFileMetadata.getDataFile().setOwner(existingDataset);
+            }
+        }
+                
+        ctxt.em().persist(newHarvestedVersion);
+                
+        Dataset savedDataset = ctxt.em().merge(existingDataset);
+        ctxt.em().flush();
+        
+        //@todo reindex 
+        
+        return savedDataset;
+    }
+
+    @Override
+    public boolean onSuccess(CommandContext ctxt, Object r) {
+        boolean retVal = true;
+        Dataset d = (Dataset) r;
+        
+        try {
+            // Note that we index harvested datasets synchronously:
+            ctxt.index().indexDataset(d, true);
+        } catch (SolrServerException|IOException solrServerEx) {
+            logger.log(Level.WARNING, "Exception while trying to index the updated Harvested dataset " + d.getGlobalId().asString(), solrServerEx.getMessage());
+            retVal = false;
+        }
+        
+        return retVal;
+    }
+}

From bd08b698859db5c5662f8a742504832023a632d7 Mon Sep 17 00:00:00 2001
From: Leonid Andreev <leonid@hmdc.harvard.edu>
Date: Wed, 11 Sep 2024 14:37:34 -0400
Subject: [PATCH 4/9] Largely finalized versions of the refactored harvesting
 classes. #10734

---
 .../api/imports/ImportServiceBean.java        | 142 ++++--------------
 .../impl/UpdateHarvestedDatasetCommand.java   |  82 +++++++---
 2 files changed, 90 insertions(+), 134 deletions(-)

diff --git a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java
index b3c65ceefcc..8c7ff5acf6d 100644
--- a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java
+++ b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java
@@ -300,7 +300,7 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve
         String globalIdString = globalId.asString();
         
         if (StringUtils.isEmpty(globalIdString)) {
-            // @todo this check shouldn't be necessary, now that there's a null check above
+            // @todo this check may not be necessary, now that there's a null check above
             throw new ImportException("The harvested metadata record with the OAI server identifier " + harvestIdentifier + " does not contain a global identifier this Dataverse recognizes, skipping.");
         }
         
@@ -347,6 +347,11 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve
                 }
                                 
                 harvestedVersion = parser.parseDatasetVersion(obj.getJsonObject("datasetVersion"));
+                Dataset tempDataset = createTemporaryHarvestedDataset(harvestedVersion);
+                // Temporarily attach the temporary dataset to the parent Collection: 
+                // (this will be needed for the purposes of looking up the configured
+                // metadatablocks and such)
+                tempDataset.setOwner(owner);
             }
         
             // Either a full new import, or an update of an existing harvested
@@ -362,15 +367,16 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve
             // Check data against required contraints
             List<ConstraintViolation<DatasetField>> violations = harvestedVersion.validateRequired();
             if (!violations.isEmpty()) {
-                // When harvesting, we add NA for missing required values:
+                // ... and fill the missing required values with "NA"s:
                 for (ConstraintViolation<DatasetField> v : violations) {
                     DatasetField f =  v.getRootBean();
                     f.setSingleValue(DatasetField.NA_VALUE);
                 }
             }
             
-            // @todo? - is this the right place to call tidyUpFields()? 
-            // usually it is called within the body of the create/update commands.
+            // is this the right place to call tidyUpFields()? 
+            // usually it is called within the body of the create/update commands
+            // later on.
             DatasetFieldUtil.tidyUpFields(harvestedVersion.getDatasetFields(), true);
             
             // Check data against validation constraints
@@ -379,7 +385,7 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve
             
             boolean sanitized = validateAndSanitizeVersionMetadata(harvestedVersion, cleanupLog);
             
-            // Note: this sanitizing approach, of replacing invalid values with 
+            // Note: our sanitizing approach, of replacing invalid values with 
             // "NA" does not work with certain fields. For example, using it to 
             // populate a GeoBox coordinate value will result in an invalid 
             // field. So we will attempt to re-validate the santized version.
@@ -390,117 +396,8 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve
                 validateVersionMetadata(harvestedVersion, cleanupLog);
             }
             
-            /*Set<ConstraintViolation> invalidViolations = harvestedVersion.validate();
-            if (!invalidViolations.isEmpty()) {
-                for (ConstraintViolation<DatasetFieldValue> v : invalidViolations) {
-                    DatasetFieldValue f = v.getRootBean();
-
-                    String msg = "Invalid metadata: " + metadataFile.getName() + "; Field: " + f.getDatasetField().getDatasetFieldType().getDisplayName() + "; "
-                            + "Invalid value:  '" + f.getValue() + "'" + ", replaced with '" + DatasetField.NA_VALUE + "'";
-                    cleanupLog.println(msg);
-                    f.setValue(DatasetField.NA_VALUE);
-                    // Note: "NA" does not work with certain fields. For example, 
-                    // using it to populate a GeoBox coordinate value is going 
-                    // to result in an invalid field. @todo? - see below
-                }
-            }*/
-            
-            // @todo? - re-validate the version before we do anything else? 
-            // something along the lines of 
-            // if (this.validate) {validateOrDie(newVersion, false);}
-            // DatasetFieldUtil.tidyUpFields(newVersion.getDatasetFields(), true);
-            
             if (existingDataset != null) {
-                // @todo
-                // ... do the magic - parse the version json, do the switcheroo ...
-                /*
-                DatasetVersion existingVersion = existingDataset.getVersions().get(0);
-
-                Map<String, Integer> existingFilesIndex = new HashMap<>(); 
-                
-                for (int i = 0; i < existingDataset.getFiles().size(); i++) {
-                    String storageIdentifier = existingDataset.getFiles().get(i).getStorageIdentifier();
-                    if (storageIdentifier != null) {
-                        existingFilesIndex.put(storageIdentifier, i);
-                    }
-                }
-                
-                for (FileMetadata newFileMetadata : harvestedVersion.getFileMetadatas()) {
-                    // is it safe to assume that each new FileMetadata will be 
-                    // pointing to a non-null DataFile
-                    String location = newFileMetadata.getDataFile().getStorageIdentifier(); 
-                    if (location != null && existingFilesIndex.containsKey(location)) {
-                        newFileMetadata.getDataFile().setFileMetadatas(new ArrayList<>());
-                        
-                        int fileIndex = existingFilesIndex.get(location);
-                        newFileMetadata.setDataFile(existingDataset.getFiles().get(fileIndex));
-                        existingDataset.getFiles().get(fileIndex).getFileMetadatas().add(newFileMetadata);
-                        existingFilesIndex.remove(location);
-                    }
-                }
-                
-                List<String> solrIdsOfDocumentsToDelete = new ArrayList<>();
-                
-                // Go through the existing files and delete the ones that are 
-                // no longer present in the version that we have just harvesed:
-                for (FileMetadata oldFileMetadata : existingDataset.getVersions().get(0).getFileMetadatas()) {
-                    DataFile oldDataFile = oldFileMetadata.getDataFile();
-                    solrIdsOfDocumentsToDelete.add(solrDocIdentifierFile + oldDataFile.getId());
-                    existingDataset.getFiles().remove(oldDataFile);
-                    // Files from harvested datasets are removed unceremoniously, 
-                    // directly in the database. No need to bother calling the 
-                    // DeleteFileCommand on them.
-                    em.remove(em.merge(oldDataFile));
-                    em.remove(em.merge(oldFileMetadata));
-                    oldDataFile = null;
-                    oldFileMetadata = null; 
-                }
-                
-                // purge all the SOLR documents associated with the files
-                // we have just deleted:
-                if (!solrIdsOfDocumentsToDelete.isEmpty()) {
-                    indexService.deleteHarvestedDocuments(solrIdsOfDocumentsToDelete);
-                }
-
-                // ... And now delete the existing version itself: 
-                
-                existingDataset.setVersions(new ArrayList<>());
-                em.remove(em.merge(existingVersion));
-                
-                // Now attach the newly-harvested version to the dataset:
-                existingDataset.getVersions().add(harvestedVersion);
-                harvestedVersion.setDataset(existingDataset);
-                
-                // ... There's one more thing to do - go through the new files, 
-                // that are not in the database yet, and make sure they are 
-                // attached to this existing dataset:
-                
-                for (FileMetadata newFileMetadata : harvestedVersion.getFileMetadatas()) {
-                    if (newFileMetadata.getDataFile().getId() == null) {
-                        existingDataset.getFiles().add(newFileMetadata.getDataFile());
-                        newFileMetadata.getDataFile().setOwner(existingDataset);
-                    }
-                }
-                
-                em.persist(harvestedVersion);
-                
-                ///@todo remove for (DataFile harvestedFile : existingDataset.getFiles()) {
-                ///@todo remove     DataFile merged = em.merge(harvestedFile);
-                ///@todo remove     em.remove(merged);
-                ///@todo remove     harvestedFile = null; 
-                ///@todo remove }
-                ///@todo remove existingDataset.setFiles(null);
-                ///@todo remove Dataset merged = em.merge(existingDataset);
-                // harvested datasets don't have physical files - so no need to worry about that.
-                ///@todo remove engineSvc.submit(new DestroyDatasetCommand(merged, dataverseRequest));
-                // @todo!
-                // UpdateHarvestedDatasetCommand() ? (later on)
-                importedDataset = em.merge(existingDataset);
-                //@todo reindex 
-                */
-                
                 importedDataset = engineSvc.submit(new UpdateHarvestedDatasetCommand(existingDataset, harvestedVersion, dataverseRequest));
-
             } else {
                 importedDataset = engineSvc.submit(new CreateHarvestedDatasetCommand(harvestedDataset, dataverseRequest));
             }
@@ -883,6 +780,23 @@ private boolean validateVersionMetadata(DatasetVersion version, boolean sanitize
         return fixed;
     }
 
+    /**
+     * Helper method that creates a throwaway Harvested Dataset to temporarily 
+     * attach the newly-harvested version to. We need this when, instead of 
+     * importing a brand-new harvested dataset from scratch, we are planning to
+     * attempt to update an already existing dataset harvested from the same 
+     * archival location.
+     * @param harvestedVersion - a newly created Version imported from harvested metadata
+     * @return - a temporary dataset to which the new version has been attached
+     */
+    private Dataset createTemporaryHarvestedDataset(DatasetVersion harvestedVersion) {
+        Dataset tempDataset = new Dataset();
+        harvestedVersion.setDataset(tempDataset);
+        tempDataset.setVersions(new ArrayList<>(1));
+        tempDataset.getVersions().add(harvestedVersion);
+        
+        return tempDataset;
+    }
 
     private static class MyCustomFormatter extends Formatter {
 
diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java
index d28950e4d9d..9d7461b9b30 100644
--- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java
+++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java
@@ -26,7 +26,10 @@
  * @author landreev
  * 
  * Much simplified version of UpdateDatasetVersionCommand, 
- * but with some extra twists. 
+ * but with some extra twists. The goal is to avoid creating new Dataset and 
+ * DataFile objects, and to instead preserve the database ids of the re-harvested 
+ * datasets and files, whenever possible. This in turn allows us to avoid deleting
+ * and rebuilding from scratch the Solr documents for these objects. 
  */
 @RequiredPermissions(Permission.EditDataset)
 public class UpdateHarvestedDatasetCommand extends AbstractDatasetCommand<Dataset> {
@@ -47,7 +50,6 @@ public boolean isValidateLenient() {
     @Override
     public Dataset execute(CommandContext ctxt) throws CommandException {
         
-        // ... do the magic - parse the version json, do the switcheroo ...
         Dataset existingDataset = getDataset();
 
         if (existingDataset == null
@@ -62,15 +64,30 @@ public Dataset execute(CommandContext ctxt) throws CommandException {
             throw new IllegalCommandException("The command can only be called with a newly-harvested, not yet saved DatasetVersion supplied", this);
         }
         
+        newHarvestedVersion.setCreateTime(getTimestamp());
+        newHarvestedVersion.setLastUpdateTime(getTimestamp());
+
+        
         Map<String, Integer> existingFilesIndex = new HashMap<>();
 
+        /* 
+        Create a map of the files that are currently part of this existing
+        harvested dataset. We assume that a harvested file can be uniquely 
+        defined by its storageidentifier. Which, in the case of a datafile
+        harvested from another Dataverse should be its data access api url. 
+        */
         for (int i = 0; i < existingDataset.getFiles().size(); i++) {
             String storageIdentifier = existingDataset.getFiles().get(i).getStorageIdentifier();
             if (storageIdentifier != null) {
                 existingFilesIndex.put(storageIdentifier, i);
             }
         }
-                
+        
+        /*
+        Go through the files in the newly-harvested version and check if any of 
+        them are the files (potentially new/updated versions thereof) of the files
+        we already have, harvested previously from the same archive location. 
+        */
         for (FileMetadata newFileMetadata : newHarvestedVersion.getFileMetadatas()) {
             // is it safe to assume that each new FileMetadata will be 
             // pointing to a non-null DataFile here?
@@ -79,14 +96,32 @@ public Dataset execute(CommandContext ctxt) throws CommandException {
                 newFileMetadata.getDataFile().setFileMetadatas(new ArrayList<>());
 
                 int fileIndex = existingFilesIndex.get(location);
+                
+                // Make sure to update the existing DataFiles that we are going 
+                // to keep in case their newly-harvested versions have different 
+                // checksums, mime types etc. These values are supposed to be 
+                // immutable, normally - but who knows, errors happen, the source 
+                // Dataverse may have had to fix these in their database to 
+                // correct a data integrity issue (for ex.):
+                existingDataset.getFiles().get(fileIndex).setContentType(newFileMetadata.getDataFile().getContentType());
+                existingDataset.getFiles().get(fileIndex).setFilesize(newFileMetadata.getDataFile().getFilesize());
+                existingDataset.getFiles().get(fileIndex).setChecksumType(newFileMetadata.getDataFile().getChecksumType());
+                existingDataset.getFiles().get(fileIndex).setChecksumValue(newFileMetadata.getDataFile().getChecksumValue());
+                
+                // Point the newly-harvested filemetadata to the existing datafile:
                 newFileMetadata.setDataFile(existingDataset.getFiles().get(fileIndex));
+                
+                // Make sure this new FileMetadata is the only one attached to this existing file:
+                existingDataset.getFiles().get(fileIndex).setFileMetadatas(new ArrayList<>(1));
                 existingDataset.getFiles().get(fileIndex).getFileMetadatas().add(newFileMetadata);
+                // (we don't want any cascade relationships left between this existing
+                // dataset and its version, since we are going to attemp to delete it).
+                
+                // Drop the file from the index map:
                 existingFilesIndex.remove(location);
             }
         }
-        // @todo check that the newly-harvested DataFiles have the same checksums
-        // and mime types etc. These values are supposed to be immutable, normally, 
-        // but who knows - they may have fixed something invalid on the other end
+        
         // @todo check if there's anything special that needs to be done for things
         // like file categories
                 
@@ -96,15 +131,20 @@ public Dataset execute(CommandContext ctxt) throws CommandException {
         // no longer present in the version that we have just harvesed:
         for (FileMetadata oldFileMetadata : existingDataset.getVersions().get(0).getFileMetadatas()) {
             DataFile oldDataFile = oldFileMetadata.getDataFile();
-            solrIdsOfDocumentsToDelete.add(solrDocIdentifierFile + oldDataFile.getId());
-            existingDataset.getFiles().remove(oldDataFile);
-            // Files from harvested datasets are removed unceremoniously, 
-            // directly in the database. No need to bother calling the 
-            // DeleteFileCommand on them.
-            ctxt.em().remove(ctxt.em().merge(oldDataFile));
-            ctxt.em().remove(ctxt.em().merge(oldFileMetadata));
-            oldDataFile = null;
-            oldFileMetadata = null;
+            String location = oldDataFile.getStorageIdentifier();
+            // Is it still in the existing files map? - that means it is no longer
+            // present in the newly-harvested version
+            if (location != null && existingFilesIndex.containsKey(location)) {
+                solrIdsOfDocumentsToDelete.add(solrDocIdentifierFile + oldDataFile.getId());
+                existingDataset.getFiles().remove(oldDataFile);
+                // Files from harvested datasets are removed unceremoniously, 
+                // directly in the database. No need to bother calling the 
+                // DeleteFileCommand on them. We'll just need to remember to purge
+                // them from Solr as well (right below)
+                ctxt.em().remove(ctxt.em().merge(oldDataFile));
+                // (no need to explicitly remove the oldFileMetadata; it will be 
+                // removed with the entire old version is deleted)
+            }
         }
                 
         // purge all the SOLR documents associated with the files
@@ -115,7 +155,10 @@ public Dataset execute(CommandContext ctxt) throws CommandException {
 
         // ... And now delete the existing version itself: 
         existingDataset.setVersions(new ArrayList<>());
-        ctxt.em().remove(ctxt.em().merge(existingVersion));
+        existingVersion.setDataset(null);
+        
+        existingVersion = ctxt.em().merge(existingVersion);
+        ctxt.em().remove(existingVersion);
 
         // Now attach the newly-harvested version to the dataset:
         existingDataset.getVersions().add(newHarvestedVersion);
@@ -123,7 +166,8 @@ public Dataset execute(CommandContext ctxt) throws CommandException {
                 
         // ... There's one more thing to do - go through the new files, 
         // that are not in the database yet, and make sure they are 
-        // attached to this existing dataset:
+        // attached to this existing dataset, instead of the dummy temp 
+        // dataset into which they were originally imported:
         for (FileMetadata newFileMetadata : newHarvestedVersion.getFileMetadatas()) {
             if (newFileMetadata.getDataFile().getId() == null) {
                 existingDataset.getFiles().add(newFileMetadata.getDataFile());
@@ -135,9 +179,7 @@ public Dataset execute(CommandContext ctxt) throws CommandException {
                 
         Dataset savedDataset = ctxt.em().merge(existingDataset);
         ctxt.em().flush();
-        
-        //@todo reindex 
-        
+                
         return savedDataset;
     }
 

From e02a5845ca02e86ec042411f32513c3c35536f5a Mon Sep 17 00:00:00 2001
From: Leonid Andreev <leonid@hmdc.harvard.edu>
Date: Wed, 11 Sep 2024 15:24:33 -0400
Subject: [PATCH 5/9] a typo + some unused imports #10734

---
 .../harvard/iq/dataverse/api/imports/ImportServiceBean.java  | 5 -----
 .../engine/command/impl/UpdateHarvestedDatasetCommand.java   | 2 +-
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java
index 8c7ff5acf6d..acfb696b71d 100644
--- a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java
+++ b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java
@@ -7,7 +7,6 @@
 
 import com.google.gson.Gson;
 import com.google.gson.GsonBuilder;
-import edu.harvard.iq.dataverse.DataFile;
 import edu.harvard.iq.dataverse.Dataset;
 import edu.harvard.iq.dataverse.DatasetField;
 import edu.harvard.iq.dataverse.DatasetFieldConstant;
@@ -20,7 +19,6 @@
 import edu.harvard.iq.dataverse.DataverseContact;
 import edu.harvard.iq.dataverse.DataverseServiceBean;
 import edu.harvard.iq.dataverse.EjbDataverseEngine;
-import edu.harvard.iq.dataverse.FileMetadata;
 import edu.harvard.iq.dataverse.GlobalId;
 import edu.harvard.iq.dataverse.MetadataBlockServiceBean;
 import edu.harvard.iq.dataverse.api.dto.DatasetDTO;
@@ -43,7 +41,6 @@
 import edu.harvard.iq.dataverse.util.json.JsonUtil;
 import edu.harvard.iq.dataverse.license.LicenseServiceBean;
 import edu.harvard.iq.dataverse.pidproviders.PidUtil;
-import static edu.harvard.iq.dataverse.search.IndexServiceBean.solrDocIdentifierFile;
 import edu.harvard.iq.dataverse.util.DatasetFieldUtil;
 
 import java.io.File;
@@ -75,8 +72,6 @@
 import jakarta.validation.Validation;
 import jakarta.validation.Validator;
 import jakarta.validation.ValidatorFactory;
-import java.util.HashMap;
-import java.util.Map;
 import javax.xml.stream.XMLStreamException;
 import org.apache.commons.lang3.StringUtils;
 
diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java
index 9d7461b9b30..c93983befa5 100644
--- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java
+++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java
@@ -115,7 +115,7 @@ them are the files (potentially new/updated versions thereof) of the files
                 existingDataset.getFiles().get(fileIndex).setFileMetadatas(new ArrayList<>(1));
                 existingDataset.getFiles().get(fileIndex).getFileMetadatas().add(newFileMetadata);
                 // (we don't want any cascade relationships left between this existing
-                // dataset and its version, since we are going to attemp to delete it).
+                // dataset and this version, since we are going to attemp to delete it).
                 
                 // Drop the file from the index map:
                 existingFilesIndex.remove(location);

From 30266ddcafc96127430e1a22dcee722920ae76cd Mon Sep 17 00:00:00 2001
From: Leonid Andreev <leonid@hmdc.harvard.edu>
Date: Wed, 11 Sep 2024 15:28:30 -0400
Subject: [PATCH 6/9] comment language #10734

---
 .../engine/command/impl/UpdateHarvestedDatasetCommand.java  | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java
index c93983befa5..2380d7309ee 100644
--- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java
+++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java
@@ -74,7 +74,7 @@ public Dataset execute(CommandContext ctxt) throws CommandException {
         Create a map of the files that are currently part of this existing
         harvested dataset. We assume that a harvested file can be uniquely 
         defined by its storageidentifier. Which, in the case of a datafile
-        harvested from another Dataverse should be its data access api url. 
+        harvested from another Dataverse should be its data access api url.
         */
         for (int i = 0; i < existingDataset.getFiles().size(); i++) {
             String storageIdentifier = existingDataset.getFiles().get(i).getStorageIdentifier();
@@ -85,8 +85,8 @@ public Dataset execute(CommandContext ctxt) throws CommandException {
         
         /*
         Go through the files in the newly-harvested version and check if any of 
-        them are the files (potentially new/updated versions thereof) of the files
-        we already have, harvested previously from the same archive location. 
+        them are (potentially new/updated) versions of files that we already
+        have, harvested previously from the same archive location. 
         */
         for (FileMetadata newFileMetadata : newHarvestedVersion.getFileMetadatas()) {
             // is it safe to assume that each new FileMetadata will be 

From 9291554e7e83faa74cbee9da955f4aa87cd070b3 Mon Sep 17 00:00:00 2001
From: Leonid Andreev <leonid@hmdc.harvard.edu>
Date: Wed, 11 Sep 2024 17:18:10 -0400
Subject: [PATCH 7/9] minor/cosmetic #10734

---
 .../api/imports/ImportServiceBean.java          | 10 +++++-----
 .../impl/UpdateHarvestedDatasetCommand.java     | 17 +++++++++--------
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java
index acfb696b71d..78bf9af99fa 100644
--- a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java
+++ b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java
@@ -328,9 +328,9 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve
                 if (existingDataset.getOwner() != null && !owner.getId().equals(existingDataset.getOwner().getId())) {
                     throw new ImportException("The dataset with the global id " + globalIdString + " already exists, in the dataverse " + existingDataset.getOwner().getAlias() + ", skipping.");
                 }
-                // And if we already have a dataset with this same id, in this same
-                // dataverse, but it is a LOCAL dataset (can happen!), we're going to 
-                // skip it also: 
+                // And if we already have a dataset with this same global id at 
+                // this Dataverse instance, but it is a LOCAL dataset (can happen!), 
+                // we're going to skip it also: 
                 if (!existingDataset.isHarvested()) {
                     throw new ImportException("A LOCAL dataset with the global id " + globalIdString + " already exists in this dataverse; skipping.");
                 }
@@ -359,7 +359,7 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve
                 harvestedVersion.setReleaseTime(oaiDateStamp);
             }
             
-            // Check data against required contraints
+            // Check data against required constraints
             List<ConstraintViolation<DatasetField>> violations = harvestedVersion.validateRequired();
             if (!violations.isEmpty()) {
                 // ... and fill the missing required values with "NA"s:
@@ -457,7 +457,7 @@ public JsonObjectBuilder doImport(DataverseRequest dataverseRequest, Dataverse o
             ds.setOwner(owner);
             ds.getLatestVersion().setDatasetFields(ds.getLatestVersion().initDatasetFields());
 
-            // Check data against required contraints
+            // Check data against required constraints
             List<ConstraintViolation<DatasetField>> violations = ds.getVersions().get(0).validateRequired();
             if (!violations.isEmpty()) {
                 if ( importType.equals(ImportType.HARVEST) ) {
diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java
index 2380d7309ee..09563686299 100644
--- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java
+++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java
@@ -11,6 +11,7 @@
 import edu.harvard.iq.dataverse.DataFile;
 import edu.harvard.iq.dataverse.FileMetadata;
 import static edu.harvard.iq.dataverse.search.IndexServiceBean.solrDocIdentifierFile;
+import edu.harvard.iq.dataverse.util.StringUtil;
 import java.io.IOException;
 
 import java.util.ArrayList;
@@ -78,7 +79,7 @@ public Dataset execute(CommandContext ctxt) throws CommandException {
         */
         for (int i = 0; i < existingDataset.getFiles().size(); i++) {
             String storageIdentifier = existingDataset.getFiles().get(i).getStorageIdentifier();
-            if (storageIdentifier != null) {
+            if (!StringUtil.isEmpty(storageIdentifier)) {
                 existingFilesIndex.put(storageIdentifier, i);
             }
         }
@@ -91,11 +92,11 @@ them are (potentially new/updated) versions of files that we already
         for (FileMetadata newFileMetadata : newHarvestedVersion.getFileMetadatas()) {
             // is it safe to assume that each new FileMetadata will be 
             // pointing to a non-null DataFile here?
-            String location = newFileMetadata.getDataFile().getStorageIdentifier();
-            if (location != null && existingFilesIndex.containsKey(location)) {
+            String storageIdentifier = newFileMetadata.getDataFile().getStorageIdentifier();
+            if (!StringUtil.isEmpty(storageIdentifier) && existingFilesIndex.containsKey(storageIdentifier)) {
                 newFileMetadata.getDataFile().setFileMetadatas(new ArrayList<>());
 
-                int fileIndex = existingFilesIndex.get(location);
+                int fileIndex = existingFilesIndex.get(storageIdentifier);
                 
                 // Make sure to update the existing DataFiles that we are going 
                 // to keep in case their newly-harvested versions have different 
@@ -118,11 +119,11 @@ them are (potentially new/updated) versions of files that we already
                 // dataset and this version, since we are going to attemp to delete it).
                 
                 // Drop the file from the index map:
-                existingFilesIndex.remove(location);
+                existingFilesIndex.remove(storageIdentifier);
             }
         }
         
-        // @todo check if there's anything special that needs to be done for things
+        // @todo? check if there's anything special that needs to be done for things
         // like file categories
                 
         List<String> solrIdsOfDocumentsToDelete = new ArrayList<>();
@@ -131,10 +132,10 @@ them are (potentially new/updated) versions of files that we already
         // no longer present in the version that we have just harvesed:
         for (FileMetadata oldFileMetadata : existingDataset.getVersions().get(0).getFileMetadatas()) {
             DataFile oldDataFile = oldFileMetadata.getDataFile();
-            String location = oldDataFile.getStorageIdentifier();
+            String storageIdentifier = oldDataFile.getStorageIdentifier();
             // Is it still in the existing files map? - that means it is no longer
             // present in the newly-harvested version
-            if (location != null && existingFilesIndex.containsKey(location)) {
+            if (StringUtil.isEmpty(storageIdentifier) || existingFilesIndex.containsKey(storageIdentifier)) {
                 solrIdsOfDocumentsToDelete.add(solrDocIdentifierFile + oldDataFile.getId());
                 existingDataset.getFiles().remove(oldDataFile);
                 // Files from harvested datasets are removed unceremoniously, 

From 7568cd05c4a82331b5539491a76c547cb00a4e93 Mon Sep 17 00:00:00 2001
From: Leonid Andreev <leonid@hmdc.harvard.edu>
Date: Wed, 18 Sep 2024 22:44:03 -0400
Subject: [PATCH 8/9] Some cleanup/streamlining #10734

---
 .../api/imports/ImportServiceBean.java        | 42 ++++++++-----------
 1 file changed, 17 insertions(+), 25 deletions(-)

diff --git a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java
index 78bf9af99fa..73d47b18b9a 100644
--- a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java
+++ b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java
@@ -303,9 +303,8 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve
         
         Dataset existingDataset = datasetService.findByGlobalId(globalIdString);
         
-        
         try {
-            Dataset harvestedDataset = null;
+            Dataset harvestedDataset;
 
             JsonParser parser = new JsonParser(datasetfieldService, metadataBlockService, settingsService, licenseService, datasetTypeService, harvestingClient);
             parser.setLenient(true);
@@ -314,7 +313,6 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve
                 // Creating a new dataset from scratch:
                 
                 harvestedDataset = parser.parseDataset(obj);
-                harvestedDataset.setOwner(owner);
                 
                 harvestedDataset.setHarvestedFrom(harvestingClient);
                 harvestedDataset.setHarvestIdentifier(harvestIdentifier);
@@ -322,6 +320,7 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve
                 harvestedVersion = harvestedDataset.getVersions().get(0);
             } else {
                 // We already have a dataset with this id in the database.
+                // Let's check a few things before we go any further with it: 
                 
                 // If this dataset already exists IN ANOTHER COLLECTION
                 // we are just going to skip it!
@@ -335,20 +334,22 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve
                     throw new ImportException("A LOCAL dataset with the global id " + globalIdString + " already exists in this dataverse; skipping.");
                 }
                 // For harvested datasets, there should always only be one version.
-                // We will replace the current version with the imported version.
-                // @todo or should we just destroy any extra versions quietly?
                 if (existingDataset.getVersions().size() != 1) {
                     throw new ImportException("Error importing Harvested Dataset, existing dataset has " + existingDataset.getVersions().size() + " versions");
                 }
-                                
+                
+                // We will attempt to import the new version, and replace the 
+                // current, already existing version with it.                
                 harvestedVersion = parser.parseDatasetVersion(obj.getJsonObject("datasetVersion"));
-                Dataset tempDataset = createTemporaryHarvestedDataset(harvestedVersion);
-                // Temporarily attach the temporary dataset to the parent Collection: 
-                // (this will be needed for the purposes of looking up the configured
-                // metadatablocks and such)
-                tempDataset.setOwner(owner);
+                
+                // For the purposes of validation, the version needs to be attached
+                // to a non-null dataset. We will reate a throwaway temporary 
+                // dataset for this:
+                harvestedDataset = createTemporaryHarvestedDataset(harvestedVersion);
             }
         
+            harvestedDataset.setOwner(owner);
+            
             // Either a full new import, or an update of an existing harvested
             // Dataset, perform some cleanup on the new version imported from the 
             // parsed metadata:
@@ -359,28 +360,19 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve
                 harvestedVersion.setReleaseTime(oaiDateStamp);
             }
             
-            // Check data against required constraints
-            List<ConstraintViolation<DatasetField>> violations = harvestedVersion.validateRequired();
-            if (!violations.isEmpty()) {
-                // ... and fill the missing required values with "NA"s:
-                for (ConstraintViolation<DatasetField> v : violations) {
-                    DatasetField f =  v.getRootBean();
-                    f.setSingleValue(DatasetField.NA_VALUE);
-                }
-            }
-            
             // is this the right place to call tidyUpFields()? 
             // usually it is called within the body of the create/update commands
             // later on.
             DatasetFieldUtil.tidyUpFields(harvestedVersion.getDatasetFields(), true);
             
-            // Check data against validation constraints
-            // Similarly to how we handle missing required values (above), we 
-            // replace invalid values with NA when harvesting.
+            // Check data against validation constraints. 
+            // Make an attempt to sanitize any invalid fields encountered - 
+            // missing required fields or invalid values, by filling the values 
+            // with NAs.
             
             boolean sanitized = validateAndSanitizeVersionMetadata(harvestedVersion, cleanupLog);
             
-            // Note: our sanitizing approach, of replacing invalid values with 
+            // Note: this sanitizing approach, of replacing invalid values with 
             // "NA" does not work with certain fields. For example, using it to 
             // populate a GeoBox coordinate value will result in an invalid 
             // field. So we will attempt to re-validate the santized version.

From 0a83bc374ab9c4cccdf7d8e7b9fbe74ce4948848 Mon Sep 17 00:00:00 2001
From: qqmyers <qqmyers@hotmail.com>
Date: Mon, 23 Sep 2024 11:22:08 -0400
Subject: [PATCH 9/9] typo in comment

---
 .../edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java
index 73d47b18b9a..66f48bfb872 100644
--- a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java
+++ b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java
@@ -343,7 +343,7 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve
                 harvestedVersion = parser.parseDatasetVersion(obj.getJsonObject("datasetVersion"));
                 
                 // For the purposes of validation, the version needs to be attached
-                // to a non-null dataset. We will reate a throwaway temporary 
+                // to a non-null dataset. We will create a throwaway temporary 
                 // dataset for this:
                 harvestedDataset = createTemporaryHarvestedDataset(harvestedVersion);
             }