IQSS · kcondon · Oct 18, 2023 · Aug 18, 2023 · Aug 21, 2023 · Aug 22, 2023
diff --git a/doc/release-notes/9763-versions-api-improvements.md b/doc/release-notes/9763-versions-api-improvements.md
@@ -0,0 +1,8 @@
+# Improvements in the /versions API
+
+- optional pagination has been added to `/api/datasets/{id}/versions` that may be useful in datasets with a large number of versions;
+- a new flag `includeFiles` is added to both `/api/datasets/{id}/versions` and `/api/datasets/{id}/versions/{vid}` (true by default), providing an option to drop the file information from the output;
+- when files are requested to be included, some database lookup optimizations have been added to improve the performance on datasets with large numbers of files. 
+
+This is reflected in the [Dataset Versions API](https://guides.dataverse.org/en/9763-lookup-optimizations/api/native-api.html#dataset-versions-api) section of the Guide.
+
diff --git a/doc/sphinx-guides/source/api/native-api.rst b/doc/sphinx-guides/source/api/native-api.rst
@@ -889,6 +889,10 @@ It returns a list of versions with their metadata, and file list:
     ]
   }
 
+The optional ``includeFiles`` parameter specifies whether the files should be listed in the output. It defaults to ``true``, preserving backward compatibility. (Note that for a dataset with a large number of versions and/or files having the files included can dramatically increase the volume of the output). A separate ``/files`` API can be used for listing the files, or a subset thereof in a given version. 
+
+The optional ``offset`` and ``limit`` parameters can be used to specify the range of the versions list to be shown. This can be used to paginate through the list in a dataset with a large number of versions. 
+
 
 Get Version of a Dataset
 ~~~~~~~~~~~~~~~~~~~~~~~~
@@ -901,13 +905,16 @@ Get Version of a Dataset
   export ID=24
   export VERSION=1.0
 
-  curl "$SERVER_URL/api/datasets/$ID/versions/$VERSION"
+  curl "$SERVER_URL/api/datasets/$ID/versions/$VERSION?includeFiles=false"
 
 The fully expanded example above (without environment variables) looks like this:
 
 .. code-block:: bash
 
-  curl "https://demo.dataverse.org/api/datasets/24/versions/1.0"
+  curl "https://demo.dataverse.org/api/datasets/24/versions/1.0?includeFiles=false"
+
+The optional ``includeFiles`` parameter specifies whether the files should be listed in the output (defaults to ``true``). Note that a separate ``/files`` API can be used for listing the files, or a subset thereof in a given version. 
+
 
 .. _export-dataset-metadata-api:
 

diff --git a/src/main/java/edu/harvard/iq/dataverse/Dataset.java b/src/main/java/edu/harvard/iq/dataverse/Dataset.java
@@ -158,6 +158,23 @@ public void setCitationDateDatasetFieldType(DatasetFieldType citationDateDataset
         this.citationDateDatasetFieldType = citationDateDatasetFieldType;
     }    
 
+    // Per DataCite best practices, the citation date of a dataset may need 
+    // to be adjusted to reflect the latest embargo availability date of any 
+    // file within the first published version. 
+    // If any files are embargoed in the first version, this date will be
+    // calculated and cached here upon its publication, in the 
+    // FinalizeDatasetPublicationCommand. 
+    private Timestamp embargoCitationDate;
+
+    public Timestamp getEmbargoCitationDate() {
+        return embargoCitationDate;
+    }
+
+    public void setEmbargoCitationDate(Timestamp embargoCitationDate) {
+        this.embargoCitationDate = embargoCitationDate;
+    }
+
+
 
     @ManyToOne
     @JoinColumn(name="template_id",nullable = true)
@@ -676,20 +693,10 @@ public Timestamp getCitationDate() {
         Timestamp citationDate = null;
         //Only calculate if this dataset doesn't use an alternate date field for publication date
         if (citationDateDatasetFieldType == null) {
-            List<DatasetVersion> versions = this.versions;
-            // TODo - is this ever not version 1.0 (or draft if not published yet)
-            DatasetVersion oldest = versions.get(versions.size() - 1);
             citationDate = super.getPublicationDate();
-            if (oldest.isPublished()) {
-                List<FileMetadata> fms = oldest.getFileMetadatas();
-                for (FileMetadata fm : fms) {
-                    Embargo embargo = fm.getDataFile().getEmbargo();
-                    if (embargo != null) {
-                        Timestamp embDate = Timestamp.valueOf(embargo.getDateAvailable().atStartOfDay());
-                        if (citationDate.compareTo(embDate) < 0) {
-                            citationDate = embDate;
-                        }
-                    }
+            if (embargoCitationDate != null) {
+                if (citationDate.compareTo(embargoCitationDate) < 0) {
+                    return embargoCitationDate;
                 }
             }
         }

diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java
@@ -137,7 +137,7 @@ public Dataset findDeep(Object pk) {
             .setHint("eclipselink.left-join-fetch", "o.files.roleAssignments")
             .getSingleResult();
     }
-
+    
     public List<Dataset> findByOwnerId(Long ownerId) {
         return findByOwnerId(ownerId, false);
     }

diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java b/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java
@@ -68,7 +68,13 @@
                query = "SELECT OBJECT(o) FROM DatasetVersion AS o WHERE o.dataset.harvestedFrom IS NULL and o.releaseTime IS NOT NULL and o.archivalCopyLocation IS NULL"
     ), 
     @NamedQuery(name = "DatasetVersion.findById", 
-                query = "SELECT o FROM DatasetVersion o LEFT JOIN FETCH o.fileMetadatas WHERE o.id=:id")})
+                query = "SELECT o FROM DatasetVersion o LEFT JOIN FETCH o.fileMetadatas WHERE o.id=:id"), 
+    @NamedQuery(name = "DatasetVersion.findByDataset",
+                query = "SELECT o FROM DatasetVersion o WHERE o.dataset.id=:datasetId ORDER BY o.versionNumber DESC, o.minorVersionNumber DESC"), 
+    @NamedQuery(name = "DatasetVersion.findReleasedByDataset",
+                query = "SELECT o FROM DatasetVersion o WHERE o.dataset.id=:datasetId AND o.versionState=edu.harvard.iq.dataverse.DatasetVersion.VersionState.RELEASED ORDER BY o.versionNumber DESC, o.minorVersionNumber DESC")/*,
+    @NamedQuery(name = "DatasetVersion.findVersionElements",
+                query = "SELECT o.id, o.versionState, o.versionNumber, o.minorVersionNumber FROM DatasetVersion o WHERE o.dataset.id=:datasetId ORDER BY o.versionNumber DESC, o.minorVersionNumber DESC")*/})
 
 
 @Entity

diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetVersionServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/DatasetVersionServiceBean.java
@@ -166,9 +166,44 @@ public DatasetVersion findDeep(Object pk) {
             .setHint("eclipselink.left-join-fetch", "o.fileMetadatas.datasetVersion")
             .setHint("eclipselink.left-join-fetch", "o.fileMetadatas.dataFile.releaseUser")
             .setHint("eclipselink.left-join-fetch", "o.fileMetadatas.dataFile.creator")
+            .setHint("eclipselink.left-join-fetch", "o.fileMetadatas.dataFile.dataFileTags")
             .getSingleResult();
     }
-
+
+    /**
+     * Performs the same database lookup as the one behind Dataset.getVersions().
+     * Additionally, provides the arguments for selecting a partial list of 
+     * (length-offset) versions for pagination, plus the ability to pre-select 
+     * only the publicly-viewable versions. 
+     * It is recommended that individual software components utilize the 
+     * ListVersionsCommand, instead of calling this service method directly.
+     * @param datasetId
+     * @param offset for pagination through long lists of versions
+     * @param length for pagination through long lists of versions
+     * @param includeUnpublished retrieves all the versions, including drafts and deaccessioned. 
+     * @return (partial) list of versions
+     */
+    public List<DatasetVersion> findVersions(Long datasetId, Integer offset, Integer length, boolean includeUnpublished) {
+        TypedQuery<DatasetVersion> query;  
+        if (includeUnpublished) {
+            query = em.createNamedQuery("DatasetVersion.findByDataset", DatasetVersion.class);
+        } else {
+            query = em.createNamedQuery("DatasetVersion.findReleasedByDataset", DatasetVersion.class)
+                    .setParameter("datasetId", datasetId);
+        }
+
+        query.setParameter("datasetId", datasetId);
+
+        if (offset != null) {
+            query.setFirstResult(offset);
+        }
+        if (length != null) {
+            query.setMaxResults(length);
+        }
+
+        return query.getResultList();
+    }
+
     public DatasetVersion findByFriendlyVersionNumber(Long datasetId, String friendlyVersionNumber) {
         Long majorVersionNumber = null;
         Long minorVersionNumber = null;

diff --git a/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java b/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java
@@ -265,7 +265,7 @@ public Response getDataset(@Context ContainerRequestContext crc, @PathParam("id"
                 MakeDataCountLoggingServiceBean.MakeDataCountEntry entry = new MakeDataCountEntry(uriInfo, headers, dvRequestService, retrieved);
                 mdcLogService.logEntry(entry);
             }
-            return ok(jsonbuilder.add("latestVersion", (latest != null) ? json(latest) : null));
+            return ok(jsonbuilder.add("latestVersion", (latest != null) ? json(latest, true) : null));
         }, getRequestUser(crc));
     }
 
@@ -471,22 +471,33 @@ public Response useDefaultCitationDate(@Context ContainerRequestContext crc, @Pa
     @GET
     @AuthRequired
     @Path("{id}/versions")
-    public Response listVersions(@Context ContainerRequestContext crc, @PathParam("id") String id ) {
-        return response( req ->
-             ok( execCommand( new ListVersionsCommand(req, findDatasetOrDie(id)) )
+    public Response listVersions(@Context ContainerRequestContext crc, @PathParam("id") String id, @QueryParam("includeFiles") Boolean includeFiles, @QueryParam("limit") Integer limit, @QueryParam("offset") Integer offset) {
+
+        return response( req -> {
+            Dataset dataset = findDatasetOrDie(id); 
+
+            return ok( execCommand( new ListVersionsCommand(req, dataset, offset, limit, (includeFiles == null ? true : includeFiles)) )
                                 .stream()
-                                .map( d -> json(d) )
-                                .collect(toJsonArray())), getRequestUser(crc));
+                                .map( d -> json(d, includeFiles == null ? true : includeFiles) )
+                                .collect(toJsonArray()));
+        }, getRequestUser(crc));
     }
 
     @GET
     @AuthRequired
     @Path("{id}/versions/{versionId}")
-    public Response getVersion(@Context ContainerRequestContext crc, @PathParam("id") String datasetId, @PathParam("versionId") String versionId, @Context UriInfo uriInfo, @Context HttpHeaders headers) {
+    public Response getVersion(@Context ContainerRequestContext crc, @PathParam("id") String datasetId, @PathParam("versionId") String versionId, @QueryParam("includeFiles") Boolean includeFiles, @Context UriInfo uriInfo, @Context HttpHeaders headers) {
         return response( req -> {
             DatasetVersion dsv = getDatasetVersionOrDie(req, versionId, findDatasetOrDie(datasetId), uriInfo, headers);
-            return (dsv == null || dsv.getId() == null) ? notFound("Dataset version not found")
-                    : ok(json(dsv));
+
+            if (dsv == null || dsv.getId() == null) {
+                return notFound("Dataset version not found");
+            }
+
+            if (includeFiles == null ? true : includeFiles) {
+                dsv = datasetversionService.findDeep(dsv.getId());
+            }
+            return ok(json(dsv, includeFiles == null ? true : includeFiles));
         }, getRequestUser(crc));
     }
 
@@ -783,7 +794,7 @@ public Response updateDraftVersion(@Context ContainerRequestContext crc, String
                 }
                 managedVersion = execCommand(new CreateDatasetVersionCommand(req, ds, incomingVersion));
             }
-            return ok( json(managedVersion) );
+            return ok( json(managedVersion, true) );
 
         } catch (JsonParseException ex) {
             logger.log(Level.SEVERE, "Semantic error parsing dataset version Json: " + ex.getMessage(), ex);
@@ -1018,7 +1029,7 @@ private Response processDatasetFieldDataDelete(String jsonBody, String id, Datav
 
 
             DatasetVersion managedVersion = execCommand(new UpdateDatasetVersionCommand(ds, req)).getLatestVersion();
-            return ok(json(managedVersion));
+            return ok(json(managedVersion, true));
 
         } catch (JsonParseException ex) {
             logger.log(Level.SEVERE, "Semantic error parsing dataset update Json: " + ex.getMessage(), ex);
@@ -1167,7 +1178,7 @@ private Response processDatasetUpdate(String jsonBody, String id, DataverseReque
             }
             DatasetVersion managedVersion = execCommand(new UpdateDatasetVersionCommand(ds, req)).getLatestVersion();
 
-            return ok(json(managedVersion));
+            return ok(json(managedVersion, true));
 
         } catch (JsonParseException ex) {
             logger.log(Level.SEVERE, "Semantic error parsing dataset update Json: " + ex.getMessage(), ex);
@@ -3942,9 +3953,9 @@ public Response getPrivateUrlDatasetVersion(@PathParam("privateUrlToken") String
         JsonObjectBuilder responseJson;
         if (isAnonymizedAccess) {
             List<String> anonymizedFieldTypeNamesList = new ArrayList<>(Arrays.asList(anonymizedFieldTypeNames.split(",\\s")));
-            responseJson = json(dsv, anonymizedFieldTypeNamesList);
+            responseJson = json(dsv, anonymizedFieldTypeNamesList, true);
         } else {
-            responseJson = json(dsv);
+            responseJson = json(dsv, true);
         }
         return ok(responseJson);
     }

diff --git a/src/main/java/edu/harvard/iq/dataverse/dataset/DatasetUtil.java b/src/main/java/edu/harvard/iq/dataverse/dataset/DatasetUtil.java
@@ -523,7 +523,7 @@ public static boolean validateDatasetMetadataExternally(Dataset ds, String execu
         // for the filter to whitelist by these attributes. 
 
         try {
-            jsonMetadata = json(ds).add("datasetVersion", json(ds.getLatestVersion()))
+            jsonMetadata = json(ds).add("datasetVersion", json(ds.getLatestVersion(), true))
                     .add("sourceAddress", sourceAddressLabel)
                     .add("userIdentifier", userIdentifier)
                     .add("parentAlias", ds.getOwner().getAlias())

diff --git a/.../java/edu/harvard/iq/dataverse/engine/command/impl/FinalizeDatasetPublicationCommand.java b/.../java/edu/harvard/iq/dataverse/engine/command/impl/FinalizeDatasetPublicationCommand.java
@@ -10,6 +10,7 @@
 import edu.harvard.iq.dataverse.DatasetVersionUser;
 import edu.harvard.iq.dataverse.Dataverse;
 import edu.harvard.iq.dataverse.DvObject;
+import edu.harvard.iq.dataverse.Embargo;
 import edu.harvard.iq.dataverse.UserNotification;
 import edu.harvard.iq.dataverse.authorization.Permission;
 import edu.harvard.iq.dataverse.authorization.users.AuthenticatedUser;
@@ -117,9 +118,37 @@ public Dataset execute(CommandContext ctxt) throws CommandException {
         // is this the first publication of the dataset?
         if (theDataset.getPublicationDate() == null) {
             theDataset.setReleaseUser((AuthenticatedUser) getUser());
-        }
-        if ( theDataset.getPublicationDate() == null ) {
+
             theDataset.setPublicationDate(new Timestamp(new Date().getTime()));
+
+            // if there are any embargoed files in this version, we will save 
+            // the latest availability date as the "embargoCitationDate" for future 
+            // reference (if the files are not available yet, as of publishing of 
+            // the dataset, this date will be used as the "ciatation date" of the dataset, 
+            // instead of the publicatonDate, in compliance with the DataCite 
+            // best practices). 
+            // the code below replicates the logic that used to be in the method 
+            // Dataset.getCitationDate() that calculated this adjusted date in real time.
+
+            Timestamp latestEmbargoDate = null; 
+            for (DataFile dataFile : theDataset.getFiles()) {
+                // this is the first version of the dataset that is being published. 
+                // therefore we can iterate through .getFiles() instead of obtaining
+                // the DataFiles by going through the FileMetadatas in the current version.
+                Embargo embargo = dataFile.getEmbargo();
+                if (embargo != null) {
+                    // "dataAvailable" is not nullable in the Embargo class, no need for a null check
+                    Timestamp embargoDate = Timestamp.valueOf(embargo.getDateAvailable().atStartOfDay());
+                    if (latestEmbargoDate == null || latestEmbargoDate.compareTo(embargoDate) < 0) {
+                        latestEmbargoDate = embargoDate;
+                    }
+                }
+            }
+            // the above loop could be easily replaced with a database query; 
+            // but we iterate through .getFiles() elsewhere in the command, when 
+            // updating and/or registering the files, so it should not result in 
+            // an extra performance hit. 
+            theDataset.setEmbargoCitationDate(latestEmbargoDate);
         } 
 
         //Clear any external status

diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/ListVersionsCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/ListVersionsCommand.java
@@ -14,6 +14,7 @@
 import edu.harvard.iq.dataverse.engine.command.DataverseRequest;
 import edu.harvard.iq.dataverse.engine.command.RequiredPermissions;
 import edu.harvard.iq.dataverse.engine.command.exception.CommandException;
+import edu.harvard.iq.dataverse.engine.command.exception.CommandExecutionException;
 import java.util.LinkedList;
 import java.util.List;
 
@@ -23,23 +24,57 @@
  */
 // No permission needed to view published dvObjects
 @RequiredPermissions({})
-public class ListVersionsCommand extends AbstractCommand<List<DatasetVersion>>{
-    
+public class ListVersionsCommand extends AbstractCommand<List<DatasetVersion>> {
+
     private final Dataset ds;
+    private final Integer limit; 
+    private final Integer offset;
+    private final Boolean deepLookup; 
 
-	public ListVersionsCommand(DataverseRequest aRequest, Dataset aDataset) {
-		super(aRequest, aDataset);
-		ds = aDataset;
-	}
+    public ListVersionsCommand(DataverseRequest aRequest, Dataset aDataset) {
+        this(aRequest, aDataset, null, null);
+    }
+
+    public ListVersionsCommand(DataverseRequest aRequest, Dataset aDataset, Integer offset, Integer limit) {
+        this(aRequest, aDataset, null, null, false);
+    }
+
+    public ListVersionsCommand(DataverseRequest aRequest, Dataset aDataset, Integer offset, Integer limit, boolean deepLookup) {
+        super(aRequest, aDataset);
+        ds = aDataset;
+        this.offset = offset; 
+        this.limit = limit; 
+        this.deepLookup = deepLookup; 
+    }
 
-	@Override
-	public List<DatasetVersion> execute(CommandContext ctxt) throws CommandException {
-		List<DatasetVersion> outputList = new LinkedList<>();
-		for ( DatasetVersion dsv : ds.getVersions() ) {
-            if (dsv.isReleased() || ctxt.permissions().request( getRequest() ).on(ds).has(Permission.EditDataset)) {
-                outputList.add(dsv);
+    @Override
+    public List<DatasetVersion> execute(CommandContext ctxt) throws CommandException {
+
+        boolean includeUnpublished = ctxt.permissions().request(getRequest()).on(ds).has(Permission.EditDataset);
+
+        if (offset == null && limit == null) { 
+
+            List<DatasetVersion> outputList = new LinkedList<>();
+            for (DatasetVersion dsv : ds.getVersions()) {
+                if (dsv.isReleased() || includeUnpublished) {
+                    if (deepLookup) {
+                        // @todo: when "deep"/extended lookup is requested, and 
+                        // we call .findDeep() to look up each version again, 
+                        // there is probably a more economical way to obtain the 
+                        // numeric ids of the versions, by a direct single query,
+                        // rather than go through ds.getVersions() like we are now. 
+                        dsv = ctxt.datasetVersion().findDeep(dsv.getId());
+                        if (dsv == null) {
+                            throw new CommandExecutionException("Failed to look up full list of dataset versions", this);
+                        }
+                    }
+                    outputList.add(dsv);
+                }
             }
-		}
-        return outputList;
-	}
+            return outputList;
+        } else {
+            // Only a partial list (one "page"-worth) of versions is being requested
+            return ctxt.datasetVersion().findVersions(ds.getId(), offset, limit, includeUnpublished);
+        }
+    }
 }