From 45ccdff7adec9007bb0bbe31ee0b4e3a23a3e413 Mon Sep 17 00:00:00 2001 From: Stephen Kraffmiller Date: Tue, 28 Sep 2021 16:41:35 -0400 Subject: [PATCH 01/24] #8097 speed up file indexing --- .../iq/dataverse/FileVersionDifference.java | 21 ++++++++-- .../iq/dataverse/search/IndexServiceBean.java | 40 ++++++++++++++----- 2 files changed, 47 insertions(+), 14 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/FileVersionDifference.java b/src/main/java/edu/harvard/iq/dataverse/FileVersionDifference.java index 8c5a549f619..2046b61107f 100644 --- a/src/main/java/edu/harvard/iq/dataverse/FileVersionDifference.java +++ b/src/main/java/edu/harvard/iq/dataverse/FileVersionDifference.java @@ -10,7 +10,6 @@ import java.util.ArrayList; import java.util.List; import java.util.Objects; -import java.util.ResourceBundle; /** * @@ -50,7 +49,7 @@ public boolean compareMetadata(FileMetadata newFileMetadata, FileMetadata origin and it updates the FileVersionDifference object which is used to display the differences on the dataset versions tab. The return value is used by the index service bean tomark whether a file needs to be re-indexed in the context of a dataset update. When there are changes (after v4.19)to the file metadata data model this method must be updated. - retVal of True means metadatas are equal. + retVal of True means metadatas are equal. */ boolean retVal = true; @@ -68,6 +67,7 @@ When there are changes (after v4.19)to the file metadata data model this method if (this.originalFileMetadata == null && this.newFileMetadata.getDataFile() != null ){ //File Added + if (!details) return false; retVal = false; updateDifferenceSummary( "", BundleUtil.getStringFromBundle("file.versionDifferences.fileGroupTitle"), 1, 0, 0, 0); } @@ -75,6 +75,7 @@ When there are changes (after v4.19)to the file metadata data model this method //Check to see if File replaced if (originalFileMetadata != null && newFileMetadata.getDataFile() != null && originalFileMetadata.getDataFile() != null &&!this.originalFileMetadata.getDataFile().equals(this.newFileMetadata.getDataFile())){ + if (!details) return false; updateDifferenceSummary( "", BundleUtil.getStringFromBundle("file.versionDifferences.fileGroupTitle"), 0, 0, 0, 1); retVal = false; } @@ -83,6 +84,8 @@ When there are changes (after v4.19)to the file metadata data model this method if (!newFileMetadata.getLabel().equals(originalFileMetadata.getLabel())) { if (details) { differenceDetailItems.add(new FileDifferenceDetailItem(BundleUtil.getStringFromBundle("file.versionDifferences.fileNameDetailTitle"), originalFileMetadata.getLabel(), newFileMetadata.getLabel())); + } else{ + return false; } updateDifferenceSummary(BundleUtil.getStringFromBundle("file.versionDifferences.fileMetadataGroupTitle"), BundleUtil.getStringFromBundle("file.versionDifferences.fileNameDetailTitle"), 0, 1, 0, 0); @@ -97,6 +100,8 @@ When there are changes (after v4.19)to the file metadata data model this method && !newFileMetadata.getDescription().equals(originalFileMetadata.getDescription())) { if (details) { differenceDetailItems.add(new FileDifferenceDetailItem(BundleUtil.getStringFromBundle("file.versionDifferences.descriptionDetailTitle"), originalFileMetadata.getDescription(), newFileMetadata.getDescription())); + } else { + return false; } updateDifferenceSummary(BundleUtil.getStringFromBundle("file.versionDifferences.fileMetadataGroupTitle"), BundleUtil.getStringFromBundle("file.versionDifferences.descriptionDetailTitle"), 0, 1, 0, 0); @@ -107,6 +112,8 @@ When there are changes (after v4.19)to the file metadata data model this method ) { if (details) { differenceDetailItems.add(new FileDifferenceDetailItem(BundleUtil.getStringFromBundle("file.versionDifferences.descriptionDetailTitle"), "", newFileMetadata.getDescription())); + } else { + return false; } updateDifferenceSummary(BundleUtil.getStringFromBundle("file.versionDifferences.fileMetadataGroupTitle"), BundleUtil.getStringFromBundle("file.versionDifferences.descriptionDetailTitle"), 1, 0, 0, 0); @@ -117,6 +124,8 @@ When there are changes (after v4.19)to the file metadata data model this method ) { if (details) { differenceDetailItems.add(new FileDifferenceDetailItem(BundleUtil.getStringFromBundle("file.versionDifferences.descriptionDetailTitle"), originalFileMetadata.getDescription(), "" )); + } else { + return false; } updateDifferenceSummary(BundleUtil.getStringFromBundle("file.versionDifferences.fileMetadataGroupTitle"), BundleUtil.getStringFromBundle("file.versionDifferences.descriptionDetailTitle"), 0, 0, 1, 0); @@ -130,6 +139,8 @@ When there are changes (after v4.19)to the file metadata data model this method && !newFileMetadata.getProvFreeForm().equals(originalFileMetadata.getProvFreeForm())) { if (details) { differenceDetailItems.add(new FileDifferenceDetailItem(BundleUtil.getStringFromBundle("file.versionDifferences.provenanceDetailTitle"), originalFileMetadata.getProvFreeForm(), newFileMetadata.getProvFreeForm())); + } else { + return false; } updateDifferenceSummary(BundleUtil.getStringFromBundle("file.versionDifferences.fileMetadataGroupTitle"), BundleUtil.getStringFromBundle("file.versionDifferences.provenanceDetailTitle"), 0, 1, 0, 0); @@ -140,6 +151,8 @@ When there are changes (after v4.19)to the file metadata data model this method ) { if (details) { differenceDetailItems.add(new FileDifferenceDetailItem(BundleUtil.getStringFromBundle("file.versionDifferences.provenanceDetailTitle"), "", newFileMetadata.getProvFreeForm())); + } else { + return false; } updateDifferenceSummary(BundleUtil.getStringFromBundle("file.versionDifferences.fileMetadataGroupTitle"), BundleUtil.getStringFromBundle("file.versionDifferences.provenanceDetailTitle"), 1, 0, 0, 0); @@ -150,6 +163,8 @@ When there are changes (after v4.19)to the file metadata data model this method ) { if (details) { differenceDetailItems.add(new FileDifferenceDetailItem(BundleUtil.getStringFromBundle("file.versionDifferences.provenanceDetailTitle"), originalFileMetadata.getProvFreeForm(), "" )); + } else { + return false; } updateDifferenceSummary(BundleUtil.getStringFromBundle("file.versionDifferences.fileMetadataGroupTitle"), BundleUtil.getStringFromBundle("file.versionDifferences.provenanceDetailTitle"), 0, 0, 1, 0); @@ -170,7 +185,7 @@ When there are changes (after v4.19)to the file metadata data model this method } if (!value1.equals(value2)) { - + if (!details) return false; int added = 0; int deleted = 0; diff --git a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java index 7fbb953299e..bd73774f454 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java @@ -2,6 +2,7 @@ import edu.harvard.iq.dataverse.ControlledVocabularyValue; import edu.harvard.iq.dataverse.DataFile; +import edu.harvard.iq.dataverse.DataFileServiceBean; import edu.harvard.iq.dataverse.DataFileTag; import edu.harvard.iq.dataverse.Dataset; import edu.harvard.iq.dataverse.DatasetField; @@ -117,6 +118,8 @@ public class IndexServiceBean { SettingsServiceBean settingsService; @EJB SolrClientService solrClientService; + @EJB + DataFileServiceBean dataFileService; @EJB VariableServiceBean variableService; @@ -937,22 +940,35 @@ private String addOrUpdateDataset(IndexableDataset indexableDataset, Set d logger.fine( "We are indexing a draft version of a dataset that has a released version. We'll be checking file metadatas if they are exact clones of the released versions."); } + Date date=java.util.Calendar.getInstance().getTime(); + System.out.print("Start file check: " + date ); for (FileMetadata fileMetadata : fileMetadatas) { boolean indexThisMetadata = true; + if (checkForDuplicateMetadata) { - - logger.fine("Checking if this file metadata is a duplicate."); + logger.fine("Checking if this file metadata is a duplicate."); + if (fileMetadata.getDataFile() != null) { + FileMetadata findReleasedFileMetadata = dataFileService.findFileMetadataByDatasetVersionIdAndDataFileId(dataset.getReleasedVersion().getId(), fileMetadata.getDataFile().getId()); + if (findReleasedFileMetadata != null) { + if ((fileMetadata.getDataFile().isRestricted() == findReleasedFileMetadata.getDataFile().isRestricted())) { + if (fileMetadata.contentEquals(findReleasedFileMetadata) + && variableMetadataUtil.compareVariableMetadata(findReleasedFileMetadata, fileMetadata)) { + indexThisMetadata = false; + logger.fine("This file metadata hasn't changed since the released version; skipping indexing."); + } else { + logger.fine("This file metadata has changed since the released version; we want to index it!"); + } + } else { + logger.fine("This file's restricted status has changed since the released version; we want to index it!"); + } + } + } + + /* for (FileMetadata releasedFileMetadata : dataset.getReleasedVersion().getFileMetadatas()) { if (fileMetadata.getDataFile() != null && fileMetadata.getDataFile().equals(releasedFileMetadata.getDataFile())) { - /* - * Duplicate if metadata matches and, for full text indexing and the - * SearchFields.ACCESS field, if the restricted status of the file hasn't - * changed. To address the case where full text indexing was on when a file was - * not restricted and it is now restricted and full text indexing has been shut - * off, we need to check for the change in restricted status regardless of - * whether full text indexing is on now. - */ + if ((fileMetadata.getDataFile().isRestricted() == releasedFileMetadata.getDataFile().isRestricted())) { if (fileMetadata.contentEquals(releasedFileMetadata) && variableMetadataUtil.compareVariableMetadata(releasedFileMetadata,fileMetadata) @@ -968,6 +984,7 @@ private String addOrUpdateDataset(IndexableDataset indexableDataset, Set d break; } } + */ } if (indexThisMetadata) { @@ -1242,7 +1259,8 @@ private String addOrUpdateDataset(IndexableDataset indexableDataset, Set d } } } - + Date date=java.util.Calendar.getInstance().getTime(); + System.out.print("End file check: " + date ); try { solrClientService.getSolrClient().add(docs); solrClientService.getSolrClient().commit(); From a8ca8d1376d33d34a64a3099f5487a6ed8c79ee4 Mon Sep 17 00:00:00 2001 From: Stephen Kraffmiller Date: Thu, 7 Oct 2021 17:02:49 -0400 Subject: [PATCH 02/24] #8097 post file docs every 100 --- .../iq/dataverse/search/IndexServiceBean.java | 63 +++++++++++++------ 1 file changed, 45 insertions(+), 18 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java index bd73774f454..a3136f428d9 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java @@ -713,7 +713,7 @@ private String addOrUpdateDataset(IndexableDataset indexableDataset, Set d Dataset dataset = indexableDataset.getDatasetVersion().getDataset(); logger.fine("adding or updating Solr document for dataset id " + dataset.getId()); Collection docs = new ArrayList<>(); - + Collection fileDocs = new ArrayList<>(); SolrInputDocument solrInputDocument = new SolrInputDocument(); String datasetSolrDocId = indexableDataset.getSolrDocId(); solrInputDocument.addField(SearchFields.ID, datasetSolrDocId); @@ -937,13 +937,21 @@ private String addOrUpdateDataset(IndexableDataset indexableDataset, Set d boolean checkForDuplicateMetadata = false; if (datasetVersion.isDraft() && dataset.isReleased() && dataset.getReleasedVersion() != null) { checkForDuplicateMetadata = true; + System.out.print("We are indexing a draft version of a dataset that has a released version. We'll be checking file metadatas if they are exact clones of the released versions."); logger.fine( "We are indexing a draft version of a dataset that has a released version. We'll be checking file metadatas if they are exact clones of the released versions."); } - Date date=java.util.Calendar.getInstance().getTime(); - System.out.print("Start file check: " + date ); + Date startdate=java.util.Calendar.getInstance().getTime(); + System.out.print("Start file check: " + startdate ); + int count = 0; for (FileMetadata fileMetadata : fileMetadatas) { - + count++; + Date loopdate=java.util.Calendar.getInstance().getTime(); + Double diff = new Double( (loopdate.getTime() - startdate.getTime())) ; + diff = diff/1000.; + Double dcount = new Double(count); + System.out.print(" fileMetadata: " + fileMetadata.getId() + " " + count + " " + diff + " " + dcount/diff + " " + loopdate); + boolean indexThisMetadata = true; if (checkForDuplicateMetadata) { @@ -986,17 +994,17 @@ private String addOrUpdateDataset(IndexableDataset indexableDataset, Set d } */ } + System.out.print(" fileMetadata: " + fileMetadata.getId() + " " + count + " index? " + indexThisMetadata); if (indexThisMetadata) { - SolrInputDocument datafileSolrInputDocument = new SolrInputDocument(); Long fileEntityId = fileMetadata.getDataFile().getId(); datafileSolrInputDocument.addField(SearchFields.ENTITY_ID, fileEntityId); - datafileSolrInputDocument.addField(SearchFields.DATAVERSE_VERSION_INDEXED_BY, dataverseVersion); + datafileSolrInputDocument.addField(SearchFields.DATAVERSE_VERSION_INDEXED_BY, dataverseVersion); //common datafileSolrInputDocument.addField(SearchFields.IDENTIFIER, fileEntityId); - datafileSolrInputDocument.addField(SearchFields.PERSISTENT_URL, dataset.getPersistentURL()); - datafileSolrInputDocument.addField(SearchFields.TYPE, "files"); - datafileSolrInputDocument.addField(SearchFields.CATEGORY_OF_DATAVERSE, dataset.getDataverseContext().getIndexableCategoryName()); + datafileSolrInputDocument.addField(SearchFields.PERSISTENT_URL, dataset.getPersistentURL()); //common + datafileSolrInputDocument.addField(SearchFields.TYPE, "files"); //common + datafileSolrInputDocument.addField(SearchFields.CATEGORY_OF_DATAVERSE, dataset.getDataverseContext().getIndexableCategoryName()); //common /* Full-text indexing using Apache Tika */ if (doFullTextIndexing) { @@ -1078,7 +1086,7 @@ private String addOrUpdateDataset(IndexableDataset indexableDataset, Set d datafileSolrInputDocument.addField(SearchFields.NAME_SORT, filenameCompleteFinal); datafileSolrInputDocument.addField(SearchFields.FILE_NAME, filenameCompleteFinal); - datafileSolrInputDocument.addField(SearchFields.DATASET_VERSION_ID, datasetVersion.getId()); + datafileSolrInputDocument.addField(SearchFields.DATASET_VERSION_ID, datasetVersion.getId()); //common /** * for rules on sorting files see @@ -1176,16 +1184,16 @@ private String addOrUpdateDataset(IndexableDataset indexableDataset, Set d datafileSolrInputDocument.addField(SearchFields.FILE_DESCRIPTION, fileMetadata.getDescription()); datafileSolrInputDocument.addField(SearchFields.FILE_PERSISTENT_ID, fileMetadata.getDataFile().getGlobalId().toString()); datafileSolrInputDocument.addField(SearchFields.UNF, fileMetadata.getDataFile().getUnf()); - datafileSolrInputDocument.addField(SearchFields.SUBTREE, dataversePaths); + datafileSolrInputDocument.addField(SearchFields.SUBTREE, dataversePaths); //common // datafileSolrInputDocument.addField(SearchFields.HOST_DATAVERSE, // dataFile.getOwner().getOwner().getName()); // datafileSolrInputDocument.addField(SearchFields.PARENT_NAME, // dataFile.getDataset().getTitle()); - datafileSolrInputDocument.addField(SearchFields.PARENT_ID, fileMetadata.getDataFile().getOwner().getId()); - datafileSolrInputDocument.addField(SearchFields.PARENT_IDENTIFIER, fileMetadata.getDataFile().getOwner().getGlobalId().toString()); - datafileSolrInputDocument.addField(SearchFields.PARENT_CITATION, fileMetadata.getDataFile().getOwner().getCitation()); + datafileSolrInputDocument.addField(SearchFields.PARENT_ID, fileMetadata.getDataFile().getOwner().getId()); //common + datafileSolrInputDocument.addField(SearchFields.PARENT_IDENTIFIER, fileMetadata.getDataFile().getOwner().getGlobalId().toString());//common + datafileSolrInputDocument.addField(SearchFields.PARENT_CITATION, fileMetadata.getDataFile().getOwner().getCitation()); //common - datafileSolrInputDocument.addField(SearchFields.PARENT_NAME, parentDatasetTitle); + datafileSolrInputDocument.addField(SearchFields.PARENT_NAME, parentDatasetTitle); //common // If this is a tabular data file -- i.e., if there are data // variables associated with this file, we index the variable @@ -1254,14 +1262,32 @@ private String addOrUpdateDataset(IndexableDataset indexableDataset, Set d if (indexableDataset.isFilesShouldBeIndexed()) { filesIndexed.add(fileSolrDocId); - docs.add(datafileSolrInputDocument); + fileDocs.add(datafileSolrInputDocument); + if (count % 100 == 0) { + try { + solrClientService.getSolrClient().add(fileDocs); + solrClientService.getSolrClient().commit(); + fileDocs.clear(); + } catch (SolrServerException | IOException ex) { + if (ex.getCause() instanceof SolrServerException) { + throw new SolrServerException(ex); + } else if (ex.getCause() instanceof IOException) { + throw new IOException(ex); + } + } + + Date date=java.util.Calendar.getInstance().getTime(); + System.out.print("***************Writing file docs: " + count + " " + date ); + } } } } } - Date date=java.util.Calendar.getInstance().getTime(); + Date date=java.util.Calendar.getInstance().getTime(); System.out.print("End file check: " + date ); try { + solrClientService.getSolrClient().add(fileDocs); + solrClientService.getSolrClient().commit(); solrClientService.getSolrClient().add(docs); solrClientService.getSolrClient().commit(); } catch (SolrServerException | IOException ex) { @@ -1271,7 +1297,8 @@ private String addOrUpdateDataset(IndexableDataset indexableDataset, Set d throw new IOException(ex); } } - + date=java.util.Calendar.getInstance().getTime(); + System.out.print("after solr service: " + date ); Long dsId = dataset.getId(); /// Dataset updatedDataset = /// (Dataset)dvObjectService.updateContentIndexTime(dataset); From d94b3f03e3b16b149868d7ca9fc9711e649b4f9d Mon Sep 17 00:00:00 2001 From: Stephen Kraffmiller Date: Fri, 8 Oct 2021 14:47:54 -0400 Subject: [PATCH 03/24] #8097 only run compare once --- .../java/edu/harvard/iq/dataverse/FileMetadata.java | 2 +- .../harvard/iq/dataverse/FileVersionDifference.java | 13 ++++++++++++- .../iq/dataverse/search/IndexServiceBean.java | 8 ++++++-- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/FileMetadata.java b/src/main/java/edu/harvard/iq/dataverse/FileMetadata.java index 0b2a92fe06a..31bff046318 100644 --- a/src/main/java/edu/harvard/iq/dataverse/FileMetadata.java +++ b/src/main/java/edu/harvard/iq/dataverse/FileMetadata.java @@ -531,7 +531,7 @@ public boolean contentEquals(FileMetadata other) { public boolean compareContent(FileMetadata other){ FileVersionDifference diffObj = new FileVersionDifference(this, other, false); - return diffObj.compareMetadata(this, other); + return diffObj.isSame(); } @Override diff --git a/src/main/java/edu/harvard/iq/dataverse/FileVersionDifference.java b/src/main/java/edu/harvard/iq/dataverse/FileVersionDifference.java index 2046b61107f..e0dea739edc 100644 --- a/src/main/java/edu/harvard/iq/dataverse/FileVersionDifference.java +++ b/src/main/java/edu/harvard/iq/dataverse/FileVersionDifference.java @@ -20,6 +20,9 @@ public final class FileVersionDifference { private FileMetadata newFileMetadata; private FileMetadata originalFileMetadata; private boolean details = false; + private boolean same = false; + + private List differenceSummaryGroups = new ArrayList<>(); private List differenceDetailItems = new ArrayList<>(); @@ -36,7 +39,7 @@ public FileVersionDifference(FileMetadata newFileMetadata, FileMetadata original this.originalFileMetadata = originalFileMetadata; this.details = details; - compareMetadata(newFileMetadata, originalFileMetadata); + this.same = compareMetadata(newFileMetadata, originalFileMetadata); //Compare versions - File Metadata first } @@ -269,6 +272,14 @@ public void setOriginalFileMetadata(FileMetadata originalFileMetadata) { this.originalFileMetadata = originalFileMetadata; } + public boolean isSame() { + return same; + } + + public void setSame(boolean same) { + this.same = same; + } + public List getDifferenceSummaryGroups() { return differenceSummaryGroups; diff --git a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java index 62683a40d07..32a313fe807 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java @@ -947,13 +947,14 @@ private String addOrUpdateDataset(IndexableDataset indexableDataset, Set d Date startdate=java.util.Calendar.getInstance().getTime(); System.out.print("Start file check: " + startdate ); int count = 0; + int countIgnore = 0; for (FileMetadata fileMetadata : fileMetadatas) { count++; Date loopdate=java.util.Calendar.getInstance().getTime(); Double diff = new Double( (loopdate.getTime() - startdate.getTime())) ; diff = diff/1000.; Double dcount = new Double(count); - System.out.print(" fileMetadata: " + fileMetadata.getId() + " " + count + " " + diff + " " + dcount/diff + " " + loopdate); + boolean indexThisMetadata = true; @@ -966,6 +967,7 @@ private String addOrUpdateDataset(IndexableDataset indexableDataset, Set d if (fileMetadata.contentEquals(findReleasedFileMetadata) && variableMetadataUtil.compareVariableMetadata(findReleasedFileMetadata, fileMetadata)) { indexThisMetadata = false; + countIgnore++; logger.fine("This file metadata hasn't changed since the released version; skipping indexing."); } else { logger.fine("This file metadata has changed since the released version; we want to index it!"); @@ -997,7 +999,9 @@ private String addOrUpdateDataset(IndexableDataset indexableDataset, Set d } */ } - System.out.print(" fileMetadata: " + fileMetadata.getId() + " " + count + " index? " + indexThisMetadata); + int include = count -countIgnore; + System.out.print(" count: " + count + " " + diff + " per second " + dcount/diff + " " + loopdate + " indexed count: " + include); + if (indexThisMetadata) { SolrInputDocument datafileSolrInputDocument = new SolrInputDocument(); From a3dbf5dddf1704e39fdb27f9aa3b7c11c37861ed Mon Sep 17 00:00:00 2001 From: Stephen Kraffmiller Date: Wed, 13 Oct 2021 16:14:09 -0400 Subject: [PATCH 04/24] #8097 index pub before draft remove debug code --- .../iq/dataverse/search/IndexServiceBean.java | 76 ++++--------------- 1 file changed, 14 insertions(+), 62 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java index 32a313fe807..c257071ab1d 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java @@ -627,16 +627,18 @@ public Future indexDataset(Dataset dataset, boolean doNormalSolrDocClean for (FileMetadata fm : latestVersion.getFileMetadatas()) { datafilesInDraftVersion.add(fm.getDataFile().getId()); } - String indexDraftResult = addOrUpdateDataset(indexableDraftVersion); - results.append("The latest version is a working copy (latestVersionState: ") - .append(latestVersionStateString).append(") and will be indexed as ") - .append(solrIdDraftDataset).append(" (limited visibility). Result: ").append(indexDraftResult).append("\n"); + desiredCards.put(DatasetVersion.VersionState.RELEASED, true); IndexableDataset indexableReleasedVersion = new IndexableDataset(releasedVersion); String indexReleasedVersionResult = addOrUpdateDataset(indexableReleasedVersion, datafilesInDraftVersion); results.append("There is a published version we will attempt to index. Result: ").append(indexReleasedVersionResult).append("\n"); + String indexDraftResult = addOrUpdateDataset(indexableDraftVersion); + results.append("The latest version is a working copy (latestVersionState: ") + .append(latestVersionStateString).append(") and will be indexed as ") + .append(solrIdDraftDataset).append(" (limited visibility). Result: ").append(indexDraftResult).append("\n"); + desiredCards.put(DatasetVersion.VersionState.DEACCESSIONED, false); if (doNormalSolrDocCleanUp) { String deleteDeaccessionedResult = removeDeaccessioned(dataset); @@ -940,21 +942,11 @@ private String addOrUpdateDataset(IndexableDataset indexableDataset, Set d boolean checkForDuplicateMetadata = false; if (datasetVersion.isDraft() && dataset.isReleased() && dataset.getReleasedVersion() != null) { checkForDuplicateMetadata = true; - System.out.print("We are indexing a draft version of a dataset that has a released version. We'll be checking file metadatas if they are exact clones of the released versions."); logger.fine( "We are indexing a draft version of a dataset that has a released version. We'll be checking file metadatas if they are exact clones of the released versions."); } - Date startdate=java.util.Calendar.getInstance().getTime(); - System.out.print("Start file check: " + startdate ); - int count = 0; - int countIgnore = 0; + for (FileMetadata fileMetadata : fileMetadatas) { - count++; - Date loopdate=java.util.Calendar.getInstance().getTime(); - Double diff = new Double( (loopdate.getTime() - startdate.getTime())) ; - diff = diff/1000.; - Double dcount = new Double(count); - boolean indexThisMetadata = true; @@ -967,7 +959,6 @@ private String addOrUpdateDataset(IndexableDataset indexableDataset, Set d if (fileMetadata.contentEquals(findReleasedFileMetadata) && variableMetadataUtil.compareVariableMetadata(findReleasedFileMetadata, fileMetadata)) { indexThisMetadata = false; - countIgnore++; logger.fine("This file metadata hasn't changed since the released version; skipping indexing."); } else { logger.fine("This file metadata has changed since the released version; we want to index it!"); @@ -977,31 +968,8 @@ private String addOrUpdateDataset(IndexableDataset indexableDataset, Set d } } } - - /* - for (FileMetadata releasedFileMetadata : dataset.getReleasedVersion().getFileMetadatas()) { - if (fileMetadata.getDataFile() != null && fileMetadata.getDataFile().equals(releasedFileMetadata.getDataFile())) { - - if ((fileMetadata.getDataFile().isRestricted() == releasedFileMetadata.getDataFile().isRestricted())) { - if (fileMetadata.contentEquals(releasedFileMetadata) - && variableMetadataUtil.compareVariableMetadata(releasedFileMetadata,fileMetadata) - ) { - indexThisMetadata = false; - logger.fine("This file metadata hasn't changed since the released version; skipping indexing."); - } else { - logger.fine("This file metadata has changed since the released version; we want to index it!"); - } - } else { - logger.fine("This file's restricted status has changed since the released version; we want to index it!"); - } - break; - } - } - */ } - int include = count -countIgnore; - System.out.print(" count: " + count + " " + diff + " per second " + dcount/diff + " " + loopdate + " indexed count: " + include); - + if (indexThisMetadata) { SolrInputDocument datafileSolrInputDocument = new SolrInputDocument(); @@ -1270,31 +1238,16 @@ private String addOrUpdateDataset(IndexableDataset indexableDataset, Set d if (indexableDataset.isFilesShouldBeIndexed()) { filesIndexed.add(fileSolrDocId); fileDocs.add(datafileSolrInputDocument); - if (count % 100 == 0) { - try { - solrClientService.getSolrClient().add(fileDocs); - solrClientService.getSolrClient().commit(); - fileDocs.clear(); - } catch (SolrServerException | IOException ex) { - if (ex.getCause() instanceof SolrServerException) { - throw new SolrServerException(ex); - } else if (ex.getCause() instanceof IOException) { - throw new IOException(ex); - } - } - - Date date=java.util.Calendar.getInstance().getTime(); - System.out.print("***************Writing file docs: " + count + " " + date ); - } } } } } - Date date=java.util.Calendar.getInstance().getTime(); - System.out.print("End file check: " + date ); + try { - solrClientService.getSolrClient().add(fileDocs); - solrClientService.getSolrClient().commit(); + if (!fileDocs.isEmpty()) { + solrClientService.getSolrClient().add(fileDocs); + solrClientService.getSolrClient().commit(); + } solrClientService.getSolrClient().add(docs); solrClientService.getSolrClient().commit(); } catch (SolrServerException | IOException ex) { @@ -1304,8 +1257,7 @@ private String addOrUpdateDataset(IndexableDataset indexableDataset, Set d throw new IOException(ex); } } - date=java.util.Calendar.getInstance().getTime(); - System.out.print("after solr service: " + date ); + Long dsId = dataset.getId(); /// Dataset updatedDataset = /// (Dataset)dvObjectService.updateContentIndexTime(dataset); From 029c3d06a1366ccf92f2047a32a2fda968ed052c Mon Sep 17 00:00:00 2001 From: Stephen Kraffmiller Date: Thu, 14 Oct 2021 11:25:35 -0400 Subject: [PATCH 05/24] #8097 remove doc commits --- .../edu/harvard/iq/dataverse/search/IndexServiceBean.java | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java index c257071ab1d..cd5266e0cc6 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java @@ -715,7 +715,6 @@ private String addOrUpdateDataset(IndexableDataset indexableDataset, Set d Dataset dataset = indexableDataset.getDatasetVersion().getDataset(); logger.fine("adding or updating Solr document for dataset id " + dataset.getId()); Collection docs = new ArrayList<>(); - Collection fileDocs = new ArrayList<>(); SolrInputDocument solrInputDocument = new SolrInputDocument(); String datasetSolrDocId = indexableDataset.getSolrDocId(); solrInputDocument.addField(SearchFields.ID, datasetSolrDocId); @@ -1237,17 +1236,13 @@ private String addOrUpdateDataset(IndexableDataset indexableDataset, Set d if (indexableDataset.isFilesShouldBeIndexed()) { filesIndexed.add(fileSolrDocId); - fileDocs.add(datafileSolrInputDocument); + docs.add(datafileSolrInputDocument); } } } } try { - if (!fileDocs.isEmpty()) { - solrClientService.getSolrClient().add(fileDocs); - solrClientService.getSolrClient().commit(); - } solrClientService.getSolrClient().add(docs); solrClientService.getSolrClient().commit(); } catch (SolrServerException | IOException ex) { From 1ca3427109d8ab1e4b893aa82c5b5e2f5796d41a Mon Sep 17 00:00:00 2001 From: Stephen Kraffmiller Date: Thu, 14 Oct 2021 11:28:57 -0400 Subject: [PATCH 06/24] #8097 remove reminder comments --- .../iq/dataverse/search/IndexServiceBean.java | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java index cd5266e0cc6..edba314c1fd 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java @@ -974,11 +974,11 @@ private String addOrUpdateDataset(IndexableDataset indexableDataset, Set d SolrInputDocument datafileSolrInputDocument = new SolrInputDocument(); Long fileEntityId = fileMetadata.getDataFile().getId(); datafileSolrInputDocument.addField(SearchFields.ENTITY_ID, fileEntityId); - datafileSolrInputDocument.addField(SearchFields.DATAVERSE_VERSION_INDEXED_BY, dataverseVersion); //common + datafileSolrInputDocument.addField(SearchFields.DATAVERSE_VERSION_INDEXED_BY, dataverseVersion); datafileSolrInputDocument.addField(SearchFields.IDENTIFIER, fileEntityId); - datafileSolrInputDocument.addField(SearchFields.PERSISTENT_URL, dataset.getPersistentURL()); //common - datafileSolrInputDocument.addField(SearchFields.TYPE, "files"); //common - datafileSolrInputDocument.addField(SearchFields.CATEGORY_OF_DATAVERSE, dataset.getDataverseContext().getIndexableCategoryName()); //common + datafileSolrInputDocument.addField(SearchFields.PERSISTENT_URL, dataset.getPersistentURL()); + datafileSolrInputDocument.addField(SearchFields.TYPE, "files"); + datafileSolrInputDocument.addField(SearchFields.CATEGORY_OF_DATAVERSE, dataset.getDataverseContext().getIndexableCategoryName()); /* Full-text indexing using Apache Tika */ if (doFullTextIndexing) { @@ -1060,7 +1060,7 @@ private String addOrUpdateDataset(IndexableDataset indexableDataset, Set d datafileSolrInputDocument.addField(SearchFields.NAME_SORT, filenameCompleteFinal); datafileSolrInputDocument.addField(SearchFields.FILE_NAME, filenameCompleteFinal); - datafileSolrInputDocument.addField(SearchFields.DATASET_VERSION_ID, datasetVersion.getId()); //common + datafileSolrInputDocument.addField(SearchFields.DATASET_VERSION_ID, datasetVersion.getId()); /** * for rules on sorting files see @@ -1158,16 +1158,16 @@ private String addOrUpdateDataset(IndexableDataset indexableDataset, Set d datafileSolrInputDocument.addField(SearchFields.FILE_DESCRIPTION, fileMetadata.getDescription()); datafileSolrInputDocument.addField(SearchFields.FILE_PERSISTENT_ID, fileMetadata.getDataFile().getGlobalId().toString()); datafileSolrInputDocument.addField(SearchFields.UNF, fileMetadata.getDataFile().getUnf()); - datafileSolrInputDocument.addField(SearchFields.SUBTREE, dataversePaths); //common + datafileSolrInputDocument.addField(SearchFields.SUBTREE, dataversePaths); // datafileSolrInputDocument.addField(SearchFields.HOST_DATAVERSE, // dataFile.getOwner().getOwner().getName()); // datafileSolrInputDocument.addField(SearchFields.PARENT_NAME, // dataFile.getDataset().getTitle()); - datafileSolrInputDocument.addField(SearchFields.PARENT_ID, fileMetadata.getDataFile().getOwner().getId()); //common - datafileSolrInputDocument.addField(SearchFields.PARENT_IDENTIFIER, fileMetadata.getDataFile().getOwner().getGlobalId().toString());//common - datafileSolrInputDocument.addField(SearchFields.PARENT_CITATION, fileMetadata.getDataFile().getOwner().getCitation()); //common + datafileSolrInputDocument.addField(SearchFields.PARENT_ID, fileMetadata.getDataFile().getOwner().getId()); + datafileSolrInputDocument.addField(SearchFields.PARENT_IDENTIFIER, fileMetadata.getDataFile().getOwner().getGlobalId().toString()); + datafileSolrInputDocument.addField(SearchFields.PARENT_CITATION, fileMetadata.getDataFile().getOwner().getCitation()); - datafileSolrInputDocument.addField(SearchFields.PARENT_NAME, parentDatasetTitle); //common + datafileSolrInputDocument.addField(SearchFields.PARENT_NAME, parentDatasetTitle); // If this is a tabular data file -- i.e., if there are data // variables associated with this file, we index the variable From eca09c7efbc10538ee19db6286c276b3b76d6a49 Mon Sep 17 00:00:00 2001 From: Stephen Kraffmiller Date: Thu, 28 Oct 2021 16:00:17 -0400 Subject: [PATCH 07/24] #8097 use map lookup --- .../search/IndexBatchServiceBean.java | 12 ++++- .../iq/dataverse/search/IndexServiceBean.java | 46 ++++++++++++------- 2 files changed, 39 insertions(+), 19 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/search/IndexBatchServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/IndexBatchServiceBean.java index 5171b1a864a..a1d6ff1a3c2 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/IndexBatchServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/IndexBatchServiceBean.java @@ -1,12 +1,15 @@ package edu.harvard.iq.dataverse.search; +import edu.harvard.iq.dataverse.Dataset; import edu.harvard.iq.dataverse.DatasetServiceBean; import edu.harvard.iq.dataverse.Dataverse; import edu.harvard.iq.dataverse.DataverseServiceBean; import edu.harvard.iq.dataverse.DvObjectServiceBean; import edu.harvard.iq.dataverse.util.SystemConfig; import java.io.IOException; +import java.sql.Timestamp; import java.util.ArrayList; +import java.util.Date; import java.util.List; import java.util.concurrent.Future; import java.util.logging.Level; @@ -205,7 +208,11 @@ public Future indexAllOrSubset(long numPartitions, long partitionId, boo try { datasetIndexCount++; logger.info("indexing dataset " + datasetIndexCount + " of " + datasetIds.size() + " (id=" + id + ")"); - Future result = indexService.indexDatasetInNewTransaction(id); + + Dataset dataset = datasetService.find(id); + Future result = indexService.indexDatasetObjectInNewTransaction(dataset); + + // Future result = indexService.indexDatasetInNewTransaction(id); } catch (Exception e) { //We want to keep running even after an exception so throw some more info into the log datasetFailureCount++; @@ -268,8 +275,9 @@ public void indexDataverseRecursively(Dataverse dataverse) { for (Long childId : datasetChildren) { try { datasetIndexCount++; + Dataset dataset = datasetService.find(childId); logger.info("indexing dataset " + datasetIndexCount + " of " + datasetChildren.size() + " (id=" + childId + ")"); - indexService.indexDatasetInNewTransaction(childId); + indexService.indexDatasetObjectInNewTransaction(dataset); } catch (Exception e) { //We want to keep running even after an exception so throw some more info into the log datasetFailureCount++; diff --git a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java index 4bb521e218e..614d2e458ac 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java @@ -41,6 +41,7 @@ import java.util.Calendar; import java.util.Collection; import java.util.Date; +import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; @@ -331,6 +332,15 @@ public Future indexDatasetInNewTransaction(Long datasetId) throws SolrS dataset = null; return ret; } + + @TransactionAttribute(REQUIRES_NEW) + public Future indexDatasetObjectInNewTransaction(Dataset dataset) throws SolrServerException, IOException{ //Dataset dataset) { + boolean doNormalSolrDocCleanUp = false; + // return indexDataset(dataset, doNormalSolrDocCleanUp); + Future ret = indexDataset(dataset, doNormalSolrDocCleanUp); + dataset = null; + return ret; + } @Asynchronous public Future asyncIndexDataset(Dataset dataset, boolean doNormalSolrDocCleanUp) throws SolrServerException, IOException { @@ -938,9 +948,15 @@ private String addOrUpdateDataset(IndexableDataset indexableDataset, Set d List filesIndexed = new ArrayList<>(); if (datasetVersion != null) { List fileMetadatas = datasetVersion.getFileMetadatas(); + List releasedFileMetadatas = new ArrayList<>(); + Map fileMap = new HashMap<>(); boolean checkForDuplicateMetadata = false; if (datasetVersion.isDraft() && dataset.isReleased() && dataset.getReleasedVersion() != null) { checkForDuplicateMetadata = true; + releasedFileMetadatas = dataset.getReleasedVersion().getFileMetadatas(); + for(FileMetadata released: releasedFileMetadatas){ + fileMap.put(released.getDataFile().getId(), released); + } logger.fine( "We are indexing a draft version of a dataset that has a released version. We'll be checking file metadatas if they are exact clones of the released versions."); } @@ -948,27 +964,23 @@ private String addOrUpdateDataset(IndexableDataset indexableDataset, Set d for (FileMetadata fileMetadata : fileMetadatas) { boolean indexThisMetadata = true; - - if (checkForDuplicateMetadata) { - logger.fine("Checking if this file metadata is a duplicate."); - if (fileMetadata.getDataFile() != null) { - FileMetadata findReleasedFileMetadata = dataFileService.findFileMetadataByDatasetVersionIdAndDataFileId(dataset.getReleasedVersion().getId(), fileMetadata.getDataFile().getId()); - if (findReleasedFileMetadata != null) { - if ((fileMetadata.getDataFile().isRestricted() == findReleasedFileMetadata.getDataFile().isRestricted())) { - if (fileMetadata.contentEquals(findReleasedFileMetadata) - && variableMetadataUtil.compareVariableMetadata(findReleasedFileMetadata, fileMetadata)) { - indexThisMetadata = false; - logger.fine("This file metadata hasn't changed since the released version; skipping indexing."); - } else { - logger.fine("This file metadata has changed since the released version; we want to index it!"); - } + if (checkForDuplicateMetadata && !releasedFileMetadatas.isEmpty()) { + logger.fine("Checking if this file metadata is a duplicate."); + FileMetadata getFromMap = fileMap.get(fileMetadata.getDataFile().getId()); + if (getFromMap != null) { + if ((fileMetadata.getDataFile().isRestricted() == getFromMap.getDataFile().isRestricted())) { + if (fileMetadata.contentEquals(getFromMap) + && variableMetadataUtil.compareVariableMetadata(getFromMap, fileMetadata)) { + indexThisMetadata = false; + logger.fine("This file metadata hasn't changed since the released version; skipping indexing."); } else { - logger.fine("This file's restricted status has changed since the released version; we want to index it!"); + logger.fine("This file metadata has changed since the released version; we want to index it!"); } + } else { + logger.fine("This file's restricted status has changed since the released version; we want to index it!"); } } - } - + } if (indexThisMetadata) { SolrInputDocument datafileSolrInputDocument = new SolrInputDocument(); From ed23eae1d6656a39a62067aa68d51f0a783e6ef9 Mon Sep 17 00:00:00 2001 From: Stephen Kraffmiller Date: Mon, 1 Nov 2021 13:57:18 -0400 Subject: [PATCH 08/24] #8097 return to dataset id passed to index in new transaction --- .../iq/dataverse/search/IndexBatchServiceBean.java | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/search/IndexBatchServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/IndexBatchServiceBean.java index a1d6ff1a3c2..5092f834108 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/IndexBatchServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/IndexBatchServiceBean.java @@ -208,11 +208,7 @@ public Future indexAllOrSubset(long numPartitions, long partitionId, boo try { datasetIndexCount++; logger.info("indexing dataset " + datasetIndexCount + " of " + datasetIds.size() + " (id=" + id + ")"); - - Dataset dataset = datasetService.find(id); - Future result = indexService.indexDatasetObjectInNewTransaction(dataset); - - // Future result = indexService.indexDatasetInNewTransaction(id); + Future result = indexService.indexDatasetInNewTransaction(id); } catch (Exception e) { //We want to keep running even after an exception so throw some more info into the log datasetFailureCount++; @@ -275,9 +271,8 @@ public void indexDataverseRecursively(Dataverse dataverse) { for (Long childId : datasetChildren) { try { datasetIndexCount++; - Dataset dataset = datasetService.find(childId); logger.info("indexing dataset " + datasetIndexCount + " of " + datasetChildren.size() + " (id=" + childId + ")"); - indexService.indexDatasetObjectInNewTransaction(dataset); + indexService.indexDatasetInNewTransaction(childId); } catch (Exception e) { //We want to keep running even after an exception so throw some more info into the log datasetFailureCount++; From 61ce33240acda64e6d8ffcae9a1e43c9d73d9217 Mon Sep 17 00:00:00 2001 From: Stephen Kraffmiller Date: Wed, 3 Nov 2021 16:42:19 -0400 Subject: [PATCH 09/24] #8097 order datasets to index by number of files --- .../iq/dataverse/DatasetServiceBean.java | 38 +++++++++++++++++++ .../search/IndexBatchServiceBean.java | 2 +- 2 files changed, 39 insertions(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java index 288575d5462..9dcbbd344e5 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java @@ -225,6 +225,44 @@ public List findAllOrSubset(long numPartitions, long partitionId, boolean return typedQuery.getResultList(); } + /** + * For docs, see the equivalent method on the DataverseServiceBean. + * @param numPartitions + * @param partitionId + * @param skipIndexed + * @return a list of datasets + * @see DataverseServiceBean#findAllOrSubset(long, long, boolean) + */ + public List findAllOrSubsetOrderByFileMetadata(boolean skipIndexed) { + + String skipClause = skipIndexed ? "AND o.indexTime is null " : ""; + Query query = em.createNativeQuery(" Select distinct(o.id), count(f.id) as numFiles FROM dvobject o " + + "left join dvobject f on f.owner_id = o.id where o.dtype = 'Dataset' " + + skipClause + + " group by o.id " + + "ORDER BY count(f.id) asc, o.id"); + + List queryResults; + queryResults = query.getResultList(); + + List retVal = new ArrayList(); + for (Object[] result : queryResults) { + Long dsId; + if (result[0] != null) { + try { + dsId = Long.parseLong(result[0].toString()) ; + } catch (Exception ex) { + dsId = null; + } + if (dsId == null) { + continue; + } + retVal.add(dsId); + } + } + return retVal; + } + /** * Merges the passed dataset to the persistence context. * @param ds the dataset whose new state we want to persist. diff --git a/src/main/java/edu/harvard/iq/dataverse/search/IndexBatchServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/IndexBatchServiceBean.java index 5092f834108..db29bde0270 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/IndexBatchServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/IndexBatchServiceBean.java @@ -203,7 +203,7 @@ public Future indexAllOrSubset(long numPartitions, long partitionId, boo int datasetIndexCount = 0; int datasetFailureCount = 0; - List datasetIds = datasetService.findAllOrSubset(numPartitions, partitionId, skipIndexed); + List datasetIds = datasetService.findAllOrSubsetOrderByFileMetadata(skipIndexed); for (Long id : datasetIds) { try { datasetIndexCount++; From ab1d0e84140fab61b04665af6b05228b2bee4273 Mon Sep 17 00:00:00 2001 From: Stephen Kraffmiller Date: Tue, 9 Nov 2021 10:00:22 -0500 Subject: [PATCH 10/24] #8097 rename method for clarity --- .../edu/harvard/iq/dataverse/DatasetServiceBean.java | 9 ++++++++- .../iq/dataverse/search/IndexBatchServiceBean.java | 2 +- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java index 9dcbbd344e5..8ebdc4745e6 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java @@ -233,7 +233,14 @@ public List findAllOrSubset(long numPartitions, long partitionId, boolean * @return a list of datasets * @see DataverseServiceBean#findAllOrSubset(long, long, boolean) */ - public List findAllOrSubsetOrderByFileMetadata(boolean skipIndexed) { + public List findAllOrSubsetOrderByFilesOwned(boolean skipIndexed) { + /* + Disregards deleted or replaced files when determining 'size' of dataset. + Could possibly make more efficient by getting file metadata counts + of latest published/draft version. + Also disregards partitioning which is no longer supported. + SEK - 11/09/2021 + */ String skipClause = skipIndexed ? "AND o.indexTime is null " : ""; Query query = em.createNativeQuery(" Select distinct(o.id), count(f.id) as numFiles FROM dvobject o " + diff --git a/src/main/java/edu/harvard/iq/dataverse/search/IndexBatchServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/IndexBatchServiceBean.java index db29bde0270..34c145fa6e8 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/IndexBatchServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/IndexBatchServiceBean.java @@ -203,7 +203,7 @@ public Future indexAllOrSubset(long numPartitions, long partitionId, boo int datasetIndexCount = 0; int datasetFailureCount = 0; - List datasetIds = datasetService.findAllOrSubsetOrderByFileMetadata(skipIndexed); + List datasetIds = datasetService.findAllOrSubsetOrderByFilesOwned(skipIndexed); for (Long id : datasetIds) { try { datasetIndexCount++; From 6b0be92f127993cd031504f284686ca0dd91b6e8 Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Mon, 15 Nov 2021 15:39:42 -0500 Subject: [PATCH 11/24] remove invalid characters from input #8018 --- doc/release-notes/8018-invalid-characters.md | 1 + .../iq/dataverse/DatasetFieldValidator.java | 26 ++++++- .../V5.8.0.1__8018-invalid-characters.sql | 5 ++ .../dataverse/DatasetFieldValidatorTest.java | 29 ++++++++ .../iq/dataverse/api/InvalidCharactersIT.java | 73 +++++++++++++++++++ tests/integration-tests.txt | 2 +- 6 files changed, 134 insertions(+), 2 deletions(-) create mode 100644 doc/release-notes/8018-invalid-characters.md create mode 100644 src/main/resources/db/migration/V5.8.0.1__8018-invalid-characters.sql create mode 100644 src/test/java/edu/harvard/iq/dataverse/api/InvalidCharactersIT.java diff --git a/doc/release-notes/8018-invalid-characters.md b/doc/release-notes/8018-invalid-characters.md new file mode 100644 index 00000000000..4b1d011eb02 --- /dev/null +++ b/doc/release-notes/8018-invalid-characters.md @@ -0,0 +1 @@ +Reindex Solr and reexport all exports after deployment because invalid characters are removed in the database by a SQL upgrade script. diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetFieldValidator.java b/src/main/java/edu/harvard/iq/dataverse/DatasetFieldValidator.java index c685fcb3e54..f9679e6c382 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetFieldValidator.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetFieldValidator.java @@ -11,6 +11,8 @@ import edu.harvard.iq.dataverse.util.BundleUtil; import java.util.Collections; import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import org.apache.commons.lang3.StringUtils; @@ -27,7 +29,29 @@ public void initialize(ValidateDatasetFieldType constraintAnnotation) { @Override public boolean isValid(DatasetField value, ConstraintValidatorContext context) { context.disableDefaultConstraintViolation(); // we do this so we can have different messages depending on the different issue - + + // If invalid characters are found, mutate the value by removing them. + if (value != null && value.getValue() != null) { + String invalidCharacters = "[\f\u0002]"; + Pattern p = Pattern.compile(invalidCharacters); + Matcher m = p.matcher(value.getValue()); + boolean invalidCharactersFound = m.find(); + if (invalidCharactersFound) { + List datasetFieldValues = value.getDatasetFieldValues(); + List controlledVocabularyValues = value.getControlledVocabularyValues(); + if (!datasetFieldValues.isEmpty()) { + datasetFieldValues.get(0).setValue(value.getValue().replaceAll(invalidCharacters, "")); + } else if (controlledVocabularyValues != null && !controlledVocabularyValues.isEmpty()) { + // This controlledVocabularyValues logic comes from value.getValue(). + // Controlled vocabularies shouldn't have invalid characters in them + // but they do, we can add a "replace" here. Some untested, commented code below. + // if (controlledVocabularyValues.get(0) != null) { + // controlledVocabularyValues.get(0).setStrValue(value.getValue().replaceAll(invalidCharacters, "")); + // } + } + } + } + DatasetFieldType dsfType = value.getDatasetFieldType(); //SEK Additional logic turns off validation for templates if (isTemplateDatasetField(value)){ diff --git a/src/main/resources/db/migration/V5.8.0.1__8018-invalid-characters.sql b/src/main/resources/db/migration/V5.8.0.1__8018-invalid-characters.sql new file mode 100644 index 00000000000..7e71857ecab --- /dev/null +++ b/src/main/resources/db/migration/V5.8.0.1__8018-invalid-characters.sql @@ -0,0 +1,5 @@ +-- This list of characters also appears in DatasetFieldValidator.java +-- Remove character: form feed (\f) +UPDATE datasetfieldvalue SET value = regexp_replace(value, E'\f', '', 'g'); +-- Remove character: start of text (\u0002) +UPDATE datasetfieldvalue SET value = regexp_replace(value, U&'\0002', '', 'g'); diff --git a/src/test/java/edu/harvard/iq/dataverse/DatasetFieldValidatorTest.java b/src/test/java/edu/harvard/iq/dataverse/DatasetFieldValidatorTest.java index e82fc02e37a..75b280861ca 100644 --- a/src/test/java/edu/harvard/iq/dataverse/DatasetFieldValidatorTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/DatasetFieldValidatorTest.java @@ -123,4 +123,33 @@ private void testCompoundDatasetField(String test, boolean requiredParent, boole assertEquals( test, expectedOutcome, datasetFieldValidator.isValid(child1DatasetField, constraintValidatorContext)); } + @Test + public void testRemoveInvalidCharacters() { + assertEquals("test", removeInvalidPrimitive("test")); + assertEquals("test", removeInvalidPrimitive("te\fst")); + assertEquals("test", removeInvalidPrimitive("te\u0002st")); + assertEquals("test", removeInvalidPrimitive("\fte\u0002st\f")); + } + + private String removeInvalidPrimitive(String value) { + Dataverse dataverse = new Dataverse(); + Dataset dataset = new Dataset(); + dataset.setOwner(dataverse); + DatasetVersion dsv = new DatasetVersion(); + dsv.setDataset(dataset); + + DatasetFieldType primitiveDSFType = new DatasetFieldType("primitive", DatasetFieldType.FieldType.TEXT, false); + boolean required = false; + primitiveDSFType.setRequired(required); + + DatasetField testDatasetField = new DatasetField(); + testDatasetField.setDatasetVersion(dsv); + testDatasetField.setDatasetFieldType(primitiveDSFType); + testDatasetField.setSingleValue(value); + + DatasetFieldValidator datasetFieldValidator = new DatasetFieldValidator(); + datasetFieldValidator.isValid(testDatasetField, constraintValidatorContext); + return testDatasetField.getValue(); + } + } diff --git a/src/test/java/edu/harvard/iq/dataverse/api/InvalidCharactersIT.java b/src/test/java/edu/harvard/iq/dataverse/api/InvalidCharactersIT.java new file mode 100644 index 00000000000..2fb412ef1cc --- /dev/null +++ b/src/test/java/edu/harvard/iq/dataverse/api/InvalidCharactersIT.java @@ -0,0 +1,73 @@ +package edu.harvard.iq.dataverse.api; + +import com.jayway.restassured.RestAssured; +import com.jayway.restassured.response.Response; +import java.io.File; +import java.io.IOException; +import java.nio.file.Path; +import java.nio.file.Paths; +import javax.json.Json; +import javax.json.JsonObjectBuilder; +import static javax.ws.rs.core.Response.Status.CREATED; +import static javax.ws.rs.core.Response.Status.OK; +import org.hamcrest.Matchers; +import org.junit.BeforeClass; +import org.junit.Test; + +public class InvalidCharactersIT { + + @BeforeClass + public static void setUp() { + RestAssured.baseURI = UtilIT.getRestAssuredBaseUri(); + } + + @Test + public void testInvalidCharacters() throws IOException { + Response createUser = UtilIT.createRandomUser(); + createUser.then().assertThat().statusCode(OK.getStatusCode()); + String apiToken = UtilIT.getApiTokenFromResponse(createUser); + String username = UtilIT.getUsernameFromResponse(createUser); + + UtilIT.makeSuperUser(username).then().assertThat().statusCode(OK.getStatusCode()); + + Response createUserNoPrivs = UtilIT.createRandomUser(); + createUserNoPrivs.then().assertThat().statusCode(OK.getStatusCode()); + String apiTokenNoPrivs = UtilIT.getApiTokenFromResponse(createUserNoPrivs); + + Response createDataverseResponse = UtilIT.createRandomDataverse(apiToken); + createDataverseResponse.prettyPrint(); + createDataverseResponse.then().assertThat() + .statusCode(CREATED.getStatusCode()); + + String dataverseAlias = UtilIT.getAliasFromResponse(createDataverseResponse); + + Response createDataset = UtilIT.createRandomDatasetViaNativeApi(dataverseAlias, apiToken); + createDataset.prettyPrint(); + createDataset.then().assertThat() + .statusCode(CREATED.getStatusCode()); + + Integer datasetId = UtilIT.getDatasetIdFromResponse(createDataset); + String datasetPid = UtilIT.getDatasetPersistentIdFromResponse(createDataset); + String badCharacter = "(\f)"; // form feed (also \u000C) +// badCharacter = "{\u0002}"; // start of text, reported problem with exports. + + JsonObjectBuilder jsonUpdateObject = Json.createObjectBuilder().add("fields", + Json.createArrayBuilder() + .add(Json.createObjectBuilder() + .add("typeName", "title") + .add("value", "MyTitle " + badCharacter) + )); + String jsonUpdateString = jsonUpdateObject.build().toString(); + Path jsonUpdatePath = Paths.get(java.nio.file.Files.createTempDirectory(null) + File.separator + "update.json"); + java.nio.file.Files.write(jsonUpdatePath, jsonUpdateString.getBytes()); + Response addDataToBadData = UtilIT.updateFieldLevelDatasetMetadataViaNative(datasetPid, jsonUpdatePath.toString(), apiToken); + addDataToBadData.prettyPrint(); + addDataToBadData.then().assertThat() + .statusCode(OK.getStatusCode()) + // The \f has been removed. + .body("data.metadataBlocks.citation.fields[0].value", Matchers.equalTo("MyTitle ()")); + + + } + +} diff --git a/tests/integration-tests.txt b/tests/integration-tests.txt index d6edfa4d70e..341a39d0086 100644 --- a/tests/integration-tests.txt +++ b/tests/integration-tests.txt @@ -1 +1 @@ -DataversesIT,DatasetsIT,SwordIT,AdminIT,BuiltinUsersIT,UsersIT,UtilIT,ConfirmEmailIT,FileMetadataIT,FilesIT,SearchIT,InReviewWorkflowIT,HarvestingServerIT,MoveIT,MakeDataCountApiIT,FileTypeDetectionIT,EditDDIIT,ExternalToolsIT,AccessIT,DuplicateFilesIT,DownloadFilesIT,LinkIT,DeleteUsersIT,DeactivateUsersIT,AuxiliaryFilesIT +DataversesIT,DatasetsIT,SwordIT,AdminIT,BuiltinUsersIT,UsersIT,UtilIT,ConfirmEmailIT,FileMetadataIT,FilesIT,SearchIT,InReviewWorkflowIT,HarvestingServerIT,MoveIT,MakeDataCountApiIT,FileTypeDetectionIT,EditDDIIT,ExternalToolsIT,AccessIT,DuplicateFilesIT,DownloadFilesIT,LinkIT,DeleteUsersIT,DeactivateUsersIT,AuxiliaryFilesIT,InvalidCharactersIT From 2d8fe44e54dd06fb05a1898ab2b2029daf3914ec Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Wed, 17 Nov 2021 10:16:42 -0500 Subject: [PATCH 12/24] Remove U+FFFE (not a character), prevents metadata editing #8018 --- .../java/edu/harvard/iq/dataverse/DatasetFieldValidator.java | 2 +- .../db/migration/V5.8.0.1__8018-invalid-characters.sql | 2 ++ .../edu/harvard/iq/dataverse/DatasetFieldValidatorTest.java | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetFieldValidator.java b/src/main/java/edu/harvard/iq/dataverse/DatasetFieldValidator.java index f9679e6c382..3ded24d7a59 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetFieldValidator.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetFieldValidator.java @@ -32,7 +32,7 @@ public boolean isValid(DatasetField value, ConstraintValidatorContext context) { // If invalid characters are found, mutate the value by removing them. if (value != null && value.getValue() != null) { - String invalidCharacters = "[\f\u0002]"; + String invalidCharacters = "[\f\u0002\ufffe]"; Pattern p = Pattern.compile(invalidCharacters); Matcher m = p.matcher(value.getValue()); boolean invalidCharactersFound = m.find(); diff --git a/src/main/resources/db/migration/V5.8.0.1__8018-invalid-characters.sql b/src/main/resources/db/migration/V5.8.0.1__8018-invalid-characters.sql index 7e71857ecab..c54da1e65bf 100644 --- a/src/main/resources/db/migration/V5.8.0.1__8018-invalid-characters.sql +++ b/src/main/resources/db/migration/V5.8.0.1__8018-invalid-characters.sql @@ -3,3 +3,5 @@ UPDATE datasetfieldvalue SET value = regexp_replace(value, E'\f', '', 'g'); -- Remove character: start of text (\u0002) UPDATE datasetfieldvalue SET value = regexp_replace(value, U&'\0002', '', 'g'); +-- Remove character: not a character (\ufffe) +UPDATE datasetfieldvalue SET value = regexp_replace(value, U&'\FFFE', '', 'g'); diff --git a/src/test/java/edu/harvard/iq/dataverse/DatasetFieldValidatorTest.java b/src/test/java/edu/harvard/iq/dataverse/DatasetFieldValidatorTest.java index 75b280861ca..99482dd9401 100644 --- a/src/test/java/edu/harvard/iq/dataverse/DatasetFieldValidatorTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/DatasetFieldValidatorTest.java @@ -129,6 +129,7 @@ public void testRemoveInvalidCharacters() { assertEquals("test", removeInvalidPrimitive("te\fst")); assertEquals("test", removeInvalidPrimitive("te\u0002st")); assertEquals("test", removeInvalidPrimitive("\fte\u0002st\f")); + assertEquals("test", removeInvalidPrimitive("te\ufffest")); } private String removeInvalidPrimitive(String value) { From 024f4be252b9df5f7513a719b10a1c2e6173b4db Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Mon, 22 Nov 2021 14:01:00 -0500 Subject: [PATCH 13/24] rename SQL script #8018 --- ...valid-characters.sql => V5.8.0.2__8018-invalid-characters.sql} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/main/resources/db/migration/{V5.8.0.1__8018-invalid-characters.sql => V5.8.0.2__8018-invalid-characters.sql} (100%) diff --git a/src/main/resources/db/migration/V5.8.0.1__8018-invalid-characters.sql b/src/main/resources/db/migration/V5.8.0.2__8018-invalid-characters.sql similarity index 100% rename from src/main/resources/db/migration/V5.8.0.1__8018-invalid-characters.sql rename to src/main/resources/db/migration/V5.8.0.2__8018-invalid-characters.sql From 8c751e43922ddb83cdccf694b1339d7921491144 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Tue, 23 Nov 2021 09:13:38 -0500 Subject: [PATCH 14/24] keyboard access to download menu just a checkpoint - only this submenu supported Conflicts: src/main/webapp/filesFragment.xhtml --- src/main/webapp/file-download-button-fragment.xhtml | 3 ++- src/main/webapp/filesFragment.xhtml | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/main/webapp/file-download-button-fragment.xhtml b/src/main/webapp/file-download-button-fragment.xhtml index b4d8756c833..64e6cd61463 100644 --- a/src/main/webapp/file-download-button-fragment.xhtml +++ b/src/main/webapp/file-download-button-fragment.xhtml @@ -203,7 +203,8 @@