Skip to content

Commit

Permalink
Merge branch 'develop' into 8236-required-subfields IQSS#8236
Browse files Browse the repository at this point in the history
  • Loading branch information
pdurbin committed Dec 1, 2021
2 parents bb2f3cc + 521c230 commit e3fe3b8
Show file tree
Hide file tree
Showing 21 changed files with 331 additions and 49 deletions.
1 change: 1 addition & 0 deletions doc/release-notes/8018-invalid-characters.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Reindex Solr and reexport all exports after deployment because invalid characters are removed in the database by a SQL upgrade script.
6 changes: 6 additions & 0 deletions doc/release-notes/8097-indexall-performance.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
### Indexing performance on datasets with large numbers of files

We discovered that whenever a full reindexing needs to be performed, datasets with large numbers of files take exceptionally long time to index (for example, in the IQSS repository it takes several hours for a dataset that has 25,000 files). In situations where the Solr index needs to be erased and rebuilt from scratch (such as a Solr version upgrade, or a corrupt index, etc.) this can significantly delay the repopulation of the search catalog.

We are still investigating the reasons behind this performance issue. For now, even though some improvements have been made, a dataset with thousands of files is still going to take a long time to index. But we've made a simple change to the reindexing process, to index any such datasets at the very end of the batch, after all the datasets with fewer files have been reindexed. This does not improve the overall reindexing time, but will repopulate the bulk of the search index much faster for the users of the installation.

3 changes: 3 additions & 0 deletions doc/release-notes/8261-geojson.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
.geojson files are now correctly identified as GeoJSON files rather than "unknown".

A full re-index is needed to update the facets, as well as optionally running the "redetect file type" API on existing GeoJSON files.
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
import edu.harvard.iq.dataverse.util.BundleUtil;
import java.util.Collections;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;


Expand All @@ -27,7 +29,29 @@ public void initialize(ValidateDatasetFieldType constraintAnnotation) {
@Override
public boolean isValid(DatasetField value, ConstraintValidatorContext context) {
context.disableDefaultConstraintViolation(); // we do this so we can have different messages depending on the different issue


// If invalid characters are found, mutate the value by removing them.
if (value != null && value.getValue() != null) {
String invalidCharacters = "[\f\u0002\ufffe]";
Pattern p = Pattern.compile(invalidCharacters);
Matcher m = p.matcher(value.getValue());
boolean invalidCharactersFound = m.find();
if (invalidCharactersFound) {
List<DatasetFieldValue> datasetFieldValues = value.getDatasetFieldValues();
List<ControlledVocabularyValue> controlledVocabularyValues = value.getControlledVocabularyValues();
if (!datasetFieldValues.isEmpty()) {
datasetFieldValues.get(0).setValue(value.getValue().replaceAll(invalidCharacters, ""));
} else if (controlledVocabularyValues != null && !controlledVocabularyValues.isEmpty()) {
// This controlledVocabularyValues logic comes from value.getValue().
// Controlled vocabularies shouldn't have invalid characters in them
// but they do, we can add a "replace" here. Some untested, commented code below.
// if (controlledVocabularyValues.get(0) != null) {
// controlledVocabularyValues.get(0).setStrValue(value.getValue().replaceAll(invalidCharacters, ""));
// }
}
}
}

DatasetFieldType dsfType = value.getDatasetFieldType();
//SEK Additional logic turns off validation for templates
if (isTemplateDatasetField(value)){
Expand Down
45 changes: 45 additions & 0 deletions src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,51 @@ public List<Long> findAllOrSubset(long numPartitions, long partitionId, boolean
return typedQuery.getResultList();
}

/**
* For docs, see the equivalent method on the DataverseServiceBean.
* @param numPartitions
* @param partitionId
* @param skipIndexed
* @return a list of datasets
* @see DataverseServiceBean#findAllOrSubset(long, long, boolean)
*/
public List<Long> findAllOrSubsetOrderByFilesOwned(boolean skipIndexed) {
/*
Disregards deleted or replaced files when determining 'size' of dataset.
Could possibly make more efficient by getting file metadata counts
of latest published/draft version.
Also disregards partitioning which is no longer supported.
SEK - 11/09/2021
*/

String skipClause = skipIndexed ? "AND o.indexTime is null " : "";
Query query = em.createNativeQuery(" Select distinct(o.id), count(f.id) as numFiles FROM dvobject o " +
"left join dvobject f on f.owner_id = o.id where o.dtype = 'Dataset' "
+ skipClause
+ " group by o.id "
+ "ORDER BY count(f.id) asc, o.id");

List<Object[]> queryResults;
queryResults = query.getResultList();

List<Long> retVal = new ArrayList();
for (Object[] result : queryResults) {
Long dsId;
if (result[0] != null) {
try {
dsId = Long.parseLong(result[0].toString()) ;
} catch (Exception ex) {
dsId = null;
}
if (dsId == null) {
continue;
}
retVal.add(dsId);
}
}
return retVal;
}

/**
* Merges the passed dataset to the persistence context.
* @param ds the dataset whose new state we want to persist.
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/edu/harvard/iq/dataverse/FileMetadata.java
Original file line number Diff line number Diff line change
Expand Up @@ -532,7 +532,7 @@ public boolean contentEquals(FileMetadata other) {

public boolean compareContent(FileMetadata other){
FileVersionDifference diffObj = new FileVersionDifference(this, other, false);
return diffObj.compareMetadata(this, other);
return diffObj.isSame();
}

@Override
Expand Down
34 changes: 30 additions & 4 deletions src/main/java/edu/harvard/iq/dataverse/FileVersionDifference.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.ResourceBundle;

/**
*
Expand All @@ -21,6 +20,9 @@ public final class FileVersionDifference {
private FileMetadata newFileMetadata;
private FileMetadata originalFileMetadata;
private boolean details = false;
private boolean same = false;



private List<FileDifferenceSummaryGroup> differenceSummaryGroups = new ArrayList<>();
private List<FileDifferenceDetailItem> differenceDetailItems = new ArrayList<>();
Expand All @@ -37,7 +39,7 @@ public FileVersionDifference(FileMetadata newFileMetadata, FileMetadata original
this.originalFileMetadata = originalFileMetadata;
this.details = details;

compareMetadata(newFileMetadata, originalFileMetadata);
this.same = compareMetadata(newFileMetadata, originalFileMetadata);
//Compare versions - File Metadata first

}
Expand All @@ -50,7 +52,7 @@ public boolean compareMetadata(FileMetadata newFileMetadata, FileMetadata origin
and it updates the FileVersionDifference object which is used to display the differences on the dataset versions tab.
The return value is used by the index service bean tomark whether a file needs to be re-indexed in the context of a dataset update.
When there are changes (after v4.19)to the file metadata data model this method must be updated.
retVal of True means metadatas are equal.
retVal of True means metadatas are equal.
*/

boolean retVal = true;
Expand All @@ -68,13 +70,15 @@ When there are changes (after v4.19)to the file metadata data model this method

if (this.originalFileMetadata == null && this.newFileMetadata.getDataFile() != null ){
//File Added
if (!details) return false;
retVal = false;
updateDifferenceSummary( "", BundleUtil.getStringFromBundle("file.versionDifferences.fileGroupTitle"), 1, 0, 0, 0);
}

//Check to see if File replaced
if (originalFileMetadata != null &&
newFileMetadata.getDataFile() != null && originalFileMetadata.getDataFile() != null &&!this.originalFileMetadata.getDataFile().equals(this.newFileMetadata.getDataFile())){
if (!details) return false;
updateDifferenceSummary( "", BundleUtil.getStringFromBundle("file.versionDifferences.fileGroupTitle"), 0, 0, 0, 1);
retVal = false;
}
Expand All @@ -83,6 +87,8 @@ When there are changes (after v4.19)to the file metadata data model this method
if (!newFileMetadata.getLabel().equals(originalFileMetadata.getLabel())) {
if (details) {
differenceDetailItems.add(new FileDifferenceDetailItem(BundleUtil.getStringFromBundle("file.versionDifferences.fileNameDetailTitle"), originalFileMetadata.getLabel(), newFileMetadata.getLabel()));
} else{
return false;
}
updateDifferenceSummary(BundleUtil.getStringFromBundle("file.versionDifferences.fileMetadataGroupTitle"),
BundleUtil.getStringFromBundle("file.versionDifferences.fileNameDetailTitle"), 0, 1, 0, 0);
Expand All @@ -97,6 +103,8 @@ When there are changes (after v4.19)to the file metadata data model this method
&& !newFileMetadata.getDescription().equals(originalFileMetadata.getDescription())) {
if (details) {
differenceDetailItems.add(new FileDifferenceDetailItem(BundleUtil.getStringFromBundle("file.versionDifferences.descriptionDetailTitle"), originalFileMetadata.getDescription(), newFileMetadata.getDescription()));
} else {
return false;
}
updateDifferenceSummary(BundleUtil.getStringFromBundle("file.versionDifferences.fileMetadataGroupTitle"),
BundleUtil.getStringFromBundle("file.versionDifferences.descriptionDetailTitle"), 0, 1, 0, 0);
Expand All @@ -107,6 +115,8 @@ When there are changes (after v4.19)to the file metadata data model this method
) {
if (details) {
differenceDetailItems.add(new FileDifferenceDetailItem(BundleUtil.getStringFromBundle("file.versionDifferences.descriptionDetailTitle"), "", newFileMetadata.getDescription()));
} else {
return false;
}
updateDifferenceSummary(BundleUtil.getStringFromBundle("file.versionDifferences.fileMetadataGroupTitle"),
BundleUtil.getStringFromBundle("file.versionDifferences.descriptionDetailTitle"), 1, 0, 0, 0);
Expand All @@ -117,6 +127,8 @@ When there are changes (after v4.19)to the file metadata data model this method
) {
if (details) {
differenceDetailItems.add(new FileDifferenceDetailItem(BundleUtil.getStringFromBundle("file.versionDifferences.descriptionDetailTitle"), originalFileMetadata.getDescription(), "" ));
} else {
return false;
}
updateDifferenceSummary(BundleUtil.getStringFromBundle("file.versionDifferences.fileMetadataGroupTitle"),
BundleUtil.getStringFromBundle("file.versionDifferences.descriptionDetailTitle"), 0, 0, 1, 0);
Expand All @@ -130,6 +142,8 @@ When there are changes (after v4.19)to the file metadata data model this method
&& !newFileMetadata.getProvFreeForm().equals(originalFileMetadata.getProvFreeForm())) {
if (details) {
differenceDetailItems.add(new FileDifferenceDetailItem(BundleUtil.getStringFromBundle("file.versionDifferences.provenanceDetailTitle"), originalFileMetadata.getProvFreeForm(), newFileMetadata.getProvFreeForm()));
} else {
return false;
}
updateDifferenceSummary(BundleUtil.getStringFromBundle("file.versionDifferences.fileMetadataGroupTitle"),
BundleUtil.getStringFromBundle("file.versionDifferences.provenanceDetailTitle"), 0, 1, 0, 0);
Expand All @@ -140,6 +154,8 @@ When there are changes (after v4.19)to the file metadata data model this method
) {
if (details) {
differenceDetailItems.add(new FileDifferenceDetailItem(BundleUtil.getStringFromBundle("file.versionDifferences.provenanceDetailTitle"), "", newFileMetadata.getProvFreeForm()));
} else {
return false;
}
updateDifferenceSummary(BundleUtil.getStringFromBundle("file.versionDifferences.fileMetadataGroupTitle"),
BundleUtil.getStringFromBundle("file.versionDifferences.provenanceDetailTitle"), 1, 0, 0, 0);
Expand All @@ -150,6 +166,8 @@ When there are changes (after v4.19)to the file metadata data model this method
) {
if (details) {
differenceDetailItems.add(new FileDifferenceDetailItem(BundleUtil.getStringFromBundle("file.versionDifferences.provenanceDetailTitle"), originalFileMetadata.getProvFreeForm(), "" ));
} else {
return false;
}
updateDifferenceSummary(BundleUtil.getStringFromBundle("file.versionDifferences.fileMetadataGroupTitle"),
BundleUtil.getStringFromBundle("file.versionDifferences.provenanceDetailTitle"), 0, 0, 1, 0);
Expand All @@ -170,7 +188,7 @@ When there are changes (after v4.19)to the file metadata data model this method
}

if (!value1.equals(value2)) {

if (!details) return false;
int added = 0;
int deleted = 0;

Expand Down Expand Up @@ -254,6 +272,14 @@ public void setOriginalFileMetadata(FileMetadata originalFileMetadata) {
this.originalFileMetadata = originalFileMetadata;
}

public boolean isSame() {
return same;
}

public void setSame(boolean same) {
this.same = same;
}


public List<FileDifferenceSummaryGroup> getDifferenceSummaryGroups() {
return differenceSummaryGroups;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
package edu.harvard.iq.dataverse.search;

import edu.harvard.iq.dataverse.Dataset;
import edu.harvard.iq.dataverse.DatasetServiceBean;
import edu.harvard.iq.dataverse.Dataverse;
import edu.harvard.iq.dataverse.DataverseServiceBean;
import edu.harvard.iq.dataverse.DvObjectServiceBean;
import edu.harvard.iq.dataverse.util.SystemConfig;
import java.io.IOException;
import java.sql.Timestamp;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.concurrent.Future;
import java.util.logging.Level;
Expand Down Expand Up @@ -200,7 +203,7 @@ public Future<String> indexAllOrSubset(long numPartitions, long partitionId, boo

int datasetIndexCount = 0;
int datasetFailureCount = 0;
List<Long> datasetIds = datasetService.findAllOrSubset(numPartitions, partitionId, skipIndexed);
List<Long> datasetIds = datasetService.findAllOrSubsetOrderByFilesOwned(skipIndexed);
for (Long id : datasetIds) {
try {
datasetIndexCount++;
Expand Down
Loading

0 comments on commit e3fe3b8

Please sign in to comment.