diff --git a/doc/release-notes/9992-harvest-metadata-values-not-in-cvv-list.md b/doc/release-notes/9992-harvest-metadata-values-not-in-cvv-list.md new file mode 100644 index 00000000000..88ca6cf0e79 --- /dev/null +++ b/doc/release-notes/9992-harvest-metadata-values-not-in-cvv-list.md @@ -0,0 +1,4 @@ +The API endpoint `api/harvest/clients/{harvestingClientNickname}` has been extended to include the following fields: + +- `allowHarvestingMissingCVV`: enable/disable allowing datasets to be harvested with Controlled Vocabulary Values that existed in the originating Dataverse Project but are not in the harvesting Dataverse Project. Default is false. +Note: This setting is only available to the API and not currently accessible/settable via the UI \ No newline at end of file diff --git a/doc/sphinx-guides/source/api/native-api.rst b/doc/sphinx-guides/source/api/native-api.rst index 70d73ae3c98..144e3ac8e5e 100644 --- a/doc/sphinx-guides/source/api/native-api.rst +++ b/doc/sphinx-guides/source/api/native-api.rst @@ -4693,7 +4693,8 @@ The following optional fields are supported: - set: The OAI set on the remote server. If not supplied, will default to none, i.e., "harvest everything". - style: Defaults to "default" - a generic OAI archive. (Make sure to use "dataverse" when configuring harvesting from another Dataverse installation). - customHeaders: This can be used to configure this client with a specific HTTP header that will be added to every OAI request. This is to accommodate a use case where the remote server requires this header to supply some form of a token in order to offer some content not available to other clients. See the example below. Multiple headers can be supplied separated by `\\n` - actual "backslash" and "n" characters, not a single "new line" character. - +- allowHarvestingMissingCVV: Flag to allow datasets to be harvested with Controlled Vocabulary Values that existed in the originating Dataverse Project but are not in the harvesting Dataverse Project. (Default is false). Currently only settable using API. + Generally, the API will accept the output of the GET version of the API for an existing client as valid input, but some fields will be ignored. For example, as of writing this there is no way to configure a harvesting schedule via this API. An example JSON file would look like this:: @@ -4706,7 +4707,8 @@ An example JSON file would look like this:: "archiveDescription": "Moissonné depuis la collection LMOPS de l'entrepôt Zenodo. En cliquant sur ce jeu de données, vous serez redirigé vers Zenodo.", "metadataFormat": "oai_dc", "customHeaders": "x-oai-api-key: xxxyyyzzz", - "set": "user-lmops" + "set": "user-lmops", + "allowHarvestingMissingCVV":true } Something important to keep in mind about this API is that, unlike the harvesting clients GUI, it will create a client with the values supplied without making any attempts to validate them in real time. In other words, for the `harvestUrl` it will accept anything that looks like a well-formed url, without making any OAI calls to verify that the name of the set and/or the metadata format entered are supported by it. This is by design, to give an admin an option to still be able to create a client, in a rare case when it cannot be done via the GUI because of some real time failures in an exchange with an otherwise valid OAI server. This however puts the responsibility on the admin to supply the values already confirmed to be valid. diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetField.java b/src/main/java/edu/harvard/iq/dataverse/DatasetField.java index c836a20893f..31e7758c7d5 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetField.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetField.java @@ -595,7 +595,8 @@ public boolean removeBlankDatasetFieldValues() { return true; } } else { // controlled vocab - if (this.getControlledVocabularyValues().isEmpty()) { + // during harvesting some CVV are put in getDatasetFieldValues. we don't want to remove those + if (this.getControlledVocabularyValues().isEmpty() && this.getDatasetFieldValues().isEmpty()) { return true; } } diff --git a/src/main/java/edu/harvard/iq/dataverse/harvest/client/HarvestingClient.java b/src/main/java/edu/harvard/iq/dataverse/harvest/client/HarvestingClient.java index 40db55f2a0c..0667f5594ce 100644 --- a/src/main/java/edu/harvard/iq/dataverse/harvest/client/HarvestingClient.java +++ b/src/main/java/edu/harvard/iq/dataverse/harvest/client/HarvestingClient.java @@ -243,6 +243,14 @@ public String getCustomHttpHeaders() { public void setCustomHttpHeaders(String customHttpHeaders) { this.customHttpHeaders = customHttpHeaders; } + + private boolean allowHarvestingMissingCVV; + public boolean getAllowHarvestingMissingCVV() { + return allowHarvestingMissingCVV; + } + public void setAllowHarvestingMissingCVV(boolean allowHarvestingMissingCVV) { + this.allowHarvestingMissingCVV = allowHarvestingMissingCVV; + } // TODO: do we need "orphanRemoval=true"? -- L.A. 4.4 // TODO: should it be @OrderBy("startTime")? -- L.A. 4.4 diff --git a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java index 5716b39e85c..b45ee3368ef 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java @@ -958,22 +958,39 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, SetFeature Request/Idea: Harvest metadata values that aren't from a list of controlled values #9992 + */ + if (dsf.getControlledVocabularyValues().isEmpty()) { + for (DatasetFieldValue dfv : dsf.getDatasetFieldValues()) { + if (dfv.getValue().equals(DatasetField.NA_VALUE)) { + continue; + } + solrInputDocument.addField(solrFieldSearchable, dfv.getValue()); - // Index in all used languages (display and metadata languages - if (!dsfType.isAllowMultiples() || langs.isEmpty()) { - solrInputDocument.addField(solrFieldSearchable, controlledVocabularyValue.getStrValue()); - } else { - for(String locale: langs) { - solrInputDocument.addField(solrFieldSearchable, controlledVocabularyValue.getLocaleStrValue(locale)); + if (dsfType.getSolrField().isFacetable()) { + solrInputDocument.addField(solrFieldFacetable, dfv.getValue()); } } + } else { + for (ControlledVocabularyValue controlledVocabularyValue : dsf.getControlledVocabularyValues()) { + if (controlledVocabularyValue.getStrValue().equals(DatasetField.NA_VALUE)) { + continue; + } - if (dsfType.getSolrField().isFacetable()) { - solrInputDocument.addField(solrFieldFacetable, controlledVocabularyValue.getStrValue()); + // Index in all used languages (display and metadata languages + if (!dsfType.isAllowMultiples() || langs.isEmpty()) { + solrInputDocument.addField(solrFieldSearchable, controlledVocabularyValue.getStrValue()); + } else { + for(String locale: langs) { + solrInputDocument.addField(solrFieldSearchable, controlledVocabularyValue.getLocaleStrValue(locale)); + } + } + + if (dsfType.getSolrField().isFacetable()) { + solrInputDocument.addField(solrFieldFacetable, controlledVocabularyValue.getStrValue()); + } } } } else if (dsfType.getFieldType().equals(DatasetFieldType.FieldType.TEXTBOX)) { diff --git a/src/main/java/edu/harvard/iq/dataverse/util/json/JsonParser.java b/src/main/java/edu/harvard/iq/dataverse/util/json/JsonParser.java index 2141fcc5fca..a0bd2fff295 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/json/JsonParser.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/json/JsonParser.java @@ -38,7 +38,6 @@ import java.text.ParseException; import java.util.ArrayList; import java.util.Arrays; -import java.util.Collections; import java.util.Date; import java.util.HashMap; import java.util.HashSet; @@ -69,7 +68,8 @@ public class JsonParser { MetadataBlockServiceBean blockService; SettingsServiceBean settingsService; LicenseServiceBean licenseService; - HarvestingClient harvestingClient = null; + HarvestingClient harvestingClient = null; + boolean allowHarvestingMissingCVV = false; /** * if lenient, we will accept alternate spellings for controlled vocabulary values @@ -93,6 +93,7 @@ public JsonParser(DatasetFieldServiceBean datasetFieldSvc, MetadataBlockServiceB this.settingsService = settingsService; this.licenseService = licenseService; this.harvestingClient = harvestingClient; + this.allowHarvestingMissingCVV = harvestingClient != null && harvestingClient.getAllowHarvestingMissingCVV(); } public JsonParser() { @@ -737,39 +738,23 @@ public DatasetField parseField(JsonObject json, Boolean testType) throws JsonPar ret.setDatasetFieldType(type); - - if (type.isCompound()) { - List vals = parseCompoundValue(type, json, testType); - for (DatasetFieldCompoundValue dsfcv : vals) { - dsfcv.setParentDatasetField(ret); - } - ret.setDatasetFieldCompoundValues(vals); + if (type.isCompound()) { + parseCompoundValue(ret, type, json, testType); } else if (type.isControlledVocabulary()) { - List vals = parseControlledVocabularyValue(type, json); - for (ControlledVocabularyValue cvv : vals) { - cvv.setDatasetFieldType(type); - } - ret.setControlledVocabularyValues(vals); - + parseControlledVocabularyValue(ret, type, json); } else { - // primitive - - List values = parsePrimitiveValue(type, json); - for (DatasetFieldValue val : values) { - val.setDatasetField(ret); - } - ret.setDatasetFieldValues(values); - } + parsePrimitiveValue(ret, type, json); + } return ret; } - public List parseCompoundValue(DatasetFieldType compoundType, JsonObject json) throws JsonParseException { - return parseCompoundValue(compoundType, json, true); + public void parseCompoundValue(DatasetField dsf, DatasetFieldType compoundType, JsonObject json) throws JsonParseException { + parseCompoundValue(dsf, compoundType, json, true); } - public List parseCompoundValue(DatasetFieldType compoundType, JsonObject json, Boolean testType) throws JsonParseException { + public void parseCompoundValue(DatasetField dsf, DatasetFieldType compoundType, JsonObject json, Boolean testType) throws JsonParseException { List vocabExceptions = new ArrayList<>(); List vals = new LinkedList<>(); if (compoundType.isAllowMultiples()) { @@ -836,18 +821,17 @@ public List parseCompoundValue(DatasetFieldType compo if (!vocabExceptions.isEmpty()) { throw new CompoundVocabularyException( "Invalid controlled vocabulary in compound field ", vocabExceptions, vals); } - return vals; + + for (DatasetFieldCompoundValue dsfcv : vals) { + dsfcv.setParentDatasetField(dsf); + } + dsf.setDatasetFieldCompoundValues(vals); } - public List parsePrimitiveValue(DatasetFieldType dft , JsonObject json) throws JsonParseException { + public void parsePrimitiveValue(DatasetField dsf, DatasetFieldType dft , JsonObject json) throws JsonParseException { Map cvocMap = datasetFieldSvc.getCVocConf(true); - boolean extVocab=false; - if(cvocMap.containsKey(dft.getId())) { - extVocab=true; - } - - + boolean extVocab = cvocMap.containsKey(dft.getId()); List vals = new LinkedList<>(); if (dft.isAllowMultiples()) { try { @@ -856,7 +840,7 @@ public List parsePrimitiveValue(DatasetFieldType dft , JsonOb throw new JsonParseException("Invalid values submitted for " + dft.getName() + ". It should be an array of values."); } for (JsonString val : json.getJsonArray("value").getValuesAs(JsonString.class)) { - DatasetFieldValue datasetFieldValue = new DatasetFieldValue(); + DatasetFieldValue datasetFieldValue = new DatasetFieldValue(dsf); datasetFieldValue.setDisplayOrder(vals.size() - 1); datasetFieldValue.setValue(val.getString().trim()); if(extVocab) { @@ -875,6 +859,7 @@ public List parsePrimitiveValue(DatasetFieldType dft , JsonOb } DatasetFieldValue datasetFieldValue = new DatasetFieldValue(); datasetFieldValue.setValue(json.getString("value", "").trim()); + datasetFieldValue.setDatasetField(dsf); if(extVocab) { if(!datasetFieldSvc.isValidCVocValue(dft, datasetFieldValue.getValue())) { throw new JsonParseException("Invalid values submitted for " + dft.getName() + " which is limited to specific vocabularies."); @@ -884,7 +869,7 @@ public List parsePrimitiveValue(DatasetFieldType dft , JsonOb vals.add(datasetFieldValue); } - return vals; + dsf.setDatasetFieldValues(vals); } public Workflow parseWorkflow(JsonObject json) throws JsonParseException { @@ -929,31 +914,35 @@ private String jsonValueToString(JsonValue jv) { default: return jv.toString(); } } - - public List parseControlledVocabularyValue(DatasetFieldType cvvType, JsonObject json) throws JsonParseException { + + public void parseControlledVocabularyValue(DatasetField dsf, DatasetFieldType cvvType, JsonObject json) throws JsonParseException { + List vals = new LinkedList<>(); try { if (cvvType.isAllowMultiples()) { try { json.getJsonArray("value").getValuesAs(JsonObject.class); } catch (ClassCastException cce) { throw new JsonParseException("Invalid values submitted for " + cvvType.getName() + ". It should be an array of values."); - } - List vals = new LinkedList<>(); + } for (JsonString strVal : json.getJsonArray("value").getValuesAs(JsonString.class)) { String strValue = strVal.getString(); ControlledVocabularyValue cvv = datasetFieldSvc.findControlledVocabularyValueByDatasetFieldTypeAndStrValue(cvvType, strValue, lenient); if (cvv == null) { - throw new ControlledVocabularyException("Value '" + strValue + "' does not exist in type '" + cvvType.getName() + "'", cvvType, strValue); - } - // Only add value to the list if it is not a duplicate - if (strValue.equals("Other")) { - System.out.println("vals = " + vals + ", contains: " + vals.contains(cvv)); + if (allowHarvestingMissingCVV) { + // we need to process these as primitive values + logger.warning("Value '" + strValue + "' does not exist in type '" + cvvType.getName() + "'. Processing as primitive per setting override."); + parsePrimitiveValue(dsf, cvvType, json); + return; + } else { + throw new ControlledVocabularyException("Value '" + strValue + "' does not exist in type '" + cvvType.getName() + "'", cvvType, strValue); + } } + cvv.setDatasetFieldType(cvvType); + // Only add value to the list if it is not a duplicate if (!vals.contains(cvv)) { vals.add(cvv); } } - return vals; } else { try { @@ -964,13 +953,23 @@ public List parseControlledVocabularyValue(DatasetFie String strValue = json.getString("value", ""); ControlledVocabularyValue cvv = datasetFieldSvc.findControlledVocabularyValueByDatasetFieldTypeAndStrValue(cvvType, strValue, lenient); if (cvv == null) { - throw new ControlledVocabularyException("Value '" + strValue + "' does not exist in type '" + cvvType.getName() + "'", cvvType, strValue); + if (allowHarvestingMissingCVV) { + // we need to process this as a primitive value + logger.warning(">>>> Value '" + strValue + "' does not exist in type '" + cvvType.getName() + "'. Processing as primitive per setting override."); + parsePrimitiveValue(dsf, cvvType , json); + return; + } else { + throw new ControlledVocabularyException("Value '" + strValue + "' does not exist in type '" + cvvType.getName() + "'", cvvType, strValue); + } } - return Collections.singletonList(cvv); + cvv.setDatasetFieldType(cvvType); + vals.add(cvv); } } catch (ClassCastException cce) { throw new JsonParseException("Invalid values submitted for " + cvvType.getName()); } + + dsf.setControlledVocabularyValues(vals); } Date parseDate(String str) throws ParseException { @@ -1001,6 +1000,7 @@ public String parseHarvestingClient(JsonObject obj, HarvestingClient harvestingC harvestingClient.setMetadataPrefix(obj.getString("metadataFormat",null)); harvestingClient.setHarvestingSet(obj.getString("set",null)); harvestingClient.setCustomHttpHeaders(obj.getString("customHeaders", null)); + harvestingClient.setAllowHarvestingMissingCVV(obj.getBoolean("allowHarvestingMissingCVV", false)); return dataverseAlias; } diff --git a/src/main/java/edu/harvard/iq/dataverse/util/json/JsonPrinter.java b/src/main/java/edu/harvard/iq/dataverse/util/json/JsonPrinter.java index f6a64e5fe6a..005ae2f2892 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/json/JsonPrinter.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/json/JsonPrinter.java @@ -949,6 +949,7 @@ public static JsonObjectBuilder json(HarvestingClient harvestingClient) { add("schedule", harvestingClient.isScheduled() ? harvestingClient.getScheduleDescription() : "none"). add("status", harvestingClient.isHarvestingNow() ? "inProgress" : "inActive"). add("customHeaders", harvestingClient.getCustomHttpHeaders()). + add("allowHarvestingMissingCVV", harvestingClient.getAllowHarvestingMissingCVV()). add("lastHarvest", harvestingClient.getLastHarvestTime() == null ? null : harvestingClient.getLastHarvestTime().toString()). add("lastResult", harvestingClient.getLastResult()). add("lastSuccessful", harvestingClient.getLastSuccessfulHarvestTime() == null ? null : harvestingClient.getLastSuccessfulHarvestTime().toString()). diff --git a/src/main/resources/db/migration/V6.1.0.6.sql b/src/main/resources/db/migration/V6.1.0.6.sql new file mode 100644 index 00000000000..c9942fb8480 --- /dev/null +++ b/src/main/resources/db/migration/V6.1.0.6.sql @@ -0,0 +1,2 @@ +-- Add flag to allow harvesting client to handle missing CVV values +ALTER TABLE harvestingclient ADD COLUMN IF NOT EXISTS allowharvestingmissingcvv BOOLEAN; diff --git a/src/test/java/edu/harvard/iq/dataverse/api/HarvestingClientsIT.java b/src/test/java/edu/harvard/iq/dataverse/api/HarvestingClientsIT.java index d5388e510d2..4466182b435 100644 --- a/src/test/java/edu/harvard/iq/dataverse/api/HarvestingClientsIT.java +++ b/src/test/java/edu/harvard/iq/dataverse/api/HarvestingClientsIT.java @@ -2,12 +2,13 @@ import java.util.logging.Logger; -import org.junit.jupiter.api.Test; - import io.restassured.RestAssured; import static io.restassured.RestAssured.given; import io.restassured.path.json.JsonPath; import io.restassured.response.Response; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; import static jakarta.ws.rs.core.Response.Status.CREATED; import static jakarta.ws.rs.core.Response.Status.UNAUTHORIZED; @@ -17,7 +18,6 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.*; import static org.junit.jupiter.api.Assertions.assertTrue; -import org.junit.jupiter.api.BeforeAll; /** * This class tests Harvesting Client functionality. @@ -37,11 +37,12 @@ public class HarvestingClientsIT { private static final String ARCHIVE_URL = "https://demo.dataverse.org"; private static final String HARVEST_METADATA_FORMAT = "oai_dc"; private static final String ARCHIVE_DESCRIPTION = "RestAssured harvesting client test"; - private static final String CONTROL_OAI_SET = "controlTestSet"; - private static final int DATASETS_IN_CONTROL_SET = 7; + private static final String CONTROL_OAI_SET = "controlTestSet2"; + private static final int DATASETS_IN_CONTROL_SET = 8; private static String normalUserAPIKey; private static String adminUserAPIKey; - private static String harvestCollectionAlias; + private static String harvestCollectionAlias; + String clientApiPath = null; @BeforeAll public static void setUpClass() { @@ -54,6 +55,16 @@ public static void setUpClass() { setupCollection(); } + @AfterEach + public void cleanup() { + if (clientApiPath != null) { + Response deleteResponse = given() + .header(UtilIT.API_TOKEN_HTTP_HEADER, adminUserAPIKey) + .delete(clientApiPath); + clientApiPath = null; + System.out.println("deleteResponse.getStatusCode(): " + deleteResponse.getStatusCode()); + } + } private static void setupUsers() { Response cu0 = UtilIT.createRandomUser(); @@ -157,9 +168,19 @@ public void testCreateEditDeleteClient() throws InterruptedException { logger.info("rDelete.getStatusCode(): " + rDelete.getStatusCode()); assertEquals(OK.getStatusCode(), rDelete.getStatusCode()); } - + + @Test + public void testHarvestingClientRun_AllowHarvestingMissingCVV_False() throws InterruptedException { + harvestingClientRun(false); + } @Test - public void testHarvestingClientRun() throws InterruptedException { + public void testHarvestingClientRun_AllowHarvestingMissingCVV_True() throws InterruptedException { + harvestingClientRun(true); + } + + private void harvestingClientRun(boolean allowHarvestingMissingCVV) throws InterruptedException { + int expectedNumberOfSetsHarvested = allowHarvestingMissingCVV ? DATASETS_IN_CONTROL_SET : DATASETS_IN_CONTROL_SET - 1; + // This test will create a client and attempt to perform an actual // harvest and validate the resulting harvested content. @@ -170,14 +191,15 @@ public void testHarvestingClientRun() throws InterruptedException { String nickName = "h" + UtilIT.getRandomString(6); - String clientApiPath = String.format(HARVEST_CLIENTS_API+"%s", nickName); + clientApiPath = String.format(HARVEST_CLIENTS_API+"%s", nickName); String clientJson = String.format("{\"dataverseAlias\":\"%s\"," + "\"type\":\"oai\"," + "\"harvestUrl\":\"%s\"," + "\"archiveUrl\":\"%s\"," + "\"set\":\"%s\"," + + "\"allowHarvestingMissingCVV\":%s," + "\"metadataFormat\":\"%s\"}", - harvestCollectionAlias, HARVEST_URL, ARCHIVE_URL, CONTROL_OAI_SET, HARVEST_METADATA_FORMAT); + harvestCollectionAlias, HARVEST_URL, ARCHIVE_URL, CONTROL_OAI_SET, allowHarvestingMissingCVV, HARVEST_METADATA_FORMAT); Response createResponse = given() .header(UtilIT.API_TOKEN_HTTP_HEADER, adminUserAPIKey) @@ -242,7 +264,7 @@ public void testHarvestingClientRun() throws InterruptedException { assertEquals(harvestTimeStamp, responseJsonPath.getString("data.lastNonEmpty")); // d) Confirm that the correct number of datasets have been harvested: - assertEquals(DATASETS_IN_CONTROL_SET, responseJsonPath.getInt("data.lastDatasetsHarvested")); + assertEquals(expectedNumberOfSetsHarvested, responseJsonPath.getInt("data.lastDatasetsHarvested")); // ok, it looks like the harvest has completed successfully. break; @@ -258,15 +280,6 @@ public void testHarvestingClientRun() throws InterruptedException { // datasets have been harvested. This may or may not be necessary, seeing // how we have already confirmed the number of successfully harvested // datasets from the control set; somewhat hard to imagine a practical - // situation where that would not be enough (?). - - // Cleanup: delete the client - - Response deleteResponse = given() - .header(UtilIT.API_TOKEN_HTTP_HEADER, adminUserAPIKey) - .delete(clientApiPath); - System.out.println("deleteResponse.getStatusCode(): " + deleteResponse.getStatusCode()); - assertEquals(OK.getStatusCode(), deleteResponse.getStatusCode()); - + // situation where that would not be enough (?). } }