Skip to content

Commit

Permalink
Merge pull request #10708 from QualitativeDataRepository/QDR-PIDBugFixes
Browse files Browse the repository at this point in the history
QDR: PID bug fixes
  • Loading branch information
ofahimIQSS authored Oct 28, 2024
2 parents 5f5126a + a7e0d87 commit 64ac076
Show file tree
Hide file tree
Showing 7 changed files with 71 additions and 23 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
MDC Citation retrieval with the PID settings has been fixed.
DOI parsing in Dataverse is case insensitive, improving interaction with services that may change the case.
Warnings related to managed/excluded PID lists for PID providers have been reduced
23 changes: 19 additions & 4 deletions src/main/java/edu/harvard/iq/dataverse/api/MakeDataCountApi.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.Iterator;
import java.util.List;
Expand Down Expand Up @@ -152,10 +155,17 @@ public Response updateCitationsForDataset(@PathParam("id") String id) throws IOE
// DataCite wants "doi=", not "doi:".
String authorityPlusIdentifier = persistentId.replaceFirst("doi:", "");
// Request max page size and then loop to handle multiple pages
URL url = new URL(JvmSettings.DATACITE_REST_API_URL.lookup() +
URL url = null;
try {
url = new URI(JvmSettings.DATACITE_REST_API_URL.lookup(pidProvider.getId()) +
"/events?doi=" +
authorityPlusIdentifier +
"&source=crossref&page[size]=1000");
"&source=crossref&page[size]=1000").toURL();
} catch (URISyntaxException e) {
//Nominally this means a config error/ bad DATACITE_REST_API_URL for this provider
logger.warning("Unable to create URL for " + persistentId + ", pidProvider " + pidProvider.getId());
return error(Status.INTERNAL_SERVER_ERROR, "Unable to create DataCite URL to retrieve citations.");
}
logger.fine("Retrieving Citations from " + url.toString());
boolean nextPage = true;
JsonArrayBuilder dataBuilder = Json.createArrayBuilder();
Expand All @@ -178,7 +188,12 @@ public Response updateCitationsForDataset(@PathParam("id") String id) throws IOE
dataBuilder.add(iter.next());
}
if (links.containsKey("next")) {
url = new URL(links.getString("next"));
try {
url = new URI(links.getString("next")).toURL();
} catch (URISyntaxException e) {
logger.warning("Unable to create URL from DataCite response: " + links.getString("next"));
return error(Status.INTERNAL_SERVER_ERROR, "Unable to retrieve all results from DataCite");
}
} else {
nextPage = false;
}
Expand All @@ -187,7 +202,7 @@ public Response updateCitationsForDataset(@PathParam("id") String id) throws IOE
JsonArray allData = dataBuilder.build();
List<DatasetExternalCitations> datasetExternalCitations = datasetExternalCitationsService.parseCitations(allData);
/*
* ToDo: If this is the only source of citations, we should remove all the existing ones for the dataset and repopuate them.
* ToDo: If this is the only source of citations, we should remove all the existing ones for the dataset and repopulate them.
* As is, this call doesn't remove old citations if there are now none (legacy issue if we decide to stop counting certain types of citation
* as we've done for 'hasPart').
* If there are some, this call individually checks each one and if a matching item exists, it removes it and adds it back. Faster and better to delete all and
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@

import edu.harvard.iq.dataverse.Dataset;
import edu.harvard.iq.dataverse.DatasetServiceBean;
import edu.harvard.iq.dataverse.GlobalId;
import edu.harvard.iq.dataverse.pidproviders.PidUtil;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
Expand Down Expand Up @@ -40,7 +43,8 @@ public class DatasetExternalCitationsServiceBean implements java.io.Serializable
Arrays.asList(
"cites",
"references",
"supplements"));
"supplements",
"is-supplement-to"));
static ArrayList<String> outboundRelationships = new ArrayList<String>(
Arrays.asList(
"is-cited-by",
Expand All @@ -59,22 +63,21 @@ public List<DatasetExternalCitations> parseCitations(JsonArray citations) {
if (inboundRelationships.contains(relationship)) {
Dataset localDs = null;
if (objectUri.contains("doi")) {
String globalId = objectUri.replace("https://", "").replace("doi.org/", "doi:").toUpperCase().replace("DOI:", "doi:");
localDs = datasetService.findByGlobalId(globalId);
localDs = datasetService.findByGlobalId(objectUri);
exCit.setDataset(localDs);
}
exCit.setCitedByUrl(subjectUri);

if (localDs != null && !exCit.getCitedByUrl().isEmpty()) {
datasetExternalCitations.add(exCit);
}
}
if (outboundRelationships.contains(relationship)) {
Dataset localDs = null;
if (subjectUri.contains("doi")) {
String globalId = subjectUri.replace("https://", "").replace("doi.org/", "doi:").toUpperCase().replace("DOI:", "doi:");
localDs = datasetService.findByGlobalId(globalId);
localDs = datasetService.findByGlobalId(subjectUri);
exCit.setDataset(localDs);

}
exCit.setCitedByUrl(objectUri);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,9 @@ public abstract class AbstractPidProvider implements PidProvider {

private String datafilePidFormat = null;

private HashSet<String> managedSet;
protected HashSet<String> managedSet = new HashSet<String>();

private HashSet<String> excludedSet;
protected HashSet<String> excludedSet = new HashSet<String>();

private String id;
private String label;
Expand All @@ -47,8 +47,6 @@ protected AbstractPidProvider(String id, String label, String protocol) {
this.id = id;
this.label = label;
this.protocol = protocol;
this.managedSet = new HashSet<String>();
this.excludedSet = new HashSet<String>();
}

protected AbstractPidProvider(String id, String label, String protocol, String authority, String shoulder,
Expand All @@ -60,8 +58,12 @@ protected AbstractPidProvider(String id, String label, String protocol, String a
this.shoulder = shoulder;
this.identifierGenerationStyle = identifierGenerationStyle;
this.datafilePidFormat = datafilePidFormat;
this.managedSet = new HashSet<String>(Arrays.asList(managedList.split(",\\s")));
this.excludedSet = new HashSet<String>(Arrays.asList(excludedList.split(",\\s")));
if(!managedList.isEmpty()) {
this.managedSet.addAll(Arrays.asList(managedList.split(",\\s")));
}
if(!excludedList.isEmpty()) {
this.excludedSet.addAll(Arrays.asList(excludedList.split(",\\s")));
}
if (logger.isLoggable(Level.FINE)) {
Iterator<String> iter = managedSet.iterator();
while (iter.hasNext()) {
Expand Down Expand Up @@ -313,10 +315,17 @@ protected GlobalId parsePersistentId(String protocol, String identifierString) {
}

public GlobalId parsePersistentId(String protocol, String authority, String identifier) {
return parsePersistentId(protocol, authority, identifier, false);
}

public GlobalId parsePersistentId(String protocol, String authority, String identifier, boolean isCaseInsensitive) {
logger.fine("Parsing: " + protocol + ":" + authority + getSeparator() + identifier + " in " + getId());
if (!PidProvider.isValidGlobalId(protocol, authority, identifier)) {
return null;
}
if(isCaseInsensitive) {
identifier = identifier.toUpperCase();
}
// Check authority/identifier if this is a provider that manages specific
// identifiers
// /is not one of the unmanaged providers that has null authority
Expand All @@ -333,7 +342,7 @@ public GlobalId parsePersistentId(String protocol, String authority, String iden
logger.fine("managed in " + getId() + ": " + getManagedSet().contains(cleanIdentifier));
logger.fine("excluded from " + getId() + ": " + getExcludedSet().contains(cleanIdentifier));

if (!(((authority.equals(getAuthority()) && identifier.startsWith(getShoulder()))
if (!(((authority.equals(getAuthority()) && identifier.startsWith(getShoulder().toUpperCase()))
|| getManagedSet().contains(cleanIdentifier)) && !getExcludedSet().contains(cleanIdentifier))) {
return null;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package edu.harvard.iq.dataverse.pidproviders.doi;

import java.util.Arrays;
import java.util.HashSet;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;
Expand All @@ -26,9 +27,24 @@ public abstract class AbstractDOIProvider extends AbstractPidProvider {

public AbstractDOIProvider(String id, String label, String providerAuthority, String providerShoulder, String identifierGenerationStyle, String datafilePidFormat, String managedList, String excludedList) {
super(id, label, DOI_PROTOCOL, providerAuthority, providerShoulder, identifierGenerationStyle, datafilePidFormat, managedList, excludedList);
//Create case insensitive (converted toUpperCase) managedSet and excludedSet
managedSet = clean(managedSet, "managed");
excludedSet = clean(excludedSet, "excluded");
}

//For Unmanged provider
private HashSet<String> clean(HashSet<String> originalSet, String setName) {
HashSet<String> cleanSet = new HashSet<String>();
for(String entry: originalSet) {
if(entry.startsWith(DOI_PROTOCOL)) {
cleanSet.add(DOI_PROTOCOL + entry.substring(DOI_PROTOCOL.length()).toUpperCase());
} else {
logger.warning("Non-DOI found in " + setName + " set of pidProvider id: " + getId() + ": " + entry + ". Entry is being dropped.");
}
}
return cleanSet;
}

//For Unmanaged provider
public AbstractDOIProvider(String name, String label) {
super(name, label, DOI_PROTOCOL);
}
Expand Down Expand Up @@ -67,7 +83,7 @@ public GlobalId parsePersistentId(String protocol, String authority, String iden
if (!DOI_PROTOCOL.equals(protocol)) {
return null;
}
return super.parsePersistentId(protocol, authority, identifier);
return super.parsePersistentId(protocol, authority, identifier, true);
}

public String getUrlPrefix() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -226,8 +226,7 @@ protected String getProviderKeyName() {

@Override
public String getProviderType() {
// TODO Auto-generated method stub
return null;
return TYPE;
}

public String getMdsUrl() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -250,9 +250,12 @@ public void testDOIParsing() throws IOException {
assertEquals(pid1String, pid3.asString());
assertEquals("dc1", pid3.getProviderId());

String pid4String = "doi:10.5072/FK3ABCDEF";
//Also test case insensitive
String pid4String = "doi:10.5072/fk3ABCDEF";
GlobalId pid4 = PidUtil.parseAsGlobalID(pid4String);
assertEquals(pid4String, pid4.asString());
// Lower case is recognized by converting to upper case internally, so we need to test vs. the upper case identifier
// I.e. we are verifying that the lower case string is parsed the same as the upper case string, both give an internal upper case PID representation
assertEquals("doi:10.5072/FK3ABCDEF", pid4.asString());
assertEquals("dc2", pid4.getProviderId());

String pid5String = "doi:10.5072/FK2ABCDEF";
Expand Down

0 comments on commit 64ac076

Please sign in to comment.