From 97e042d4f74deb70aded8d255f666a5f75d46c8f Mon Sep 17 00:00:00 2001
From: Camelia Dumitru <62257307+Camelia-Orcid@users.noreply.github.com>
Date: Tue, 4 Jun 2024 15:59:51 +0100
Subject: [PATCH] Added extra columns for names locations (#7038)
* Added extra columns for names locations
* added the dbms to fix failing tests
* Separate changeset for hsqldb to fix failing tests
* Fixed formatting to follow Orcid rules
---
.../jpa/entities/OrgDisambiguatedEntity.java | 26 +-
.../src/main/resources/db-master.xml | 3 +-
.../org_disambiguated_extra_columns.xml | 21 +
.../loader/source/ror/RorOrgLoadSource.java | 919 +++++++++---------
4 files changed, 499 insertions(+), 470 deletions(-)
create mode 100644 orcid-persistence/src/main/resources/db/updates/org_disambiguated_extra_columns.xml
diff --git a/orcid-persistence/src/main/java/org/orcid/persistence/jpa/entities/OrgDisambiguatedEntity.java b/orcid-persistence/src/main/java/org/orcid/persistence/jpa/entities/OrgDisambiguatedEntity.java
index 431fd0263a7..9379cd2db44 100644
--- a/orcid-persistence/src/main/java/org/orcid/persistence/jpa/entities/OrgDisambiguatedEntity.java
+++ b/orcid-persistence/src/main/java/org/orcid/persistence/jpa/entities/OrgDisambiguatedEntity.java
@@ -211,4 +211,28 @@ public void setMemberChosenOrgDisambiguatedEntity(MemberChosenOrgDisambiguatedEn
this.memberChosenOrgDisambiguatedEntity = memberChosenOrgDisambiguatedEntity;
}
-}
+ @Column(name = "locations_json")
+ private String locationsJson;
+
+ @Column(name = "locations_json")
+ public String getLocationsJson() {
+ return locationsJson;
+ }
+
+ public void setLocationsJson(String locationsJson) {
+ this.locationsJson = locationsJson;
+ }
+
+ @Column(name = "names_json")
+ private String namesJson;
+
+ @Column(name = "names_json")
+ public String getNamesJson() {
+ return namesJson;
+ }
+
+ public void setNamesJson(String namesJson) {
+ this.namesJson = namesJson;
+ }
+
+}
\ No newline at end of file
diff --git a/orcid-persistence/src/main/resources/db-master.xml b/orcid-persistence/src/main/resources/db-master.xml
index 034c50c878d..50751220853 100644
--- a/orcid-persistence/src/main/resources/db-master.xml
+++ b/orcid-persistence/src/main/resources/db-master.xml
@@ -387,5 +387,6 @@
-
+
+
diff --git a/orcid-persistence/src/main/resources/db/updates/org_disambiguated_extra_columns.xml b/orcid-persistence/src/main/resources/db/updates/org_disambiguated_extra_columns.xml
new file mode 100644
index 00000000000..e0b2613f4c1
--- /dev/null
+++ b/orcid-persistence/src/main/resources/db/updates/org_disambiguated_extra_columns.xml
@@ -0,0 +1,21 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/orcid-scheduler-web/src/main/java/org/orcid/scheduler/loader/source/ror/RorOrgLoadSource.java b/orcid-scheduler-web/src/main/java/org/orcid/scheduler/loader/source/ror/RorOrgLoadSource.java
index 8e9b4f2bd46..28da2f9a727 100644
--- a/orcid-scheduler-web/src/main/java/org/orcid/scheduler/loader/source/ror/RorOrgLoadSource.java
+++ b/orcid-scheduler-web/src/main/java/org/orcid/scheduler/loader/source/ror/RorOrgLoadSource.java
@@ -46,473 +46,456 @@
@Component
public class RorOrgLoadSource implements OrgLoadSource {
- private static final Logger LOGGER = LoggerFactory.getLogger(RorOrgLoadSource.class);
-
- private static final String WIKIPEDIA_URL = "wikipedia_url";
-
- @Value("${org.orcid.core.orgs.ror.enabled:true}")
- private boolean enabled;
-
- @Value("${org.orcid.core.orgs.clients.userAgent}")
- private String userAgent;
-
- @Resource(name = "rorOrgDataClient")
- private OrgDataClient orgDataClient;
-
- @Value("${org.orcid.core.orgs.ror.localZipPath:/tmp/grid/ror.zip}")
- private String zipFilePath;
-
- @Value("${org.orcid.core.orgs.ror.localDataPath:/tmp/grid/ror.json}")
- private String localDataPath;
-
- @Value("${org.orcid.core.orgs.ror.indexAllEnabled:false}")
- private boolean indexAllEnabled;
-
- @Resource
- private OrgDisambiguatedDao orgDisambiguatedDao;
-
- @Resource
- private OrgDisambiguatedManager orgDisambiguatedManager;
-
- @Resource
- private OrgDisambiguatedExternalIdentifierDao orgDisambiguatedExternalIdentifierDao;
-
- @Value("${org.orcid.core.orgs.ror.zenodoRecordsUrl:https://zenodo.org/api/records?communities=ror-data}")
- private String rorZenodoRecordsUrl;
-
- @Resource
- private FileRotator fileRotator;
-
- private Set UPDATED_RORS;
-
- @Override
- public String getSourceName() {
- return "ROR";
- }
-
- @Override
- public boolean loadOrgData() {
- if (!enabled) {
- throw new LoadSourceDisabledException(getSourceName());
- }
-
- return loadData();
- }
-
- @Override
- public boolean downloadOrgData() {
- try {
- fileRotator.removeFileIfExists(zipFilePath);
- fileRotator.removeFileIfExists(localDataPath);
-
- ZenodoRecords zenodoRecords = orgDataClient.get(rorZenodoRecordsUrl + "&sort=mostrecent&size=1", userAgent,
- ZenodoRecords.class);
- ZenodoRecordsHit zenodoHit = zenodoRecords.getHits().getHits().get(0);
-
- boolean success = false;
-
- // we are returning the collection ordered by mostrecent and size 1, we need to
- // get the last element in the list that has the last version
- String zenodoUrl = zenodoHit.getFiles()
- .get(zenodoHit.getFiles().size() > 0 ? zenodoHit.getFiles().size() - 1 : 0).getLinks().getSelf();
- LOGGER.info("Retrieving ROR data from: " + zenodoUrl);
- success = orgDataClient.downloadFile(zenodoUrl, userAgent, zipFilePath);
-
- try {
- LOGGER.info("Unzipping ROR ....");
- unzipData();
- } catch (IOException e) {
- LOGGER.error("Error unzipping Zenodo ROR data", e);
- throw new RuntimeException(e);
- }
- return success;
- } catch (Exception e) {
- LOGGER.error("Error downloading Zenodo ROR data", e);
- return false;
- }
- }
-
- private void unzipData() throws IOException {
- byte[] buffer = new byte[1024];
- ZipInputStream zis = new ZipInputStream(new FileInputStream(zipFilePath));
- ZipEntry zipEntry = zis.getNextEntry();
- while (zipEntry != null) {
- String zipEntryName = zipEntry.getName();
- if (zipEntryName.endsWith("v2.json")) {
- File jsonData = new File(localDataPath);
- FileOutputStream fos = new FileOutputStream(jsonData);
- int len;
- while ((len = zis.read(buffer)) > 0) {
- fos.write(buffer, 0, len);
- }
- fos.close();
- break;
- }
- zipEntry = zis.getNextEntry();
- }
- zis.closeEntry();
- zis.close();
- }
-
- private boolean loadData() {
- try {
- LOGGER.info("Loading ROR data...");
- Instant start = Instant.now();
- File fileToLoad = new File(localDataPath);
- if (!fileToLoad.exists()) {
- LOGGER.error("File {} doesn't exist", localDataPath);
- return false;
- }
-
- // ror returns the JSON as Array of institutes
- JsonNode rootNode = JsonUtils.read(fileToLoad);
- UPDATED_RORS = new HashSet();
-
- rootNode.forEach(institute -> {
- String sourceId = institute.get("id").isNull() ? null : institute.get("id").asText();
- String status = institute.get("status").isNull() ? null : institute.get("status").asText();
- if ("active".equalsIgnoreCase(status) || "inactive".equalsIgnoreCase(status)) {
- ArrayNode namesNode = institute.get("names").isNull() ? null : (ArrayNode) institute.get("names");
- String name = null;
- if (namesNode != null) {
- for (JsonNode nameJson : namesNode) {
- ArrayNode nameTypes = nameJson.get("types").isNull() ? null
- : (ArrayNode) nameJson.get("types");
- for (JsonNode nameType : nameTypes) {
- if (StringUtils.equalsIgnoreCase(nameType.asText(), "ror_display")) {
- name = nameJson.get("value").asText();
- break;
- }
- }
- }
- }
-
- StringJoiner sj = new StringJoiner(",");
- String orgType = null;
- if (!institute.get("types").isNull()) {
- ((ArrayNode) institute.get("types")).forEach(x -> sj.add(x.textValue()));
- orgType = sj.toString();
- }
-
-
- //location node
-
- ArrayNode locationsNode = institute.get("locations").isNull() ? null : (ArrayNode) institute.get("locations");
- Iso3166Country country = null;
- String region = null;
- String city = null;
- if (locationsNode != null) {
- for (JsonNode locationJson : locationsNode) {
- JsonNode geoDetailsNode = locationJson.get("geonames_details").isNull() ? null
- : (JsonNode) locationJson.get("geonames_details");
-
- if(geoDetailsNode !=null) {
- String countryCode = geoDetailsNode.get("country_code").isNull() ? null
- : geoDetailsNode.get("country_code").asText();
- country = StringUtils.isBlank(countryCode) ? null : Iso3166Country.fromValue(countryCode);
- //for now storing just the first location
- city = geoDetailsNode.get("name").isNull() ? null
- : geoDetailsNode.get("name").asText();
- if(country != null) {
- break;
- }
- }
-
- }
- }
-
-
- ArrayNode urls = institute.get("links").isNull() ? null : (ArrayNode) institute.get("links");
- // Use the first URL
- String url = (urls != null && urls.size() > 0) ? urls.get(0).asText() : null;
-
- // Creates or updates an institute
- OrgDisambiguatedEntity entity = processInstitute(sourceId, name, country, city, region, url,
- orgType);
-
- // Creates external identifiers
- processExternalIdentifiers(entity, institute);
- } else if ("redirected".equals(status)) {
- String primaryId = institute.get("redirect").isNull() ? null : institute.get("redirect").asText();
- deprecateOrg(sourceId, primaryId);
- } else if ("withdrawn".equals(status) || "obsolete".equals(status)) {
- obsoleteOrg(sourceId);
- } else {
- LOGGER.error("Illegal status '" + status + "' for institute " + sourceId);
- }
- });
-
- // Check if any RORs with external identifiers updated and group them
- groupRORsWithUpdatedExternalModifiers();
-
- LOGGER.info("Time taken to process the data: {}", Duration.between(start, Instant.now()).toString());
- return true;
- } catch (Exception e) {
- LOGGER.error("Error loading ROR data", e);
- return false;
- }
- }
-
- private OrgDisambiguatedEntity processInstitute(String sourceId, String name, Iso3166Country country, String city,
- String region, String url, String orgType) {
- OrgDisambiguatedEntity existingBySourceId = orgDisambiguatedDao.findBySourceIdAndSourceType(sourceId,
- OrgDisambiguatedSourceType.ROR.name());
- if (existingBySourceId != null) {
- if (entityChanged(existingBySourceId, name, country.value(), city, region, url, orgType)
- || indexAllEnabled) {
- existingBySourceId.setCity(city);
- existingBySourceId.setCountry(country.name());
- existingBySourceId.setName(name);
- existingBySourceId.setOrgType(orgType);
- existingBySourceId.setRegion(region);
- existingBySourceId.setUrl(url);
- existingBySourceId.setIndexingStatus(IndexingStatus.PENDING);
- try {
- // mark group for indexing
- new OrgGrouping(existingBySourceId, orgDisambiguatedManager)
- .markGroupForIndexing(orgDisambiguatedDao);
-
- } catch (Exception ex) {
- LOGGER.error(
- "Error when grouping by ROR and marking group orgs for reindexing, eating the exception",
- ex);
- }
- orgDisambiguatedManager.updateOrgDisambiguated(existingBySourceId);
- }
- return existingBySourceId;
- }
-
- // Create a new disambiguated org
- OrgDisambiguatedEntity newOrg = createDisambiguatedOrg(sourceId, name, orgType, country, city, region, url);
- try {
- // mark group for indexing
- new OrgGrouping(newOrg, orgDisambiguatedManager).markGroupForIndexing(orgDisambiguatedDao);
- } catch (Exception ex) {
- LOGGER.error("Error when grouping by ROR and removing related orgs solr index, eating the exception", ex);
- }
- return newOrg;
- }
-
- private void processExternalIdentifiers(OrgDisambiguatedEntity org, JsonNode institute) {
- ArrayNode nodes = institute.get("external_ids") == null ? null : (ArrayNode) institute.get("external_ids");
- if (nodes!= null) {
- for(JsonNode entry:nodes){
- String identifierTypeName = entry.get("type").asText().toUpperCase();
- String preferredId = entry.get("preferred").isNull() ? null
- : entry.get("preferred").asText();
- if (StringUtils.equalsIgnoreCase(OrgDisambiguatedSourceType.GRID.name(), identifierTypeName)) {
- JsonNode extId = (JsonNode) entry.get("all");
- setExternalId(org, identifierTypeName, preferredId, extId);
- UPDATED_RORS.add(org.getId());
- } else {
- ArrayNode elements = (ArrayNode) entry.get("all");
- for (JsonNode extId : elements) {
- setExternalId(org, identifierTypeName, preferredId, extId);
- UPDATED_RORS.add(org.getId());
- }
- }
- }
- }
- }
-
- private void setExternalId(OrgDisambiguatedEntity org, String identifierTypeName, String preferredId,
- JsonNode extId) {
- // If the external identifier doesn't exists yet
- OrgDisambiguatedExternalIdentifierEntity existingExternalId = orgDisambiguatedExternalIdentifierDao
- .findByDetails(org.getId(), extId.asText(), identifierTypeName);
- Boolean preferred = extId.asText().equals(preferredId);
- if (existingExternalId == null) {
- if (preferred) {
- createExternalIdentifier(org, extId.asText(), identifierTypeName, true);
- } else {
- createExternalIdentifier(org, extId.asText(), identifierTypeName, false);
- }
- } else {
- if (existingExternalId.getPreferred() != preferred) {
- existingExternalId.setPreferred(preferred);
- orgDisambiguatedManager.updateOrgDisambiguatedExternalIdentifier(existingExternalId);
- LOGGER.info("External identifier for {} with ext id {} and type {} was updated",
- new Object[] { org.getId(), extId.asText(), identifierTypeName });
- } else {
- LOGGER.info("External identifier for {} with ext id {} and type {} already exists",
- new Object[] { org.getId(), extId.asText(), identifierTypeName });
- }
- }
- }
-
- /**
- * Indicates if an entity changed his address, url or org type
- *
- * @return true if the entity has changed.
- */
- private boolean entityChanged(OrgDisambiguatedEntity entity, String name, String countryCode, String city,
- String region, String url, String orgType) {
- // Check name
- if (StringUtils.isNotBlank(name)) {
- if (!name.equalsIgnoreCase(entity.getName()))
- return true;
- } else if (StringUtils.isNotBlank(entity.getName())) {
- return true;
- }
- // Check country
- if (StringUtils.isNotBlank(countryCode)) {
- if (entity.getCountry() == null || !countryCode.equals(entity.getCountry())) {
- return true;
- }
- } else if (entity.getCountry() != null) {
- return true;
- }
- // Check city
- if (StringUtils.isNotBlank(city)) {
- if (entity.getCity() == null || !city.equals(entity.getCity())) {
- return true;
- }
- } else if (StringUtils.isNotBlank(entity.getCity())) {
- return true;
- }
- // Check region
- if (StringUtils.isNotBlank(region)) {
- if (entity.getRegion() == null || !region.equals(entity.getRegion())) {
- return true;
- }
- } else if (StringUtils.isNotBlank(entity.getRegion())) {
- return true;
- }
- // Check url
- if (StringUtils.isNotBlank(url)) {
- if (entity.getUrl() == null || !url.equals(entity.getUrl())) {
- return true;
- }
- } else if (StringUtils.isNotBlank(entity.getUrl())) {
- return true;
- }
- // Check org_type
- if (StringUtils.isNotBlank(orgType)) {
- if (entity.getOrgType() == null || !orgType.equals(entity.getOrgType())) {
- return true;
- }
- } else if (StringUtils.isNotBlank(entity.getOrgType())) {
- return true;
- }
-
- return false;
- }
-
- /**
- * Creates a disambiguated ORG in the org_disambiguated table
- */
- private OrgDisambiguatedEntity createDisambiguatedOrg(String sourceId, String name, String orgType,
- Iso3166Country country, String city, String region, String url) {
- LOGGER.info("Creating disambiguated org {}", name);
- OrgDisambiguatedEntity orgDisambiguatedEntity = new OrgDisambiguatedEntity();
- orgDisambiguatedEntity.setName(name);
- orgDisambiguatedEntity.setCountry(country != null ? country.name() : null);
- orgDisambiguatedEntity.setCity(city);
- orgDisambiguatedEntity.setRegion(region);
- orgDisambiguatedEntity.setUrl(url);
- orgDisambiguatedEntity.setOrgType(orgType);
- orgDisambiguatedEntity.setSourceId(sourceId);
- orgDisambiguatedEntity.setSourceType(OrgDisambiguatedSourceType.ROR.name());
- orgDisambiguatedManager.createOrgDisambiguated(orgDisambiguatedEntity);
- return orgDisambiguatedEntity;
- }
-
- /**
- * Creates an external identifier in the org_disambiguated_external_identifier
- * table
- */
- private boolean createExternalIdentifier(OrgDisambiguatedEntity disambiguatedOrg, String identifier,
- String externalIdType, Boolean preferred) {
- LOGGER.info("Creating external identifier for {}", disambiguatedOrg.getId());
- OrgDisambiguatedExternalIdentifierEntity externalIdentifier = new OrgDisambiguatedExternalIdentifierEntity();
- externalIdentifier.setIdentifier(identifier);
- externalIdentifier.setIdentifierType(externalIdType);
- externalIdentifier.setOrgDisambiguated(disambiguatedOrg);
- externalIdentifier.setPreferred(preferred);
- orgDisambiguatedManager.createOrgDisambiguatedExternalIdentifier(externalIdentifier);
- return true;
- }
-
- /**
- * Mark an existing org as DEPRECATED
- */
- private void deprecateOrg(String sourceId, String primarySourceId) {
- LOGGER.info("Deprecating org {} for {}", sourceId, primarySourceId);
- OrgDisambiguatedEntity existingEntity = orgDisambiguatedDao.findBySourceIdAndSourceType(sourceId,
- OrgDisambiguatedSourceType.ROR.name());
- if (existingEntity != null) {
- if (existingEntity.getStatus() == null
- || !existingEntity.getStatus().equals(OrganizationStatus.DEPRECATED.name())
- || !existingEntity.getSourceParentId().equals(primarySourceId)) {
- existingEntity.setStatus(OrganizationStatus.DEPRECATED.name());
- existingEntity.setSourceParentId(primarySourceId);
- existingEntity.setIndexingStatus(IndexingStatus.PENDING);
- orgDisambiguatedManager.updateOrgDisambiguated(existingEntity);
- }
- } else {
- OrgDisambiguatedEntity deprecatedEntity = new OrgDisambiguatedEntity();
- deprecatedEntity.setSourceType(OrgDisambiguatedSourceType.ROR.name());
- deprecatedEntity.setStatus(OrganizationStatus.DEPRECATED.name());
- deprecatedEntity.setSourceId(sourceId);
- deprecatedEntity.setSourceParentId(primarySourceId);
- // We don't need to index it
- deprecatedEntity.setIndexingStatus(IndexingStatus.DONE);
- orgDisambiguatedManager.createOrgDisambiguated(deprecatedEntity);
- }
- }
-
- /**
- * Mark an existing org as OBSOLETE
- */
- private void obsoleteOrg(String sourceId) {
- LOGGER.info("Marking or as obsolete {}", sourceId);
- OrgDisambiguatedEntity existingEntity = orgDisambiguatedDao.findBySourceIdAndSourceType(sourceId,
- OrgDisambiguatedSourceType.ROR.name());
- if (existingEntity != null) {
- if (existingEntity.getStatus() == null
- || !existingEntity.getStatus().equals(OrganizationStatus.OBSOLETE.name())) {
- existingEntity.setStatus(OrganizationStatus.OBSOLETE.name());
- existingEntity.setIndexingStatus(IndexingStatus.PENDING);
- orgDisambiguatedManager.updateOrgDisambiguated(existingEntity);
- new OrgGrouping(existingEntity, orgDisambiguatedManager)
- .ungroupObsoleteRorForIndexing(orgDisambiguatedDao);
- }
- } else {
- OrgDisambiguatedEntity obsoletedEntity = new OrgDisambiguatedEntity();
- obsoletedEntity.setSourceType(OrgDisambiguatedSourceType.ROR.name());
- obsoletedEntity.setStatus(OrganizationStatus.OBSOLETE.name());
- obsoletedEntity.setSourceId(sourceId);
- // We don't need to index it
- obsoletedEntity.setIndexingStatus(IndexingStatus.DONE);
- orgDisambiguatedManager.createOrgDisambiguated(obsoletedEntity);
- new OrgGrouping(obsoletedEntity, orgDisambiguatedManager)
- .ungroupObsoleteRorForIndexing(orgDisambiguatedDao);
- }
- }
-
- @Override
- public boolean isEnabled() {
- return enabled;
- }
-
- private void groupRORsWithUpdatedExternalModifiers() {
- for (Long id : UPDATED_RORS) {
- OrgDisambiguatedEntity entity = orgDisambiguatedDao.find(id);
- if (entity != null) {
- entity.setIndexingStatus(IndexingStatus.PENDING);
- try {
- // mark group for indexing
- new OrgGrouping(entity, orgDisambiguatedManager).markGroupForIndexing(orgDisambiguatedDao);
-
- } catch (Exception ex) {
- LOGGER.error(
- "Error when grouping by ROR and marking group orgs for reindexing, eating the exception",
- ex);
- }
- entity = orgDisambiguatedManager.updateOrgDisambiguated(entity);
-
- }
- }
- }
+ private static final Logger LOGGER = LoggerFactory.getLogger(RorOrgLoadSource.class);
+
+ private static final String WIKIPEDIA_URL = "wikipedia_url";
+
+ @Value("${org.orcid.core.orgs.ror.enabled:true}")
+ private boolean enabled;
+
+ @Value("${org.orcid.core.orgs.clients.userAgent}")
+ private String userAgent;
+
+ @Resource(name = "rorOrgDataClient")
+ private OrgDataClient orgDataClient;
+
+ @Value("${org.orcid.core.orgs.ror.localZipPath:/tmp/grid/ror.zip}")
+ private String zipFilePath;
+
+ @Value("${org.orcid.core.orgs.ror.localDataPath:/tmp/grid/ror.json}")
+ private String localDataPath;
+
+ @Value("${org.orcid.core.orgs.ror.indexAllEnabled:false}")
+ private boolean indexAllEnabled;
+
+ @Resource
+ private OrgDisambiguatedDao orgDisambiguatedDao;
+
+ @Resource
+ private OrgDisambiguatedManager orgDisambiguatedManager;
+
+ @Resource
+ private OrgDisambiguatedExternalIdentifierDao orgDisambiguatedExternalIdentifierDao;
+
+ @Value("${org.orcid.core.orgs.ror.zenodoRecordsUrl:https://zenodo.org/api/records?communities=ror-data}")
+ private String rorZenodoRecordsUrl;
+
+ @Resource
+ private FileRotator fileRotator;
+
+ private Set UPDATED_RORS;
+
+ @Override
+ public String getSourceName() {
+ return "ROR";
+ }
+
+ @Override
+ public boolean loadOrgData() {
+ if (!enabled) {
+ throw new LoadSourceDisabledException(getSourceName());
+ }
+
+ return loadData();
+ }
+
+ @Override
+ public boolean downloadOrgData() {
+ try {
+ fileRotator.removeFileIfExists(zipFilePath);
+ fileRotator.removeFileIfExists(localDataPath);
+
+ ZenodoRecords zenodoRecords = orgDataClient.get(rorZenodoRecordsUrl + "&sort=mostrecent&size=1", userAgent, ZenodoRecords.class);
+ ZenodoRecordsHit zenodoHit = zenodoRecords.getHits().getHits().get(0);
+
+ boolean success = false;
+
+ // we are returning the collection ordered by mostrecent and size 1,
+ // we need to
+ // get the last element in the list that has the last version
+ String zenodoUrl = zenodoHit.getFiles().get(zenodoHit.getFiles().size() > 0 ? zenodoHit.getFiles().size() - 1 : 0).getLinks().getSelf();
+ LOGGER.info("Retrieving ROR data from: " + zenodoUrl);
+ success = orgDataClient.downloadFile(zenodoUrl, userAgent, zipFilePath);
+
+ try {
+ LOGGER.info("Unzipping ROR ....");
+ unzipData();
+ } catch (IOException e) {
+ LOGGER.error("Error unzipping Zenodo ROR data", e);
+ throw new RuntimeException(e);
+ }
+ return success;
+ } catch (Exception e) {
+ LOGGER.error("Error downloading Zenodo ROR data", e);
+ return false;
+ }
+ }
+
+ private void unzipData() throws IOException {
+ byte[] buffer = new byte[1024];
+ ZipInputStream zis = new ZipInputStream(new FileInputStream(zipFilePath));
+ ZipEntry zipEntry = zis.getNextEntry();
+ while (zipEntry != null) {
+ String zipEntryName = zipEntry.getName();
+ if (zipEntryName.endsWith("v2.json")) {
+ File jsonData = new File(localDataPath);
+ FileOutputStream fos = new FileOutputStream(jsonData);
+ int len;
+ while ((len = zis.read(buffer)) > 0) {
+ fos.write(buffer, 0, len);
+ }
+ fos.close();
+ break;
+ }
+ zipEntry = zis.getNextEntry();
+ }
+ zis.closeEntry();
+ zis.close();
+ }
+
+ private boolean loadData() {
+ try {
+ LOGGER.info("Loading ROR data...");
+ Instant start = Instant.now();
+ File fileToLoad = new File(localDataPath);
+ if (!fileToLoad.exists()) {
+ LOGGER.error("File {} doesn't exist", localDataPath);
+ return false;
+ }
+
+ // ror returns the JSON as Array of institutes
+ JsonNode rootNode = JsonUtils.read(fileToLoad);
+ UPDATED_RORS = new HashSet();
+
+ rootNode.forEach(institute -> {
+ String sourceId = institute.get("id").isNull() ? null : institute.get("id").asText();
+ String status = institute.get("status").isNull() ? null : institute.get("status").asText();
+ if ("active".equalsIgnoreCase(status) || "inactive".equalsIgnoreCase(status)) {
+ ArrayNode namesNode = institute.get("names").isNull() ? null : (ArrayNode) institute.get("names");
+ String name = null;
+ String namesJson = null;
+
+ if (namesNode != null) {
+ for (JsonNode nameJson : namesNode) {
+ ArrayNode nameTypes = nameJson.get("types").isNull() ? null : (ArrayNode) nameJson.get("types");
+ for (JsonNode nameType : nameTypes) {
+ if (StringUtils.equalsIgnoreCase(nameType.asText(), "ror_display")) {
+ name = nameJson.get("value").asText();
+ break;
+ }
+ }
+ }
+ namesJson = namesNode.toString();
+ }
+
+ StringJoiner sj = new StringJoiner(",");
+ String orgType = null;
+ if (!institute.get("types").isNull()) {
+ ((ArrayNode) institute.get("types")).forEach(x -> sj.add(x.textValue()));
+ orgType = sj.toString();
+ }
+
+ // location node
+
+ ArrayNode locationsNode = institute.get("locations").isNull() ? null : (ArrayNode) institute.get("locations");
+ Iso3166Country country = null;
+ String region = null;
+ String city = null;
+
+ String locationsJson = null;
+ if (locationsNode != null) {
+ for (JsonNode locationJson : locationsNode) {
+ JsonNode geoDetailsNode = locationJson.get("geonames_details").isNull() ? null : (JsonNode) locationJson.get("geonames_details");
+
+ if (geoDetailsNode != null) {
+ String countryCode = geoDetailsNode.get("country_code").isNull() ? null : geoDetailsNode.get("country_code").asText();
+ country = StringUtils.isBlank(countryCode) ? null : Iso3166Country.fromValue(countryCode);
+ // for now storing just the first location
+ city = geoDetailsNode.get("name").isNull() ? null : geoDetailsNode.get("name").asText();
+ if (country != null) {
+ break;
+ }
+ }
+
+ }
+ locationsJson = locationsNode.toString();
+ }
+
+ ArrayNode urls = institute.get("links").isNull() ? null : (ArrayNode) institute.get("links");
+ // Use the first URL
+ String url = (urls != null && urls.size() > 0) ? urls.get(0).asText() : null;
+
+ // Creates or updates an institute
+ OrgDisambiguatedEntity entity = processInstitute(sourceId, name, country, city, region, url, orgType, locationsJson, namesJson);
+
+ // Creates external identifiers
+ processExternalIdentifiers(entity, institute);
+ } else if ("redirected".equals(status)) {
+ String primaryId = institute.get("redirect").isNull() ? null : institute.get("redirect").asText();
+ deprecateOrg(sourceId, primaryId);
+ } else if ("withdrawn".equals(status) || "obsolete".equals(status)) {
+ obsoleteOrg(sourceId);
+ } else {
+ LOGGER.error("Illegal status '" + status + "' for institute " + sourceId);
+ }
+ });
+
+ // Check if any RORs with external identifiers updated and group
+ // them
+ groupRORsWithUpdatedExternalModifiers();
+
+ LOGGER.info("Time taken to process the data: {}", Duration.between(start, Instant.now()).toString());
+ return true;
+ } catch (Exception e) {
+ LOGGER.error("Error loading ROR data", e);
+ return false;
+ }
+ }
+
+ private OrgDisambiguatedEntity processInstitute(String sourceId, String name, Iso3166Country country, String city,
+
+ String region, String url, String orgType, String locationsJson, String namesJson) {
+ OrgDisambiguatedEntity existingBySourceId = orgDisambiguatedDao.findBySourceIdAndSourceType(sourceId, OrgDisambiguatedSourceType.ROR.name());
+ if (existingBySourceId != null) {
+ if (entityChanged(existingBySourceId, name, country.value(), city, region, url, orgType) || indexAllEnabled) {
+ existingBySourceId.setCity(city);
+ existingBySourceId.setCountry(country.name());
+ existingBySourceId.setName(name);
+ existingBySourceId.setOrgType(orgType);
+ existingBySourceId.setRegion(region);
+ existingBySourceId.setUrl(url);
+ existingBySourceId.setLocationsJson(locationsJson);
+ existingBySourceId.setNamesJson(namesJson);
+
+ existingBySourceId.setIndexingStatus(IndexingStatus.PENDING);
+ try {
+ // mark group for indexing
+ new OrgGrouping(existingBySourceId, orgDisambiguatedManager).markGroupForIndexing(orgDisambiguatedDao);
+
+ } catch (Exception ex) {
+ LOGGER.error("Error when grouping by ROR and marking group orgs for reindexing, eating the exception", ex);
+ }
+ orgDisambiguatedManager.updateOrgDisambiguated(existingBySourceId);
+ }
+ return existingBySourceId;
+ }
+
+ // Create a new disambiguated org
+ OrgDisambiguatedEntity newOrg = createDisambiguatedOrg(sourceId, name, orgType, country, city, region, url);
+ try {
+ // mark group for indexing
+ new OrgGrouping(newOrg, orgDisambiguatedManager).markGroupForIndexing(orgDisambiguatedDao);
+ } catch (Exception ex) {
+ LOGGER.error("Error when grouping by ROR and removing related orgs solr index, eating the exception", ex);
+ }
+ return newOrg;
+ }
+
+ private void processExternalIdentifiers(OrgDisambiguatedEntity org, JsonNode institute) {
+ ArrayNode nodes = institute.get("external_ids") == null ? null : (ArrayNode) institute.get("external_ids");
+ if (nodes != null) {
+ for (JsonNode entry : nodes) {
+ String identifierTypeName = entry.get("type").asText().toUpperCase();
+ String preferredId = entry.get("preferred").isNull() ? null : entry.get("preferred").asText();
+ if (StringUtils.equalsIgnoreCase(OrgDisambiguatedSourceType.GRID.name(), identifierTypeName)) {
+ JsonNode extId = (JsonNode) entry.get("all");
+ setExternalId(org, identifierTypeName, preferredId, extId);
+ UPDATED_RORS.add(org.getId());
+ } else {
+ ArrayNode elements = (ArrayNode) entry.get("all");
+ for (JsonNode extId : elements) {
+ setExternalId(org, identifierTypeName, preferredId, extId);
+ UPDATED_RORS.add(org.getId());
+ }
+ }
+ }
+ }
+ }
+
+ private void setExternalId(OrgDisambiguatedEntity org, String identifierTypeName, String preferredId, JsonNode extId) {
+ // If the external identifier doesn't exists yet
+ OrgDisambiguatedExternalIdentifierEntity existingExternalId = orgDisambiguatedExternalIdentifierDao.findByDetails(org.getId(), extId.asText(),
+ identifierTypeName);
+ Boolean preferred = extId.asText().equals(preferredId);
+ if (existingExternalId == null) {
+ if (preferred) {
+ createExternalIdentifier(org, extId.asText(), identifierTypeName, true);
+ } else {
+ createExternalIdentifier(org, extId.asText(), identifierTypeName, false);
+ }
+ } else {
+ if (existingExternalId.getPreferred() != preferred) {
+ existingExternalId.setPreferred(preferred);
+ orgDisambiguatedManager.updateOrgDisambiguatedExternalIdentifier(existingExternalId);
+ LOGGER.info("External identifier for {} with ext id {} and type {} was updated", new Object[] { org.getId(), extId.asText(), identifierTypeName });
+ } else {
+ LOGGER.info("External identifier for {} with ext id {} and type {} already exists", new Object[] { org.getId(), extId.asText(), identifierTypeName });
+ }
+ }
+ }
+
+ /**
+ * Indicates if an entity changed his address, url or org type
+ *
+ * @return true if the entity has changed.
+ */
+ private boolean entityChanged(OrgDisambiguatedEntity entity, String name, String countryCode, String city, String region, String url, String orgType) {
+ // Check name
+ if (StringUtils.isNotBlank(name)) {
+ if (!name.equalsIgnoreCase(entity.getName()))
+ return true;
+ } else if (StringUtils.isNotBlank(entity.getName())) {
+ return true;
+ }
+ // Check country
+ if (StringUtils.isNotBlank(countryCode)) {
+ if (entity.getCountry() == null || !countryCode.equals(entity.getCountry())) {
+ return true;
+ }
+ } else if (entity.getCountry() != null) {
+ return true;
+ }
+ // Check city
+ if (StringUtils.isNotBlank(city)) {
+ if (entity.getCity() == null || !city.equals(entity.getCity())) {
+ return true;
+ }
+ } else if (StringUtils.isNotBlank(entity.getCity())) {
+ return true;
+ }
+ // Check region
+ if (StringUtils.isNotBlank(region)) {
+ if (entity.getRegion() == null || !region.equals(entity.getRegion())) {
+ return true;
+ }
+ } else if (StringUtils.isNotBlank(entity.getRegion())) {
+ return true;
+ }
+ // Check url
+ if (StringUtils.isNotBlank(url)) {
+ if (entity.getUrl() == null || !url.equals(entity.getUrl())) {
+ return true;
+ }
+ } else if (StringUtils.isNotBlank(entity.getUrl())) {
+ return true;
+ }
+ // Check org_type
+ if (StringUtils.isNotBlank(orgType)) {
+ if (entity.getOrgType() == null || !orgType.equals(entity.getOrgType())) {
+ return true;
+ }
+ } else if (StringUtils.isNotBlank(entity.getOrgType())) {
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Creates a disambiguated ORG in the org_disambiguated table
+ */
+ private OrgDisambiguatedEntity createDisambiguatedOrg(String sourceId, String name, String orgType, Iso3166Country country, String city, String region, String url) {
+ LOGGER.info("Creating disambiguated org {}", name);
+ OrgDisambiguatedEntity orgDisambiguatedEntity = new OrgDisambiguatedEntity();
+ orgDisambiguatedEntity.setName(name);
+ orgDisambiguatedEntity.setCountry(country != null ? country.name() : null);
+ orgDisambiguatedEntity.setCity(city);
+ orgDisambiguatedEntity.setRegion(region);
+ orgDisambiguatedEntity.setUrl(url);
+ orgDisambiguatedEntity.setOrgType(orgType);
+ orgDisambiguatedEntity.setSourceId(sourceId);
+ orgDisambiguatedEntity.setSourceType(OrgDisambiguatedSourceType.ROR.name());
+ orgDisambiguatedManager.createOrgDisambiguated(orgDisambiguatedEntity);
+ return orgDisambiguatedEntity;
+ }
+
+ /**
+ * Creates an external identifier in the
+ * org_disambiguated_external_identifier table
+ */
+ private boolean createExternalIdentifier(OrgDisambiguatedEntity disambiguatedOrg, String identifier, String externalIdType, Boolean preferred) {
+ LOGGER.info("Creating external identifier for {}", disambiguatedOrg.getId());
+ OrgDisambiguatedExternalIdentifierEntity externalIdentifier = new OrgDisambiguatedExternalIdentifierEntity();
+ externalIdentifier.setIdentifier(identifier);
+ externalIdentifier.setIdentifierType(externalIdType);
+ externalIdentifier.setOrgDisambiguated(disambiguatedOrg);
+ externalIdentifier.setPreferred(preferred);
+ orgDisambiguatedManager.createOrgDisambiguatedExternalIdentifier(externalIdentifier);
+ return true;
+ }
+
+ /**
+ * Mark an existing org as DEPRECATED
+ */
+ private void deprecateOrg(String sourceId, String primarySourceId) {
+ LOGGER.info("Deprecating org {} for {}", sourceId, primarySourceId);
+ OrgDisambiguatedEntity existingEntity = orgDisambiguatedDao.findBySourceIdAndSourceType(sourceId, OrgDisambiguatedSourceType.ROR.name());
+ if (existingEntity != null) {
+ if (existingEntity.getStatus() == null || !existingEntity.getStatus().equals(OrganizationStatus.DEPRECATED.name())
+ || !existingEntity.getSourceParentId().equals(primarySourceId)) {
+ existingEntity.setStatus(OrganizationStatus.DEPRECATED.name());
+ existingEntity.setSourceParentId(primarySourceId);
+ existingEntity.setIndexingStatus(IndexingStatus.PENDING);
+ orgDisambiguatedManager.updateOrgDisambiguated(existingEntity);
+ }
+ } else {
+ OrgDisambiguatedEntity deprecatedEntity = new OrgDisambiguatedEntity();
+ deprecatedEntity.setSourceType(OrgDisambiguatedSourceType.ROR.name());
+ deprecatedEntity.setStatus(OrganizationStatus.DEPRECATED.name());
+ deprecatedEntity.setSourceId(sourceId);
+ deprecatedEntity.setSourceParentId(primarySourceId);
+ // We don't need to index it
+ deprecatedEntity.setIndexingStatus(IndexingStatus.DONE);
+ orgDisambiguatedManager.createOrgDisambiguated(deprecatedEntity);
+ }
+ }
+
+ /**
+ * Mark an existing org as OBSOLETE
+ */
+ private void obsoleteOrg(String sourceId) {
+ LOGGER.info("Marking or as obsolete {}", sourceId);
+ OrgDisambiguatedEntity existingEntity = orgDisambiguatedDao.findBySourceIdAndSourceType(sourceId, OrgDisambiguatedSourceType.ROR.name());
+ if (existingEntity != null) {
+ if (existingEntity.getStatus() == null || !existingEntity.getStatus().equals(OrganizationStatus.OBSOLETE.name())) {
+ existingEntity.setStatus(OrganizationStatus.OBSOLETE.name());
+ existingEntity.setIndexingStatus(IndexingStatus.PENDING);
+ orgDisambiguatedManager.updateOrgDisambiguated(existingEntity);
+ new OrgGrouping(existingEntity, orgDisambiguatedManager).ungroupObsoleteRorForIndexing(orgDisambiguatedDao);
+ }
+ } else {
+ OrgDisambiguatedEntity obsoletedEntity = new OrgDisambiguatedEntity();
+ obsoletedEntity.setSourceType(OrgDisambiguatedSourceType.ROR.name());
+ obsoletedEntity.setStatus(OrganizationStatus.OBSOLETE.name());
+ obsoletedEntity.setSourceId(sourceId);
+ // We don't need to index it
+ obsoletedEntity.setIndexingStatus(IndexingStatus.DONE);
+ orgDisambiguatedManager.createOrgDisambiguated(obsoletedEntity);
+ new OrgGrouping(obsoletedEntity, orgDisambiguatedManager).ungroupObsoleteRorForIndexing(orgDisambiguatedDao);
+ }
+ }
+
+ @Override
+ public boolean isEnabled() {
+ return enabled;
+ }
+
+ private void groupRORsWithUpdatedExternalModifiers() {
+ for (Long id : UPDATED_RORS) {
+ OrgDisambiguatedEntity entity = orgDisambiguatedDao.find(id);
+ if (entity != null) {
+ entity.setIndexingStatus(IndexingStatus.PENDING);
+ try {
+ // mark group for indexing
+ new OrgGrouping(entity, orgDisambiguatedManager).markGroupForIndexing(orgDisambiguatedDao);
+
+ } catch (Exception ex) {
+ LOGGER.error("Error when grouping by ROR and marking group orgs for reindexing, eating the exception", ex);
+ }
+ entity = orgDisambiguatedManager.updateOrgDisambiguated(entity);
+
+ }
+ }
+ }
}
\ No newline at end of file