From 116b6562351e8cae551f26ca242ae9e2fd44c27e Mon Sep 17 00:00:00 2001 From: "c.dumitru@orcid.org" Date: Thu, 19 Oct 2023 09:57:13 +0300 Subject: [PATCH 1/5] Added the code to cleanup duplicated external identifiers un entity update --- .../core/manager/OrgDisambiguatedManager.java | 2 + .../impl/OrgDisambiguatedManagerImpl.java | 42 +++++++++++++++++++ 2 files changed, 44 insertions(+) diff --git a/orcid-core/src/main/java/org/orcid/core/manager/OrgDisambiguatedManager.java b/orcid-core/src/main/java/org/orcid/core/manager/OrgDisambiguatedManager.java index 3dd8c72cad0..76851b41b23 100644 --- a/orcid-core/src/main/java/org/orcid/core/manager/OrgDisambiguatedManager.java +++ b/orcid-core/src/main/java/org/orcid/core/manager/OrgDisambiguatedManager.java @@ -36,5 +36,7 @@ public interface OrgDisambiguatedManager { void createOrgDisambiguatedExternalIdentifier(OrgDisambiguatedExternalIdentifierEntity identifier); public List findOrgDisambiguatedIdsForSameExternalIdentifier(String identifier, String type); + + public void cleanDuplicatedExternalIdentifiersForOrgDisambiguated(OrgDisambiguatedEntity orgDisambiguatedEntity); } diff --git a/orcid-core/src/main/java/org/orcid/core/manager/impl/OrgDisambiguatedManagerImpl.java b/orcid-core/src/main/java/org/orcid/core/manager/impl/OrgDisambiguatedManagerImpl.java index 2a074b34149..90a70ec4788 100644 --- a/orcid-core/src/main/java/org/orcid/core/manager/impl/OrgDisambiguatedManagerImpl.java +++ b/orcid-core/src/main/java/org/orcid/core/manager/impl/OrgDisambiguatedManagerImpl.java @@ -67,6 +67,9 @@ public class OrgDisambiguatedManagerImpl implements OrgDisambiguatedManager { @Value("${org.orcid.persistence.messaging.updated.disambiguated_org.solr:indexDisambiguatedOrgs}") private String updateSolrQueueName; + + @Value("${org.orcid.core.cleanExtIdsOnOrgUpdate:true}") + private boolean cleanDuplicateExtIdOnOrgUpdate; @Resource(name = "jmsMessageSender") private JmsMessageSender messaging; @@ -219,6 +222,9 @@ public List searchOrgsFromSolrForSelfService(String searchTerm @Override public OrgDisambiguatedEntity updateOrgDisambiguated(OrgDisambiguatedEntity orgDisambiguatedEntity) { normalizeExternalIdentifiers(orgDisambiguatedEntity); + if(cleanDuplicateExtIdOnOrgUpdate) { + cleanDuplicatedExternalIdentifiersForOrgDisambiguated(orgDisambiguatedEntity); + } return orgDisambiguatedDao.merge(orgDisambiguatedEntity); } @@ -346,5 +352,41 @@ private void normalizeExternalIdentifiers(OrgDisambiguatedEntity orgDisambiguate } } } + + public void cleanDuplicatedExternalIdentifiersForOrgDisambiguated(OrgDisambiguatedEntity orgDisambiguatedEntity) { + if (orgDisambiguatedEntity.getExternalIdentifiers() != null) { + HashMap extIdsMapping = new HashMap(); + String extIdentifierKey ; + OrgDisambiguatedExternalIdentifierEntity mappedExtIdentifier; + List duplicatedExtIdentifiersToBeRemoved = new ArrayList(); + for (OrgDisambiguatedExternalIdentifierEntity identifier : orgDisambiguatedEntity.getExternalIdentifiers()) { + extIdentifierKey = identifier.getIdentifierType() + "::" + identifier.getIdentifier(); + if(extIdsMapping.containsKey(extIdentifierKey)) { + + if(!identifier.getPreferred()) { + duplicatedExtIdentifiersToBeRemoved.add(identifier); + } + else { + mappedExtIdentifier = extIdsMapping.get(extIdentifierKey); + duplicatedExtIdentifiersToBeRemoved.add(mappedExtIdentifier); + extIdsMapping.put(extIdentifierKey, identifier); + } + + } + } + //remove the duplicates from DB + try { + LOGGER.info("About to remove " + duplicatedExtIdentifiersToBeRemoved.size() + " duplicate external Ids for Disambiguated Org " + orgDisambiguatedEntity.getId() ); + duplicatedExtIdentifiersToBeRemoved.stream().forEach((e) -> { + orgDisambiguatedExternalIdentifierDao.remove(e); + LOGGER.debug("Removed ext id " + e.getIdentifierType() + "::" + e.getIdentifier() + "::" + e.getId()); + }); + + } catch (Exception ex) { + LOGGER.error("Exception when removing duplicate external ids for Disambiguated Org " + orgDisambiguatedEntity.getId(), ex); + } + } + + } } From bbfe9bc5ca4bf8239f0758ead648a6481df02bc7 Mon Sep 17 00:00:00 2001 From: "c.dumitru@orcid.org" Date: Thu, 19 Oct 2023 10:41:52 +0300 Subject: [PATCH 2/5] Formatting --- .../impl/OrgDisambiguatedManagerImpl.java | 65 +++++++++---------- 1 file changed, 31 insertions(+), 34 deletions(-) diff --git a/orcid-core/src/main/java/org/orcid/core/manager/impl/OrgDisambiguatedManagerImpl.java b/orcid-core/src/main/java/org/orcid/core/manager/impl/OrgDisambiguatedManagerImpl.java index 90a70ec4788..caa208d0e28 100644 --- a/orcid-core/src/main/java/org/orcid/core/manager/impl/OrgDisambiguatedManagerImpl.java +++ b/orcid-core/src/main/java/org/orcid/core/manager/impl/OrgDisambiguatedManagerImpl.java @@ -52,7 +52,7 @@ public class OrgDisambiguatedManagerImpl implements OrgDisambiguatedManager { @Resource private OrgDao orgDao; - + @Resource private OrgDisambiguatedExternalIdentifierDao orgDisambiguatedExternalIdentifierDao; @@ -67,7 +67,7 @@ public class OrgDisambiguatedManagerImpl implements OrgDisambiguatedManager { @Value("${org.orcid.persistence.messaging.updated.disambiguated_org.solr:indexDisambiguatedOrgs}") private String updateSolrQueueName; - + @Value("${org.orcid.core.cleanExtIdsOnOrgUpdate:true}") private boolean cleanDuplicateExtIdOnOrgUpdate; @@ -79,7 +79,7 @@ public class OrgDisambiguatedManagerImpl implements OrgDisambiguatedManager { @Value("${org.orcid.persistence.messaging.updated.disambiguated_org.indexing.batchSize:1000}") private int indexingBatchSize; - + @Override synchronized public void processOrgsForIndexing() { LOGGER.info("About to process disambiguated orgs for indexing"); @@ -106,7 +106,7 @@ synchronized public void markOrgsForIndexingAsGroup() { entities = orgDisambiguatedDaoReadOnly.findOrgsToGroup(startIndex, indexingBatchSize); LOGGER.info("GROUP: Found chunk of {} disambiguated orgs for indexing as group", entities.size()); for (OrgDisambiguatedEntity entity : entities) { - + new OrgGrouping(entity, this).markGroupForIndexing(orgDisambiguatedDao); } startIndex = startIndex + indexingBatchSize; @@ -150,7 +150,7 @@ private OrgDisambiguatedSolrDocument convertEntityToDocument(OrgDisambiguatedEnt document.setOrgDisambiguatedPopularity(entity.getPopularity()); Set orgNames = new HashSet<>(); orgNames.add(entity.getName()); - + List orgs = orgDao.findByOrgDisambiguatedId(entity.getId()); if (orgs != null) { for (OrgEntity org : orgs) { @@ -222,7 +222,7 @@ public List searchOrgsFromSolrForSelfService(String searchTerm @Override public OrgDisambiguatedEntity updateOrgDisambiguated(OrgDisambiguatedEntity orgDisambiguatedEntity) { normalizeExternalIdentifiers(orgDisambiguatedEntity); - if(cleanDuplicateExtIdOnOrgUpdate) { + if (cleanDuplicateExtIdOnOrgUpdate) { cleanDuplicatedExternalIdentifiersForOrgDisambiguated(orgDisambiguatedEntity); } return orgDisambiguatedDao.merge(orgDisambiguatedEntity); @@ -277,17 +277,16 @@ public void updateOrgDisambiguatedExternalIdentifier(OrgDisambiguatedExternalIde orgDisambiguatedExternalIdentifierDao.merge(identifier); } - public List findOrgDisambiguatedIdsForSameExternalIdentifier( String identifier, String type ) { + public List findOrgDisambiguatedIdsForSameExternalIdentifier(String identifier, String type) { List orgDisambiguatedIds = new ArrayList(); List extIds = orgDisambiguatedExternalIdentifierDao.findByIdentifierIdAndType(identifier, type); - extIds.stream().forEach((e) -> - { - OrgDisambiguatedEntity de = e.getOrgDisambiguated(); - // Group only if it is not a RINGGOLD org - if(de != null && !OrgDisambiguatedSourceType.RINGGOLD.name().equals(de.getSourceType())) { - orgDisambiguatedIds.add(convertEntity(de)); - } - }); + extIds.stream().forEach((e) -> { + OrgDisambiguatedEntity de = e.getOrgDisambiguated(); + // Group only if it is not a RINGGOLD org + if (de != null && !OrgDisambiguatedSourceType.RINGGOLD.name().equals(de.getSourceType())) { + orgDisambiguatedIds.add(convertEntity(de)); + } + }); return orgDisambiguatedIds; } @@ -352,41 +351,39 @@ private void normalizeExternalIdentifiers(OrgDisambiguatedEntity orgDisambiguate } } } - + public void cleanDuplicatedExternalIdentifiersForOrgDisambiguated(OrgDisambiguatedEntity orgDisambiguatedEntity) { if (orgDisambiguatedEntity.getExternalIdentifiers() != null) { HashMap extIdsMapping = new HashMap(); - String extIdentifierKey ; + String extIdentifierKey; OrgDisambiguatedExternalIdentifierEntity mappedExtIdentifier; List duplicatedExtIdentifiersToBeRemoved = new ArrayList(); for (OrgDisambiguatedExternalIdentifierEntity identifier : orgDisambiguatedEntity.getExternalIdentifiers()) { - extIdentifierKey = identifier.getIdentifierType() + "::" + identifier.getIdentifier(); - if(extIdsMapping.containsKey(extIdentifierKey)) { - - if(!identifier.getPreferred()) { - duplicatedExtIdentifiersToBeRemoved.add(identifier); - } - else { - mappedExtIdentifier = extIdsMapping.get(extIdentifierKey); - duplicatedExtIdentifiersToBeRemoved.add(mappedExtIdentifier); - extIdsMapping.put(extIdentifierKey, identifier); - } - + extIdentifierKey = identifier.getIdentifierType() + "::" + identifier.getIdentifier(); + if (extIdsMapping.containsKey(extIdentifierKey)) { + + if (!identifier.getPreferred()) { + duplicatedExtIdentifiersToBeRemoved.add(identifier); + } else { + mappedExtIdentifier = extIdsMapping.get(extIdentifierKey); + duplicatedExtIdentifiersToBeRemoved.add(mappedExtIdentifier); + extIdsMapping.put(extIdentifierKey, identifier); + } + } } - //remove the duplicates from DB + // remove the duplicates from DB try { - LOGGER.info("About to remove " + duplicatedExtIdentifiersToBeRemoved.size() + " duplicate external Ids for Disambiguated Org " + orgDisambiguatedEntity.getId() ); + LOGGER.info("About to remove " + duplicatedExtIdentifiersToBeRemoved.size() + " duplicate external Ids for Disambiguated Org " + + orgDisambiguatedEntity.getId()); duplicatedExtIdentifiersToBeRemoved.stream().forEach((e) -> { orgDisambiguatedExternalIdentifierDao.remove(e); LOGGER.debug("Removed ext id " + e.getIdentifierType() + "::" + e.getIdentifier() + "::" + e.getId()); }); - + } catch (Exception ex) { LOGGER.error("Exception when removing duplicate external ids for Disambiguated Org " + orgDisambiguatedEntity.getId(), ex); } } - } - } From 1ea45a4b597e9531bd3efcde9238f6bfdfc9b89e Mon Sep 17 00:00:00 2001 From: "c.dumitru@orcid.org" Date: Thu, 19 Oct 2023 16:18:56 +0300 Subject: [PATCH 3/5] added the duplicate check at create external id step as well --- .../impl/OrgDisambiguatedManagerImpl.java | 40 ++++++++++++++----- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/orcid-core/src/main/java/org/orcid/core/manager/impl/OrgDisambiguatedManagerImpl.java b/orcid-core/src/main/java/org/orcid/core/manager/impl/OrgDisambiguatedManagerImpl.java index caa208d0e28..8b5101019db 100644 --- a/orcid-core/src/main/java/org/orcid/core/manager/impl/OrgDisambiguatedManagerImpl.java +++ b/orcid-core/src/main/java/org/orcid/core/manager/impl/OrgDisambiguatedManagerImpl.java @@ -11,6 +11,7 @@ import javax.annotation.Resource; import javax.transaction.Transactional; +import org.apache.commons.lang.StringUtils; import org.apache.commons.lang3.tuple.Pair; import org.orcid.core.manager.OrgDisambiguatedManager; import org.orcid.core.messaging.JmsMessageSender; @@ -268,7 +269,27 @@ public OrgDisambiguated findInDB(String idValue, String idType) { @Override public void createOrgDisambiguatedExternalIdentifier(OrgDisambiguatedExternalIdentifierEntity identifier) { normalizeExternalIdentifier(identifier); - orgDisambiguatedExternalIdentifierDao.persist(identifier); + boolean toPersist = true; + OrgDisambiguatedEntity orgDisambiguatedEntity = identifier.getOrgDisambiguated(); + if (orgDisambiguatedEntity.getExternalIdentifiers() != null) { + String extIdentifierKeyToAdd = identifier.getIdentifierType() + "::" + identifier.getIdentifier(); + String extIdentifierKey; + for (OrgDisambiguatedExternalIdentifierEntity identifier1 : orgDisambiguatedEntity.getExternalIdentifiers()) { + extIdentifierKey = identifier1.getIdentifierType() + "::" + identifier1.getIdentifier(); + if (StringUtils.equals(extIdentifierKeyToAdd, extIdentifierKey)) { + toPersist = false; + break; + } + } + } + if (cleanDuplicateExtIdOnOrgUpdate) { + cleanDuplicatedExternalIdentifiersForOrgDisambiguated(orgDisambiguatedEntity); + } + // check if in the current external id list the identifier already + if (toPersist) { + orgDisambiguatedExternalIdentifierDao.persist(identifier); + } + } @Override @@ -373,17 +394,18 @@ public void cleanDuplicatedExternalIdentifiersForOrgDisambiguated(OrgDisambiguat } } // remove the duplicates from DB - try { - LOGGER.info("About to remove " + duplicatedExtIdentifiersToBeRemoved.size() + " duplicate external Ids for Disambiguated Org " - + orgDisambiguatedEntity.getId()); - duplicatedExtIdentifiersToBeRemoved.stream().forEach((e) -> { + + LOGGER.info( + "About to remove " + duplicatedExtIdentifiersToBeRemoved.size() + " duplicate external Ids for Disambiguated Org " + orgDisambiguatedEntity.getId()); + duplicatedExtIdentifiersToBeRemoved.stream().forEach((e) -> { + try { orgDisambiguatedExternalIdentifierDao.remove(e); LOGGER.debug("Removed ext id " + e.getIdentifierType() + "::" + e.getIdentifier() + "::" + e.getId()); - }); + } catch (Exception ex) { + LOGGER.error("Exception when removing duplicate external ids for Disambiguated Org " + orgDisambiguatedEntity.getId(), ex); + } + }); - } catch (Exception ex) { - LOGGER.error("Exception when removing duplicate external ids for Disambiguated Org " + orgDisambiguatedEntity.getId(), ex); - } } } } From 7885983e3f2d7576af7e129b7cea0cd6cddfab51 Mon Sep 17 00:00:00 2001 From: "c.dumitru@orcid.org" Date: Thu, 19 Oct 2023 17:22:27 +0300 Subject: [PATCH 4/5] set the cleanup prop to false --- .../core/manager/impl/OrgDisambiguatedManagerImpl.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/orcid-core/src/main/java/org/orcid/core/manager/impl/OrgDisambiguatedManagerImpl.java b/orcid-core/src/main/java/org/orcid/core/manager/impl/OrgDisambiguatedManagerImpl.java index 8b5101019db..05f789d67fa 100644 --- a/orcid-core/src/main/java/org/orcid/core/manager/impl/OrgDisambiguatedManagerImpl.java +++ b/orcid-core/src/main/java/org/orcid/core/manager/impl/OrgDisambiguatedManagerImpl.java @@ -69,8 +69,8 @@ public class OrgDisambiguatedManagerImpl implements OrgDisambiguatedManager { @Value("${org.orcid.persistence.messaging.updated.disambiguated_org.solr:indexDisambiguatedOrgs}") private String updateSolrQueueName; - @Value("${org.orcid.core.cleanExtIdsOnOrgUpdate:true}") - private boolean cleanDuplicateExtIdOnOrgUpdate; + @Value("${org.orcid.core.cleanExtIdsForOrg:false}") + private boolean cleanDuplicateExtIdForOrg; @Resource(name = "jmsMessageSender") private JmsMessageSender messaging; @@ -223,7 +223,7 @@ public List searchOrgsFromSolrForSelfService(String searchTerm @Override public OrgDisambiguatedEntity updateOrgDisambiguated(OrgDisambiguatedEntity orgDisambiguatedEntity) { normalizeExternalIdentifiers(orgDisambiguatedEntity); - if (cleanDuplicateExtIdOnOrgUpdate) { + if (cleanDuplicateExtIdForOrg) { cleanDuplicatedExternalIdentifiersForOrgDisambiguated(orgDisambiguatedEntity); } return orgDisambiguatedDao.merge(orgDisambiguatedEntity); @@ -282,7 +282,7 @@ public void createOrgDisambiguatedExternalIdentifier(OrgDisambiguatedExternalIde } } } - if (cleanDuplicateExtIdOnOrgUpdate) { + if (cleanDuplicateExtIdForOrg) { cleanDuplicatedExternalIdentifiersForOrgDisambiguated(orgDisambiguatedEntity); } // check if in the current external id list the identifier already From d9af48dd23b2a8f4380193d900b8dfeb1199d253 Mon Sep 17 00:00:00 2001 From: "c.dumitru@orcid.org" Date: Thu, 19 Oct 2023 17:28:21 +0300 Subject: [PATCH 5/5] checking for null org --- .../orcid/core/manager/impl/OrgDisambiguatedManagerImpl.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/orcid-core/src/main/java/org/orcid/core/manager/impl/OrgDisambiguatedManagerImpl.java b/orcid-core/src/main/java/org/orcid/core/manager/impl/OrgDisambiguatedManagerImpl.java index 05f789d67fa..03faaeb9208 100644 --- a/orcid-core/src/main/java/org/orcid/core/manager/impl/OrgDisambiguatedManagerImpl.java +++ b/orcid-core/src/main/java/org/orcid/core/manager/impl/OrgDisambiguatedManagerImpl.java @@ -271,7 +271,7 @@ public void createOrgDisambiguatedExternalIdentifier(OrgDisambiguatedExternalIde normalizeExternalIdentifier(identifier); boolean toPersist = true; OrgDisambiguatedEntity orgDisambiguatedEntity = identifier.getOrgDisambiguated(); - if (orgDisambiguatedEntity.getExternalIdentifiers() != null) { + if (orgDisambiguatedEntity != null && orgDisambiguatedEntity.getExternalIdentifiers() != null) { String extIdentifierKeyToAdd = identifier.getIdentifierType() + "::" + identifier.getIdentifier(); String extIdentifierKey; for (OrgDisambiguatedExternalIdentifierEntity identifier1 : orgDisambiguatedEntity.getExternalIdentifiers()) {