citation count converter fix

opencitations · Jul 11, 2024 · 776bf89 · 776bf89
1 parent 2082006
commit 776bf89
Showing 1 changed file with 27 additions and 27 deletions.
diff --git a/scripts/citation_count_converter.py b/scripts/citation_count_converter.py
@@ -12,10 +12,11 @@
 import redis
 
 from oc.index.utils.logging import get_logger
+from oc.index.utils.config import get_config
 
 csv.field_size_limit(sys.maxsize)
-
-ANYID = ["doi","issn","isbn","pmid","pmcid","url","wikidata","wikipedia","jid","arxiv"]
+_config = get_config()
+conf_br_ids = _config.get("cnc", "br_ids").split(",")
 
 '''
 Regex to get all the IDs in the Meta CSV dump
@@ -45,7 +46,9 @@ def re_get_ids(val, identifiers, multi_ids = False, group_ids= False):
 '''
 To create the omid map using the original OC META CSV dump
 '''
-def get_omid_map(fzip, wanted_id):
+def get_omid_map(fzip):
+    global conf_br_ids
+
     omid_map = dict()
     with ZipFile(fzip) as archive:
         logger.info("Total number of files in the archive is:"+str(len(archive.namelist())))
@@ -57,18 +60,17 @@ def get_omid_map(fzip, wanted_id):
                     omid_ids = re_get_ids(o_row["id"],["omid"])
                     if len(omid_ids) > 0:
                         omid = omid_ids[0].replace("omid:","")
-                        other_ids = re_get_ids(o_row["id"], ANYID)
-                        for any_id in other_ids:
-                            if any_id.startswith(wanted_id):
-                                any_id = any_id.replace(wanted_id+":","")
-                                omid_map[omid] = any_id
+                        other_ids = re_get_ids(o_row["id"], conf_br_ids)
+                        omid_map[omid] = set(other_ids)
     return omid_map
 
 '''
 To create the omid map using the META BRs index (in CSV)
 The META BRs index should be previously generated using 'meta2redis' command
 '''
 def read_omid_map(f_omidmap):
+    global conf_br_ids
+
     omid_map = defaultdict(set)
     with open(f_omidmap, mode='r') as file:
         csv_reader = csv.reader(file)
@@ -77,32 +79,34 @@ def read_omid_map(f_omidmap):
                 br_omid, anyids = row
                 br_omid = "br/"+br_omid
                 for _id in anyids.split("; "):
-                    for anyid_pref in ANYID:
+                    for anyid_pref in conf_br_ids:
                         if _id.startswith(anyid_pref):
                             omid_map[br_omid].add( _id )
 
     return omid_map
 
 def main():
+    global _config
+
     parser = argparse.ArgumentParser(description='Converts the citation count dump of OpenCitations Index based on BR OMIDs to any other ID (e.g., DOI, PMID)')
-    parser.add_argument('--citations', required=True, help='Path to the CSV file containing the citation count in the OpenCitations INDEX expressed as OMID: [COUNT] (*Note: generated by citationcount_dump_gen)')
-    parser.add_argument('--redisindex', help='Redis DB storing all the citations of opencitations (*Note: generated by meta2redis)')
+    parser.add_argument('--citations', required=True, help='Path to the CSV file containing the citation count in the OpenCitations INDEX expressed as OMID: [COUNT] (*Note: generated by cits2redis)')
+    parser.add_argument('--redisindex', help='Redis DB storing all the citations of opencitations (*Note: populated by cits2redis)')
     parser.add_argument('--metabrs', help='Path to CSV dump containing the index/map of all BR in Meta (OMIDs) (*Note: generated by meta2redis)')
     parser.add_argument('--metacsv', help='Path to the directory containing the ZIP CSV dump of OC Meta')
     parser.add_argument('--id',  default='doi', help='Convert OMID(s) to a given ID')
-    parser.add_argument('--out', default='anyid_citation_count.csv', help='Path to the output CSV file')
+    parser.add_argument('--out', default='./', help='Path to the output destination dir')
     args = parser.parse_args()
     logger = get_logger()
 
     anyid_pref = args.id
 
     # Build OMID map
-    # DICT => { <OMID>: <anyid_pref>:<ANYID> }
+    # DICT => { <OMID>: <anyid_pref>:<anyid_val> }
     logger.info("Build OMID map ...")
     if args.metabrs:
         omid_map = read_omid_map(args.metabrs)
     elif args.metacsv:
-        omid_map = get_omid_map(args.metacsv, anyid_pref)
+        omid_map = get_omid_map(args.metacsv)
 
 
     # Redis DB storing OC Index citations
@@ -152,20 +156,16 @@ def main():
 
                             l_citing_anyids = [omid_map["br/"+__c] for __c in citing_omids if "br/"+__c in omid_map]
 
-                            __element_count = dict()
-                            for s in l_citing_anyids:
-                                for elem in s:
-                                    if elem in __element_count:
-                                        __element_count[elem] += 1
-                                    else:
-                                        __element_count[elem] = 1
-
                             unique_citing_anyids = []
                             for s in l_citing_anyids:
-                                if all(__element_count[elem] == 1 for elem in s):
+                                # check the unique citing anyids
+                                _c_intersection = 0
+                                for __unique in unique_citing_anyids:
+                                    _c_intersection += len(__unique.intersection(s))
+                                # if there is no common anyids with the other citing entities
+                                if _c_intersection == 0:
                                     unique_citing_anyids.append(s)
 
-
                             cits_count = len(unique_citing_anyids)
 
                         else:
@@ -194,16 +194,16 @@ def main():
 
 
     # dump anyid - citation count
-    logger.info('Saving '+args.out+', storing the citation counts of '+anyid_pref+' BRs ...')
-    with open(args.out, mode='w', newline='') as output_csvfile:
+    logger.info('Saving the citation counts of '+anyid_pref+' BRs ...')
+    with open(args.out+anyid_pref+"_citation_count.csv", mode='w', newline='') as output_csvfile:
         writer = csv.writer(output_csvfile)
         writer.writerow([anyid_pref, 'citation_count'])
         for c in [(k,anyid_citation_count[k]) for k in anyid_citation_count]:
             writer.writerow([c[0],str(c[1])])
 
     # dump duplicates
     logger.info('Saving duplicated BR entites ...')
-    with open(anyid_pref+"_dupilcates.csv", mode='w', newline='') as output_csvfile:
+    with open(args.out+anyid_pref+"_dupilcates.csv", mode='w', newline='') as output_csvfile:
         writer = csv.writer(output_csvfile)
         writer.writerow([anyid_pref, 'num_duplicates'])
         for c in [ (any_id,multi_any_ids[any_id]) for any_id in multi_any_ids]: