Skip to content

Commit

Permalink
citation count converter fix
Browse files Browse the repository at this point in the history
  • Loading branch information
ivanhb committed Jul 11, 2024
1 parent 2082006 commit 776bf89
Showing 1 changed file with 27 additions and 27 deletions.
54 changes: 27 additions & 27 deletions scripts/citation_count_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,11 @@
import redis

from oc.index.utils.logging import get_logger
from oc.index.utils.config import get_config

csv.field_size_limit(sys.maxsize)

ANYID = ["doi","issn","isbn","pmid","pmcid","url","wikidata","wikipedia","jid","arxiv"]
_config = get_config()
conf_br_ids = _config.get("cnc", "br_ids").split(",")

'''
Regex to get all the IDs in the Meta CSV dump
Expand Down Expand Up @@ -45,7 +46,9 @@ def re_get_ids(val, identifiers, multi_ids = False, group_ids= False):
'''
To create the omid map using the original OC META CSV dump
'''
def get_omid_map(fzip, wanted_id):
def get_omid_map(fzip):
global conf_br_ids

omid_map = dict()
with ZipFile(fzip) as archive:
logger.info("Total number of files in the archive is:"+str(len(archive.namelist())))
Expand All @@ -57,18 +60,17 @@ def get_omid_map(fzip, wanted_id):
omid_ids = re_get_ids(o_row["id"],["omid"])
if len(omid_ids) > 0:
omid = omid_ids[0].replace("omid:","")
other_ids = re_get_ids(o_row["id"], ANYID)
for any_id in other_ids:
if any_id.startswith(wanted_id):
any_id = any_id.replace(wanted_id+":","")
omid_map[omid] = any_id
other_ids = re_get_ids(o_row["id"], conf_br_ids)
omid_map[omid] = set(other_ids)
return omid_map

'''
To create the omid map using the META BRs index (in CSV)
The META BRs index should be previously generated using 'meta2redis' command
'''
def read_omid_map(f_omidmap):
global conf_br_ids

omid_map = defaultdict(set)
with open(f_omidmap, mode='r') as file:
csv_reader = csv.reader(file)
Expand All @@ -77,32 +79,34 @@ def read_omid_map(f_omidmap):
br_omid, anyids = row
br_omid = "br/"+br_omid
for _id in anyids.split("; "):
for anyid_pref in ANYID:
for anyid_pref in conf_br_ids:
if _id.startswith(anyid_pref):
omid_map[br_omid].add( _id )

return omid_map

def main():
global _config

parser = argparse.ArgumentParser(description='Converts the citation count dump of OpenCitations Index based on BR OMIDs to any other ID (e.g., DOI, PMID)')
parser.add_argument('--citations', required=True, help='Path to the CSV file containing the citation count in the OpenCitations INDEX expressed as OMID: [COUNT] (*Note: generated by citationcount_dump_gen)')
parser.add_argument('--redisindex', help='Redis DB storing all the citations of opencitations (*Note: generated by meta2redis)')
parser.add_argument('--citations', required=True, help='Path to the CSV file containing the citation count in the OpenCitations INDEX expressed as OMID: [COUNT] (*Note: generated by cits2redis)')
parser.add_argument('--redisindex', help='Redis DB storing all the citations of opencitations (*Note: populated by cits2redis)')
parser.add_argument('--metabrs', help='Path to CSV dump containing the index/map of all BR in Meta (OMIDs) (*Note: generated by meta2redis)')
parser.add_argument('--metacsv', help='Path to the directory containing the ZIP CSV dump of OC Meta')
parser.add_argument('--id', default='doi', help='Convert OMID(s) to a given ID')
parser.add_argument('--out', default='anyid_citation_count.csv', help='Path to the output CSV file')
parser.add_argument('--out', default='./', help='Path to the output destination dir')
args = parser.parse_args()
logger = get_logger()

anyid_pref = args.id

# Build OMID map
# DICT => { <OMID>: <anyid_pref>:<ANYID> }
# DICT => { <OMID>: <anyid_pref>:<anyid_val> }
logger.info("Build OMID map ...")
if args.metabrs:
omid_map = read_omid_map(args.metabrs)
elif args.metacsv:
omid_map = get_omid_map(args.metacsv, anyid_pref)
omid_map = get_omid_map(args.metacsv)


# Redis DB storing OC Index citations
Expand Down Expand Up @@ -152,20 +156,16 @@ def main():

l_citing_anyids = [omid_map["br/"+__c] for __c in citing_omids if "br/"+__c in omid_map]

__element_count = dict()
for s in l_citing_anyids:
for elem in s:
if elem in __element_count:
__element_count[elem] += 1
else:
__element_count[elem] = 1

unique_citing_anyids = []
for s in l_citing_anyids:
if all(__element_count[elem] == 1 for elem in s):
# check the unique citing anyids
_c_intersection = 0
for __unique in unique_citing_anyids:
_c_intersection += len(__unique.intersection(s))
# if there is no common anyids with the other citing entities
if _c_intersection == 0:
unique_citing_anyids.append(s)


cits_count = len(unique_citing_anyids)

else:
Expand Down Expand Up @@ -194,16 +194,16 @@ def main():


# dump anyid - citation count
logger.info('Saving '+args.out+', storing the citation counts of '+anyid_pref+' BRs ...')
with open(args.out, mode='w', newline='') as output_csvfile:
logger.info('Saving the citation counts of '+anyid_pref+' BRs ...')
with open(args.out+anyid_pref+"_citation_count.csv", mode='w', newline='') as output_csvfile:
writer = csv.writer(output_csvfile)
writer.writerow([anyid_pref, 'citation_count'])
for c in [(k,anyid_citation_count[k]) for k in anyid_citation_count]:
writer.writerow([c[0],str(c[1])])

# dump duplicates
logger.info('Saving duplicated BR entites ...')
with open(anyid_pref+"_dupilcates.csv", mode='w', newline='') as output_csvfile:
with open(args.out+anyid_pref+"_dupilcates.csv", mode='w', newline='') as output_csvfile:
writer = csv.writer(output_csvfile)
writer.writerow([anyid_pref, 'num_duplicates'])
for c in [ (any_id,multi_any_ids[any_id]) for any_id in multi_any_ids]:
Expand Down

0 comments on commit 776bf89

Please sign in to comment.