From 2c7b97ea7bcf5e9b983407ba58f59271ebfdecfe Mon Sep 17 00:00:00 2001 From: Ludo Pulles <ludo.pulles@gmail.com> Date: Sat, 28 Sep 2024 20:04:53 +0200 Subject: [PATCH 1/3] Fix imports (?) --- add.py | 19 ++++++++----------- add_doi_crossref.py | 17 ++++++++--------- check_many_authors_keys.py | 20 ++++++-------------- 3 files changed, 22 insertions(+), 34 deletions(-) diff --git a/add.py b/add.py index 7796f06..ffc6fe1 100644 --- a/add.py +++ b/add.py @@ -4,8 +4,12 @@ folders "lib" and "db" """ -import sys +import argparse +import logging import os +import shutil +import sys +import time scriptdir = os.path.dirname(os.path.realpath(__file__)) sys.path.append(os.path.join(scriptdir, "..", "lib")) @@ -15,13 +19,8 @@ import mybibtex.generator import confs_years -import logging -import shutil -import argparse -import time - import config -from config import * +from config import confs_missing_years mybibtex.generator.config = config logging.basicConfig(level=logging.DEBUG) @@ -31,10 +30,8 @@ def add(filenames: list[str]): parser = mybibtex.parser.Parser() parser.parse_file("db/abbrev0.bib") parser.parse_file("db/crypto_db.bib") - db = parser.parse_file("db/crypto_conf_list.bib") - - for filename in filenames: - db = parser.parse_file(filename) + parser.parse_file("db/crypto_conf_list.bib") + db = parser.parse_files(filenames) conf_years = confs_years.get_confs_years_inter(db, confs_missing_years) diff --git a/add_doi_crossref.py b/add_doi_crossref.py index 5c5d720..f4cef05 100644 --- a/add_doi_crossref.py +++ b/add_doi_crossref.py @@ -4,8 +4,15 @@ folders "lib" and "db" """ -import sys +import argparse +import json +import logging import os +import shutil +import sys +import time +import urllib.parse +import urllib.request scriptdir = os.path.dirname(os.path.realpath(__file__)) sys.path.append(os.path.join(scriptdir, "..", "lib")) @@ -16,14 +23,6 @@ import mybibtex.generator import confs_years -import logging -import shutil -import argparse -import time -import urllib.request, urllib.parse, urllib.error -import urllib.request, urllib.error, urllib.parse -import json - import config from config import * diff --git a/check_many_authors_keys.py b/check_many_authors_keys.py index 6428fad..a29ecdf 100644 --- a/check_many_authors_keys.py +++ b/check_many_authors_keys.py @@ -4,29 +4,21 @@ folders "lib" and "db" """ -import collections -import sys +import argparse +import logging import os +import re +import sys + +from pybtex.bibtex.utils import split_name_list scriptdir = os.path.dirname(os.path.realpath(__file__)) sys.path.append(os.path.join(scriptdir, "..", "lib")) sys.path.append(os.path.join(scriptdir, "..", "db")) import mybibtex.parser -import mybibtex.database import mybibtex.generator -import confs_years - -import argparse -import logging -import re -import shutil -import time - -from pybtex.bibtex.utils import split_name_list - import config -from config import * mybibtex.generator.config = config logging.basicConfig(level=logging.DEBUG) From 23cc94b903e56ce3609ff94f0086796051f42530 Mon Sep 17 00:00:00 2001 From: Ludo Pulles <ludo.pulles@gmail.com> Date: Sat, 28 Sep 2024 20:05:03 +0200 Subject: [PATCH 2/3] Check multiple spellings for people --- fix_von_prefix.py | 121 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 121 insertions(+) create mode 100644 fix_von_prefix.py diff --git a/fix_von_prefix.py b/fix_von_prefix.py new file mode 100644 index 0000000..e274358 --- /dev/null +++ b/fix_von_prefix.py @@ -0,0 +1,121 @@ +#!/usr/bin/env python3 +""" +This script needs to be run in the root folder containing the +folders "lib" and "db" +""" + +import argparse +import os +import re +import sys +# import shutil +# import time + +scriptdir = os.path.dirname(os.path.realpath(__file__)) +sys.path.append(os.path.join(scriptdir, "..", "lib")) +sys.path.append(os.path.join(scriptdir, "..", "db")) + +import mybibtex.parser +import mybibtex.database +import mybibtex.generator +from confs_years import get_confs_years_inter + + +import header +import config +from config import confs_missing_years + +mybibtex.generator.config = config + + +def read_database(on_db=False): + filename = "db/crypto_db.bib" if on_db else "db/crypto_db_test.bib" + + # Read the database + parser = mybibtex.parser.Parser(encoding="utf8", person_fields=['author']) + return parser.parse_files([ + "db/abbrev0.bib", + "db/crypto_conf_list.bib", + filename, + ]) + + +def write_database(db, confs_years): + for expand_crossrefs in [False, True]: + outname = "db/crypto.bib" if expand_crossrefs else "db/crypto_crossref.bib" + with open(outname, "w") as out, open("db/crypto_misc.bib") as fin: + out.write(header.get_header(config, "gen.py", confs_years)) + mybibtex.generator.bibtex_gen(out, db, expand_crossrefs=expand_crossrefs, + include_crossrefs=not expand_crossrefs, + remove_empty_fields=True) + out.write("\n\n") + for line in fin: + out.write(line) + + +def namestrip(s): + s = s.replace('{', '') + s = s.replace('}', '') + return re.sub('\\\\.', '', s).strip() + + +def fix_von_prefix(on_db=False): + db = read_database(on_db) + print("crypto_db is read.", file=sys.stderr) + + people = set() + for entrykey, entry in db.entries.items(): + # key = str(entrykey) + if 'author' in entry.persons: + for author in entry.persons['author']: + people.add(author) + + people = sorted(list(people), key=lambda x: str(x)) + stripped_people = dict() + + for p in people: + parts = ( + p.get_part('first') + + p.get_part('prelast') + + p.get_part('last') + + p.get_part('lineage') + ) + stripped_p = ' '.join(map(namestrip, parts)) + + if stripped_p not in stripped_people: + stripped_people[stripped_p] = [] + stripped_people[stripped_p].append(p) + # lastname = p.get_part_as_text('last') + # if re.match('^{[a-z]', lastname): + # print(p) + + for strip, ps in stripped_people.items(): + if len(ps) > 1: + print(strip, "has multiple hits:") + for p in ps: + print(" ", p) + + return db + + +def main(): + parser = argparse.ArgumentParser("Fix von prefices") + parser.add_argument("--db", action="store_true", + help="Run on actual crypto_db.bib") + parser.add_argument("--out", action="store_true", + help="Write to db/crypto[_crossref].bib") + args = parser.parse_args() + + # Make a backup + # shutil.copy("db/crypto_db.bib", + # f"db/crypto_db.bib.{int(time.time()):0>12d}") + # Run the command + db = fix_von_prefix(on_db=args.db) + if args.out: + confs_years = get_confs_years_inter(db, confs_missing_years) + write_database(db, confs_years) + + + +if __name__ == "__main__": + main() From fe7828b069cd351befda20ae715b79b61b43601e Mon Sep 17 00:00:00 2001 From: Ludo Pulles <ludo.pulles@gmail.com> Date: Wed, 2 Oct 2024 11:11:35 +0200 Subject: [PATCH 3/3] All scripts used --- fix_lineage.py | 126 ++++++++++++++++++++++++++++ fix_von_prefix.py | 133 ++++++++++++++++-------------- format_entry.py | 203 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 402 insertions(+), 60 deletions(-) create mode 100644 fix_lineage.py create mode 100644 format_entry.py diff --git a/fix_lineage.py b/fix_lineage.py new file mode 100644 index 0000000..0a93dbd --- /dev/null +++ b/fix_lineage.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python3 +""" +This script needs to be run in the root folder containing the +folders "lib" and "db" +""" + +import argparse +import os +import re +import sys +# import shutil +# import time + +scriptdir = os.path.dirname(os.path.realpath(__file__)) +sys.path.append(os.path.join(scriptdir, "..", "lib")) +sys.path.append(os.path.join(scriptdir, "..", "db")) + +import mybibtex.parser +import mybibtex.database +import mybibtex.generator +from confs_years import get_confs_years_inter + +import header +import config + +mybibtex.generator.config = config + + +def read_database(on_db=False): + filename = "db/crypto_db.bib" if on_db else "db/test.bib" + + # Read the database + parser = mybibtex.parser.Parser(encoding="utf8", person_fields=['author']) + parser.parse_files(["db/abbrev0.bib", "db/crypto_conf_list.bib"]) + + with open(filename, encoding="utf8") as f: + preamble = "" + + cookie = f.tell() + line = f.readline() + while line == '\n' or line.startswith('%'): + preamble += line + cookie = f.tell() + line = f.readline() + f.seek(cookie) + return preamble, parser.parse_file(f) + + +def write_database(preamble, db, confs_years, on_db=False): + filename = ("db/crypto_db.bib" if on_db else "db/test.bib") + ".out" + with open(filename, "w") as out: + out.write(preamble) + mybibtex.generator.bibtex_gen(out, db, expand_crossrefs=False, + include_crossrefs=False, + remove_empty_fields=True) + + +################################################################################ + + +def namestrip(s): + s = s.replace('{', '') + s = s.replace('}', '') + return re.sub('\\\\.', '', s).strip() + + +def fix_lineage(person): + lastname = person.get_part_as_text('last') + lineage = person.get_part_as_text('lineage') + orig_name = lastname + ' ' + lineage if lineage else lastname + + moves = 0 + while re.search(' (Jr\\.|Sr\\.|II|III|IV)}?$', lastname): + lastname = person._last[-1] + moves += 1 + idx = lastname.rfind(' ') + assert idx >= 0 and lastname[-1] == '}' + + person._lineage.append(lastname[idx + 1:-1]) + person._last[-1] = lastname[:idx] + '}' + lastname = person.get_part_as_text('last') + + if re.match('^(II|III|IV)$', lastname): + print("WARNING: Lineage is seen as lastname for ", person) + + if moves > 0: + # Check if we can remove the {lastname}. + if re.match('{[A-Za-z]*}', person.get_part_as_text('last')): + assert len(person._last) == 1 + person._last[0] = person._last[0][1:-1] + + lastname = person.get_part_as_text('last') + lineage = person.get_part_as_text('lineage') + print("Changed \"", orig_name, "\" to \"", lastname + ', ' + lineage, "\"", sep="") + + +def run(db): + for entrykey, entry in db.entries.items(): + if 'author' in entry.persons: + for author in entry.persons['author']: + fix_lineage(author) + + +def main(): + parser = argparse.ArgumentParser("Fix von prefices") + parser.add_argument("--db", action="store_true", + help="Run on actual crypto_db.bib") + args = parser.parse_args() + + # Make a backup + # shutil.copy("db/crypto_db.bib", + # f"db/crypto_db.bib.{int(time.time()):0>12d}") + # Run the command + + print("Reading bibtex source file...", file=sys.stderr, flush=True) + preamble, db = read_database(args.db) + print("Bibtex source file is read!", file=sys.stderr, flush=True) + + run(db) + + confs_years = get_confs_years_inter(db, config.confs_missing_years) + write_database(preamble, db, confs_years, args.db) + + +if __name__ == "__main__": + main() diff --git a/fix_von_prefix.py b/fix_von_prefix.py index e274358..805b3ba 100644 --- a/fix_von_prefix.py +++ b/fix_von_prefix.py @@ -20,7 +20,6 @@ import mybibtex.generator from confs_years import get_confs_years_inter - import header import config from config import confs_missing_years @@ -29,28 +28,35 @@ def read_database(on_db=False): - filename = "db/crypto_db.bib" if on_db else "db/crypto_db_test.bib" + filename = "db/crypto_db.bib" if on_db else "db/test.bib" # Read the database parser = mybibtex.parser.Parser(encoding="utf8", person_fields=['author']) - return parser.parse_files([ - "db/abbrev0.bib", - "db/crypto_conf_list.bib", - filename, - ]) - - -def write_database(db, confs_years): - for expand_crossrefs in [False, True]: - outname = "db/crypto.bib" if expand_crossrefs else "db/crypto_crossref.bib" - with open(outname, "w") as out, open("db/crypto_misc.bib") as fin: - out.write(header.get_header(config, "gen.py", confs_years)) - mybibtex.generator.bibtex_gen(out, db, expand_crossrefs=expand_crossrefs, - include_crossrefs=not expand_crossrefs, - remove_empty_fields=True) - out.write("\n\n") - for line in fin: - out.write(line) + parser.parse_files(["db/abbrev0.bib", "db/crypto_conf_list.bib"]) + + with open(filename, encoding="utf8") as f: + preamble = "" + + cookie = f.tell() + line = f.readline() + while line == '\n' or line.startswith('%'): + preamble += line + cookie = f.tell() + line = f.readline() + f.seek(cookie) + return preamble, parser.parse_file(f) + + +def write_database(preamble, db, confs_years, on_db=False): + filename = ("db/crypto_db.bib" if on_db else "db/test.bib") + ".out" + with open(filename, "w") as out: + out.write(preamble) + mybibtex.generator.bibtex_gen(out, db, expand_crossrefs=False, + include_crossrefs=False, + remove_empty_fields=True) + + +################################################################################ def namestrip(s): @@ -59,62 +65,69 @@ def namestrip(s): return re.sub('\\\\.', '', s).strip() -def fix_von_prefix(on_db=False): - db = read_database(on_db) - print("crypto_db is read.", file=sys.stderr) - - people = set() +def fix_von_prefix_person(person): + prelast = person.get_part_as_text('prelast') + lastname = person.get_part_as_text('last') + orig_name = prelast + ' ' + lastname if prelast else lastname + + """ + Current exceptions are: +Exception: {abhi} {shelat} +Exception: Bruno {d'Ausbourg} +Exception: Sabah {al-Binali} + """ + + moves = 0 + while re.match('^{\'?[a-z]', lastname): + moves += 1 + idx = person._last[0].find(' ') + if idx == -1: + # Should be one of the names above. + print("Exception: ", person) + return + + person._prelast.append(person._last[0][1:idx]) # person._last[0] + person._last[0] = '{' + person._last[0][idx + 1:] + # person._last = person._last[1:] + lastname = person.get_part_as_text('last') + + if moves > 0: + # Check if we can remove the {lastname}. + if re.match('{[A-Za-z]*}', person.get_part_as_text('last')): + person._last[0] = person._last[0][1:] + person._last[-1] = person._last[-1][:-1] + + prelast = person.get_part_as_text('prelast') + lastname = person.get_part_as_text('last') + print("Changed \"", orig_name, "\" to \"", prelast + ' ' + lastname, "\"", sep="") + + +def run(db): for entrykey, entry in db.entries.items(): - # key = str(entrykey) if 'author' in entry.persons: for author in entry.persons['author']: - people.add(author) - - people = sorted(list(people), key=lambda x: str(x)) - stripped_people = dict() - - for p in people: - parts = ( - p.get_part('first') - + p.get_part('prelast') - + p.get_part('last') - + p.get_part('lineage') - ) - stripped_p = ' '.join(map(namestrip, parts)) - - if stripped_p not in stripped_people: - stripped_people[stripped_p] = [] - stripped_people[stripped_p].append(p) - # lastname = p.get_part_as_text('last') - # if re.match('^{[a-z]', lastname): - # print(p) - - for strip, ps in stripped_people.items(): - if len(ps) > 1: - print(strip, "has multiple hits:") - for p in ps: - print(" ", p) - - return db + fix_von_prefix_person(author) def main(): parser = argparse.ArgumentParser("Fix von prefices") parser.add_argument("--db", action="store_true", help="Run on actual crypto_db.bib") - parser.add_argument("--out", action="store_true", - help="Write to db/crypto[_crossref].bib") args = parser.parse_args() # Make a backup # shutil.copy("db/crypto_db.bib", # f"db/crypto_db.bib.{int(time.time()):0>12d}") # Run the command - db = fix_von_prefix(on_db=args.db) - if args.out: - confs_years = get_confs_years_inter(db, confs_missing_years) - write_database(db, confs_years) + print("Reading bibtex source file...", file=sys.stderr, flush=True) + preamble, db = read_database(args.db) + print("Bibtex source file is read!", file=sys.stderr, flush=True) + + run(db) + + confs_years = get_confs_years_inter(db, confs_missing_years) + write_database(preamble, db, confs_years, args.db) if __name__ == "__main__": diff --git a/format_entry.py b/format_entry.py new file mode 100644 index 0000000..ed057ca --- /dev/null +++ b/format_entry.py @@ -0,0 +1,203 @@ +#!/usr/bin/env python3 +""" +This script needs to be run in the root folder containing the +folders "lib" and "db". + +Note: either can be called from a different script, or from command line! +TODO: let db_import/import.py use this one! +""" +from copy import copy +from unidecode import unidecode + +import argparse +import os +import re +import sys +# import shutil +# import time + +scriptdir = os.path.dirname(os.path.realpath(__file__)) +sys.path.append(os.path.join(scriptdir, "..", "lib")) +sys.path.append(os.path.join(scriptdir, "..", "db")) + +import mybibtex.parser +import mybibtex.database +import mybibtex.generator +from confs_years import get_confs_years_inter + +import config + +mybibtex.generator.config = config + + +def read_database(on_db=False): + filename = "db/crypto_db.bib" if on_db else "db/test.bib" + + # Read the database + parser = mybibtex.parser.Parser(encoding="utf8", person_fields=['author']) + parser.parse_files(["db/abbrev0.bib", "db/crypto_conf_list.bib"]) + + with open(filename, encoding="utf8") as f: + preamble = "" + + cookie = f.tell() + line = f.readline() + while line == '\n' or line.startswith('%'): + preamble += line + cookie = f.tell() + line = f.readline() + f.seek(cookie) + return preamble, parser.parse_file(f) + + +def write_database(preamble, db, confs_years, on_db=False): + filename = ("db/crypto_db.bib" if on_db else "db/test.bib") + ".out" + with open(filename, "w") as out: + out.write(preamble) + mybibtex.generator.bibtex_gen(out, db, expand_crossrefs=False, + include_crossrefs=False, + remove_empty_fields=True) + + +################################################################################ + + +def strip_accents(s): + """ + Return a string similar to `s` but with all accents removed. + These can be unicode characters or LaTeX escaped. + """ + original_s = s + + # Remove any UTF-8 characters, although most of them are escaped with LaTeX. + s = unidecode(s) + + # Expand \ss to 'ss', because it's a ringel-s, like in: + # [PoPETS:Gross21]: author="Thomas Gro{\ss}" + s = s.replace('\\ss', 'ss') + + # Remove \textcommabelow, like in + # [EPRINT:RosButSim23](https://eprint.iacr.org/2023/124) + s = s.replace('\\textcommabelow', '') + + # Replace Danish \O, \o by respectively O, o: + s = re.sub('\\\\o', 'o', re.sub('\\\\O', 'O', s)) + + # Also remove all LaTeX escape characters, and gobble up the character after it. + s = re.sub('\\\\.', '', s) + + if s.find('.') >= 0: + print(f"WARNING: Spurious '.' found in \"{original_s}\".") + + # Remove '{', '}', '(', ')', "'", '-', '~', ' ', etc.: + # 1) "-" can occur in [EPRINT:Hall-Andersen19]: Mathias Hall-Andersen + # 2) "'" can occur in [CANS:BelONe13]: Adam O'Neill + # but watch out for [C:ChaCreDam87]: + # here "Cr{\'e}peau" should be replaced by "Crepeau"! + # 3) "~" can occur in [SAC:OhESco06] with name "Colm {{\'O}~h{\'E}igeartaigh}". + # 4) "(" can occur in [WISA:YanKimOhL16] with name "Il Seok {Oh(Luke)}". + return re.sub('[^a-zA-Z]', '', s) + # https://www.w3schools.com/python/ref_string_translate.asp + # return s.translate(str.maketrans('', '', "{}()'-~. ")) + + +_warned_authors = set() + + +def author_abbreviation(author): + """ + Return the name of the author that can be used as an entry key for a bibtex entry. + If there are some von prefices, these are abbreviated to one letter. + + Note: this is only for one single author. + Use authors_abbreviation instead if you have a list of all authors. + """ + von = author.prelast() + lastname = strip_accents(author.get_part_as_text('last')) + + if not 'A' <= lastname[0] <= 'Z': + if author not in _warned_authors: + print("WARNING: odd lastname: \"", lastname, "\"; ", str(author), sep="") + _warned_authors.add(author) + + return "".join(strip_accents(x)[0] for x in von) + lastname + + +def authors_abbreviation(authors): + """return the author bibtext key part""" + if len(authors) <= 0: + print("ERROR: Entry with no author => replaced by ???") + return "???" + + if len(authors) == 1: + # The key contains the last name. + return author_abbreviation(authors[0]) + + if len(authors) <= 3: + # The key contains the first three letters of each last name. + return "".join(author_abbreviation(a)[:3] for a in authors) + + # The key contains the first letter of the last name of the first six authors. + # (any von prefices are removed) + return "".join(strip_accents(a.last()[0])[0] for a in authors[:6]) + + +def run(db): + # Warning: if we modify the entry name, + # it might conflict with one that we already saw overriding that entry. + # In any case, it is not a smart idea to modify a dict while iterating over it, + # so just compute the new result in new_db. + new_db = mybibtex.database.BibliographyData() + + for entrykey, entry in db.entries.items(): + if 'author' not in entry.persons: + new_db.add_entry(entrykey, entry) + continue + + authors = entry.persons['author'] + + new_key = copy(entrykey) + new_key.auth = authors_abbreviation(authors) + key_a = copy(new_key) + new_key.dis, key_a.dis = '', 'a' + + if new_key in new_db.entries: + # We require disambiguation: turn KEY into KEYa and KEYb. + new_db.entries[key_a] = new_db.entries.pop(new_key) + new_key.dis = 'b' + elif key_a in new_db.entries: + # Find the next disambiguation string ("", "a", "b", ...) that is unused. + # May be needed if same authors and same year occur for different papers. + new_key.dis = 'b' + while new_key in new_db.entries: + # Having more than 26 of the same names is weird... That will NEVER happen! + assert new_key.dis[0] < 'z' + new_key.dis = chr(ord(new_key.dis[0]) + 1) + new_db.add_entry(new_key, entry) + return new_db + + +def main(): + parser = argparse.ArgumentParser("Fix von prefices") + parser.add_argument("--db", action="store_true", + help="Run on actual crypto_db.bib") + args = parser.parse_args() + + # Make a backup + # shutil.copy("db/crypto_db.bib", + # f"db/crypto_db.bib.{int(time.time()):0>12d}") + # Run the command + + print("LOG: Reading bibtex source file...", file=sys.stderr, flush=True) + preamble, db = read_database(args.db) + print("LOG: Bibtex source file is read!\n", file=sys.stderr, flush=True) + + new_db = run(db) + del db + print("LOG: Output bibtex to output file...") + confs_years = get_confs_years_inter(new_db, config.confs_missing_years) + write_database(preamble, new_db, confs_years, args.db) + + +if __name__ == "__main__": + main()