From c0ae3962019075378e2a4d6723938687c32e6d88 Mon Sep 17 00:00:00 2001 From: andrazrepar Date: Thu, 28 Jul 2022 11:06:00 +0200 Subject: [PATCH] add custom scripts to master --- custom_scripts/csv2tei-general-windowsOS.py | 85 +++++++++++ custom_scripts/remap_pos.py | 148 ++++++++++++++++++++ 2 files changed, 233 insertions(+) create mode 100644 custom_scripts/csv2tei-general-windowsOS.py create mode 100644 custom_scripts/remap_pos.py diff --git a/custom_scripts/csv2tei-general-windowsOS.py b/custom_scripts/csv2tei-general-windowsOS.py new file mode 100644 index 0000000..2c73eaa --- /dev/null +++ b/custom_scripts/csv2tei-general-windowsOS.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python3 +import sys + +import pandas as pd + +input_filename = sys.argv[1] +df = pd.read_csv(input_filename, index_col=False, + sep='\t' if input_filename.endswith('.tsv') else None) + +expected_columns = ['lemma', 'pos', 'definition'] +if not all(col in df for col in expected_columns): + sys.exit('ERROR: Table requires columns: ' + str(expected_columns)) +expected_columns = ['entry_id', 'sense_id'] +if not all(col in df for col in expected_columns): + print('WARNING: Table could have (additionally) columns: ' + str(expected_columns)) + +try: + head_xml = open('teiHeader.xml').read() +except IOError: + print('WARNING: Missing ./teiHeader.xml (containing ``). ' + 'Will use blank.', file=sys.stderr) + head_xml = '\n' +else: + if not head_xml.strip().startswith(''): + sys.exit('ERROR: File teiHeader.xml does not start with ""') + if not head_xml.strip().endswith(''): + sys.exit('ERROR: File teiHeader.xml does not end with ""') + + +UD_POS = ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', + 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', + 'PRON', 'PROPN', 'PUNCT', 'SCONJ', + 'SYM', 'VERB', 'X'] +for pos in df['pos'].unique(): + if pos.upper() not in UD_POS: + sys.exit(f'ERROR: POS values must be from Univ.Deps.v2. ' + f'Got {pos!r}, expected one of: {UD_POS}') + +sort_keys = ( + (['entry_id'] if 'entry_id' in df else []) + + ['lemma', 'pos'] + + (['sense_id'] if 'sense_id' in df else []) +) +df.sort_values(sort_keys, inplace=True, ignore_index=True) + +have_entry_id = 'entry_id' in df +have_sense_id = 'sense_id' in df + +if have_entry_id: + assert df['entry_id'].nunique() == df['lemma'].nunique(),\ + 'Number of `entry_id` must match number of distinct `lemma`' +if have_sense_id: + assert df['sense_id'].nunique() == len(df), \ + 'Every sense requires unique `sense_id`' + + +def write_entries(write_out): + for (lemma, pos), subdf in df.groupby(['lemma', 'pos']): + entry_id = f' xml:id="{subdf["entry_id"].iloc[0]}"' if have_entry_id else '' + senses = [] + for n, sense in enumerate(subdf.itertuples(index=False), 1): + sense_id = f' xml:id="{sense.sense_id}"' if have_sense_id else '' + senses.append(f'{sense.definition}') + senses = '\n'.join(senses) + write_out(f'''\ + +
{lemma}
+{pos} +{senses} + +''') + + +with open('output.xml', 'w', encoding="utf-8") as fd: + fd.write(''' + + + + +''') + fd.write(head_xml) + fd.write('\n\n') + write_entries(fd.write) + fd.write('\n') diff --git a/custom_scripts/remap_pos.py b/custom_scripts/remap_pos.py new file mode 100644 index 0000000..5ac18f2 --- /dev/null +++ b/custom_scripts/remap_pos.py @@ -0,0 +1,148 @@ +import sys +from lxml import etree + +HELP = """ +USAGE: +python3 remap_pos.py [command] + +COMMANDS: +export: Exports all pos element values from into .csv file as original_pos. + Ouput file FILE_NAME-export.csv is saved into the same directory as input file. + +remap: First run export command and add new tei_pos values to FILE_NAME-export.csv file. Remember to save it as .csv + format and keep it in the same directory as input file. This script will remap all pos values from + FILE_NAME-export.csv file and save them into FILE_NAME-remapped.xml file in the same directory. + Add to be used in the ids of entries and senses. + +DEPENDENCIES: +Make sure you have lxml installed. You can install it with the following command: +pip3 install lxml +""" + + +def prepare_ns(xml_tree): + nsmap = xml_tree.getroot().nsmap + try: + nsmap["_"] = nsmap[None] + nsmap.pop(None) + except: + nsmap["_"] = None + return nsmap + + +def xpath_ns(xml_tree, xpath, nsmap): + if nsmap["_"] is None: + #result = xml_tree.xpath(".//gramGrp/gram[@type='pos']") + result = xml_tree.xpath(xpath) + else: + #result = xml_tree.xpath(".//_:gramGrp/_:gram[@type='pos']", namespaces=nsmap) + result = xml_tree.xpath(xpath.replace("/", "/_:").replace("./_:", "./"), namespaces=nsmap) + return result + + +def export(filename): + parser = etree.XMLParser(encoding='utf-8') + xml_tree = lxml.etree.parse(filename, parser) + nsmap = prepare_ns(xml_tree) + """ + if nsmap["_"] is None: + result = xml_tree.xpath(".//gramGrp/gram[@type='pos']") + else: + result = xml_tree.xpath(".//_:gramGrp/_:gram[@type='pos']", namespaces=nsmap) + """ + result = xpath_ns(xml_tree, ".//gramGrp/gram[@type='pos']", nsmap) + out_filename = filename[:-4] + "-export.csv" + pos = set() + for r in result: + pos.add(r.text) + with open(out_filename, 'w', encoding='utf-8') as file: + file.write("original_pos, tei_pos\n") + for r in pos: + r = "" if r is None else r + file.write(f"{r},\n") + print(f"Exports have been saved to {out_filename}. Remember to save it as .csv format after editing.") + + +def remap(filename, acronym): + csv_filename = filename[:-4] + "-export.csv" + pos_map = dict() + with open(csv_filename, encoding='utf-8') as file: + csv_line = file.readlines() + for line in csv_line[1:]: + if line.strip() == "": + continue + k = ",".join(line.split(",")[:-1]).strip() + v = line.split(",")[-1].strip() + pos_map[k] = v + #pos_map[line.split(",")[0].strip()] = line.split(",")[1].strip() + parser = etree.XMLParser(encoding='utf-8') + xml_tree = lxml.etree.parse(filename, parser) + nsmap = prepare_ns(xml_tree) + """ + if nsmap["_"] is None: + result = xml_tree.xpath(".//gramGrp/gram[@type='pos']") + else: + result = xml_tree.xpath(".//_:gramGrp/_:gram[@type='pos']", namespaces=nsmap) + """ + result = xpath_ns(xml_tree, ".//gramGrp/gram[@type='pos']", nsmap) + for r in result: + r.attrib["orig"] = "pos" if r.text is None else str(r.text).strip() + k = str(r.text).strip() + try: + r.text = pos_map[k] + except: + continue + #r.text = pos_map[r.text.split(",")[0].strip()] + out_filename = filename[:-4] + "-remapped.xml" + add_ids(xml_tree, acronym) + xml_tree.write(out_filename, encoding='utf-8', pretty_print=True) + print(f"Your xml has been remapped and saved to {out_filename}") + + +def add_ids(xml_tree, acronym): + nsmap = prepare_ns(xml_tree) + result = xpath_ns(xml_tree, ".//entry", nsmap) + entry_counter = 1 + for entry in result: + hw = xpath_ns(entry, ".//form/orth", nsmap)[0].text + try: + pos = xpath_ns(entry, ".//gramGrp/gram", nsmap)[0].text + except: + pos = "pos" + if pos is None: + pos = "pos" + # try to remap old id + try: + entry.attrib["{http://www.w3.org/XML/1998/namespace}orig_id"] = entry.attrib["{http://www.w3.org/XML/1998/namespace}id"] + except: + pass + id_str = f"{acronym}_{hw}_{entry_counter}_{pos}" + entry.attrib["{http://www.w3.org/XML/1998/namespace}id"] = id_str + sense_counter = 1 + sense_result = xpath_ns(entry, ".//sense", nsmap) + for sense in sense_result: + # try to remap old id + try: + sense.attrib["{http://www.w3.org/XML/1998/namespace}orig_id"] = sense.attrib["{http://www.w3.org/XML/1998/namespace}id"] + except: + pass + sense.attrib["{http://www.w3.org/XML/1998/namespace}id"] = f"{id_str}_{sense_counter}" + sense_counter += 1 + entry_counter += 1 + + +if __name__ == "__main__": + if ("export" not in sys.argv and "remap" not in sys.argv) or len(sys.argv) < 3: + print(HELP) + sys.exit(0) + + import lxml.etree + + if "export" in sys.argv: + export(sys.argv[2]) + elif "remap" in sys.argv: + if len(sys.argv) != 4: + sys.argv.append("ACRO") + remap(sys.argv[2], sys.argv[3]) + else: + print("You shouldn't be here...") \ No newline at end of file