-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
c223374
commit c0ae396
Showing
2 changed files
with
233 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
#!/usr/bin/env python3 | ||
import sys | ||
|
||
import pandas as pd | ||
|
||
input_filename = sys.argv[1] | ||
df = pd.read_csv(input_filename, index_col=False, | ||
sep='\t' if input_filename.endswith('.tsv') else None) | ||
|
||
expected_columns = ['lemma', 'pos', 'definition'] | ||
if not all(col in df for col in expected_columns): | ||
sys.exit('ERROR: Table requires columns: ' + str(expected_columns)) | ||
expected_columns = ['entry_id', 'sense_id'] | ||
if not all(col in df for col in expected_columns): | ||
print('WARNING: Table could have (additionally) columns: ' + str(expected_columns)) | ||
|
||
try: | ||
head_xml = open('teiHeader.xml').read() | ||
except IOError: | ||
print('WARNING: Missing ./teiHeader.xml (containing `<teiHeader/>`). ' | ||
'Will use blank.', file=sys.stderr) | ||
head_xml = '\n' | ||
else: | ||
if not head_xml.strip().startswith('<teiHeader>'): | ||
sys.exit('ERROR: File teiHeader.xml does not start with "<teiHeader>"') | ||
if not head_xml.strip().endswith('</teiHeader>'): | ||
sys.exit('ERROR: File teiHeader.xml does not end with "</teiHeader>"') | ||
|
||
|
||
UD_POS = ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', | ||
'DET', 'INTJ', 'NOUN', 'NUM', 'PART', | ||
'PRON', 'PROPN', 'PUNCT', 'SCONJ', | ||
'SYM', 'VERB', 'X'] | ||
for pos in df['pos'].unique(): | ||
if pos.upper() not in UD_POS: | ||
sys.exit(f'ERROR: POS values must be from Univ.Deps.v2. ' | ||
f'Got {pos!r}, expected one of: {UD_POS}') | ||
|
||
sort_keys = ( | ||
(['entry_id'] if 'entry_id' in df else []) + | ||
['lemma', 'pos'] + | ||
(['sense_id'] if 'sense_id' in df else []) | ||
) | ||
df.sort_values(sort_keys, inplace=True, ignore_index=True) | ||
|
||
have_entry_id = 'entry_id' in df | ||
have_sense_id = 'sense_id' in df | ||
|
||
if have_entry_id: | ||
assert df['entry_id'].nunique() == df['lemma'].nunique(),\ | ||
'Number of `entry_id` must match number of distinct `lemma`' | ||
if have_sense_id: | ||
assert df['sense_id'].nunique() == len(df), \ | ||
'Every sense requires unique `sense_id`' | ||
|
||
|
||
def write_entries(write_out): | ||
for (lemma, pos), subdf in df.groupby(['lemma', 'pos']): | ||
entry_id = f' xml:id="{subdf["entry_id"].iloc[0]}"' if have_entry_id else '' | ||
senses = [] | ||
for n, sense in enumerate(subdf.itertuples(index=False), 1): | ||
sense_id = f' xml:id="{sense.sense_id}"' if have_sense_id else '' | ||
senses.append(f'<sense n="{n}"{sense_id}><def>{sense.definition}</def></sense>') | ||
senses = '\n'.join(senses) | ||
write_out(f'''\ | ||
<entry{entry_id}> | ||
<form type="lemma"><orth>{lemma}</orth></form> | ||
<gramGrp><gram type="pos">{pos}</gram></gramGrp> | ||
{senses} | ||
</entry> | ||
''') | ||
|
||
|
||
with open('output.xml', 'w', encoding="utf-8") as fd: | ||
fd.write(''' | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<?xml-model href="http://www.tei-c.org/release/xml/tei/custom/schema/relaxng/tei_all.rng" | ||
schematypens="http://relaxng.org/ns/structure/1.0" type="application/xml"?> | ||
<!-- Validate with `xmllint -relaxng $model-href $file` --> | ||
<TEI xmlns="http://www.tei-c.org/ns/1.0"> | ||
''') | ||
fd.write(head_xml) | ||
fd.write('\n<text><body>\n') | ||
write_entries(fd.write) | ||
fd.write('</body></text></TEI>\n') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,148 @@ | ||
import sys | ||
from lxml import etree | ||
|
||
HELP = """ | ||
USAGE: | ||
python3 remap_pos.py [command] <FILE_NAME> <ACRONYM> | ||
COMMANDS: | ||
export: Exports all pos element values from <gramGrp><gram type="pos"> into .csv file as original_pos. | ||
Ouput file FILE_NAME-export.csv is saved into the same directory as input file. | ||
remap: First run export command and add new tei_pos values to FILE_NAME-export.csv file. Remember to save it as .csv | ||
format and keep it in the same directory as input file. This script will remap all pos values from | ||
FILE_NAME-export.csv file and save them into FILE_NAME-remapped.xml file in the same directory. | ||
Add <ACRONYM> to be used in the ids of entries and senses. | ||
DEPENDENCIES: | ||
Make sure you have lxml installed. You can install it with the following command: | ||
pip3 install lxml | ||
""" | ||
|
||
|
||
def prepare_ns(xml_tree): | ||
nsmap = xml_tree.getroot().nsmap | ||
try: | ||
nsmap["_"] = nsmap[None] | ||
nsmap.pop(None) | ||
except: | ||
nsmap["_"] = None | ||
return nsmap | ||
|
||
|
||
def xpath_ns(xml_tree, xpath, nsmap): | ||
if nsmap["_"] is None: | ||
#result = xml_tree.xpath(".//gramGrp/gram[@type='pos']") | ||
result = xml_tree.xpath(xpath) | ||
else: | ||
#result = xml_tree.xpath(".//_:gramGrp/_:gram[@type='pos']", namespaces=nsmap) | ||
result = xml_tree.xpath(xpath.replace("/", "/_:").replace("./_:", "./"), namespaces=nsmap) | ||
return result | ||
|
||
|
||
def export(filename): | ||
parser = etree.XMLParser(encoding='utf-8') | ||
xml_tree = lxml.etree.parse(filename, parser) | ||
nsmap = prepare_ns(xml_tree) | ||
""" | ||
if nsmap["_"] is None: | ||
result = xml_tree.xpath(".//gramGrp/gram[@type='pos']") | ||
else: | ||
result = xml_tree.xpath(".//_:gramGrp/_:gram[@type='pos']", namespaces=nsmap) | ||
""" | ||
result = xpath_ns(xml_tree, ".//gramGrp/gram[@type='pos']", nsmap) | ||
out_filename = filename[:-4] + "-export.csv" | ||
pos = set() | ||
for r in result: | ||
pos.add(r.text) | ||
with open(out_filename, 'w', encoding='utf-8') as file: | ||
file.write("original_pos, tei_pos\n") | ||
for r in pos: | ||
r = "" if r is None else r | ||
file.write(f"{r},\n") | ||
print(f"Exports have been saved to {out_filename}. Remember to save it as .csv format after editing.") | ||
|
||
|
||
def remap(filename, acronym): | ||
csv_filename = filename[:-4] + "-export.csv" | ||
pos_map = dict() | ||
with open(csv_filename, encoding='utf-8') as file: | ||
csv_line = file.readlines() | ||
for line in csv_line[1:]: | ||
if line.strip() == "": | ||
continue | ||
k = ",".join(line.split(",")[:-1]).strip() | ||
v = line.split(",")[-1].strip() | ||
pos_map[k] = v | ||
#pos_map[line.split(",")[0].strip()] = line.split(",")[1].strip() | ||
parser = etree.XMLParser(encoding='utf-8') | ||
xml_tree = lxml.etree.parse(filename, parser) | ||
nsmap = prepare_ns(xml_tree) | ||
""" | ||
if nsmap["_"] is None: | ||
result = xml_tree.xpath(".//gramGrp/gram[@type='pos']") | ||
else: | ||
result = xml_tree.xpath(".//_:gramGrp/_:gram[@type='pos']", namespaces=nsmap) | ||
""" | ||
result = xpath_ns(xml_tree, ".//gramGrp/gram[@type='pos']", nsmap) | ||
for r in result: | ||
r.attrib["orig"] = "pos" if r.text is None else str(r.text).strip() | ||
k = str(r.text).strip() | ||
try: | ||
r.text = pos_map[k] | ||
except: | ||
continue | ||
#r.text = pos_map[r.text.split(",")[0].strip()] | ||
out_filename = filename[:-4] + "-remapped.xml" | ||
add_ids(xml_tree, acronym) | ||
xml_tree.write(out_filename, encoding='utf-8', pretty_print=True) | ||
print(f"Your xml has been remapped and saved to {out_filename}") | ||
|
||
|
||
def add_ids(xml_tree, acronym): | ||
nsmap = prepare_ns(xml_tree) | ||
result = xpath_ns(xml_tree, ".//entry", nsmap) | ||
entry_counter = 1 | ||
for entry in result: | ||
hw = xpath_ns(entry, ".//form/orth", nsmap)[0].text | ||
try: | ||
pos = xpath_ns(entry, ".//gramGrp/gram", nsmap)[0].text | ||
except: | ||
pos = "pos" | ||
if pos is None: | ||
pos = "pos" | ||
# try to remap old id | ||
try: | ||
entry.attrib["{http://www.w3.org/XML/1998/namespace}orig_id"] = entry.attrib["{http://www.w3.org/XML/1998/namespace}id"] | ||
except: | ||
pass | ||
id_str = f"{acronym}_{hw}_{entry_counter}_{pos}" | ||
entry.attrib["{http://www.w3.org/XML/1998/namespace}id"] = id_str | ||
sense_counter = 1 | ||
sense_result = xpath_ns(entry, ".//sense", nsmap) | ||
for sense in sense_result: | ||
# try to remap old id | ||
try: | ||
sense.attrib["{http://www.w3.org/XML/1998/namespace}orig_id"] = sense.attrib["{http://www.w3.org/XML/1998/namespace}id"] | ||
except: | ||
pass | ||
sense.attrib["{http://www.w3.org/XML/1998/namespace}id"] = f"{id_str}_{sense_counter}" | ||
sense_counter += 1 | ||
entry_counter += 1 | ||
|
||
|
||
if __name__ == "__main__": | ||
if ("export" not in sys.argv and "remap" not in sys.argv) or len(sys.argv) < 3: | ||
print(HELP) | ||
sys.exit(0) | ||
|
||
import lxml.etree | ||
|
||
if "export" in sys.argv: | ||
export(sys.argv[2]) | ||
elif "remap" in sys.argv: | ||
if len(sys.argv) != 4: | ||
sys.argv.append("ACRO") | ||
remap(sys.argv[2], sys.argv[3]) | ||
else: | ||
print("You shouldn't be here...") |