Skip to content

Commit

Permalink
add custom scripts to master
Browse files Browse the repository at this point in the history
  • Loading branch information
andrazrepar committed Jul 28, 2022
1 parent c223374 commit c0ae396
Show file tree
Hide file tree
Showing 2 changed files with 233 additions and 0 deletions.
85 changes: 85 additions & 0 deletions custom_scripts/csv2tei-general-windowsOS.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
#!/usr/bin/env python3
import sys

import pandas as pd

input_filename = sys.argv[1]
df = pd.read_csv(input_filename, index_col=False,
sep='\t' if input_filename.endswith('.tsv') else None)

expected_columns = ['lemma', 'pos', 'definition']
if not all(col in df for col in expected_columns):
sys.exit('ERROR: Table requires columns: ' + str(expected_columns))
expected_columns = ['entry_id', 'sense_id']
if not all(col in df for col in expected_columns):
print('WARNING: Table could have (additionally) columns: ' + str(expected_columns))

try:
head_xml = open('teiHeader.xml').read()
except IOError:
print('WARNING: Missing ./teiHeader.xml (containing `<teiHeader/>`). '
'Will use blank.', file=sys.stderr)
head_xml = '\n'
else:
if not head_xml.strip().startswith('<teiHeader>'):
sys.exit('ERROR: File teiHeader.xml does not start with "<teiHeader>"')
if not head_xml.strip().endswith('</teiHeader>'):
sys.exit('ERROR: File teiHeader.xml does not end with "</teiHeader>"')


UD_POS = ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ',
'DET', 'INTJ', 'NOUN', 'NUM', 'PART',
'PRON', 'PROPN', 'PUNCT', 'SCONJ',
'SYM', 'VERB', 'X']
for pos in df['pos'].unique():
if pos.upper() not in UD_POS:
sys.exit(f'ERROR: POS values must be from Univ.Deps.v2. '
f'Got {pos!r}, expected one of: {UD_POS}')

sort_keys = (
(['entry_id'] if 'entry_id' in df else []) +
['lemma', 'pos'] +
(['sense_id'] if 'sense_id' in df else [])
)
df.sort_values(sort_keys, inplace=True, ignore_index=True)

have_entry_id = 'entry_id' in df
have_sense_id = 'sense_id' in df

if have_entry_id:
assert df['entry_id'].nunique() == df['lemma'].nunique(),\
'Number of `entry_id` must match number of distinct `lemma`'
if have_sense_id:
assert df['sense_id'].nunique() == len(df), \
'Every sense requires unique `sense_id`'


def write_entries(write_out):
for (lemma, pos), subdf in df.groupby(['lemma', 'pos']):
entry_id = f' xml:id="{subdf["entry_id"].iloc[0]}"' if have_entry_id else ''
senses = []
for n, sense in enumerate(subdf.itertuples(index=False), 1):
sense_id = f' xml:id="{sense.sense_id}"' if have_sense_id else ''
senses.append(f'<sense n="{n}"{sense_id}><def>{sense.definition}</def></sense>')
senses = '\n'.join(senses)
write_out(f'''\
<entry{entry_id}>
<form type="lemma"><orth>{lemma}</orth></form>
<gramGrp><gram type="pos">{pos}</gram></gramGrp>
{senses}
</entry>
''')


with open('output.xml', 'w', encoding="utf-8") as fd:
fd.write('''
<?xml version="1.0" encoding="UTF-8"?>
<?xml-model href="http://www.tei-c.org/release/xml/tei/custom/schema/relaxng/tei_all.rng"
schematypens="http://relaxng.org/ns/structure/1.0" type="application/xml"?>
<!-- Validate with `xmllint -relaxng $model-href $file` -->
<TEI xmlns="http://www.tei-c.org/ns/1.0">
''')
fd.write(head_xml)
fd.write('\n<text><body>\n')
write_entries(fd.write)
fd.write('</body></text></TEI>\n')
148 changes: 148 additions & 0 deletions custom_scripts/remap_pos.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
import sys
from lxml import etree

HELP = """
USAGE:
python3 remap_pos.py [command] <FILE_NAME> <ACRONYM>
COMMANDS:
export: Exports all pos element values from <gramGrp><gram type="pos"> into .csv file as original_pos.
Ouput file FILE_NAME-export.csv is saved into the same directory as input file.
remap: First run export command and add new tei_pos values to FILE_NAME-export.csv file. Remember to save it as .csv
format and keep it in the same directory as input file. This script will remap all pos values from
FILE_NAME-export.csv file and save them into FILE_NAME-remapped.xml file in the same directory.
Add <ACRONYM> to be used in the ids of entries and senses.
DEPENDENCIES:
Make sure you have lxml installed. You can install it with the following command:
pip3 install lxml
"""


def prepare_ns(xml_tree):
nsmap = xml_tree.getroot().nsmap
try:
nsmap["_"] = nsmap[None]
nsmap.pop(None)
except:
nsmap["_"] = None
return nsmap


def xpath_ns(xml_tree, xpath, nsmap):
if nsmap["_"] is None:
#result = xml_tree.xpath(".//gramGrp/gram[@type='pos']")
result = xml_tree.xpath(xpath)
else:
#result = xml_tree.xpath(".//_:gramGrp/_:gram[@type='pos']", namespaces=nsmap)
result = xml_tree.xpath(xpath.replace("/", "/_:").replace("./_:", "./"), namespaces=nsmap)
return result


def export(filename):
parser = etree.XMLParser(encoding='utf-8')
xml_tree = lxml.etree.parse(filename, parser)
nsmap = prepare_ns(xml_tree)
"""
if nsmap["_"] is None:
result = xml_tree.xpath(".//gramGrp/gram[@type='pos']")
else:
result = xml_tree.xpath(".//_:gramGrp/_:gram[@type='pos']", namespaces=nsmap)
"""
result = xpath_ns(xml_tree, ".//gramGrp/gram[@type='pos']", nsmap)
out_filename = filename[:-4] + "-export.csv"
pos = set()
for r in result:
pos.add(r.text)
with open(out_filename, 'w', encoding='utf-8') as file:
file.write("original_pos, tei_pos\n")
for r in pos:
r = "" if r is None else r
file.write(f"{r},\n")
print(f"Exports have been saved to {out_filename}. Remember to save it as .csv format after editing.")


def remap(filename, acronym):
csv_filename = filename[:-4] + "-export.csv"
pos_map = dict()
with open(csv_filename, encoding='utf-8') as file:
csv_line = file.readlines()
for line in csv_line[1:]:
if line.strip() == "":
continue
k = ",".join(line.split(",")[:-1]).strip()
v = line.split(",")[-1].strip()
pos_map[k] = v
#pos_map[line.split(",")[0].strip()] = line.split(",")[1].strip()
parser = etree.XMLParser(encoding='utf-8')
xml_tree = lxml.etree.parse(filename, parser)
nsmap = prepare_ns(xml_tree)
"""
if nsmap["_"] is None:
result = xml_tree.xpath(".//gramGrp/gram[@type='pos']")
else:
result = xml_tree.xpath(".//_:gramGrp/_:gram[@type='pos']", namespaces=nsmap)
"""
result = xpath_ns(xml_tree, ".//gramGrp/gram[@type='pos']", nsmap)
for r in result:
r.attrib["orig"] = "pos" if r.text is None else str(r.text).strip()
k = str(r.text).strip()
try:
r.text = pos_map[k]
except:
continue
#r.text = pos_map[r.text.split(",")[0].strip()]
out_filename = filename[:-4] + "-remapped.xml"
add_ids(xml_tree, acronym)
xml_tree.write(out_filename, encoding='utf-8', pretty_print=True)
print(f"Your xml has been remapped and saved to {out_filename}")


def add_ids(xml_tree, acronym):
nsmap = prepare_ns(xml_tree)
result = xpath_ns(xml_tree, ".//entry", nsmap)
entry_counter = 1
for entry in result:
hw = xpath_ns(entry, ".//form/orth", nsmap)[0].text
try:
pos = xpath_ns(entry, ".//gramGrp/gram", nsmap)[0].text
except:
pos = "pos"
if pos is None:
pos = "pos"
# try to remap old id
try:
entry.attrib["{http://www.w3.org/XML/1998/namespace}orig_id"] = entry.attrib["{http://www.w3.org/XML/1998/namespace}id"]
except:
pass
id_str = f"{acronym}_{hw}_{entry_counter}_{pos}"
entry.attrib["{http://www.w3.org/XML/1998/namespace}id"] = id_str
sense_counter = 1
sense_result = xpath_ns(entry, ".//sense", nsmap)
for sense in sense_result:
# try to remap old id
try:
sense.attrib["{http://www.w3.org/XML/1998/namespace}orig_id"] = sense.attrib["{http://www.w3.org/XML/1998/namespace}id"]
except:
pass
sense.attrib["{http://www.w3.org/XML/1998/namespace}id"] = f"{id_str}_{sense_counter}"
sense_counter += 1
entry_counter += 1


if __name__ == "__main__":
if ("export" not in sys.argv and "remap" not in sys.argv) or len(sys.argv) < 3:
print(HELP)
sys.exit(0)

import lxml.etree

if "export" in sys.argv:
export(sys.argv[2])
elif "remap" in sys.argv:
if len(sys.argv) != 4:
sys.argv.append("ACRO")
remap(sys.argv[2], sys.argv[3])
else:
print("You shouldn't be here...")

0 comments on commit c0ae396

Please sign in to comment.