From 83c656f4ffd2bf1bbc55e2300ea0567aaad958bc Mon Sep 17 00:00:00 2001 From: andrazrepar Date: Thu, 28 Jul 2022 11:02:01 +0200 Subject: [PATCH 1/3] add custom scripts --- custom_scripts/csv2tei-general-windowsOS.py | 85 +++++++++++ custom_scripts/remap_pos.py | 148 ++++++++++++++++++++ 2 files changed, 233 insertions(+) create mode 100644 custom_scripts/csv2tei-general-windowsOS.py create mode 100644 custom_scripts/remap_pos.py diff --git a/custom_scripts/csv2tei-general-windowsOS.py b/custom_scripts/csv2tei-general-windowsOS.py new file mode 100644 index 0000000..2c73eaa --- /dev/null +++ b/custom_scripts/csv2tei-general-windowsOS.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python3 +import sys + +import pandas as pd + +input_filename = sys.argv[1] +df = pd.read_csv(input_filename, index_col=False, + sep='\t' if input_filename.endswith('.tsv') else None) + +expected_columns = ['lemma', 'pos', 'definition'] +if not all(col in df for col in expected_columns): + sys.exit('ERROR: Table requires columns: ' + str(expected_columns)) +expected_columns = ['entry_id', 'sense_id'] +if not all(col in df for col in expected_columns): + print('WARNING: Table could have (additionally) columns: ' + str(expected_columns)) + +try: + head_xml = open('teiHeader.xml').read() +except IOError: + print('WARNING: Missing ./teiHeader.xml (containing ``). ' + 'Will use blank.', file=sys.stderr) + head_xml = '\n' +else: + if not head_xml.strip().startswith(''): + sys.exit('ERROR: File teiHeader.xml does not start with ""') + if not head_xml.strip().endswith(''): + sys.exit('ERROR: File teiHeader.xml does not end with ""') + + +UD_POS = ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', + 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', + 'PRON', 'PROPN', 'PUNCT', 'SCONJ', + 'SYM', 'VERB', 'X'] +for pos in df['pos'].unique(): + if pos.upper() not in UD_POS: + sys.exit(f'ERROR: POS values must be from Univ.Deps.v2. ' + f'Got {pos!r}, expected one of: {UD_POS}') + +sort_keys = ( + (['entry_id'] if 'entry_id' in df else []) + + ['lemma', 'pos'] + + (['sense_id'] if 'sense_id' in df else []) +) +df.sort_values(sort_keys, inplace=True, ignore_index=True) + +have_entry_id = 'entry_id' in df +have_sense_id = 'sense_id' in df + +if have_entry_id: + assert df['entry_id'].nunique() == df['lemma'].nunique(),\ + 'Number of `entry_id` must match number of distinct `lemma`' +if have_sense_id: + assert df['sense_id'].nunique() == len(df), \ + 'Every sense requires unique `sense_id`' + + +def write_entries(write_out): + for (lemma, pos), subdf in df.groupby(['lemma', 'pos']): + entry_id = f' xml:id="{subdf["entry_id"].iloc[0]}"' if have_entry_id else '' + senses = [] + for n, sense in enumerate(subdf.itertuples(index=False), 1): + sense_id = f' xml:id="{sense.sense_id}"' if have_sense_id else '' + senses.append(f'{sense.definition}') + senses = '\n'.join(senses) + write_out(f'''\ + +
{lemma}
+{pos} +{senses} + +''') + + +with open('output.xml', 'w', encoding="utf-8") as fd: + fd.write(''' + + + + +''') + fd.write(head_xml) + fd.write('\n\n') + write_entries(fd.write) + fd.write('\n') diff --git a/custom_scripts/remap_pos.py b/custom_scripts/remap_pos.py new file mode 100644 index 0000000..5ac18f2 --- /dev/null +++ b/custom_scripts/remap_pos.py @@ -0,0 +1,148 @@ +import sys +from lxml import etree + +HELP = """ +USAGE: +python3 remap_pos.py [command] + +COMMANDS: +export: Exports all pos element values from into .csv file as original_pos. + Ouput file FILE_NAME-export.csv is saved into the same directory as input file. + +remap: First run export command and add new tei_pos values to FILE_NAME-export.csv file. Remember to save it as .csv + format and keep it in the same directory as input file. This script will remap all pos values from + FILE_NAME-export.csv file and save them into FILE_NAME-remapped.xml file in the same directory. + Add to be used in the ids of entries and senses. + +DEPENDENCIES: +Make sure you have lxml installed. You can install it with the following command: +pip3 install lxml +""" + + +def prepare_ns(xml_tree): + nsmap = xml_tree.getroot().nsmap + try: + nsmap["_"] = nsmap[None] + nsmap.pop(None) + except: + nsmap["_"] = None + return nsmap + + +def xpath_ns(xml_tree, xpath, nsmap): + if nsmap["_"] is None: + #result = xml_tree.xpath(".//gramGrp/gram[@type='pos']") + result = xml_tree.xpath(xpath) + else: + #result = xml_tree.xpath(".//_:gramGrp/_:gram[@type='pos']", namespaces=nsmap) + result = xml_tree.xpath(xpath.replace("/", "/_:").replace("./_:", "./"), namespaces=nsmap) + return result + + +def export(filename): + parser = etree.XMLParser(encoding='utf-8') + xml_tree = lxml.etree.parse(filename, parser) + nsmap = prepare_ns(xml_tree) + """ + if nsmap["_"] is None: + result = xml_tree.xpath(".//gramGrp/gram[@type='pos']") + else: + result = xml_tree.xpath(".//_:gramGrp/_:gram[@type='pos']", namespaces=nsmap) + """ + result = xpath_ns(xml_tree, ".//gramGrp/gram[@type='pos']", nsmap) + out_filename = filename[:-4] + "-export.csv" + pos = set() + for r in result: + pos.add(r.text) + with open(out_filename, 'w', encoding='utf-8') as file: + file.write("original_pos, tei_pos\n") + for r in pos: + r = "" if r is None else r + file.write(f"{r},\n") + print(f"Exports have been saved to {out_filename}. Remember to save it as .csv format after editing.") + + +def remap(filename, acronym): + csv_filename = filename[:-4] + "-export.csv" + pos_map = dict() + with open(csv_filename, encoding='utf-8') as file: + csv_line = file.readlines() + for line in csv_line[1:]: + if line.strip() == "": + continue + k = ",".join(line.split(",")[:-1]).strip() + v = line.split(",")[-1].strip() + pos_map[k] = v + #pos_map[line.split(",")[0].strip()] = line.split(",")[1].strip() + parser = etree.XMLParser(encoding='utf-8') + xml_tree = lxml.etree.parse(filename, parser) + nsmap = prepare_ns(xml_tree) + """ + if nsmap["_"] is None: + result = xml_tree.xpath(".//gramGrp/gram[@type='pos']") + else: + result = xml_tree.xpath(".//_:gramGrp/_:gram[@type='pos']", namespaces=nsmap) + """ + result = xpath_ns(xml_tree, ".//gramGrp/gram[@type='pos']", nsmap) + for r in result: + r.attrib["orig"] = "pos" if r.text is None else str(r.text).strip() + k = str(r.text).strip() + try: + r.text = pos_map[k] + except: + continue + #r.text = pos_map[r.text.split(",")[0].strip()] + out_filename = filename[:-4] + "-remapped.xml" + add_ids(xml_tree, acronym) + xml_tree.write(out_filename, encoding='utf-8', pretty_print=True) + print(f"Your xml has been remapped and saved to {out_filename}") + + +def add_ids(xml_tree, acronym): + nsmap = prepare_ns(xml_tree) + result = xpath_ns(xml_tree, ".//entry", nsmap) + entry_counter = 1 + for entry in result: + hw = xpath_ns(entry, ".//form/orth", nsmap)[0].text + try: + pos = xpath_ns(entry, ".//gramGrp/gram", nsmap)[0].text + except: + pos = "pos" + if pos is None: + pos = "pos" + # try to remap old id + try: + entry.attrib["{http://www.w3.org/XML/1998/namespace}orig_id"] = entry.attrib["{http://www.w3.org/XML/1998/namespace}id"] + except: + pass + id_str = f"{acronym}_{hw}_{entry_counter}_{pos}" + entry.attrib["{http://www.w3.org/XML/1998/namespace}id"] = id_str + sense_counter = 1 + sense_result = xpath_ns(entry, ".//sense", nsmap) + for sense in sense_result: + # try to remap old id + try: + sense.attrib["{http://www.w3.org/XML/1998/namespace}orig_id"] = sense.attrib["{http://www.w3.org/XML/1998/namespace}id"] + except: + pass + sense.attrib["{http://www.w3.org/XML/1998/namespace}id"] = f"{id_str}_{sense_counter}" + sense_counter += 1 + entry_counter += 1 + + +if __name__ == "__main__": + if ("export" not in sys.argv and "remap" not in sys.argv) or len(sys.argv) < 3: + print(HELP) + sys.exit(0) + + import lxml.etree + + if "export" in sys.argv: + export(sys.argv[2]) + elif "remap" in sys.argv: + if len(sys.argv) != 4: + sys.argv.append("ACRO") + remap(sys.argv[2], sys.argv[3]) + else: + print("You shouldn't be here...") \ No newline at end of file From 0d9c74f7abecf481f0ace911bc665c9a04f02c6e Mon Sep 17 00:00:00 2001 From: andrazrepar Date: Thu, 28 Jul 2022 11:03:17 +0200 Subject: [PATCH 2/3] remove custum scripts --- custom_scripts/csv2tei-general-windowsOS.py | 85 ----------- custom_scripts/remap_pos.py | 148 -------------------- 2 files changed, 233 deletions(-) delete mode 100644 custom_scripts/csv2tei-general-windowsOS.py delete mode 100644 custom_scripts/remap_pos.py diff --git a/custom_scripts/csv2tei-general-windowsOS.py b/custom_scripts/csv2tei-general-windowsOS.py deleted file mode 100644 index 2c73eaa..0000000 --- a/custom_scripts/csv2tei-general-windowsOS.py +++ /dev/null @@ -1,85 +0,0 @@ -#!/usr/bin/env python3 -import sys - -import pandas as pd - -input_filename = sys.argv[1] -df = pd.read_csv(input_filename, index_col=False, - sep='\t' if input_filename.endswith('.tsv') else None) - -expected_columns = ['lemma', 'pos', 'definition'] -if not all(col in df for col in expected_columns): - sys.exit('ERROR: Table requires columns: ' + str(expected_columns)) -expected_columns = ['entry_id', 'sense_id'] -if not all(col in df for col in expected_columns): - print('WARNING: Table could have (additionally) columns: ' + str(expected_columns)) - -try: - head_xml = open('teiHeader.xml').read() -except IOError: - print('WARNING: Missing ./teiHeader.xml (containing ``). ' - 'Will use blank.', file=sys.stderr) - head_xml = '\n' -else: - if not head_xml.strip().startswith(''): - sys.exit('ERROR: File teiHeader.xml does not start with ""') - if not head_xml.strip().endswith(''): - sys.exit('ERROR: File teiHeader.xml does not end with ""') - - -UD_POS = ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', - 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', - 'PRON', 'PROPN', 'PUNCT', 'SCONJ', - 'SYM', 'VERB', 'X'] -for pos in df['pos'].unique(): - if pos.upper() not in UD_POS: - sys.exit(f'ERROR: POS values must be from Univ.Deps.v2. ' - f'Got {pos!r}, expected one of: {UD_POS}') - -sort_keys = ( - (['entry_id'] if 'entry_id' in df else []) + - ['lemma', 'pos'] + - (['sense_id'] if 'sense_id' in df else []) -) -df.sort_values(sort_keys, inplace=True, ignore_index=True) - -have_entry_id = 'entry_id' in df -have_sense_id = 'sense_id' in df - -if have_entry_id: - assert df['entry_id'].nunique() == df['lemma'].nunique(),\ - 'Number of `entry_id` must match number of distinct `lemma`' -if have_sense_id: - assert df['sense_id'].nunique() == len(df), \ - 'Every sense requires unique `sense_id`' - - -def write_entries(write_out): - for (lemma, pos), subdf in df.groupby(['lemma', 'pos']): - entry_id = f' xml:id="{subdf["entry_id"].iloc[0]}"' if have_entry_id else '' - senses = [] - for n, sense in enumerate(subdf.itertuples(index=False), 1): - sense_id = f' xml:id="{sense.sense_id}"' if have_sense_id else '' - senses.append(f'{sense.definition}') - senses = '\n'.join(senses) - write_out(f'''\ - -
{lemma}
-{pos} -{senses} - -''') - - -with open('output.xml', 'w', encoding="utf-8") as fd: - fd.write(''' - - - - -''') - fd.write(head_xml) - fd.write('\n\n') - write_entries(fd.write) - fd.write('\n') diff --git a/custom_scripts/remap_pos.py b/custom_scripts/remap_pos.py deleted file mode 100644 index 5ac18f2..0000000 --- a/custom_scripts/remap_pos.py +++ /dev/null @@ -1,148 +0,0 @@ -import sys -from lxml import etree - -HELP = """ -USAGE: -python3 remap_pos.py [command] - -COMMANDS: -export: Exports all pos element values from into .csv file as original_pos. - Ouput file FILE_NAME-export.csv is saved into the same directory as input file. - -remap: First run export command and add new tei_pos values to FILE_NAME-export.csv file. Remember to save it as .csv - format and keep it in the same directory as input file. This script will remap all pos values from - FILE_NAME-export.csv file and save them into FILE_NAME-remapped.xml file in the same directory. - Add to be used in the ids of entries and senses. - -DEPENDENCIES: -Make sure you have lxml installed. You can install it with the following command: -pip3 install lxml -""" - - -def prepare_ns(xml_tree): - nsmap = xml_tree.getroot().nsmap - try: - nsmap["_"] = nsmap[None] - nsmap.pop(None) - except: - nsmap["_"] = None - return nsmap - - -def xpath_ns(xml_tree, xpath, nsmap): - if nsmap["_"] is None: - #result = xml_tree.xpath(".//gramGrp/gram[@type='pos']") - result = xml_tree.xpath(xpath) - else: - #result = xml_tree.xpath(".//_:gramGrp/_:gram[@type='pos']", namespaces=nsmap) - result = xml_tree.xpath(xpath.replace("/", "/_:").replace("./_:", "./"), namespaces=nsmap) - return result - - -def export(filename): - parser = etree.XMLParser(encoding='utf-8') - xml_tree = lxml.etree.parse(filename, parser) - nsmap = prepare_ns(xml_tree) - """ - if nsmap["_"] is None: - result = xml_tree.xpath(".//gramGrp/gram[@type='pos']") - else: - result = xml_tree.xpath(".//_:gramGrp/_:gram[@type='pos']", namespaces=nsmap) - """ - result = xpath_ns(xml_tree, ".//gramGrp/gram[@type='pos']", nsmap) - out_filename = filename[:-4] + "-export.csv" - pos = set() - for r in result: - pos.add(r.text) - with open(out_filename, 'w', encoding='utf-8') as file: - file.write("original_pos, tei_pos\n") - for r in pos: - r = "" if r is None else r - file.write(f"{r},\n") - print(f"Exports have been saved to {out_filename}. Remember to save it as .csv format after editing.") - - -def remap(filename, acronym): - csv_filename = filename[:-4] + "-export.csv" - pos_map = dict() - with open(csv_filename, encoding='utf-8') as file: - csv_line = file.readlines() - for line in csv_line[1:]: - if line.strip() == "": - continue - k = ",".join(line.split(",")[:-1]).strip() - v = line.split(",")[-1].strip() - pos_map[k] = v - #pos_map[line.split(",")[0].strip()] = line.split(",")[1].strip() - parser = etree.XMLParser(encoding='utf-8') - xml_tree = lxml.etree.parse(filename, parser) - nsmap = prepare_ns(xml_tree) - """ - if nsmap["_"] is None: - result = xml_tree.xpath(".//gramGrp/gram[@type='pos']") - else: - result = xml_tree.xpath(".//_:gramGrp/_:gram[@type='pos']", namespaces=nsmap) - """ - result = xpath_ns(xml_tree, ".//gramGrp/gram[@type='pos']", nsmap) - for r in result: - r.attrib["orig"] = "pos" if r.text is None else str(r.text).strip() - k = str(r.text).strip() - try: - r.text = pos_map[k] - except: - continue - #r.text = pos_map[r.text.split(",")[0].strip()] - out_filename = filename[:-4] + "-remapped.xml" - add_ids(xml_tree, acronym) - xml_tree.write(out_filename, encoding='utf-8', pretty_print=True) - print(f"Your xml has been remapped and saved to {out_filename}") - - -def add_ids(xml_tree, acronym): - nsmap = prepare_ns(xml_tree) - result = xpath_ns(xml_tree, ".//entry", nsmap) - entry_counter = 1 - for entry in result: - hw = xpath_ns(entry, ".//form/orth", nsmap)[0].text - try: - pos = xpath_ns(entry, ".//gramGrp/gram", nsmap)[0].text - except: - pos = "pos" - if pos is None: - pos = "pos" - # try to remap old id - try: - entry.attrib["{http://www.w3.org/XML/1998/namespace}orig_id"] = entry.attrib["{http://www.w3.org/XML/1998/namespace}id"] - except: - pass - id_str = f"{acronym}_{hw}_{entry_counter}_{pos}" - entry.attrib["{http://www.w3.org/XML/1998/namespace}id"] = id_str - sense_counter = 1 - sense_result = xpath_ns(entry, ".//sense", nsmap) - for sense in sense_result: - # try to remap old id - try: - sense.attrib["{http://www.w3.org/XML/1998/namespace}orig_id"] = sense.attrib["{http://www.w3.org/XML/1998/namespace}id"] - except: - pass - sense.attrib["{http://www.w3.org/XML/1998/namespace}id"] = f"{id_str}_{sense_counter}" - sense_counter += 1 - entry_counter += 1 - - -if __name__ == "__main__": - if ("export" not in sys.argv and "remap" not in sys.argv) or len(sys.argv) < 3: - print(HELP) - sys.exit(0) - - import lxml.etree - - if "export" in sys.argv: - export(sys.argv[2]) - elif "remap" in sys.argv: - if len(sys.argv) != 4: - sys.argv.append("ACRO") - remap(sys.argv[2], sys.argv[3]) - else: - print("You shouldn't be here...") \ No newline at end of file From 550dae6f2512a8ed6dea263ab448c21581d22788 Mon Sep 17 00:00:00 2001 From: Luka Date: Mon, 6 Mar 2023 10:12:25 +0100 Subject: [PATCH 3/3] restart containers --- docker-compose.prod.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index ea45f7d..0ff0f0a 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -15,6 +15,7 @@ services: - flask-app ports: - 5000:8080 + restart: unless-stopped redis: image: redis extra_hosts: @@ -23,6 +24,7 @@ services: elexifier-network: aliases: - redis + restart: unless-stopped worker1: build: . command: celery -A app:celery worker -P threads --loglevel=info @@ -34,6 +36,7 @@ services: elexifier-network: aliases: - worker1 + restart: unless-stopped depends_on: - redis worker2: @@ -47,6 +50,7 @@ services: elexifier-network: aliases: - worker2 + restart: unless-stopped depends_on: - redis