-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #3 from mpsilfve/fairseq-container
Fairseq container
- Loading branch information
Showing
90 changed files
with
8,844 additions
and
662 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
*.pt filter=lfs diff=lfs merge=lfs -text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,7 @@ | ||
[submodule "backend_coling/coling2018-neural-transition-based-morphology"] | ||
path = backend_coling/coling2018-neural-transition-based-morphology | ||
url = [email protected]:mpsilfve/coling2018-neural-transition-based-morphology.git | ||
|
||
[submodule "backend_fairseq/fairseq"] | ||
path = backend_fairseq/fairseq | ||
url = [email protected]:mpsilfve/fairseq.git |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,15 +1,12 @@ | ||
FROM python:3.9 | ||
WORKDIR /backend_fairseq | ||
# RUN apt-get update && apt-get install -y \ | ||
# python3.9\ | ||
# python-pip3\ | ||
# && apt-get clean && rm -rf /var/lib/apt/lists/* | ||
RUN apt-get update && apt-get install -y \ | ||
python3.9\ | ||
python3-pip\ | ||
cython3\ | ||
&& apt-get clean && rm -rf /var/lib/apt/lists/* | ||
COPY requirements.txt . | ||
RUN pip install --no-cache-dir --upgrade pip && \ | ||
pip install --no-cache-dir -r requirements.txt | ||
# COPY fairseq ./fairseq | ||
# COPY word_model ./word_model | ||
# RUN cd fairseq && pip install --editable ./ | ||
# add run commands that will install (setup) the model | ||
# CMD ["/bin/sh"] | ||
CMD tail -f /dev/null | ||
ENV PYTHONUNBUFFERED=1 | ||
CMD /backend_fairseq/entrypoint.sh |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
#!/bin/bash | ||
|
||
# install Fairseq | ||
pwd | ||
cd fairseq && pip install ./ && pip install --upgrade numpy | ||
pwd | ||
cd ../pretrained_models | ||
|
||
# create named pipes for the models | ||
PIPE_DIR=io/pipes | ||
if [[ -p $PIPE_DIR/glossPipe ]] | ||
then | ||
rm $PIPE_DIR/glossPipe | ||
fi | ||
mkfifo $PIPE_DIR/glossPipe | ||
|
||
if [[ -p $PIPE_DIR/glossOut ]] | ||
then | ||
rm $PIPE_DIR/glossOut | ||
fi | ||
mkfifo $PIPE_DIR/glossOut | ||
|
||
if [[ -p $PIPE_DIR/morphSegPipe ]] | ||
then | ||
rm $PIPE_DIR/morphSegPipe | ||
fi | ||
mkfifo $PIPE_DIR/morphSegPipe | ||
|
||
# init the fairseq-interactive processes | ||
OUT_DIR=io/outputs | ||
rm $OUT_DIR/gloss_out.txt | ||
rm $OUT_DIR/morph_seg_out.txt | ||
|
||
#time head -n 1 dev_small.txt | fairseq-interactive --path data/gloss/checkpoint_best.pt --beam 5 --nbest 1 --source-lang src --target-lang trg data/gloss/gloss_preprocess > out.txt | ||
|
||
# tail -f $PIPE_DIR/glossPipe | fairseq-interactive --path data/gloss/checkpoint_best.pt --beam 5 --nbest 4 \ | ||
# --source-lang src --target-lang trg data/gloss/gloss_preprocess > $OUT_DIR/gloss_out.txt & | ||
|
||
# tail -f $PIPE_DIR/morphSegPipe | fairseq-interactive --path data/morphseg/lstm/checkpoint_best.pt --beam 5 --nbest 4 \ | ||
# --source-lang src --target-lang trg data/morphseg/lstm/lstm_preprocess > $OUT_DIR/morph_seg_out.txt & | ||
|
||
# init the listener | ||
cd .. && python3 -u listen_fairseq.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"model": "fairseq", "getSeg": true, "getGloss": true, "nbest": "1", "text": ["Helthl k'uuhl siwadi'm ahl la'oo'y g\u0332o'ohl Hla Heen", "Ky'ulhl k\u0332'ay 'mesim hanak\u0332' wat as Wiladoo ant 'wahl ky'eegwihl t'a'wihlg\u0332an", "Ii sii hlguuhlxwt loot iit haboo'lt", "Needii sim g\u0332al 'wudin 'nekwt ii k\u0332'ap g\u0332ala'y tusthl bent", "Ii sildim wiltxwst", "Ha'ondii 'nekwt dii sgyethl hlgu tk'ihlxw hlguuhlxw k\u0332'ay 'mesim hanak\u0332'si,", "ii ap gyatt wilt gya'adiit, k\u0332'ap hogyag\u0332am didils wilt gya'ahl gyet", "Ii helt ant neediit wilaaxt wil 'nit hlguuhlxum t'a'wihlg\u0332an k\u0332'ay 'mesim hanak\u0332'si", "Ii lukw'il ky'aa 'wii 'nekwt", "G\u0332asg\u0332oohl hla 'wii t'ishl t'a'wihlg\u0332an tunsi", "Ii belgi gwitkw'ootxwhl enx\u0332 g\u0332anhl g\u0332ax\u0332biist g\u0332anhl huwilp", "Wihl ap gupgupxwidiihl wilt, wilt gya'ahl hli gyedihl ts'epsi, iit liksgyedinhl hli gyedihl g\u0332alts'epsi", "Iit seex\u0332diithl k'i'yhl wilp x\u0332hliphetxwit ahl hla k\u0332'aphl ts'epsi", "Iit dok\u0332s dipunhl g\u0332ax\u0332biist iit liiluxwdiit", "K'i'yhl sa iit nax\u0332'nis dipun wil yukwhl yook\u0332xwhl similoo'o", "Hets'im x\u0332jaax\u0332xwhl nax\u0332'nidiit"], "input_type": "text", "id": 1651692717, "n_sentences": 16} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,220 @@ | ||
""" | ||
Passes a job through Fairseq. | ||
Creates a job_id.dev file, which is passed to the Python wrapper for the pre-trained models. | ||
""" | ||
|
||
from __future__ import annotations | ||
import json, re | ||
import os | ||
import subprocess | ||
import cProfile | ||
import time | ||
from pretrained_models.run_fairseq import * | ||
|
||
|
||
class FairseqSubmitter: | ||
""" | ||
Submit jobs to the background process of fairseq-interactive. | ||
Keeps track of the number of examples seen by fairseq-interactive so far. | ||
""" | ||
|
||
def __init__(self): | ||
self.first_example = 0 | ||
self.last_example = -1 | ||
|
||
def process_text_sentence_batch(self, data): | ||
""" | ||
Split the job into separate sentences, and send each of them to the background fairseq process | ||
""" | ||
|
||
# model options | ||
jobid = data['id'] | ||
n_best = int(data['nbest']) | ||
n_sentences = data['n_sentences'] | ||
sentences = data['text'] | ||
|
||
getSeg = data['getSeg'] | ||
getGloss = data['getGloss'] | ||
assert getSeg or getGloss | ||
|
||
# init the subprocess | ||
if getSeg: | ||
stail, seg = init_fairseq_model('seg', n_best) | ||
if getGloss: | ||
gtail, gloss = init_fairseq_model('gloss', n_best) | ||
|
||
for i in range(n_sentences): | ||
sentence = sentences[i] | ||
n_tokens = len(sentence.split(' ')) | ||
save_path = f'/data/results/sentence_{jobid}_{i}.std.out' | ||
|
||
self.last_example += n_tokens | ||
submit_sentence(sentence, i, getSeg, getGloss, self.first_example, self.last_example, n_best, save_path) | ||
self.first_example += n_tokens | ||
|
||
# found_sentences = 0 | ||
# # while found_sentences < n_sentences: | ||
# # sentence, i, first, last, nbest, path = sentence_dict[found_sentences] | ||
# # if get_sentence(sentence, i, first, last, nbest, path): | ||
# # found_sentences += 1 | ||
# out_path = '/backend_fairseq/pretrained_models/io/outputs/gloss_out.txt' | ||
# out_file = open(out_path, 'r') | ||
# out_file.seek(0,2) | ||
# out_str = '' | ||
# while found_sentences < n_sentences: | ||
# line = out_file.readline() | ||
# if line: | ||
# out_str += line | ||
# sentence, i, first, last, nbest, path = sentence_dict[found_sentences] | ||
# if get_sentence(sentence, out_str, i, first, last, nbest, path): | ||
# found_sentences += 1 | ||
# else: | ||
# sleep(0.01) | ||
|
||
#out, err = gloss.communicate() | ||
|
||
if getSeg: | ||
seg.terminate() | ||
stail.terminate() | ||
if getGloss: | ||
gloss.terminate() | ||
gtail.terminate() | ||
|
||
self.first_example = 0 | ||
self.last_example = -1 | ||
|
||
def process_elan_annotation_batch(self, data): | ||
""" | ||
Send each annotation as its own job | ||
""" | ||
|
||
# model options | ||
jobid = data['id'] | ||
n_best = int(data['nbest']) | ||
annot = data['eaf_data'] | ||
|
||
getSeg = data['getSeg'] | ||
getGloss = data['getGloss'] | ||
assert getSeg or getGloss | ||
|
||
# init the subprocess | ||
if getSeg: | ||
stail, seg = init_fairseq_model('seg', n_best) | ||
if getGloss: | ||
gtail, gloss = init_fairseq_model('gloss', n_best) | ||
|
||
for i in range(len(annot)): | ||
text = annot[i]['annotation_text'] | ||
n_tokens = len(text.split(' ')) | ||
id = annot[i]['annotation_id'] | ||
|
||
save_path = f'/data/results/sentence_{jobid}_{i}.std.out' | ||
self.last_example += n_tokens | ||
submit_sentence(text, i, getSeg, getGloss, self.first_example, self.last_example, n_best, save_path, annotation_id=id) | ||
self.first_example += n_tokens | ||
|
||
if getSeg: | ||
stail.terminate() | ||
seg.terminate() | ||
if getGloss: | ||
gtail.terminate() | ||
gloss.terminate() | ||
|
||
self.first_example = 0 | ||
self.last_example = -1 | ||
|
||
def process_batch(self, job_id): | ||
""" | ||
Loads a job file from the server, and sends to the model | ||
""" | ||
|
||
# get the JSON contents | ||
job_path = '/data/inputs/fairseq_{}.txt'.format(job_id) | ||
with open(job_path, 'r') as new_job: | ||
new_job_contents = new_job.read() | ||
new_job_data = json.loads(new_job_contents) | ||
|
||
# process based on input type | ||
in_type = new_job_data['input_type'] | ||
if in_type == 'text': | ||
start = time.time() | ||
|
||
self.process_text_sentence_batch(new_job_data) | ||
|
||
#cProfile.run('self.process_text_sentence_batch(new_job_data)') | ||
end = time.time() | ||
print('Time elapsed:', end-start) | ||
elif in_type == "eaf_file": | ||
self.process_elan_annotation_batch(new_job_data) | ||
|
||
|
||
# def process_file(job_id): | ||
# """ | ||
# Converts a job json file into an input for the model wrapper. | ||
# Passes it to the model. | ||
# """ | ||
|
||
# # TODO: just write the files to backend_fairseq for now | ||
# job_path = '/data/inputs/fairseq_{}.txt'.format(job_id) | ||
# with open(job_path, 'r') as new_job: | ||
# new_job_contents = new_job.read() | ||
|
||
# new_job_data = json.loads(new_job_contents) | ||
|
||
# # TODO: Debug | ||
# print(new_job_data) | ||
|
||
# # check for options | ||
# n_best = int(new_job_data['nbest']) | ||
|
||
# if new_job_data['input_type'] == 'text': | ||
# text_input = new_job_data['text'] | ||
|
||
# # convert text stream into a list of sentences | ||
# sentences = [] | ||
# txt_lines = re.split('\n', text_input) | ||
# for txtl in txt_lines: | ||
# sent_lines = re.split('[.!?] ', txtl) | ||
# for sl in sent_lines: | ||
# sentences.append(sl) | ||
|
||
# annotation_seq = None | ||
|
||
# elif new_job_data['input_type'] == 'eaf_file': | ||
# eaf_data = new_job_data['eaf_data'] | ||
|
||
# # each annotation is considered a sentence | ||
# sentences = [] | ||
# annotation_list = [] | ||
# annotation_seq = [] | ||
# for annotation in eaf_data: | ||
# annotation_value = annotation['annotation_text'] | ||
# annotation_id = annotation['annotation_id'] | ||
# sentences.append(annotation_value) | ||
|
||
# # repeat the annotation value for each word, align in run_fairseq.py | ||
# for i in range(len(annotation_value.split(' '))): | ||
# annotation_seq.append(annotation_id) | ||
|
||
# annotation_list.append(annotation_id) | ||
|
||
# # overwrite job file with a list of annotations | ||
# new_job_data['annotation_list'] = annotation_list | ||
# with open(job_path, 'w') as out_file: | ||
# out_file = json.dump(new_job_data, out_file) | ||
|
||
# # Save file, send to the wrapper | ||
# # TODO: save where? | ||
# txt_path = '/data/inputs/fairseq_{}.sents'.format(job_id) | ||
# with open(txt_path, 'w') as outpt: | ||
# for sline in sentences: | ||
# outpt.write(sline + '\n') | ||
|
||
# # find task | ||
# if new_job_data['task'] == 'gloss': | ||
# call_glossing_model(txt_path, n_best, '/data/results/output_inference_json-{}.std.out'.format(job_id), annotations=annotation_seq) | ||
# # submit_glossing_text(txt_path, n_best, '/data/results/output_inference_json-{}.std.out'.format(job_id), annotations=annotation_seq) | ||
# # print('Processing ', txt_path) | ||
# elif new_job_data['task'] == 'morphseg': | ||
# call_morphseg_model(txt_path, n_best, '/data/results/output_inference_json-{}.std.out'.format(job_id), annotations=annotation_seq) |
Empty file.
3 changes: 3 additions & 0 deletions
3
backend_fairseq/pretrained_models/data/gloss/checkpoint_best.pt
Git LFS file not shown
Oops, something went wrong.