Skip to content

Commit

Permalink
Merge pull request #3 from mpsilfve/fairseq-container
Browse files Browse the repository at this point in the history
Fairseq container
  • Loading branch information
mpsilfve authored May 17, 2022
2 parents c4a2c69 + e2c2bed commit 0ffaf86
Show file tree
Hide file tree
Showing 90 changed files with 8,844 additions and 662 deletions.
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*.pt filter=lfs diff=lfs merge=lfs -text
4 changes: 4 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
[submodule "backend_coling/coling2018-neural-transition-based-morphology"]
path = backend_coling/coling2018-neural-transition-based-morphology
url = [email protected]:mpsilfve/coling2018-neural-transition-based-morphology.git

[submodule "backend_fairseq/fairseq"]
path = backend_fairseq/fairseq
url = [email protected]:mpsilfve/fairseq.git
17 changes: 7 additions & 10 deletions backend_fairseq/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,15 +1,12 @@
FROM python:3.9
WORKDIR /backend_fairseq
# RUN apt-get update && apt-get install -y \
# python3.9\
# python-pip3\
# && apt-get clean && rm -rf /var/lib/apt/lists/*
RUN apt-get update && apt-get install -y \
python3.9\
python3-pip\
cython3\
&& apt-get clean && rm -rf /var/lib/apt/lists/*
COPY requirements.txt .
RUN pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir -r requirements.txt
# COPY fairseq ./fairseq
# COPY word_model ./word_model
# RUN cd fairseq && pip install --editable ./
# add run commands that will install (setup) the model
# CMD ["/bin/sh"]
CMD tail -f /dev/null
ENV PYTHONUNBUFFERED=1
CMD /backend_fairseq/entrypoint.sh
43 changes: 43 additions & 0 deletions backend_fairseq/entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#!/bin/bash

# install Fairseq
pwd
cd fairseq && pip install ./ && pip install --upgrade numpy
pwd
cd ../pretrained_models

# create named pipes for the models
PIPE_DIR=io/pipes
if [[ -p $PIPE_DIR/glossPipe ]]
then
rm $PIPE_DIR/glossPipe
fi
mkfifo $PIPE_DIR/glossPipe

if [[ -p $PIPE_DIR/glossOut ]]
then
rm $PIPE_DIR/glossOut
fi
mkfifo $PIPE_DIR/glossOut

if [[ -p $PIPE_DIR/morphSegPipe ]]
then
rm $PIPE_DIR/morphSegPipe
fi
mkfifo $PIPE_DIR/morphSegPipe

# init the fairseq-interactive processes
OUT_DIR=io/outputs
rm $OUT_DIR/gloss_out.txt
rm $OUT_DIR/morph_seg_out.txt

#time head -n 1 dev_small.txt | fairseq-interactive --path data/gloss/checkpoint_best.pt --beam 5 --nbest 1 --source-lang src --target-lang trg data/gloss/gloss_preprocess > out.txt

# tail -f $PIPE_DIR/glossPipe | fairseq-interactive --path data/gloss/checkpoint_best.pt --beam 5 --nbest 4 \
# --source-lang src --target-lang trg data/gloss/gloss_preprocess > $OUT_DIR/gloss_out.txt &

# tail -f $PIPE_DIR/morphSegPipe | fairseq-interactive --path data/morphseg/lstm/checkpoint_best.pt --beam 5 --nbest 4 \
# --source-lang src --target-lang trg data/morphseg/lstm/lstm_preprocess > $OUT_DIR/morph_seg_out.txt &

# init the listener
cd .. && python3 -u listen_fairseq.py
1 change: 1 addition & 0 deletions backend_fairseq/fairseq
Submodule fairseq added at eb228e
1 change: 1 addition & 0 deletions backend_fairseq/jobs
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"model": "fairseq", "getSeg": true, "getGloss": true, "nbest": "1", "text": ["Helthl k'uuhl siwadi'm ahl la'oo'y g\u0332o'ohl Hla Heen", "Ky'ulhl k\u0332'ay 'mesim hanak\u0332' wat as Wiladoo ant 'wahl ky'eegwihl t'a'wihlg\u0332an", "Ii sii hlguuhlxwt loot iit haboo'lt", "Needii sim g\u0332al 'wudin 'nekwt ii k\u0332'ap g\u0332ala'y tusthl bent", "Ii sildim wiltxwst", "Ha'ondii 'nekwt dii sgyethl hlgu tk'ihlxw hlguuhlxw k\u0332'ay 'mesim hanak\u0332'si,", "ii ap gyatt wilt gya'adiit, k\u0332'ap hogyag\u0332am didils wilt gya'ahl gyet", "Ii helt ant neediit wilaaxt wil 'nit hlguuhlxum t'a'wihlg\u0332an k\u0332'ay 'mesim hanak\u0332'si", "Ii lukw'il ky'aa 'wii 'nekwt", "G\u0332asg\u0332oohl hla 'wii t'ishl t'a'wihlg\u0332an tunsi", "Ii belgi gwitkw'ootxwhl enx\u0332 g\u0332anhl g\u0332ax\u0332biist g\u0332anhl huwilp", "Wihl ap gupgupxwidiihl wilt, wilt gya'ahl hli gyedihl ts'epsi, iit liksgyedinhl hli gyedihl g\u0332alts'epsi", "Iit seex\u0332diithl k'i'yhl wilp x\u0332hliphetxwit ahl hla k\u0332'aphl ts'epsi", "Iit dok\u0332s dipunhl g\u0332ax\u0332biist iit liiluxwdiit", "K'i'yhl sa iit nax\u0332'nis dipun wil yukwhl yook\u0332xwhl similoo'o", "Hets'im x\u0332jaax\u0332xwhl nax\u0332'nidiit"], "input_type": "text", "id": 1651692717, "n_sentences": 16}
32 changes: 29 additions & 3 deletions backend_fairseq/listen_fairseq.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,14 @@
# "--dynet-seed", dynet_seed,
# "--dynet-mem", "1000"
# ])
import time, shutil
import time, shutil, os
from watchdog.observers import Observer
from watchdog.events import PatternMatchingEventHandler
from sortedcontainers import SortedList
import model_inference

# job_list is a sorted list, by job ID
job_list = SortedList()

if __name__ == "__main__":
patterns = "*"
Expand All @@ -23,12 +28,21 @@ def on_created(event):
path_components = path.split("_")
# path_components = path.split("/")

if path_components[0] == "/data/fairseq":
newPath = shutil.copy(f"{event.src_path}", ".")
if path_components[0] == "/data/inputs/fairseq":
newPath = shutil.copy(f"{event.src_path}", "./jobs")
# then remove the file from /data/fairseq
# then make a list of current jobs
# feed the next in line job
# pass the next job into a script

# TODO: just write the files to backend_fairseq for now
#os.remove(event.src_path)
further_path_components = path_components[1].split(".")
job_id = further_path_components[0]
job_list.add(job_id)

elif path_components[0] == "/data/results/sentence":
print("Spotted sentence!")

# add it to the sorted list
# act on the first item in the list
Expand All @@ -52,10 +66,22 @@ def on_moved(event):
my_observer = Observer()
my_observer.schedule(my_event_handler, path, recursive=go_recursively)

# object for submitting Fairseq jobs
submitter = model_inference.FairseqSubmitter()

my_observer.start()
try:
while True:
# sleep to avoid running constantly
time.sleep(1)
# process jobs in the job list
if len(job_list) > 0:
# pass job to model_inference.py
tic = time.time()
current_job = job_list.pop(0)

# process file
submitter.process_batch(current_job)
except KeyboardInterrupt:
my_observer.stop()
my_observer.join()
Expand Down
220 changes: 220 additions & 0 deletions backend_fairseq/model_inference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,220 @@
"""
Passes a job through Fairseq.
Creates a job_id.dev file, which is passed to the Python wrapper for the pre-trained models.
"""

from __future__ import annotations
import json, re
import os
import subprocess
import cProfile
import time
from pretrained_models.run_fairseq import *


class FairseqSubmitter:
"""
Submit jobs to the background process of fairseq-interactive.
Keeps track of the number of examples seen by fairseq-interactive so far.
"""

def __init__(self):
self.first_example = 0
self.last_example = -1

def process_text_sentence_batch(self, data):
"""
Split the job into separate sentences, and send each of them to the background fairseq process
"""

# model options
jobid = data['id']
n_best = int(data['nbest'])
n_sentences = data['n_sentences']
sentences = data['text']

getSeg = data['getSeg']
getGloss = data['getGloss']
assert getSeg or getGloss

# init the subprocess
if getSeg:
stail, seg = init_fairseq_model('seg', n_best)
if getGloss:
gtail, gloss = init_fairseq_model('gloss', n_best)

for i in range(n_sentences):
sentence = sentences[i]
n_tokens = len(sentence.split(' '))
save_path = f'/data/results/sentence_{jobid}_{i}.std.out'

self.last_example += n_tokens
submit_sentence(sentence, i, getSeg, getGloss, self.first_example, self.last_example, n_best, save_path)
self.first_example += n_tokens

# found_sentences = 0
# # while found_sentences < n_sentences:
# # sentence, i, first, last, nbest, path = sentence_dict[found_sentences]
# # if get_sentence(sentence, i, first, last, nbest, path):
# # found_sentences += 1
# out_path = '/backend_fairseq/pretrained_models/io/outputs/gloss_out.txt'
# out_file = open(out_path, 'r')
# out_file.seek(0,2)
# out_str = ''
# while found_sentences < n_sentences:
# line = out_file.readline()
# if line:
# out_str += line
# sentence, i, first, last, nbest, path = sentence_dict[found_sentences]
# if get_sentence(sentence, out_str, i, first, last, nbest, path):
# found_sentences += 1
# else:
# sleep(0.01)

#out, err = gloss.communicate()

if getSeg:
seg.terminate()
stail.terminate()
if getGloss:
gloss.terminate()
gtail.terminate()

self.first_example = 0
self.last_example = -1

def process_elan_annotation_batch(self, data):
"""
Send each annotation as its own job
"""

# model options
jobid = data['id']
n_best = int(data['nbest'])
annot = data['eaf_data']

getSeg = data['getSeg']
getGloss = data['getGloss']
assert getSeg or getGloss

# init the subprocess
if getSeg:
stail, seg = init_fairseq_model('seg', n_best)
if getGloss:
gtail, gloss = init_fairseq_model('gloss', n_best)

for i in range(len(annot)):
text = annot[i]['annotation_text']
n_tokens = len(text.split(' '))
id = annot[i]['annotation_id']

save_path = f'/data/results/sentence_{jobid}_{i}.std.out'
self.last_example += n_tokens
submit_sentence(text, i, getSeg, getGloss, self.first_example, self.last_example, n_best, save_path, annotation_id=id)
self.first_example += n_tokens

if getSeg:
stail.terminate()
seg.terminate()
if getGloss:
gtail.terminate()
gloss.terminate()

self.first_example = 0
self.last_example = -1

def process_batch(self, job_id):
"""
Loads a job file from the server, and sends to the model
"""

# get the JSON contents
job_path = '/data/inputs/fairseq_{}.txt'.format(job_id)
with open(job_path, 'r') as new_job:
new_job_contents = new_job.read()
new_job_data = json.loads(new_job_contents)

# process based on input type
in_type = new_job_data['input_type']
if in_type == 'text':
start = time.time()

self.process_text_sentence_batch(new_job_data)

#cProfile.run('self.process_text_sentence_batch(new_job_data)')
end = time.time()
print('Time elapsed:', end-start)
elif in_type == "eaf_file":
self.process_elan_annotation_batch(new_job_data)


# def process_file(job_id):
# """
# Converts a job json file into an input for the model wrapper.
# Passes it to the model.
# """

# # TODO: just write the files to backend_fairseq for now
# job_path = '/data/inputs/fairseq_{}.txt'.format(job_id)
# with open(job_path, 'r') as new_job:
# new_job_contents = new_job.read()

# new_job_data = json.loads(new_job_contents)

# # TODO: Debug
# print(new_job_data)

# # check for options
# n_best = int(new_job_data['nbest'])

# if new_job_data['input_type'] == 'text':
# text_input = new_job_data['text']

# # convert text stream into a list of sentences
# sentences = []
# txt_lines = re.split('\n', text_input)
# for txtl in txt_lines:
# sent_lines = re.split('[.!?] ', txtl)
# for sl in sent_lines:
# sentences.append(sl)

# annotation_seq = None

# elif new_job_data['input_type'] == 'eaf_file':
# eaf_data = new_job_data['eaf_data']

# # each annotation is considered a sentence
# sentences = []
# annotation_list = []
# annotation_seq = []
# for annotation in eaf_data:
# annotation_value = annotation['annotation_text']
# annotation_id = annotation['annotation_id']
# sentences.append(annotation_value)

# # repeat the annotation value for each word, align in run_fairseq.py
# for i in range(len(annotation_value.split(' '))):
# annotation_seq.append(annotation_id)

# annotation_list.append(annotation_id)

# # overwrite job file with a list of annotations
# new_job_data['annotation_list'] = annotation_list
# with open(job_path, 'w') as out_file:
# out_file = json.dump(new_job_data, out_file)

# # Save file, send to the wrapper
# # TODO: save where?
# txt_path = '/data/inputs/fairseq_{}.sents'.format(job_id)
# with open(txt_path, 'w') as outpt:
# for sline in sentences:
# outpt.write(sline + '\n')

# # find task
# if new_job_data['task'] == 'gloss':
# call_glossing_model(txt_path, n_best, '/data/results/output_inference_json-{}.std.out'.format(job_id), annotations=annotation_seq)
# # submit_glossing_text(txt_path, n_best, '/data/results/output_inference_json-{}.std.out'.format(job_id), annotations=annotation_seq)
# # print('Processing ', txt_path)
# elif new_job_data['task'] == 'morphseg':
# call_morphseg_model(txt_path, n_best, '/data/results/output_inference_json-{}.std.out'.format(job_id), annotations=annotation_seq)
Empty file.
Git LFS file not shown
Loading

0 comments on commit 0ffaf86

Please sign in to comment.