Merge pull request #3 from mpsilfve/fairseq-container

Fairseq container
mpsilfve · May 17, 2022 · 0ffaf86 · 0ffaf86
2 parents c4a2c69 + e2c2bed
commit 0ffaf86
Show file tree

Hide file tree

Showing 90 changed files with 8,844 additions and 662 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1 @@
+*.pt filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitmodules b/.gitmodules
@@ -1,3 +1,7 @@
 [submodule "backend_coling/coling2018-neural-transition-based-morphology"]
 	path = backend_coling/coling2018-neural-transition-based-morphology
 	url = [email protected]:mpsilfve/coling2018-neural-transition-based-morphology.git
+
+[submodule "backend_fairseq/fairseq"]
+	path = backend_fairseq/fairseq
+	url = [email protected]:mpsilfve/fairseq.git
diff --git a/backend_fairseq/Dockerfile b/backend_fairseq/Dockerfile
@@ -1,15 +1,12 @@
 FROM python:3.9
 WORKDIR /backend_fairseq
-# RUN apt-get update && apt-get install -y \
-#             python3.9\
-#             python-pip3\
-#             && apt-get clean && rm -rf /var/lib/apt/lists/*
+RUN apt-get update && apt-get install -y \
+            python3.9\
+            python3-pip\
+            cython3\
+            && apt-get clean && rm -rf /var/lib/apt/lists/*
 COPY requirements.txt .
 RUN pip install --no-cache-dir --upgrade pip && \
     pip install --no-cache-dir -r requirements.txt
-# COPY fairseq ./fairseq
-# COPY word_model ./word_model
-# RUN cd fairseq && pip install --editable ./
-#  add run commands that will install (setup) the model 
-# CMD ["/bin/sh"]
-CMD tail -f /dev/null
+ENV PYTHONUNBUFFERED=1
+CMD /backend_fairseq/entrypoint.sh
diff --git a/backend_fairseq/entrypoint.sh b/backend_fairseq/entrypoint.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+
+# install Fairseq
+pwd
+cd fairseq && pip install ./ && pip install --upgrade numpy
+pwd
+cd ../pretrained_models
+
+# create named pipes for the models
+PIPE_DIR=io/pipes
+if [[ -p $PIPE_DIR/glossPipe ]] 
+then
+    rm $PIPE_DIR/glossPipe
+fi
+mkfifo $PIPE_DIR/glossPipe
+
+if [[ -p $PIPE_DIR/glossOut ]]
+then
+    rm $PIPE_DIR/glossOut
+fi
+mkfifo $PIPE_DIR/glossOut
+
+if [[ -p $PIPE_DIR/morphSegPipe ]]
+then
+    rm $PIPE_DIR/morphSegPipe
+fi
+mkfifo $PIPE_DIR/morphSegPipe
+
+# init the fairseq-interactive processes
+OUT_DIR=io/outputs
+rm $OUT_DIR/gloss_out.txt
+rm $OUT_DIR/morph_seg_out.txt
+
+#time head -n 1 dev_small.txt | fairseq-interactive --path data/gloss/checkpoint_best.pt --beam 5 --nbest 1 --source-lang src --target-lang trg data/gloss/gloss_preprocess > out.txt
+
+# tail -f $PIPE_DIR/glossPipe | fairseq-interactive --path data/gloss/checkpoint_best.pt --beam 5 --nbest 4 \
+#     --source-lang src --target-lang trg data/gloss/gloss_preprocess > $OUT_DIR/gloss_out.txt &
+
+# tail -f $PIPE_DIR/morphSegPipe | fairseq-interactive --path data/morphseg/lstm/checkpoint_best.pt --beam 5 --nbest 4 \
+#     --source-lang src --target-lang trg data/morphseg/lstm/lstm_preprocess > $OUT_DIR/morph_seg_out.txt &
+
+# init the listener
+cd .. && python3 -u listen_fairseq.py
diff --git a/backend_fairseq/fairseq b/backend_fairseq/fairseq
diff --git a/backend_fairseq/jobs b/backend_fairseq/jobs
@@ -0,0 +1 @@
+{"model": "fairseq", "getSeg": true, "getGloss": true, "nbest": "1", "text": ["Helthl k'uuhl siwadi'm ahl la'oo'y g\u0332o'ohl Hla Heen", "Ky'ulhl k\u0332'ay 'mesim hanak\u0332' wat as Wiladoo ant 'wahl ky'eegwihl t'a'wihlg\u0332an", "Ii sii hlguuhlxwt loot iit haboo'lt", "Needii sim g\u0332al 'wudin 'nekwt ii k\u0332'ap g\u0332ala'y tusthl bent", "Ii sildim wiltxwst", "Ha'ondii 'nekwt dii sgyethl hlgu tk'ihlxw hlguuhlxw k\u0332'ay 'mesim hanak\u0332'si,", "ii ap gyatt wilt gya'adiit, k\u0332'ap hogyag\u0332am didils wilt gya'ahl gyet", "Ii helt ant neediit wilaaxt wil 'nit hlguuhlxum t'a'wihlg\u0332an k\u0332'ay 'mesim hanak\u0332'si", "Ii lukw'il ky'aa 'wii 'nekwt", "G\u0332asg\u0332oohl hla 'wii t'ishl t'a'wihlg\u0332an tunsi", "Ii belgi gwitkw'ootxwhl enx\u0332 g\u0332anhl g\u0332ax\u0332biist g\u0332anhl huwilp", "Wihl ap gupgupxwidiihl wilt, wilt gya'ahl hli gyedihl ts'epsi, iit liksgyedinhl hli gyedihl g\u0332alts'epsi", "Iit seex\u0332diithl k'i'yhl wilp x\u0332hliphetxwit ahl hla k\u0332'aphl ts'epsi", "Iit dok\u0332s dipunhl g\u0332ax\u0332biist iit liiluxwdiit", "K'i'yhl sa iit nax\u0332'nis dipun wil yukwhl yook\u0332xwhl similoo'o", "Hets'im x\u0332jaax\u0332xwhl nax\u0332'nidiit"], "input_type": "text", "id": 1651692717, "n_sentences": 16}
diff --git a/backend_fairseq/listen_fairseq.py b/backend_fairseq/listen_fairseq.py
@@ -5,9 +5,14 @@
 #     "--dynet-seed", dynet_seed,
 #     "--dynet-mem", "1000"
 # ])
-import time, shutil
+import time, shutil, os
 from watchdog.observers import Observer
 from watchdog.events import PatternMatchingEventHandler
+from sortedcontainers import SortedList
+import model_inference
+
+# job_list is a sorted list, by job ID
+job_list = SortedList()
 
 if __name__ == "__main__":
     patterns = "*"
@@ -23,12 +28,21 @@ def on_created(event):
     path_components = path.split("_")
     # path_components = path.split("/")
 
-    if path_components[0] == "/data/fairseq":
-        newPath = shutil.copy(f"{event.src_path}", ".")
+    if path_components[0] == "/data/inputs/fairseq":
+        newPath = shutil.copy(f"{event.src_path}", "./jobs")
         #  then remove the file from /data/fairseq
         #  then make a list of current jobs
         #  feed the next in line job 
         #  pass the next job into a script  
+
+        # TODO: just write the files to backend_fairseq for now
+        #os.remove(event.src_path)
+        further_path_components = path_components[1].split(".")
+        job_id = further_path_components[0]
+        job_list.add(job_id)
+
+    elif path_components[0] == "/data/results/sentence":
+        print("Spotted sentence!")
 
     # add it to the sorted list 
     # act on the first item in the list 
@@ -52,10 +66,22 @@ def on_moved(event):
 my_observer = Observer()
 my_observer.schedule(my_event_handler, path, recursive=go_recursively)
 
+# object for submitting Fairseq jobs
+submitter = model_inference.FairseqSubmitter()
+
 my_observer.start()
 try:
     while True:
+        # sleep to avoid running constantly
         time.sleep(1)
+        # process jobs in the job list
+        if len(job_list) > 0:
+            # pass job to model_inference.py
+            tic = time.time()
+            current_job = job_list.pop(0)
+
+            # process file
+            submitter.process_batch(current_job)
 except KeyboardInterrupt:
     my_observer.stop()
     my_observer.join()

diff --git a/backend_fairseq/model_inference.py b/backend_fairseq/model_inference.py
@@ -0,0 +1,220 @@
+"""
+Passes a job through Fairseq.
+
+Creates a job_id.dev file, which is passed to the Python wrapper for the pre-trained models.
+"""
+
+from __future__ import annotations
+import json, re
+import os
+import subprocess
+import cProfile
+import time
+from pretrained_models.run_fairseq import *
+
+
+class FairseqSubmitter:
+    """
+    Submit jobs to the background process of fairseq-interactive.
+    Keeps track of the number of examples seen by fairseq-interactive so far.
+    """
+
+    def __init__(self):
+        self.first_example = 0
+        self.last_example = -1
+
+    def process_text_sentence_batch(self, data):
+        """
+        Split the job into separate sentences, and send each of them to the background fairseq process
+        """
+
+        # model options
+        jobid = data['id']
+        n_best = int(data['nbest'])
+        n_sentences = data['n_sentences']
+        sentences = data['text']
+
+        getSeg = data['getSeg']
+        getGloss = data['getGloss']
+        assert getSeg or getGloss
+
+        # init the subprocess
+        if getSeg:
+            stail, seg = init_fairseq_model('seg', n_best)
+        if getGloss:
+            gtail, gloss = init_fairseq_model('gloss', n_best)
+
+        for i in range(n_sentences):
+            sentence = sentences[i]
+            n_tokens = len(sentence.split(' '))
+            save_path = f'/data/results/sentence_{jobid}_{i}.std.out'
+
+            self.last_example += n_tokens
+            submit_sentence(sentence, i, getSeg, getGloss, self.first_example, self.last_example, n_best, save_path)
+            self.first_example += n_tokens
+
+        # found_sentences = 0
+        # # while found_sentences < n_sentences:
+        # #     sentence, i, first, last, nbest, path = sentence_dict[found_sentences]
+        # #     if get_sentence(sentence, i, first, last, nbest, path):
+        # #         found_sentences += 1
+        # out_path = '/backend_fairseq/pretrained_models/io/outputs/gloss_out.txt'
+        # out_file = open(out_path, 'r')
+        # out_file.seek(0,2)
+        # out_str = ''
+        # while found_sentences < n_sentences:
+        #     line = out_file.readline()
+        #     if line:
+        #         out_str += line
+        #         sentence, i, first, last, nbest, path = sentence_dict[found_sentences]
+        #         if get_sentence(sentence, out_str, i, first, last, nbest, path):
+        #             found_sentences += 1
+        #     else:
+        #         sleep(0.01)
+
+        #out, err = gloss.communicate()
+
+        if getSeg:
+            seg.terminate()
+            stail.terminate()
+        if getGloss:
+            gloss.terminate()
+            gtail.terminate()
+
+        self.first_example = 0
+        self.last_example = -1
+
+    def process_elan_annotation_batch(self, data):
+        """
+        Send each annotation as its own job
+        """
+
+        # model options
+        jobid = data['id']
+        n_best = int(data['nbest'])
+        annot = data['eaf_data']
+
+        getSeg = data['getSeg']
+        getGloss = data['getGloss']
+        assert getSeg or getGloss
+
+        # init the subprocess
+        if getSeg:
+            stail, seg = init_fairseq_model('seg', n_best)
+        if getGloss:
+            gtail, gloss = init_fairseq_model('gloss', n_best)
+
+        for i in range(len(annot)):
+            text = annot[i]['annotation_text']
+            n_tokens = len(text.split(' '))
+            id = annot[i]['annotation_id']
+
+            save_path = f'/data/results/sentence_{jobid}_{i}.std.out'
+            self.last_example += n_tokens
+            submit_sentence(text, i, getSeg, getGloss, self.first_example, self.last_example, n_best, save_path, annotation_id=id)
+            self.first_example += n_tokens
+
+        if getSeg:
+            stail.terminate()
+            seg.terminate()
+        if getGloss:
+            gtail.terminate()
+            gloss.terminate()
+
+        self.first_example = 0
+        self.last_example = -1
+
+    def process_batch(self, job_id):
+        """
+        Loads a job file from the server, and sends to the model
+        """
+
+        # get the JSON contents
+        job_path = '/data/inputs/fairseq_{}.txt'.format(job_id)
+        with open(job_path, 'r') as new_job:
+            new_job_contents = new_job.read()
+        new_job_data = json.loads(new_job_contents)
+
+        # process based on input type
+        in_type = new_job_data['input_type']
+        if in_type == 'text':
+            start = time.time()
+
+            self.process_text_sentence_batch(new_job_data)
+
+            #cProfile.run('self.process_text_sentence_batch(new_job_data)')
+            end = time.time()
+            print('Time elapsed:', end-start)
+        elif in_type == "eaf_file":
+            self.process_elan_annotation_batch(new_job_data)
+
+
+# def process_file(job_id):
+#     """
+#     Converts a job json file into an input for the model wrapper.
+#     Passes it to the model.
+#     """
+
+#     # TODO: just write the files to backend_fairseq for now
+#     job_path = '/data/inputs/fairseq_{}.txt'.format(job_id)
+#     with open(job_path, 'r') as new_job:
+#         new_job_contents = new_job.read()
+
+#     new_job_data = json.loads(new_job_contents)
+
+#     # TODO: Debug
+#     print(new_job_data)
+
+#     # check for options
+#     n_best = int(new_job_data['nbest'])
+
+#     if new_job_data['input_type'] == 'text':
+#         text_input = new_job_data['text']
+
+#         # convert text stream into a list of sentences
+#         sentences = []
+#         txt_lines = re.split('\n', text_input)
+#         for txtl in txt_lines:
+#             sent_lines = re.split('[.!?] ', txtl)
+#             for sl in sent_lines:
+#                 sentences.append(sl)
+
+#         annotation_seq = None
+
+#     elif new_job_data['input_type'] == 'eaf_file':
+#         eaf_data = new_job_data['eaf_data']
+
+#         # each annotation is considered a sentence
+#         sentences = []
+#         annotation_list = []
+#         annotation_seq = []
+#         for annotation in eaf_data:
+#             annotation_value = annotation['annotation_text']
+#             annotation_id = annotation['annotation_id']
+#             sentences.append(annotation_value)
+
+#             # repeat the annotation value for each word, align in run_fairseq.py
+#             for i in range(len(annotation_value.split(' '))):
+#                 annotation_seq.append(annotation_id)
+
+#             annotation_list.append(annotation_id)
+
+#         # overwrite job file with a list of annotations
+#         new_job_data['annotation_list'] = annotation_list
+#         with open(job_path, 'w') as out_file:
+#             out_file = json.dump(new_job_data, out_file)
+
+#     # Save file, send to the wrapper
+#     # TODO: save where?
+#     txt_path = '/data/inputs/fairseq_{}.sents'.format(job_id)
+#     with open(txt_path, 'w') as outpt:
+#         for sline in sentences:
+#             outpt.write(sline + '\n')
+
+#     # find task
+#     if new_job_data['task'] == 'gloss':
+#         call_glossing_model(txt_path, n_best, '/data/results/output_inference_json-{}.std.out'.format(job_id), annotations=annotation_seq)
+#         # submit_glossing_text(txt_path, n_best, '/data/results/output_inference_json-{}.std.out'.format(job_id), annotations=annotation_seq)
+#         # print('Processing ', txt_path)
+#     elif new_job_data['task'] == 'morphseg':
+#         call_morphseg_model(txt_path, n_best, '/data/results/output_inference_json-{}.std.out'.format(job_id), annotations=annotation_seq)
diff --git a/backend_fairseq/pretrained_models/__init__.py b/backend_fairseq/pretrained_models/__init__.py
diff --git a/backend_fairseq/pretrained_models/data/gloss/checkpoint_best.pt b/backend_fairseq/pretrained_models/data/gloss/checkpoint_best.pt
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"model": "fairseq", "getSeg": true, "getGloss": true, "nbest": "1", "text": ["Helthl k'uuhl siwadi'm ahl la'oo'y g\u0332o'ohl Hla Heen", "Ky'ulhl k\u0332'ay 'mesim hanak\u0332' wat as Wiladoo ant 'wahl ky'eegwihl t'a'wihlg\u0332an", "Ii sii hlguuhlxwt loot iit haboo'lt", "Needii sim g\u0332al 'wudin 'nekwt ii k\u0332'ap g\u0332ala'y tusthl bent", "Ii sildim wiltxwst", "Ha'ondii 'nekwt dii sgyethl hlgu tk'ihlxw hlguuhlxw k\u0332'ay 'mesim hanak\u0332'si,", "ii ap gyatt wilt gya'adiit, k\u0332'ap hogyag\u0332am didils wilt gya'ahl gyet", "Ii helt ant neediit wilaaxt wil 'nit hlguuhlxum t'a'wihlg\u0332an k\u0332'ay 'mesim hanak\u0332'si", "Ii lukw'il ky'aa 'wii 'nekwt", "G\u0332asg\u0332oohl hla 'wii t'ishl t'a'wihlg\u0332an tunsi", "Ii belgi gwitkw'ootxwhl enx\u0332 g\u0332anhl g\u0332ax\u0332biist g\u0332anhl huwilp", "Wihl ap gupgupxwidiihl wilt, wilt gya'ahl hli gyedihl ts'epsi, iit liksgyedinhl hli gyedihl g\u0332alts'epsi", "Iit seex\u0332diithl k'i'yhl wilp x\u0332hliphetxwit ahl hla k\u0332'aphl ts'epsi", "Iit dok\u0332s dipunhl g\u0332ax\u0332biist iit liiluxwdiit", "K'i'yhl sa iit nax\u0332'nis dipun wil yukwhl yook\u0332xwhl similoo'o", "Hets'im x\u0332jaax\u0332xwhl nax\u0332'nidiit"], "input_type": "text", "id": 1651692717, "n_sentences": 16}