Merge pull request #2 from RodionfromHSE/finetune

Finetune
RodionfromHSE · Nov 19, 2023 · 3cc40b3 · 3cc40b3
2 parents 3e7200b + 5425ab9
commit 3cc40b3
Show file tree

Hide file tree

Showing 37 changed files with 3,838 additions and 140 deletions.
diff --git a/.dvc/config b/.dvc/config
@@ -2,4 +2,4 @@
     autostage = true
     remote = storage
 ['remote "storage"']
-    url = gdrive://1XzdLLDSWCRT57Kj9ZqYlfTk_0phVu6Fz
+    url = gdrive://19-PaarPhbUW27F4XXpLXS1SBu0Dvzch1
diff --git a/.gitignore b/.gitignore
@@ -88,3 +88,4 @@ target/
 # Mypy cache
 .mypy_cache/
 /data
+/models
diff --git a/README.md b/README.md
@@ -9,6 +9,7 @@ Project Organization
     ├── LICENSE
     ├── Makefile           <- Makefile with commands like `make data` or `make train`
     ├── README.md          <- The top-level README for developers using this project.
+    ├── conf               <- Configuration files
     ├── data
     │   ├── external       <- Data from third party sources.
     │   ├── interim        <- Intermediate data that has been transformed.
@@ -54,4 +55,74 @@ Project Organization
 
 --------
 
+## Installation
+
+```bash
+pip3 install -r requirements.txt
+```
+
+## Dataset
+
+#### Training dataset
+The training dataset is based on `saier/unarxive_citrec` [hf](https://huggingface.co/datasets/saier/unarxive_citrec).
+
+*Details*:
+```yaml
+Train size: 9082
+Valid size: 702
+Test size: 568
+```
+
+All the samples have length from `128` to `512` characters (TO-DO: characters -> tokens)\
+More in `notebooks/data/dataset_download.ipynb`
+
+After collecting the dataset, we carefully translated the samples from English to Russian using the OpenAI API.\
+Details in `notebooks/data/dataset_translate.ipynb`
+
+#### Dataset for model comparison (EvalDataset)
+This dataset is based on `turkic_xwmt`, `subset=ru-en`, `split=test` [hf](https://huggingface.co/datasets/turkic_xwmt).
+
+Dataset size: 1000
+
+## Models comparison
+
+Models comparison is based on bleu score of the translated samples and reference translation by OpenAI.
+
+**Models**:\
+transformer-en-ru: `Helsinki-NLP/opus-mt-en-ru` [hf](https://huggingface.co/Helsinki-NLP/opus-mt-en-ru)\
+nnlb-1.3B-distilled: `facebook/nllb-200-distilled-1.3B` [hf](https://huggingface.co/facebook/nllb-200-distilled-1.3B)
+
+
+**Results**:
+```yaml
+transformer-en-ru BLEU: 2.58
+nnlb-1.3B-distilled BLEU: 2.55
+```
+
+Even though results aren't statistically important, transformer-en-ru model was chosen since it's faster and has smaller size.\
+Details in `src/finetune/eval_bleu.py`
+
+## Model finetuning
+
+Simple seq2seq model finetuning transformer-en-ru.\
+Details in `notebooks/finetune/finetune.ipynb`.\
+Model on [hf](https://huggingface.co/under-tree/transformer-en-ru)
+
+**Fine-tuned model results**:
+```yaml
+eval_loss: 0.656
+eval_bleu: 67.197
+```
+(BLEU is suspeciously high)
+
+## Translation App
+
+**Synonyms Searcher**\
+Simple version is based on `word2vec` model, namely `fasttext` ([link](https://fasttext.cc/docs/en/crawl-vectors.html)). We've chosen fasttext because it solves the problem of out-of-vocabulary words.
+
+
+
+
+
+
 <p><small>Project based on the <a target="_blank" href="https://drivendata.github.io/cookiecutter-data-science/">cookiecutter data science project template</a>. #cookiecutterdatascience</small></p>
diff --git a/conf/.gitignore b/conf/.gitignore
@@ -0,0 +1 @@
+config.yaml
diff --git a/conf/config.yaml b/conf/config.yaml
@@ -0,0 +1,9 @@
+defaults:
+  - _self_
+  - dataset: null
+  - model: null
+  - params: null
+  - setup: null
+
+
+root: /Users/user010/Desktop/Programming/ML/En2RuTranslator
diff --git a/conf/dataset/model_eval.yaml b/conf/dataset/model_eval.yaml
@@ -0,0 +1,7 @@
+path: ${root}/data/processed/model_eval_results.csv
+
+cols:  # cols to be used when calculating BLEU
+  reference: target
+  candidates:
+    - transformer-en-ru
+    - nnlb-1.3B-distilled
diff --git a/conf/dataset/model_eval_raw.yaml b/conf/dataset/model_eval_raw.yaml
@@ -0,0 +1 @@
+path: ${root}/data/processed/model_eval.csv
diff --git a/conf/dataset/unarxive.yaml b/conf/dataset/unarxive.yaml
@@ -0,0 +1 @@
+path: "waleko/unarXive-en2ru"
diff --git a/conf/model/fasttext_en.yaml b/conf/model/fasttext_en.yaml
@@ -0,0 +1,2 @@
+path: ${root}/models/embs/cc.en.100.bin
+type: fasttext
diff --git a/conf/model/fasttext_ru.yaml b/conf/model/fasttext_ru.yaml
@@ -0,0 +1,2 @@
+path: ${root}/models/embs/cc.ru.100.bin
+type: fasttext
diff --git a/conf/model/nnlb_1.3B.yaml b/conf/model/nnlb_1.3B.yaml
@@ -0,0 +1,2 @@
+name: nnlb-1.3B-distilled
+model_and_tokenizer_name: facebook/nllb-200-distilled-1.3B
diff --git a/conf/model/opus_distilled_en_ru.yaml b/conf/model/opus_distilled_en_ru.yaml
@@ -0,0 +1,4 @@
+name: opus-distilled-en-ru
+model_and_tokenizer_name: "under-tree/transformer-en-ru"
+output_dir: ${root}/models/${.name}/finetuned
+type: seq2seq
diff --git a/conf/model/opus_en_ru.yaml b/conf/model/opus_en_ru.yaml
@@ -0,0 +1,2 @@
+name: opus-en-ru
+model_and_tokenizer_name: Helsinki-NLP/opus-mt-en-ru
diff --git a/conf/model/random_attention_extractor.yaml b/conf/model/random_attention_extractor.yaml
@@ -0,0 +1 @@
+type: "random"
diff --git a/conf/notebooks/finetune/candidates_inference.yaml b/conf/notebooks/finetune/candidates_inference.yaml
@@ -0,0 +1,13 @@
+root: ???
+
+nnlb_model:
+  name: nnlb-1.3B-distilled
+  model_and_tokenizer_name: facebook/nllb-200-distilled-1.3B
+
+mt_model:
+  name: transformer-en-ru
+  model_and_tokenizer_name: Helsinki-NLP/opus-mt-en-ru
+
+inference_dataset_path: ${root}/data/processed/model_eval.csv
+results_path: ${root}/data/processed/model_eval_results.csv
+
diff --git a/models/.gitkeep → conf/notebooks/finetune/finetune.yaml b/models/.gitkeep → conf/notebooks/finetune/finetune.yaml
diff --git a/conf/notebooks/finetune/model_eval.yaml b/conf/notebooks/finetune/model_eval.yaml
@@ -0,0 +1,7 @@
+root: ???
+load_dataset_params:
+  path: 'turkic_xwmt'
+  name: 'ru-en'
+  split: 'test'
+save_path: '${root}/data/processed/model_eval.csv'
+
diff --git a/conf/params/finetune.yaml b/conf/params/finetune.yaml
@@ -0,0 +1,15 @@
+batch_size: 16
+max_length: 512
+train_args:
+  evaluation_strategy: epoch
+  learning_rate: 2e-5
+  per_device_train_batch_size: ${..batch_size}
+  per_device_eval_batch_size: ${..batch_size}
+  weight_decay: 0.01
+  save_total_limit: 3
+  num_train_epochs: 4
+  predict_with_generate: true
+
+wandb_args:
+  report_to: wandb
+  run_name: finetune
diff --git a/conf/setup/all_models_example.yaml b/conf/setup/all_models_example.yaml
@@ -0,0 +1,6 @@
+# @package _global_
+
+defaults:
+  - /model@model1: opus_en_ru
+  - /model@model2: opus_distilled_en_ru
+  - override /dataset: unarxive
diff --git a/conf/setup/finetune.yaml b/conf/setup/finetune.yaml
@@ -0,0 +1,7 @@
+# @package _global_
+
+defaults:
+  - /model@pretrained: opus_en_ru
+  - /model@finetuned: opus_distilled_en_ru
+  - override /dataset: unarxive
+  - override /params: finetune
diff --git a/conf/setup/inference.yaml b/conf/setup/inference.yaml
@@ -0,0 +1,7 @@
+# @package _global_
+
+defaults:
+  - /model@opus_model: opus_en_ru
+  - /model@nnlb_model: nnlb_1.3B
+  - /dataset@inference_dataset: model_eval_raw
+  - /dataset@result_dataset: model_eval
diff --git a/conf/setup/prod.yaml b/conf/setup/prod.yaml
@@ -0,0 +1,8 @@
+# @package _global_
+
+defaults:
+  - /model@dest_synonym_searcher: fasttext_ru
+  - /model@src_synonym_searcher: fasttext_en
+  - /model@translator: opus_distilled_en_ru
+  - /model@attention_extractor: random_attention_extractor
+  - override /params: finetune
diff --git a/custom_utils/__init__.py b/custom_utils/__init__.py
diff --git a/custom_utils/config_handler.py b/custom_utils/config_handler.py
@@ -0,0 +1,25 @@
+from omegaconf import OmegaConf
+import json
+import typing as tp
+import os
+from hydra import initialize_config_dir, compose
+
+__ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+__CONFIG_DIR = os.path.join(__ROOT_DIR, "conf")
+
+def read_config(config_dir: str = __CONFIG_DIR, overrides: tp.Optional[tp.List[str]] = None) -> OmegaConf:
+    """
+    :@param config_dir: path to config directory
+    :@param overrides: list of overrides (e.g. ["dataset=model_eval"])
+    :@param set_to_none_empty_with_warn: if True, set empty values to None and print warning
+    :@return: OmegaConf object
+    """
+    config_dir = os.path.abspath(config_dir)
+    with initialize_config_dir(config_dir=config_dir, version_base=None):
+        cfg = compose(config_name="config", overrides=overrides)
+        cfg = OmegaConf.create(OmegaConf.to_yaml(cfg, resolve=True))
+    return cfg
+
+def pprint_config(cfg: OmegaConf) -> None:
+    "Pretty print config"
+    print(json.dumps(OmegaConf.to_container(cfg), indent=2))
diff --git a/data.dvc b/data.dvc
@@ -1,6 +1,6 @@
 outs:
-- md5: 255799c6a8913d73679631d546a9dd88.dir
-  nfiles: 13
+- md5: a04b7051c1e5067e29c68137c321dae1.dir
+  nfiles: 15
   hash: md5
   path: data
-  size: 19936558
+  size: 21297660
diff --git a/models.dvc b/models.dvc
@@ -0,0 +1,6 @@
+outs:
+- md5: c32a6fc3220dba5ae7628692d397c852.dir
+  size: 4892930090
+  nfiles: 2
+  hash: md5
+  path: models
-Original file line number
+Diff line change
@@ Expand Up / @@ -88,3 +88,4 @@ target/ @@
     # Mypy cache
     .mypy_cache/
     /data
+    /models
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		path: ${root}/models/embs/cc.en.100.bin
		type: fasttext
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		path: ${root}/models/embs/cc.ru.100.bin
		type: fasttext
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		name: nnlb-1.3B-distilled
		model_and_tokenizer_name: facebook/nllb-200-distilled-1.3B
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		name: opus-en-ru
		model_and_tokenizer_name: Helsinki-NLP/opus-mt-en-ru