Skip to content

Commit

Permalink
app-example
Browse files Browse the repository at this point in the history
  • Loading branch information
RodionfromHSE committed Nov 20, 2023
1 parent 43e4365 commit ba04635
Showing 1 changed file with 280 additions and 0 deletions.
280 changes: 280 additions & 0 deletions src/prod/example.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,280 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The autoreload extension is already loaded. To reload it, use:\n",
" %reload_ext autoreload\n"
]
}
],
"source": [
"%load_ext autoreload\n",
"%autoreload 2"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/Users/user010/Desktop/Programming/ML/En2RuTranslator\n"
]
}
],
"source": [
"import os\n",
"import sys\n",
"\n",
"root_dir = os.path.abspath(os.path.join(os.getcwd(), '../..'))\n",
"print(root_dir)\n",
"assert os.path.exists(root_dir), f'Could not find root directory at {root_dir}'\n",
"sys.path.insert(0, root_dir)\n",
"\n",
"from custom_utils.config_handler import read_config, pprint_config"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{\n",
" \"root\": \"/Users/user010/Desktop/Programming/ML/En2RuTranslator\",\n",
" \"params\": {\n",
" \"batch_size\": 16,\n",
" \"max_length\": 512,\n",
" \"train_args\": {\n",
" \"evaluation_strategy\": \"epoch\",\n",
" \"learning_rate\": 2e-05,\n",
" \"per_device_train_batch_size\": 16,\n",
" \"per_device_eval_batch_size\": 16,\n",
" \"weight_decay\": 0.01,\n",
" \"save_total_limit\": 3,\n",
" \"num_train_epochs\": 4,\n",
" \"predict_with_generate\": true\n",
" },\n",
" \"wandb_args\": {\n",
" \"report_to\": \"wandb\",\n",
" \"run_name\": \"finetune\"\n",
" }\n",
" },\n",
" \"dest_synonym_searcher\": {\n",
" \"path\": \"/Users/user010/Desktop/Programming/ML/En2RuTranslator/models/embs/cc.ru.100.bin\",\n",
" \"type\": \"fasttext\"\n",
" },\n",
" \"src_synonym_searcher\": {\n",
" \"path\": \"/Users/user010/Desktop/Programming/ML/En2RuTranslator/models/embs/cc.en.100.bin\",\n",
" \"type\": \"fasttext\"\n",
" },\n",
" \"translator\": {\n",
" \"name\": \"opus-distilled-en-ru\",\n",
" \"model_and_tokenizer_name\": \"under-tree/transformer-en-ru\",\n",
" \"output_dir\": \"/Users/user010/Desktop/Programming/ML/En2RuTranslator/models/opus-distilled-en-ru/finetuned\",\n",
" \"type\": \"seq2seq\"\n",
" },\n",
" \"attention_extractor\": {\n",
" \"type\": \"random\"\n",
" }\n",
"}\n"
]
}
],
"source": [
"overrides = [\"setup=prod\"]\n",
"cfg = read_config(overrides=overrides)\n",
"pprint_config(cfg)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/user010/Desktop/Programming/ML/En2RuTranslator/venv/lib/python3.11/site-packages/transformers/models/marian/tokenization_marian.py:197: UserWarning: Recommended: pip install sacremoses.\n",
" warnings.warn(\"Recommended: pip install sacremoses.\")\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 4.22 s, sys: 2.59 s, total: 6.81 s\n",
"Wall time: 8.32 s\n"
]
}
],
"source": [
"%%time\n",
"from translator_app import TranslatorApp\n",
"\n",
"app = TranslatorApp(cfg)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/user010/Desktop/Programming/ML/En2RuTranslator/venv/lib/python3.11/site-packages/transformers/tokenization_utils_base.py:3848: UserWarning: `as_target_tokenizer` is deprecated and will be removed in v5 of Transformers. You can tokenize your labels by using the argument `text_target` of the regular `__call__` method (either in the same call as your input texts if you use the same keyword arguments, or in a separate call.\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 9.32 s, sys: 1.47 s, total: 10.8 s\n",
"Wall time: 11.3 s\n"
]
}
],
"source": [
"%%time\n",
"text = \"Oh, dear, I'm afraid I've said too much. I'm a bad, bad blogger.\"\n",
"app_output = app(text)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### App Output\n",
"\n",
"* input words\n",
"* input tokens\n",
"* src synonyms\n",
"* output words\n",
"* output tokens\n",
"* dest synonyms\n",
"* translation \n",
"* attention*\n",
"\n",
"```python\n",
"{\n",
" 'translation': str,\n",
" 'input_tokens': list[str]\n",
" 'input_words': list[str]\n",
" 'output_tokens': list[str]\n",
" 'output_words': list[str]\n",
" 'src_synonyms': list[list[str]]\n",
" 'dest_synonyms': list[list[str]]\n",
" 'attention': list[list[float]]\n",
"}\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['oh', ',', 'dear', ',', 'i', \"'m\", 'afraid', 'i', \"'ve\", 'said', 'too', 'much', '.', 'i', \"'m\", 'a', 'bad', ',', 'bad', 'blogger', '.']\n"
]
}
],
"source": [
"print(app_output['input_words'])"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['▁Oh', ',', '▁dear', ',', '▁I', \"'\", 'm', '▁afraid', '▁I', \"'\", 've', '▁said', '▁too', '▁much', '.', '▁I', \"'\", 'm', '▁a', '▁bad', ',', '▁bad', '▁blogger', '.']\n"
]
}
],
"source": [
"print(app_output['input_tokens'])"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['dearest', 'dears', 'dear.And', 'darling', 'dearies']\n"
]
}
],
"source": [
"dear_idx = app_output['input_words'].index('dear')\n",
"print(app_output['src_synonyms'][dear_idx])"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"О, дорогая, боюсь, я слишком много сказал. Я плохой, плохой блоггер.\n"
]
}
],
"source": [
"print(app_output['translation'])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

0 comments on commit ba04635

Please sign in to comment.