From 0371e7d83f423e6e60bb0e43b3eef344fa508c5a Mon Sep 17 00:00:00 2001 From: Aaron W Chen Date: Mon, 29 Jul 2024 22:12:38 -0700 Subject: [PATCH] Finish init DVC/MLflow & refactor MeaLeon backend Trying first round of combining DVC with MLflow. DVC explicitly handles data, both raw and processed data from data cleaning and embedding transformation. MLflow handles the embedding model and classes needed fo the model and model pipeline. MeaLeon backend has refactor to more hierarchical code structure and more clear delineation of functions and naming --- data.dvc | 6 +- nbs/16_notebook_refactor.ipynb | 415 ++++++++++-------- .../embedding_creation/apply_stanza.py | 90 ++++ .../sklearn_transformer_as_mlflow_model.py | 78 ++++ 4 files changed, 410 insertions(+), 179 deletions(-) create mode 100644 src/backend/embedding_creation/apply_stanza.py create mode 100644 src/backend/embedding_creation/sklearn_transformer_as_mlflow_model.py diff --git a/data.dvc b/data.dvc index 502ad17..9905113 100644 --- a/data.dvc +++ b/data.dvc @@ -1,5 +1,5 @@ outs: -- md5: d3e85fc804165a9de26ee51138033176.dir - size: 256748989 - nfiles: 8 +- md5: 2ce6297077793c098f42db8660fb0d0e.dir + size: 656551290 + nfiles: 12 path: data diff --git a/nbs/16_notebook_refactor.ipynb b/nbs/16_notebook_refactor.ipynb index c131e38..b88ef0c 100644 --- a/nbs/16_notebook_refactor.ipynb +++ b/nbs/16_notebook_refactor.ipynb @@ -23,74 +23,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "49bce3f66fb64be5821785585078927f", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json: 0%| …" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024-07-22 22:20:47 INFO: Downloading default packages for language: en (English) ...\n", - "2024-07-22 22:20:48 INFO: File exists: /home/awchen/stanza_resources/en/default.zip\n", - "2024-07-22 22:20:51 INFO: Finished downloading models and saved to /home/awchen/stanza_resources.\n", - "2024-07-22 22:20:51 INFO: Checking for updates to resources.json in case models have been updated. Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "4a0c28a0b6fe44418f58e2b9dc4baecd", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json: 0%| …" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024-07-22 22:20:52 INFO: Loading these models for language: en (English):\n", - "======================================\n", - "| Processor | Package |\n", - "--------------------------------------\n", - "| tokenize | combined |\n", - "| pos | combined_charlm |\n", - "| lemma | combined_nocharlm |\n", - "| constituency | ptb3-revised_charlm |\n", - "| depparse | combined_charlm |\n", - "| sentiment | sstplus |\n", - "| ner | ontonotes_charlm |\n", - "======================================\n", - "\n", - "2024-07-22 22:20:52 INFO: Using device: cpu\n", - "2024-07-22 22:20:52 INFO: Loading: tokenize\n", - "2024-07-22 22:20:52 INFO: Loading: pos\n", - "2024-07-22 22:20:52 INFO: Loading: lemma\n", - "2024-07-22 22:20:52 INFO: Loading: constituency\n", - "2024-07-22 22:20:52 INFO: Loading: depparse\n", - "2024-07-22 22:20:52 INFO: Loading: sentiment\n", - "2024-07-22 22:20:53 INFO: Loading: ner\n", - "2024-07-22 22:20:53 INFO: Done loading processors!\n" - ] - } - ], + "outputs": [], "source": [ "# | hide\n", "import dagshub\n", @@ -110,7 +43,8 @@ ")\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.pipeline import make_pipeline\n", - "from src.custom_sklearn_text_transformer_mlflow import CustomSKLearnAnalyzer\n", + "from src.backend.embedding_creation.apply_stanza import CustomSKLearnAnalyzer\n", + "from src.backend.embedding_creation.sklearn_transformer_as_mlflow_model import CustomSKLearnWrapper\n", "import src.backend.raw_data_cleaning.raw_data_preprocessor as rdpp\n", "import stanza\n", "from tqdm import tqdm" @@ -128,55 +62,6 @@ "execution_count": null, "metadata": {}, "outputs": [ - { - "data": { - "text/html": [ - "
                                       ❗❗❗ AUTHORIZATION REQUIRED ❗❗❗                                        \n",
-       "
\n" - ], - "text/plain": [ - " \u001b[1m❗❗❗ AUTHORIZATION REQUIRED ❗❗❗\u001b[0m \n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "3c1045da94a04943927df0436c7594e8", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Output()" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "\n", - "Open the following link in your browser to authorize the client:\n", - "https://dagshub.com/login/oauth/authorize?state=74d288e4-2633-4d72-a81e-195610799aa2&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=98ffa694aa3b7e3ae32389c71e83d07870d02d6adc6f1bc3b35b09aeb66ae885\n", - "\n", - "\n" - ] - }, - { - "data": { - "text/html": [ - "
\n"
-      ],
-      "text/plain": []
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
     {
      "data": {
       "text/html": [
@@ -202,7 +87,7 @@
     "DAGSHUB_REPO_NAME = \"MeaLeon\"\n",
     "\n",
     "#@markdown Enter the name of the branch you are working on \n",
-    "BRANCH = \"NB1/notebook_refactor\"\n",
+    "BRANCH = \"init_mealeon_to_notebook_refactor\"\n",
     "dagshub.init(repo_name=DAGSHUB_REPO_NAME\n",
     "             , repo_owner=DAGSHUB_USER_NAME)"
    ]
@@ -262,7 +147,32 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[?25l\u001b[32m⠋\u001b[0m Checking graph                                                 \n",
+      "Adding...                                                                       \n",
+      "!\u001b[A\n",
+      "  0% Checking cache in '/home/awchen/Repos/Projects/MeaLeon/.dvc/cache'| |0/? [0\u001b[A\n",
+      "                                                                                \u001b[A\n",
+      "!\u001b[A\n",
+      "  0%|          |Checking out ../data/raw/201706-epicur0/? [00:00