From 0371e7d83f423e6e60bb0e43b3eef344fa508c5a Mon Sep 17 00:00:00 2001
From: Aaron W Chen <awc33@cornell.edu>
Date: Mon, 29 Jul 2024 22:12:38 -0700
Subject: [PATCH] Finish init DVC/MLflow & refactor MeaLeon backend

Trying first round of combining DVC with MLflow. DVC explicitly handles data, both raw and processed data from data cleaning and embedding transformation. MLflow handles the embedding model and classes needed fo the model and model pipeline.

MeaLeon backend has refactor to more hierarchical code structure and more clear delineation of functions and naming
---
 data.dvc                                      |   6 +-
 nbs/16_notebook_refactor.ipynb                | 415 ++++++++++--------
 .../embedding_creation/apply_stanza.py        |  90 ++++
 .../sklearn_transformer_as_mlflow_model.py    |  78 ++++
 4 files changed, 410 insertions(+), 179 deletions(-)
 create mode 100644 src/backend/embedding_creation/apply_stanza.py
 create mode 100644 src/backend/embedding_creation/sklearn_transformer_as_mlflow_model.py

diff --git a/data.dvc b/data.dvc
index 502ad17..9905113 100644
--- a/data.dvc
+++ b/data.dvc
@@ -1,5 +1,5 @@
 outs:
-- md5: d3e85fc804165a9de26ee51138033176.dir
-  size: 256748989
-  nfiles: 8
+- md5: 2ce6297077793c098f42db8660fb0d0e.dir
+  size: 656551290
+  nfiles: 12
   path: data
diff --git a/nbs/16_notebook_refactor.ipynb b/nbs/16_notebook_refactor.ipynb
index c131e38..b88ef0c 100644
--- a/nbs/16_notebook_refactor.ipynb
+++ b/nbs/16_notebook_refactor.ipynb
@@ -23,74 +23,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "49bce3f66fb64be5821785585078927f",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2024-07-22 22:20:47 INFO: Downloading default packages for language: en (English) ...\n",
-      "2024-07-22 22:20:48 INFO: File exists: /home/awchen/stanza_resources/en/default.zip\n",
-      "2024-07-22 22:20:51 INFO: Finished downloading models and saved to /home/awchen/stanza_resources.\n",
-      "2024-07-22 22:20:51 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "4a0c28a0b6fe44418f58e2b9dc4baecd",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2024-07-22 22:20:52 INFO: Loading these models for language: en (English):\n",
-      "======================================\n",
-      "| Processor    | Package             |\n",
-      "--------------------------------------\n",
-      "| tokenize     | combined            |\n",
-      "| pos          | combined_charlm     |\n",
-      "| lemma        | combined_nocharlm   |\n",
-      "| constituency | ptb3-revised_charlm |\n",
-      "| depparse     | combined_charlm     |\n",
-      "| sentiment    | sstplus             |\n",
-      "| ner          | ontonotes_charlm    |\n",
-      "======================================\n",
-      "\n",
-      "2024-07-22 22:20:52 INFO: Using device: cpu\n",
-      "2024-07-22 22:20:52 INFO: Loading: tokenize\n",
-      "2024-07-22 22:20:52 INFO: Loading: pos\n",
-      "2024-07-22 22:20:52 INFO: Loading: lemma\n",
-      "2024-07-22 22:20:52 INFO: Loading: constituency\n",
-      "2024-07-22 22:20:52 INFO: Loading: depparse\n",
-      "2024-07-22 22:20:52 INFO: Loading: sentiment\n",
-      "2024-07-22 22:20:53 INFO: Loading: ner\n",
-      "2024-07-22 22:20:53 INFO: Done loading processors!\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# | hide\n",
     "import dagshub\n",
@@ -110,7 +43,8 @@
     ")\n",
     "from sklearn.model_selection import train_test_split\n",
     "from sklearn.pipeline import make_pipeline\n",
-    "from src.custom_sklearn_text_transformer_mlflow import CustomSKLearnAnalyzer\n",
+    "from src.backend.embedding_creation.apply_stanza import CustomSKLearnAnalyzer\n",
+    "from src.backend.embedding_creation.sklearn_transformer_as_mlflow_model import CustomSKLearnWrapper\n",
     "import src.backend.raw_data_cleaning.raw_data_preprocessor as rdpp\n",
     "import stanza\n",
     "from tqdm import tqdm"
@@ -128,55 +62,6 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">                                       <span style=\"font-weight: bold\">❗❗❗ AUTHORIZATION REQUIRED ❗❗❗</span>                                        \n",
-       "</pre>\n"
-      ],
-      "text/plain": [
-       "                                       \u001b[1m❗❗❗ AUTHORIZATION REQUIRED ❗❗❗\u001b[0m                                        \n"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "3c1045da94a04943927df0436c7594e8",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Output()"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "\n",
-      "Open the following link in your browser to authorize the client:\n",
-      "https://dagshub.com/login/oauth/authorize?state=74d288e4-2633-4d72-a81e-195610799aa2&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=98ffa694aa3b7e3ae32389c71e83d07870d02d6adc6f1bc3b35b09aeb66ae885\n",
-      "\n",
-      "\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
-      ],
-      "text/plain": []
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
     {
      "data": {
       "text/html": [
@@ -202,7 +87,7 @@
     "DAGSHUB_REPO_NAME = \"MeaLeon\"\n",
     "\n",
     "#@markdown Enter the name of the branch you are working on \n",
-    "BRANCH = \"NB1/notebook_refactor\"\n",
+    "BRANCH = \"init_mealeon_to_notebook_refactor\"\n",
     "dagshub.init(repo_name=DAGSHUB_REPO_NAME\n",
     "             , repo_owner=DAGSHUB_USER_NAME)"
    ]
@@ -262,7 +147,32 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[?25l\u001b[32m⠋\u001b[0m Checking graph                                                 \n",
+      "Adding...                                                                       \n",
+      "!\u001b[A\n",
+      "  0% Checking cache in '/home/awchen/Repos/Projects/MeaLeon/.dvc/cache'| |0/? [0\u001b[A\n",
+      "                                                                                \u001b[A\n",
+      "!\u001b[A\n",
+      "  0%|          |Checking out ../data/raw/201706-epicur0/? [00:00<?,    ?files/s]\u001b[A\n",
+      "  0%|          |Checking out ../data/raw/201706-epicur0/1 [00:00<?,    ?files/s]\u001b[A\n",
+      "100% Adding...|████████████████████████████████████████|1/1 [00:00,  4.23file/s]\u001b[A\n",
+      "\n",
+      "To track the changes with git, run:\n",
+      "\n",
+      "\tgit add ../data.dvc\n",
+      "\n",
+      "To enable auto staging, run:\n",
+      "\n",
+      "\tdvc config core.autostage true\n",
+      "\u001b[0m"
+     ]
+    }
+   ],
    "source": [
     "# raw data\n",
     "\n",
@@ -278,7 +188,7 @@
    "source": [
     "# ETL work (currently, data cleaning/prep)\n",
     "# how the prep works is via dataframe_preprocessor \n",
-    "cleaned_df = rdpp.raw_data_preprocessor(raw_df)\n",
+    "cleaned_df = rdpp.preprocess_dataframe(raw_df)\n",
     "cleaned_df.to_parquet(\"../data/processed/cleaned_df.parquet.gzip\", compression=\"gzip\")"
    ]
   },
@@ -286,30 +196,62 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[?25l\u001b[32m⠋\u001b[0m Checking graph                                                 \n",
+      "Adding...                                                                       \n",
+      "!\u001b[A\n",
+      "  0% Checking cache in '/home/awchen/Repos/Projects/MeaLeon/.dvc/cache'| |0/? [0\u001b[A\n",
+      "                                                                                \u001b[A\n",
+      "!\u001b[A\n",
+      "  0%|          |Transferring                          0/? [00:00<?,     ?file/s]\u001b[A\n",
+      "  0%|          |Transferring                          0/1 [00:00<?,     ?file/s]\u001b[A\n",
+      "                                                                                \u001b[A\n",
+      "!\u001b[A\n",
+      "  0%|          |Checking out ../data/processed/cleaned0/? [00:00<?,    ?files/s]\u001b[A\n",
+      "  0%|          |Checking out ../data/processed/cleaned0/1 [00:00<?,    ?files/s]\u001b[A\n",
+      "100% Adding...|████████████████████████████████████████|1/1 [00:00, 17.84file/s]\u001b[A\n",
+      "\n",
+      "To track the changes with git, run:\n",
+      "\n",
+      "\tgit add ../data.dvc\n",
+      "\n",
+      "To enable auto staging, run:\n",
+      "\n",
+      "\tdvc config core.autostage true\n",
+      "\u001b[0m"
+     ]
+    }
+   ],
    "source": [
     "# add cleaned dataframe to DVC\n",
     "!dvc add \"../data/processed/cleaned_df.parquet.gzip\""
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# logging the prepped data, ready for embedding transformation (another ETL currently done with nlp_processor, but performed with an MLflow model, then added back to DVC)\n",
-    "# embeddings can be converted to PyTorch tensors/datasets, but not the original raw text\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "# Prepare whole dataframe for new processing\n",
-    "from src.custom_stanza_mlflow import CustomSKLearnWrapper"
+    "Need to commit DVC/data changes to git, does that need to be done in this cell?\n",
+    "- based off of the nbdev tools currently (where it essentially runs the whole notebook), this may not be a good idea\n",
+    "- when working out of a notebook for testing, dvc maybe can pull the data, but we should not be doing the actual processing here\n",
+    "\n",
+    "In the future, can/should the data cleaning be done in dbt?\n",
+    "\n",
+    "- no, dbt is more about analytics then data cleaning, it seems\n",
+    "\n",
+    "- if text processing needed regularly, might have to put in Airflow\n",
+    "\n",
+    "---\n",
+    "\n",
+    "Now that we have converted the raw dataframe to a cleaner form with lemmatization (if needed/preferred) we can move on to the embedding transformation. Currently, this is another ETL done with `nlp_processor`, but performed with an MLflow model and this embedding transformed/vectorized data should then added back to DVC.\n",
+    "\n",
+    "---\n",
+    "\n",
+    "In the future, we can take the embeddings and convert them to PyTorch tensors/datasets, which is not something we can do with the original raw text"
    ]
   },
   {
@@ -319,6 +261,7 @@
    "outputs": [],
    "source": [
     "#| export\n",
+    "# this is a custom function to be used with MLflow to get or create experiments (is from the MLflow team)\n",
     "def get_mlflow_experiment_id(name):\n",
     "    # this function allows us to get the experiment ID from an experiment name\n",
     "    exp = mlflow.get_experiment_by_name(name)\n",
@@ -343,27 +286,60 @@
    "source": [
     "mlflow.set_tracking_uri(f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow')\n",
     "\n",
-    "# starter idea for making an experiment name can be the git branch, but need more specificity\n",
-    "experiment_name = f\"{DAGSHUB_EMAIL}/TFIDF_up_to_quadgrams_small_sample_upload_test\"\n",
+    "# starter idea for making an experiment name, can be the git branch, but need more specificity\n",
+    "experiment_name = f\"{DAGSHUB_EMAIL}/DVC-MLflow-integration-test\"\n",
     "mlflow_exp_id = get_mlflow_experiment_id(experiment_name)\n",
     "\n",
+    "# define processed data location and data to be added to DVC\n",
+    "processed_data_base = \"../data/processed\"\n",
+    "transformed_recipes_parquet_path = processed_data_base + \"/transformed_recipes.parquet.gzip\"\n",
+    "combined_df_path = processed_data_base + \"/combined_df.parquet.gzip\"\n",
+    "\n",
+    "\n",
     "# define model location\n",
-    "# model_directory = \"/tmp/sklearn_model\"\n",
     "model_directory = \"../models/sklearn_model\"\n",
     "\n",
     "# Define the required artifacts associated with the saved custom pyfunc\n",
-    "# sklearn_path = model_directory + \"\"\n",
     "sklearn_model_path = model_directory + \"/python_model.pkl\"\n",
     "sklearn_transformer_path = model_directory + \"/sklearn_transformer.pkl\"\n",
-    "transformed_recipes_path = model_directory + \"/transformed_recipes.pkl\"\n",
-    "transformed_recipes_parquet_path = model_directory + \"/transformed_recipes.parquet\"\n",
-    "combined_df_path = model_directory + \"/combined_df.pkl\"\n",
+    "# transformed_recipes_path = model_directory + \"/transformed_recipes.pkl\"\n",
+    "combined_df_sample_path = model_directory + \"/combined_df_sample.parquet\"\n",
     "\n",
     "artifacts = {'sklearn_model': sklearn_model_path,\n",
     "             'sklearn_transformer': sklearn_transformer_path,\n",
     "            #  'transformed_recipes': transformed_recipes_path,\n",
-    "             'combined_data': combined_df_path\n",
-    "             }\n"
+    "            #  'combined_data': combined_df_path,\n",
+    "             'combined_data_sample': combined_df_sample_path\n",
+    "             }\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  0% Checkout|                                      |0/27 [00:00<?,     ?file/s]\n",
+      "!\u001b[A\n",
+      "Building data objects from ../joblib/2022.08.23       |0.00 [00:00,      ?obj/s]\u001b[A\n",
+      "                                                                                \u001b[A\n",
+      "!\u001b[A\n",
+      "Building data objects from ../data                    |0.00 [00:00,      ?obj/s]\u001b[A\n",
+      "\u001b[33mM\u001b[0m       ..\u001b[35m/data/\u001b[0m                                              \u001b[A\n",
+      "\u001b[31mD\u001b[0m       data/raw/\u001b[1;36m201706\u001b[0m-epicurious-recipes-en.json\n",
+      "\u001b[31mD\u001b[0m       data/processed/cleaned_df.parquet.gzip\n",
+      "2 files deleted and 1 file modified\n",
+      "\u001b[0m"
+     ]
+    }
+   ],
+   "source": [
+    "# Prepare whole dataframe for new processing\n",
+    "!dvc pull"
    ]
   },
   {
@@ -619,7 +595,8 @@
     }
    ],
    "source": [
-    "whole_nlp_df = pd.read_parquet('../joblib/2024.03.19/pre_proc_df.parquet.gzip')\n",
+    "# this part can be done after a dvc pull\n",
+    "whole_nlp_df = pd.read_parquet(\"../data/processed/cleaned_df.parquet.gzip\")\n",
     "whole_nlp_df.head()"
    ]
   },
@@ -673,7 +650,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████| 34756/34756 [00:03<00:00, 10809.07it/s]\n"
+      "100%|██████████| 34756/34756 [00:03<00:00, 10450.53it/s]\n"
      ]
     },
     {
@@ -732,7 +709,7 @@
       "54a408a66529d92b2c003638               0.0    0.0  \n",
       "54a408a719925f464b3733cc               0.0    0.0  \n",
       "\n",
-      "[5 rows x 78378 columns]\n",
+      "[5 rows x 78381 columns]\n",
       "\n",
       "\n",
       "--------------------------------------------------------------------------------\n",
@@ -773,13 +750,13 @@
       "54a43ad16529d92b2c019fc3  cup basmati rice ounce brk cup sweeten flake c...  \n",
       "55d4e08063b1ba1b5534b198  tablespoon white wine vinegar brk teaspoon sug...  \n",
       "\n",
-      "[3 rows x 78379 columns]\n"
+      "[3 rows x 78382 columns]\n"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "49bfe31006a14e5abffaa11d752ef843",
+       "model_id": "413513de77ec40e097f0fe537db730da",
        "version_major": 2,
        "version_minor": 0
       },
@@ -793,7 +770,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "aac5675f24ed4396823d8f6d9c86f8fc",
+       "model_id": "f463247aaa654948a7d7170a6d90f997",
        "version_major": 2,
        "version_minor": 0
       },
@@ -807,7 +784,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "6f65b1308ca1475ab46a67488bfe32e1",
+       "model_id": "bf79aae1fe38472ba528d8b84d1b5f65",
        "version_major": 2,
        "version_minor": 0
       },
@@ -822,10 +799,12 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2024/06/12 17:47:16 WARNING mlflow.utils.environment: Encountered an unexpected error while inferring pip requirements (model URI: /tmp/tmpjjchiloj/model, flavor: python_function), fall back to return ['cloudpickle==2.2.1']. Set logging level to DEBUG to see the full traceback.\n",
+      "2024/07/29 21:58:31 WARNING mlflow.utils.environment: Encountered an unexpected error while inferring pip requirements (model URI: /tmp/tmpzmn49nj8/model, flavor: python_function), fall back to return ['cloudpickle==2.2.1']. Set logging level to DEBUG to see the full traceback.\n",
+      "/home/awchen/Repos/Projects/MeaLeon/.venv/lib/python3.10/site-packages/_distutils_hack/__init__.py:18: UserWarning: Distutils was imported before Setuptools, but importing Setuptools also replaces the `distutils` module in `sys.modules`. This may lead to undesirable behaviors or errors. To avoid these issues, avoid using distutils directly, ensure that setuptools is installed in the traditional way (e.g. not an editable install), and/or make sure that setuptools is always imported before distutils.\n",
+      "  warnings.warn(\n",
       "/home/awchen/Repos/Projects/MeaLeon/.venv/lib/python3.10/site-packages/_distutils_hack/__init__.py:33: UserWarning: Setuptools is replacing distutils.\n",
       "  warnings.warn(\"Setuptools is replacing distutils.\")\n",
-      "2024/06/12 17:47:52 WARNING mlflow.models.model: Logging model metadata to the tracking server has failed. The model artifacts have been logged successfully under mlflow-artifacts:/284d65dcc09149b8b4279793753b69f9/8fca733b693542439e9366f46c40b553/artifacts. Set logging level to DEBUG via `logging.getLogger(\"mlflow\").setLevel(logging.DEBUG)` to see the full traceback.\n"
+      "2024/07/29 21:59:09 WARNING mlflow.models.model: Logging model metadata to the tracking server has failed. The model artifacts have been logged successfully under mlflow-artifacts:/5abd2670253447e0a4988212aabcf35a/e3cf27f656504b0d9b6d5a8d4ce1abb2/artifacts. Set logging level to DEBUG via `logging.getLogger(\"mlflow\").setLevel(logging.DEBUG)` to see the full traceback.\n"
      ]
     }
    ],
@@ -867,25 +846,25 @@
     "\n",
     "    print('\\n')\n",
     "    print('-' * 80)\n",
-    "    print('sklearn fit transform on ingredients:', end='\\n')\n",
+    "    print('sklearn fit transform on ingredients:')\n",
     "\n",
     "    model_input = whole_nlp_df['ingredients_lemmafied']\n",
     "\n",
     "    print('\\n')\n",
     "    print('-' * 80)\n",
-    "    print('Input Data: ', end='\\n')\n",
+    "    print('Input Data: ')\n",
     "    print(model_input)\n",
     "\n",
     "    print('\\n')\n",
     "    print('-' * 80)\n",
-    "    print('Input Data Shape: ', end='\\n')\n",
+    "    print('Input Data Shape: ')\n",
     "    print(model_input.shape)\n",
     "\n",
     "    random_sample = model_input.sample(3, random_state=200)\n",
     "\n",
     "    print('\\n')\n",
     "    print('-' * 80)\n",
-    "    print('Random 3 Records from Input Data: ', end='\\n')\n",
+    "    print('Random 3 Records from Input Data: ')\n",
     "    print(random_sample)\n",
     "\n",
     "    # Do fit transform on data\n",
@@ -903,30 +882,28 @@
     "\n",
     "    print('\\n')\n",
     "    print('-' * 80)\n",
-    "    print('Transformed Data:', end='\\n')\n",
+    "    print('Transformed Data:')\n",
     "    print(transformed_recipe.head())\n",
     "    \n",
-    "    combined_df = transformed_recipe.join(random_sample, how='inner')\n",
+    "    combined_df = transformed_recipe.join(model_input, how='inner')\n",
+    "    combined_df_sample = transformed_recipe.join(random_sample, how='inner')\n",
     "\n",
     "    print('\\n')\n",
     "    print('-' * 80)\n",
-    "    print('Random Sample of Combined Data:', end='\\n')\n",
-    "    print(combined_df.head())\n",
+    "    print('Random Sample of Combined Data:')\n",
+    "    print(combined_df_sample.head())\n",
     "\n",
     "    with open(sklearn_transformer_path, \"wb\") as fo:\n",
     "        pickle.dump(sklearn_transformer, fo)\n",
-    "    \n",
-    "    with open(transformed_recipes_path, \"wb\") as fo:\n",
-    "        pickle.dump(transformed_recipe, fo)\n",
     "\n",
-    "    transformed_recipe.to_parquet(path=transformed_recipes_parquet_path)\n",
-    "    \n",
-    "    with open(combined_df_path, 'wb') as fo:\n",
-    "        pickle.dump(combined_df, fo)\n",
+    "    transformed_recipe.to_parquet(path=transformed_recipes_parquet_path, compression=\"gzip\")\n",
     "\n",
+    "    combined_df.to_parquet(path=combined_df_path, compression=\"gzip\")\n",
+    "    \n",
+    "    combined_df_sample.to_parquet(path=combined_df_sample_path)\n",
     "\n",
     "    model_info = mlflow.pyfunc.log_model( \n",
-    "        code_path=[\"../src/\"],\n",
+    "        code_path=[\"../src/backend/\"],\n",
     "        python_model=CustomSKLearnWrapper(),\n",
     "        input_example=whole_nlp_df['ingredients_lemmafied'][0],\n",
     "        signature=signature,        \n",
@@ -934,26 +911,112 @@
     "        artifacts=artifacts\n",
     "        ) \n",
     "\n",
-    "    # since this uses a custom Stanza analyzer, we have to use a custom mlflow.Pyfunc.PythonModel\n",
-    "    "
+    "    # since this uses a custom Stanza analyzer, we have to use a custom mlflow.Pyfunc.PythonModel"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [],
-   "source": []
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[?25l\u001b[32m⠋\u001b[0m Checking graph                                                 \n",
+      "Adding...                                                                       \n",
+      "!\u001b[A\n",
+      "  0% Checking cache in '/home/awchen/Repos/Projects/MeaLeon/.dvc/cache'| |0/? [0\u001b[A\n",
+      "                                                                                \u001b[A\n",
+      "!\u001b[A\n",
+      "  0%|          |Transferring                          0/? [00:00<?,     ?file/s]\u001b[A\n",
+      "  0%|          |Transferring                          0/1 [00:00<?,     ?file/s]\u001b[A\n",
+      "                                                                                \u001b[A\n",
+      "!\u001b[A\n",
+      "  0%|          |Checking out ../data/processed/transfo0/? [00:00<?,    ?files/s]\u001b[A\n",
+      "  0%|          |Checking out ../data/processed/transfo0/1 [00:00<?,    ?files/s]\u001b[A\n",
+      "100% Adding...|████████████████████████████████████████|1/1 [00:00,  5.53file/s]\u001b[A\n",
+      "\n",
+      "To track the changes with git, run:\n",
+      "\n",
+      "\tgit add ../data.dvc\n",
+      "\n",
+      "To enable auto staging, run:\n",
+      "\n",
+      "\tdvc config core.autostage true\n",
+      "\u001b[0m"
+     ]
+    }
+   ],
+   "source": [
+    "!dvc add \"../data/processed/transformed_recipes.parquet.gzip\""
+   ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[?25l\u001b[32m⠋\u001b[0m Checking graph                                                 \n",
+      "Adding...                                                                       \n",
+      "!\u001b[A\n",
+      "  0% Checking cache in '/home/awchen/Repos/Projects/MeaLeon/.dvc/cache'| |0/? [0\u001b[A\n",
+      "                                                                                \u001b[A\n",
+      "!\u001b[A\n",
+      "  0%|          |Transferring                          0/? [00:00<?,     ?file/s]\u001b[A\n",
+      "  0%|          |Transferring                          0/1 [00:00<?,     ?file/s]\u001b[A\n",
+      "                                                                                \u001b[A\n",
+      "!\u001b[A\n",
+      "  0%|          |Checking out ../data/processed/combine0/? [00:00<?,    ?files/s]\u001b[A\n",
+      "  0%|          |Checking out ../data/processed/combine0/1 [00:00<?,    ?files/s]\u001b[A\n",
+      "100% Adding...|████████████████████████████████████████|1/1 [00:00,  5.37file/s]\u001b[A\n",
+      "\n",
+      "To track the changes with git, run:\n",
+      "\n",
+      "\tgit add ../data.dvc\n",
+      "\n",
+      "To enable auto staging, run:\n",
+      "\n",
+      "\tdvc config core.autostage true\n",
+      "\u001b[0m"
+     ]
+    }
+   ],
+   "source": [
+    "!dvc add \"../data/processed/combined_df.parquet.gzip\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/awchen/Repos/Projects/MeaLeon/.venv/lib/python3.10/site-packages/nbdev/export.py:73: UserWarning: Notebook '/home/awchen/Repos/Projects/MeaLeon/nbs/16_notebook_refactor.ipynb' uses `#|export` without `#|default_exp` cell.\n",
+      "Note nbdev2 no longer supports nbdev1 syntax. Run `nbdev_migrate` to upgrade.\n",
+      "See https://nbdev.fast.ai/getting_started.html for more information.\n",
+      "  warn(f\"Notebook '{nbname}' uses `#|export` without `#|default_exp` cell.\\n\"\n"
+     ]
+    }
+   ],
    "source": [
     "# | hide\n",
     "nbdev.nbdev_export()"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
diff --git a/src/backend/embedding_creation/apply_stanza.py b/src/backend/embedding_creation/apply_stanza.py
new file mode 100644
index 0000000..31a63ef
--- /dev/null
+++ b/src/backend/embedding_creation/apply_stanza.py
@@ -0,0 +1,90 @@
+from itertools import tee, islice
+import re
+import stanza
+
+
+class CustomSKLearnAnalyzer:
+    """
+    This class handles allows sklearn text transformers to incorporate a Stanza pipeline with a custom analyzer
+    """
+
+    def __init__(self, stanza_lang_str="en"):
+        """
+        Constructor method. Initializes the model with a Stanza libary language
+        type. The default is "en" for English, later on, can think adding
+        functionality to download the pretrained model/embeddings
+        """
+        self.stanza_lang_str = stanza_lang_str
+
+    def prepare_stanza_pipeline(
+        self,
+        depparse_batch_size=50,
+        depparse_min_length_to_batch_separately=50,
+        verbose=True,
+        use_gpu=True,
+        batch_size=100,
+    ):
+        """
+        Method to simply construction of Stanza Pipeline for usage in the sklearn custom analyzer
+
+        Args:
+            Follow creation of stanza pipeline (link to their docs)
+
+            self.stanza_lang_str:
+                str for pretrained Stanza embeddings to use in the pipeline (from init)
+
+            depparse_batch_size:
+                int for batch size for processing, default is 50
+
+            depparse_min_length_to_batch_separately:
+                int for minimum string length to batch, default is 50
+
+            verbose:
+                boolean for information for readouts during processing, default is True
+
+            use_gpu:
+                boolean for using GPU for stanza, default is False,
+                set to True when on cloud/not on streaming computer
+
+            batch_size:
+                int for batch sizing, default is 100
+
+        Returns:
+            nlp:
+                stanza pipeline
+        """
+
+        # Perhaps down the road, this should be stored as an MLflow Artifact to be downloaded
+        # Or should this be part of the Container building at start up? If so, how would those get logged? Just as artifacts?
+        stanza.download(self.stanza_lang_str)
+
+        nlp = stanza.Pipeline(
+            self.stanza_lang_str,
+            depparse_batch_size=depparse_batch_size,
+            depparse_min_length_to_batch_separately=depparse_min_length_to_batch_separately,
+            verbose=verbose,
+            use_gpu=use_gpu,
+            batch_size=batch_size,
+        )
+
+        return nlp
+
+    @classmethod
+    def ngram_maker(self, min_ngram_length: int, max_ngram_length: int):
+        def ngrams_per_line(row: str):
+            for ln in row.split(" brk "):
+                at_least_two_english_characters_whole_words = r"(?u)\b\w{2,}\b"
+                terms = re.findall(at_least_two_english_characters_whole_words, ln)
+                for ngram_length in range(min_ngram_length, max_ngram_length + 1):
+
+                    # find and return all ngrams
+                    # for ngram in zip(*[terms[i:] for i in range(3)]):
+                    # <-- solution without a generator (works the same but has higher memory usage)
+                    for ngram in (
+                        word
+                        for i in range(len(terms) - ngram_length + 1)
+                        for word in (" ".join(terms[i : i + ngram_length]),)
+                    ):
+                        yield ngram
+
+        return ngrams_per_line
diff --git a/src/backend/embedding_creation/sklearn_transformer_as_mlflow_model.py b/src/backend/embedding_creation/sklearn_transformer_as_mlflow_model.py
new file mode 100644
index 0000000..59a1089
--- /dev/null
+++ b/src/backend/embedding_creation/sklearn_transformer_as_mlflow_model.py
@@ -0,0 +1,78 @@
+import mlflow
+import pandas as pd
+from typing import List
+
+
+class CustomSKLearnWrapper(mlflow.pyfunc.PythonModel):
+    """
+    This class allows sklearn text transformers to be logged in MLflow as a
+    custom PythonModel. It overrides the default load_context and predict methods (as required by MLflow).
+    load_context now loads pickled files representing the model itself (which requires Stanza) and the transformer (which is an sklearn object)
+    """
+
+    # def __init__(self, model):
+    #     """
+    #     Constructor method. Initializes the model with a Stanza libary language
+    #     type. The default is "en" for English
+
+    #     model:          sklearn.Transformer
+    #             The sklearn text Transformer or Pipeline that ends in a
+    #             Transformer
+
+    #     later can add functionality to include pretrained models needed for Stanza
+
+    #     """
+    #     self.model = model
+
+    def load_context(self, context):
+        """
+        Method needed to override default load_context. Needs to handle different components of sklearn model
+
+        """
+        import dill as pickle
+        # dill is needed due to generators and classes in the model itself
+
+        with open(context.artifacts["sklearn_model"], "rb") as f:
+            self.model = pickle.load(f)
+
+        with open(context.artifacts["sklearn_transformer"], "rb") as f:
+            self.sklearn_transformer = pickle.load(f)
+
+    def predict(self, context, model_input: List[str], params: dict):
+        """
+        This method is needed to override the default predict.
+        It needs to function essentially as a wrapper and returns back the
+        transformed recipes
+
+        Args:
+            context:        Any
+                Not used
+
+            model_input:    List(string)
+                The ingredients of a single query recipe in a list
+                Need to decide if this is taking in raw text or preprocessed text
+                Leaning towards taking in raw text, doing preprocessing, and
+                logging the pre processed text as an artifact
+
+            params:         dict, optional
+                Parameters used for the model (optional)
+                Not used currently for sklearn
+
+        Returns:
+            transformed_recipe_df: DataFrame of the recipes after going through
+            the sklearn/Stanza text processing
+        """
+
+        print(model_input)
+        print(model_input.shape)
+        print(model_input.sample(3, random_state=200))
+
+        response = self.sklearn_transformer.transform(model_input.values)
+
+        transformed_recipe = pd.DataFrame(
+            response.toarray(),
+            columns=self.sklearn_transformer.get_feature_names_out(),
+            index=model_input.index,
+        )
+
+        return transformed_recipe