diff --git a/nbs/15_new_preproc_test_combined_df.ipynb b/nbs/15_new_preproc_test_combined_df.ipynb
new file mode 100644
index 0000000..9e95074
--- /dev/null
+++ b/nbs/15_new_preproc_test_combined_df.ipynb
@@ -0,0 +1,978 @@
+{
+ "cells": [
+  {
+   "cell_type": "raw",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "description: test\n",
+    "output-file: template.html\n",
+    "title: Template\n",
+    "\n",
+    "---\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# | default_exp core"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "087d6d4ced3c49c88ec00adb20295872",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024-05-08 19:00:54 INFO: Downloading default packages for language: en (English) ...\n",
+      "2024-05-08 19:00:55 INFO: File exists: /home/awchen/stanza_resources/en/default.zip\n",
+      "2024-05-08 19:00:58 INFO: Finished downloading models and saved to /home/awchen/stanza_resources.\n",
+      "2024-05-08 19:00:58 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8af294b5fac641219a3a46629cf99fba",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024-05-08 19:00:59 INFO: Loading these models for language: en (English):\n",
+      "======================================\n",
+      "| Processor    | Package             |\n",
+      "--------------------------------------\n",
+      "| tokenize     | combined            |\n",
+      "| pos          | combined_charlm     |\n",
+      "| lemma        | combined_nocharlm   |\n",
+      "| constituency | ptb3-revised_charlm |\n",
+      "| depparse     | combined_charlm     |\n",
+      "| sentiment    | sstplus             |\n",
+      "| ner          | ontonotes_charlm    |\n",
+      "======================================\n",
+      "\n",
+      "2024-05-08 19:00:59 INFO: Using device: cpu\n",
+      "2024-05-08 19:00:59 INFO: Loading: tokenize\n",
+      "2024-05-08 19:00:59 INFO: Loading: pos\n",
+      "2024-05-08 19:01:00 INFO: Loading: lemma\n",
+      "2024-05-08 19:01:00 INFO: Loading: constituency\n",
+      "2024-05-08 19:01:00 INFO: Loading: depparse\n",
+      "2024-05-08 19:01:00 INFO: Loading: sentiment\n",
+      "2024-05-08 19:01:00 INFO: Loading: ner\n",
+      "2024-05-08 19:01:01 INFO: Done loading processors!\n"
+     ]
+    }
+   ],
+   "source": [
+    "# | hide\n",
+    "# from bertopic import BERTopic\n",
+    "# from bertopic.vectorizers import OnlineCountVectorizer\n",
+    "import dagshub\n",
+    "from datetime import datetime\n",
+    "import dill as pickle\n",
+    "import dvc.api\n",
+    "# from hdbscan import HDBSCAN\n",
+    "from itertools import tee, islice, product\n",
+    "import joblib\n",
+    "import nbdev\n",
+    "from nbdev.showdoc import *\n",
+    "import pandas as pd\n",
+    "import re\n",
+    "from sentence_transformers import SentenceTransformer\n",
+    "from sklearn.feature_extraction.text import (\n",
+    "    CountVectorizer\n",
+    "    , TfidfTransformer\n",
+    "    , TfidfVectorizer\n",
+    "    , \n",
+    ")\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.pipeline import make_pipeline\n",
+    "from src.custom_sklearn_text_transformer_mlflow import CustomSKLearnAnalyzer\n",
+    "import src.dataframe_preprocessor as dfpp\n",
+    "import stanza\n",
+    "from tqdm import tqdm\n",
+    "# from umap import UMAP"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!export 'PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# | export"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# | hide\n",
+    "# nbdev.nbdev_export()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Data Preparation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9b4405a6faa044f185efdb8e5359b8e5",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024-05-08 19:01:01 INFO: Downloading default packages for language: en (English) ...\n",
+      "2024-05-08 19:01:02 INFO: File exists: /home/awchen/stanza_resources/en/default.zip\n",
+      "2024-05-08 19:01:05 INFO: Finished downloading models and saved to /home/awchen/stanza_resources.\n",
+      "2024-05-08 19:01:05 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6f43e74e5a7940a1b60662d5884ab4a2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024-05-08 19:01:06 INFO: Loading these models for language: en (English):\n",
+      "======================================\n",
+      "| Processor    | Package             |\n",
+      "--------------------------------------\n",
+      "| tokenize     | combined            |\n",
+      "| pos          | combined_charlm     |\n",
+      "| lemma        | combined_nocharlm   |\n",
+      "| constituency | ptb3-revised_charlm |\n",
+      "| depparse     | combined_charlm     |\n",
+      "| sentiment    | sstplus             |\n",
+      "| ner          | ontonotes_charlm    |\n",
+      "======================================\n",
+      "\n",
+      "2024-05-08 19:01:06 INFO: Using device: cuda\n",
+      "2024-05-08 19:01:06 INFO: Loading: tokenize\n",
+      "2024-05-08 19:01:10 INFO: Loading: pos\n",
+      "2024-05-08 19:01:10 INFO: Loading: lemma\n",
+      "2024-05-08 19:01:10 INFO: Loading: constituency\n",
+      "2024-05-08 19:01:11 INFO: Loading: depparse\n",
+      "2024-05-08 19:01:11 INFO: Loading: sentiment\n",
+      "2024-05-08 19:01:13 INFO: Loading: ner\n",
+      "2024-05-08 19:01:14 INFO: Done loading processors!\n"
+     ]
+    }
+   ],
+   "source": [
+    "# instantiate stanza pipeline\n",
+    "stanza.download('en')\n",
+    "nlp = stanza.Pipeline('en', \n",
+    "                    depparse_batch_size=50, \n",
+    "                    depparse_min_length_to_batch_separately=50,\n",
+    "                    verbose=True,\n",
+    "                    use_gpu=True, # set to true when on cloud/not on streaming computer\n",
+    "                    batch_size=100\n",
+    "                    )\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Prepare whole dataframe for new processing\n",
+    "import mlflow\n",
+    "from mlflow.models import infer_signature\n",
+    "from src.custom_stanza_mlflow import CustomSKLearnWrapper"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# this function allows us to get the experiment ID from an experiment name\n",
+    "def get_experiment_id(name):\n",
+    "    exp = mlflow.get_experiment_by_name(name)\n",
+    "    if exp is None:\n",
+    "      exp_id = mlflow.create_experiment(name)\n",
+    "      return exp_id\n",
+    "    return exp.experiment_id"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">                                       <span style=\"font-weight: bold\">❗❗❗ AUTHORIZATION REQUIRED ❗❗❗</span>                                        \n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "                                       \u001b[1m❗❗❗ AUTHORIZATION REQUIRED ❗❗❗\u001b[0m                                        \n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Open the following link in your browser to authorize the client:\n",
+      "https://dagshub.com/login/oauth/authorize?state=2a72caa0-4d17-4133-b792-04bc75d86098&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=9f4396584299dc580a77cbeee10d45564a42b8b6598f116f383828cec1dc79d7\n",
+      "\n",
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a37f40520504442f9d3ed6e408a7c309",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Output()"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">Repository initialized!\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "Repository initialized!\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "#@markdown Enter the username of your DAGsHub account:\n",
+    "DAGSHUB_USER_NAME = \"AaronWChen\" #@param {type:\"string\"}\n",
+    "\n",
+    "#@markdown Enter the email for your DAGsHub account:\n",
+    "DAGSHUB_EMAIL = \"awc33@cornell.edu\" #@param {type:\"string\"}\n",
+    "\n",
+    "#@markdown Enter the repo name \n",
+    "DAGSHUB_REPO_NAME = \"MeaLeon\"\n",
+    "\n",
+    "#@markdown Enter the name of the branch you are working on \n",
+    "BRANCH = \"NGRAM-2/trying-sklearn-object-upload\"\n",
+    "dagshub.init(repo_name=DAGSHUB_REPO_NAME\n",
+    "             , repo_owner=DAGSHUB_USER_NAME)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Starting DEV stage for TFIDF Encoded model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mlflow.set_tracking_uri(f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow')\n",
+    "\n",
+    "# starter idea for making an experiment name can be the git branch, but need more specificity\n",
+    "experiment_name = f\"{DAGSHUB_EMAIL}/TFIDF_up_to_quadgrams_small_sample_upload_test\"\n",
+    "mlflow_exp_id = get_experiment_id(experiment_name)\n",
+    "\n",
+    "# define model location\n",
+    "# model_directory = \"/tmp/sklearn_model\"\n",
+    "model_directory = \"../models/sklearn_model\"\n",
+    "\n",
+    "# Define the required artifacts associated with the saved custom pyfunc\n",
+    "# sklearn_path = model_directory + \"\"\n",
+    "sklearn_model_path = model_directory + \"/python_model.pkl\"\n",
+    "sklearn_transformer_path = model_directory + \"/sklearn_transformer.pkl\"\n",
+    "transformed_recipes_path = model_directory + \"/transformed_recipes.pkl\"\n",
+    "combined_df_path = model_directory + \"/combined_df.pkl\"\n",
+    "\n",
+    "artifacts = {'sklearn_model': sklearn_model_path,\n",
+    "             'sklearn_transformer': sklearn_transformer_path,\n",
+    "            #  'transformed_recipes': transformed_recipes_path,\n",
+    "             'combined_data': combined_df_path\n",
+    "             }\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>dek</th>\n",
+       "      <th>hed</th>\n",
+       "      <th>aggregateRating</th>\n",
+       "      <th>ingredients</th>\n",
+       "      <th>prepSteps</th>\n",
+       "      <th>reviewsCount</th>\n",
+       "      <th>willMakeAgainPct</th>\n",
+       "      <th>ingredients_lemmafied</th>\n",
+       "      <th>cuisine_name</th>\n",
+       "      <th>photo_filename</th>\n",
+       "      <th>photo_credit</th>\n",
+       "      <th>author_name</th>\n",
+       "      <th>date_published</th>\n",
+       "      <th>recipe_url</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>id</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>54a2b6b019925f464b373351</th>\n",
+       "      <td>How does fried chicken achieve No. 1 status? B...</td>\n",
+       "      <td>Pickle-Brined Fried Chicken</td>\n",
+       "      <td>3.11</td>\n",
+       "      <td>[1 tablespoons yellow mustard seeds, 1 tablesp...</td>\n",
+       "      <td>[Toast mustard and coriander seeds in a dry me...</td>\n",
+       "      <td>7</td>\n",
+       "      <td>100</td>\n",
+       "      <td>tablespoon yellow mustard seed brk tablespoon ...</td>\n",
+       "      <td>Missing Cuisine</td>\n",
+       "      <td>51247610_fried-chicken_1x1.jpg</td>\n",
+       "      <td>Michael Graydon and Nikole Herriott</td>\n",
+       "      <td>Missing Author Name</td>\n",
+       "      <td>2014-08-19 04:00:00+00:00</td>\n",
+       "      <td>https://www.epicurious.com/recipes/food/views/...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>54a408a019925f464b3733bc</th>\n",
+       "      <td>Spinaci all'Ebraica</td>\n",
+       "      <td>Spinach Jewish Style</td>\n",
+       "      <td>3.22</td>\n",
+       "      <td>[3 pounds small-leaved bulk spinach, Salt, 1/2...</td>\n",
+       "      <td>[Remove the stems and roots from the spinach. ...</td>\n",
+       "      <td>5</td>\n",
+       "      <td>80</td>\n",
+       "      <td>pound small leave bulk spinach brk salt brk cu...</td>\n",
+       "      <td>Italian</td>\n",
+       "      <td>EP_12162015_placeholders_rustic.jpg</td>\n",
+       "      <td>Photo by Chelsea Kyle, Prop Styling by Anna St...</td>\n",
+       "      <td>Edda Servi Machlin</td>\n",
+       "      <td>2008-09-09 04:00:00+00:00</td>\n",
+       "      <td>https://www.epicurious.com/recipes/food/views/...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>54a408a26529d92b2c003631</th>\n",
+       "      <td>This majestic, moist, and richly spiced honey ...</td>\n",
+       "      <td>New Year’s Honey Cake</td>\n",
+       "      <td>3.62</td>\n",
+       "      <td>[3 1/2 cups all-purpose flour, 1 tablespoon ba...</td>\n",
+       "      <td>[I like this cake best baked in a 9-inch angel...</td>\n",
+       "      <td>105</td>\n",
+       "      <td>88</td>\n",
+       "      <td>cup purpose flour brk tablespoon baking powder...</td>\n",
+       "      <td>Kosher</td>\n",
+       "      <td>EP_09022015_honeycake-2.jpg</td>\n",
+       "      <td>Photo by Chelsea Kyle, Food Styling by Anna St...</td>\n",
+       "      <td>Marcy Goldman</td>\n",
+       "      <td>2008-09-10 04:00:00+00:00</td>\n",
+       "      <td>https://www.epicurious.com/recipes/food/views/...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>54a408a66529d92b2c003638</th>\n",
+       "      <td>The idea for this sandwich came to me when my ...</td>\n",
+       "      <td>The B.L.A.Bagel with Lox and Avocado</td>\n",
+       "      <td>4.00</td>\n",
+       "      <td>[1 small ripe avocado, preferably Hass (see No...</td>\n",
+       "      <td>[A short time before serving, mash avocado and...</td>\n",
+       "      <td>7</td>\n",
+       "      <td>100</td>\n",
+       "      <td>small ripe avocado hass see note brk teaspoon ...</td>\n",
+       "      <td>Kosher</td>\n",
+       "      <td>EP_12162015_placeholders_casual.jpg</td>\n",
+       "      <td>Photo by Chelsea Kyle, Prop Styling by Rhoda B...</td>\n",
+       "      <td>Faye Levy</td>\n",
+       "      <td>2008-09-08 04:00:00+00:00</td>\n",
+       "      <td>https://www.epicurious.com/recipes/food/views/...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>54a408a719925f464b3733cc</th>\n",
+       "      <td>In 1930, Simon Agranat, the chief justice of t...</td>\n",
+       "      <td>Shakshuka a la Doktor Shakshuka</td>\n",
+       "      <td>2.71</td>\n",
+       "      <td>[2 pounds fresh tomatoes, unpeeled and cut in ...</td>\n",
+       "      <td>[1. Place the tomatoes, garlic, salt, paprika,...</td>\n",
+       "      <td>7</td>\n",
+       "      <td>83</td>\n",
+       "      <td>pound fresh tomato unpeeled cut quarter ounce ...</td>\n",
+       "      <td>Kosher</td>\n",
+       "      <td>EP_12162015_placeholders_formal.jpg</td>\n",
+       "      <td>Photo by Chelsea Kyle, Prop Styling by Rhoda B...</td>\n",
+       "      <td>Joan Nathan</td>\n",
+       "      <td>2008-09-09 04:00:00+00:00</td>\n",
+       "      <td>https://www.epicurious.com/recipes/food/views/...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                                        dek  \\\n",
+       "id                                                                            \n",
+       "54a2b6b019925f464b373351  How does fried chicken achieve No. 1 status? B...   \n",
+       "54a408a019925f464b3733bc                                Spinaci all'Ebraica   \n",
+       "54a408a26529d92b2c003631  This majestic, moist, and richly spiced honey ...   \n",
+       "54a408a66529d92b2c003638  The idea for this sandwich came to me when my ...   \n",
+       "54a408a719925f464b3733cc  In 1930, Simon Agranat, the chief justice of t...   \n",
+       "\n",
+       "                                                            hed  \\\n",
+       "id                                                                \n",
+       "54a2b6b019925f464b373351            Pickle-Brined Fried Chicken   \n",
+       "54a408a019925f464b3733bc                   Spinach Jewish Style   \n",
+       "54a408a26529d92b2c003631                  New Year’s Honey Cake   \n",
+       "54a408a66529d92b2c003638  The B.L.A.Bagel with Lox and Avocado   \n",
+       "54a408a719925f464b3733cc        Shakshuka a la Doktor Shakshuka   \n",
+       "\n",
+       "                          aggregateRating  \\\n",
+       "id                                          \n",
+       "54a2b6b019925f464b373351             3.11   \n",
+       "54a408a019925f464b3733bc             3.22   \n",
+       "54a408a26529d92b2c003631             3.62   \n",
+       "54a408a66529d92b2c003638             4.00   \n",
+       "54a408a719925f464b3733cc             2.71   \n",
+       "\n",
+       "                                                                ingredients  \\\n",
+       "id                                                                            \n",
+       "54a2b6b019925f464b373351  [1 tablespoons yellow mustard seeds, 1 tablesp...   \n",
+       "54a408a019925f464b3733bc  [3 pounds small-leaved bulk spinach, Salt, 1/2...   \n",
+       "54a408a26529d92b2c003631  [3 1/2 cups all-purpose flour, 1 tablespoon ba...   \n",
+       "54a408a66529d92b2c003638  [1 small ripe avocado, preferably Hass (see No...   \n",
+       "54a408a719925f464b3733cc  [2 pounds fresh tomatoes, unpeeled and cut in ...   \n",
+       "\n",
+       "                                                                  prepSteps  \\\n",
+       "id                                                                            \n",
+       "54a2b6b019925f464b373351  [Toast mustard and coriander seeds in a dry me...   \n",
+       "54a408a019925f464b3733bc  [Remove the stems and roots from the spinach. ...   \n",
+       "54a408a26529d92b2c003631  [I like this cake best baked in a 9-inch angel...   \n",
+       "54a408a66529d92b2c003638  [A short time before serving, mash avocado and...   \n",
+       "54a408a719925f464b3733cc  [1. Place the tomatoes, garlic, salt, paprika,...   \n",
+       "\n",
+       "                          reviewsCount  willMakeAgainPct  \\\n",
+       "id                                                         \n",
+       "54a2b6b019925f464b373351             7               100   \n",
+       "54a408a019925f464b3733bc             5                80   \n",
+       "54a408a26529d92b2c003631           105                88   \n",
+       "54a408a66529d92b2c003638             7               100   \n",
+       "54a408a719925f464b3733cc             7                83   \n",
+       "\n",
+       "                                                      ingredients_lemmafied  \\\n",
+       "id                                                                            \n",
+       "54a2b6b019925f464b373351  tablespoon yellow mustard seed brk tablespoon ...   \n",
+       "54a408a019925f464b3733bc  pound small leave bulk spinach brk salt brk cu...   \n",
+       "54a408a26529d92b2c003631  cup purpose flour brk tablespoon baking powder...   \n",
+       "54a408a66529d92b2c003638  small ripe avocado hass see note brk teaspoon ...   \n",
+       "54a408a719925f464b3733cc  pound fresh tomato unpeeled cut quarter ounce ...   \n",
+       "\n",
+       "                             cuisine_name  \\\n",
+       "id                                          \n",
+       "54a2b6b019925f464b373351  Missing Cuisine   \n",
+       "54a408a019925f464b3733bc          Italian   \n",
+       "54a408a26529d92b2c003631           Kosher   \n",
+       "54a408a66529d92b2c003638           Kosher   \n",
+       "54a408a719925f464b3733cc           Kosher   \n",
+       "\n",
+       "                                               photo_filename  \\\n",
+       "id                                                              \n",
+       "54a2b6b019925f464b373351       51247610_fried-chicken_1x1.jpg   \n",
+       "54a408a019925f464b3733bc  EP_12162015_placeholders_rustic.jpg   \n",
+       "54a408a26529d92b2c003631          EP_09022015_honeycake-2.jpg   \n",
+       "54a408a66529d92b2c003638  EP_12162015_placeholders_casual.jpg   \n",
+       "54a408a719925f464b3733cc  EP_12162015_placeholders_formal.jpg   \n",
+       "\n",
+       "                                                               photo_credit  \\\n",
+       "id                                                                            \n",
+       "54a2b6b019925f464b373351                Michael Graydon and Nikole Herriott   \n",
+       "54a408a019925f464b3733bc  Photo by Chelsea Kyle, Prop Styling by Anna St...   \n",
+       "54a408a26529d92b2c003631  Photo by Chelsea Kyle, Food Styling by Anna St...   \n",
+       "54a408a66529d92b2c003638  Photo by Chelsea Kyle, Prop Styling by Rhoda B...   \n",
+       "54a408a719925f464b3733cc  Photo by Chelsea Kyle, Prop Styling by Rhoda B...   \n",
+       "\n",
+       "                                  author_name            date_published  \\\n",
+       "id                                                                        \n",
+       "54a2b6b019925f464b373351  Missing Author Name 2014-08-19 04:00:00+00:00   \n",
+       "54a408a019925f464b3733bc   Edda Servi Machlin 2008-09-09 04:00:00+00:00   \n",
+       "54a408a26529d92b2c003631        Marcy Goldman 2008-09-10 04:00:00+00:00   \n",
+       "54a408a66529d92b2c003638            Faye Levy 2008-09-08 04:00:00+00:00   \n",
+       "54a408a719925f464b3733cc          Joan Nathan 2008-09-09 04:00:00+00:00   \n",
+       "\n",
+       "                                                                 recipe_url  \n",
+       "id                                                                           \n",
+       "54a2b6b019925f464b373351  https://www.epicurious.com/recipes/food/views/...  \n",
+       "54a408a019925f464b3733bc  https://www.epicurious.com/recipes/food/views/...  \n",
+       "54a408a26529d92b2c003631  https://www.epicurious.com/recipes/food/views/...  \n",
+       "54a408a66529d92b2c003638  https://www.epicurious.com/recipes/food/views/...  \n",
+       "54a408a719925f464b3733cc  https://www.epicurious.com/recipes/food/views/...  "
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "whole_nlp_df = pd.read_parquet('../joblib/2024.03.19/pre_proc_df.parquet.gzip')\n",
+    "whole_nlp_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n",
+      "sklearn fit transform on ingredients:\n",
+      "\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n",
+      "Input Data: \n",
+      "id\n",
+      "54a2b6b019925f464b373351    tablespoon yellow mustard seed brk tablespoon ...\n",
+      "54a408a019925f464b3733bc    pound small leave bulk spinach brk salt brk cu...\n",
+      "54a408a26529d92b2c003631    cup purpose flour brk tablespoon baking powder...\n",
+      "54a408a66529d92b2c003638    small ripe avocado hass see note brk teaspoon ...\n",
+      "54a408a719925f464b3733cc    pound fresh tomato unpeeled cut quarter ounce ...\n",
+      "                                                  ...                        \n",
+      "59541a31bff3052847ae2107    tablespoon unsalt butter room temperature brk ...\n",
+      "5954233ad52ca90dc28200e7    tablespoon stick salt butter room temperature ...\n",
+      "595424c2109c972493636f83    tablespoon unsalted butter more greasing pan b...\n",
+      "5956638625dc3d1d829b7166    coarse salt brk lime wedge brk ounce tomato ju...\n",
+      "59566daa25dc3d1d829b7169    bottle millileter sour beer such almanac citra...\n",
+      "Name: ingredients_lemmafied, Length: 34756, dtype: object\n",
+      "\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n",
+      "Input Data Shape: \n",
+      "(34756,)\n",
+      "\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n",
+      "Random 3 Records from Input Data: \n",
+      "id\n",
+      "54a40caa19925f464b374017    boneless muscovy duck breast half pound total ...\n",
+      "55d4e08063b1ba1b5534b198    tablespoon white wine vinegar brk teaspoon sug...\n",
+      "54a43ad16529d92b2c019fc3    cup basmati rice ounce brk cup sweeten flake c...\n",
+      "Name: ingredients_lemmafied, dtype: object\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 34756/34756 [00:03<00:00, 10261.04it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n",
+      "Transformed Data:\n",
+      "                          100g  125g  13x9x2  150g  1pound  1tablespoon  \\\n",
+      "id                                                                        \n",
+      "54a2b6b019925f464b373351   0.0   0.0     0.0   0.0     0.0          0.0   \n",
+      "54a408a019925f464b3733bc   0.0   0.0     0.0   0.0     0.0          0.0   \n",
+      "54a408a26529d92b2c003631   0.0   0.0     0.0   0.0     0.0          0.0   \n",
+      "54a408a66529d92b2c003638   0.0   0.0     0.0   0.0     0.0          0.0   \n",
+      "54a408a719925f464b3733cc   0.0   0.0     0.0   0.0     0.0          0.0   \n",
+      "\n",
+      "                          1teaspoon  200g  250g  2cup  ...  árbol divide  \\\n",
+      "id                                                     ...                 \n",
+      "54a2b6b019925f464b373351        0.0   0.0   0.0   0.0  ...           0.0   \n",
+      "54a408a019925f464b3733bc        0.0   0.0   0.0   0.0  ...           0.0   \n",
+      "54a408a26529d92b2c003631        0.0   0.0   0.0   0.0  ...           0.0   \n",
+      "54a408a66529d92b2c003638        0.0   0.0   0.0   0.0  ...           0.0   \n",
+      "54a408a719925f464b3733cc        0.0   0.0   0.0   0.0  ...           0.0   \n",
+      "\n",
+      "                          árbol seed  árbol seed remove  árbol stem  \\\n",
+      "id                                                                    \n",
+      "54a2b6b019925f464b373351         0.0                0.0         0.0   \n",
+      "54a408a019925f464b3733bc         0.0                0.0         0.0   \n",
+      "54a408a26529d92b2c003631         0.0                0.0         0.0   \n",
+      "54a408a66529d92b2c003638         0.0                0.0         0.0   \n",
+      "54a408a719925f464b3733cc         0.0                0.0         0.0   \n",
+      "\n",
+      "                          árbol teaspoon  árbol teaspoon crush  \\\n",
+      "id                                                               \n",
+      "54a2b6b019925f464b373351             0.0                   0.0   \n",
+      "54a408a019925f464b3733bc             0.0                   0.0   \n",
+      "54a408a26529d92b2c003631             0.0                   0.0   \n",
+      "54a408a66529d92b2c003638             0.0                   0.0   \n",
+      "54a408a719925f464b3733cc             0.0                   0.0   \n",
+      "\n",
+      "                          árbol teaspoon crush red  árbol wipe  \\\n",
+      "id                                                               \n",
+      "54a2b6b019925f464b373351                       0.0         0.0   \n",
+      "54a408a019925f464b3733bc                       0.0         0.0   \n",
+      "54a408a26529d92b2c003631                       0.0         0.0   \n",
+      "54a408a66529d92b2c003638                       0.0         0.0   \n",
+      "54a408a719925f464b3733cc                       0.0         0.0   \n",
+      "\n",
+      "                          árbol wipe clean  épice  \n",
+      "id                                                 \n",
+      "54a2b6b019925f464b373351               0.0    0.0  \n",
+      "54a408a019925f464b3733bc               0.0    0.0  \n",
+      "54a408a26529d92b2c003631               0.0    0.0  \n",
+      "54a408a66529d92b2c003638               0.0    0.0  \n",
+      "54a408a719925f464b3733cc               0.0    0.0  \n",
+      "\n",
+      "[5 rows x 78378 columns]\n",
+      "\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n",
+      "Random Sample of Combined Data:\n",
+      "                          100g  125g  13x9x2  150g  1pound  1tablespoon  \\\n",
+      "id                                                                        \n",
+      "54a40caa19925f464b374017   0.0   0.0     0.0   0.0     0.0          0.0   \n",
+      "54a43ad16529d92b2c019fc3   0.0   0.0     0.0   0.0     0.0          0.0   \n",
+      "55d4e08063b1ba1b5534b198   0.0   0.0     0.0   0.0     0.0          0.0   \n",
+      "\n",
+      "                          1teaspoon  200g  250g  2cup  ...  árbol seed  \\\n",
+      "id                                                     ...               \n",
+      "54a40caa19925f464b374017        0.0   0.0   0.0   0.0  ...         0.0   \n",
+      "54a43ad16529d92b2c019fc3        0.0   0.0   0.0   0.0  ...         0.0   \n",
+      "55d4e08063b1ba1b5534b198        0.0   0.0   0.0   0.0  ...         0.0   \n",
+      "\n",
+      "                          árbol seed remove  árbol stem  árbol teaspoon  \\\n",
+      "id                                                                        \n",
+      "54a40caa19925f464b374017                0.0         0.0             0.0   \n",
+      "54a43ad16529d92b2c019fc3                0.0         0.0             0.0   \n",
+      "55d4e08063b1ba1b5534b198                0.0         0.0             0.0   \n",
+      "\n",
+      "                          árbol teaspoon crush  árbol teaspoon crush red  \\\n",
+      "id                                                                         \n",
+      "54a40caa19925f464b374017                   0.0                       0.0   \n",
+      "54a43ad16529d92b2c019fc3                   0.0                       0.0   \n",
+      "55d4e08063b1ba1b5534b198                   0.0                       0.0   \n",
+      "\n",
+      "                          árbol wipe  árbol wipe clean  épice  \\\n",
+      "id                                                              \n",
+      "54a40caa19925f464b374017         0.0               0.0    0.0   \n",
+      "54a43ad16529d92b2c019fc3         0.0               0.0    0.0   \n",
+      "55d4e08063b1ba1b5534b198         0.0               0.0    0.0   \n",
+      "\n",
+      "                                                      ingredients_lemmafied  \n",
+      "id                                                                           \n",
+      "54a40caa19925f464b374017  boneless muscovy duck breast half pound total ...  \n",
+      "54a43ad16529d92b2c019fc3  cup basmati rice ounce brk cup sweeten flake c...  \n",
+      "55d4e08063b1ba1b5534b198  tablespoon white wine vinegar brk teaspoon sug...  \n",
+      "\n",
+      "[3 rows x 78379 columns]\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8c78d8c010124a2b81119f07b34a3614",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7bdf0865e2bb4029b353a562f819e139",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b6933fbc97f64a1987293ba6851561fa",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024/05/08 19:14:05 WARNING mlflow.utils.environment: Encountered an unexpected error while inferring pip requirements (model URI: /tmp/tmpazhxbb7a/model, flavor: python_function), fall back to return ['cloudpickle==2.2.1']. Set logging level to DEBUG to see the full traceback.\n",
+      "/home/awchen/Repos/Projects/MeaLeon/.venv/lib/python3.10/site-packages/_distutils_hack/__init__.py:33: UserWarning: Setuptools is replacing distutils.\n",
+      "  warnings.warn(\"Setuptools is replacing distutils.\")\n",
+      "2024/05/08 19:14:41 WARNING mlflow.models.model: Logging model metadata to the tracking server has failed. The model artifacts have been logged successfully under mlflow-artifacts:/284d65dcc09149b8b4279793753b69f9/bd4ea6fe14cc4964bc56b6a4e41ddf71/artifacts. Set logging level to DEBUG via `logging.getLogger(\"mlflow\").setLevel(logging.DEBUG)` to see the full traceback.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# load from MLflow\n",
+    "mlflow_client = mlflow.tracking.MlflowClient(\n",
+    "    tracking_uri=f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow')\n",
+    "\n",
+    "# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer\n",
+    "sklearn_transformer_params = {    \n",
+    "    'analyzer': CustomSKLearnAnalyzer().ngram_maker(\n",
+    "        min_ngram_length=1,\n",
+    "        max_ngram_length=4,\n",
+    "        ),\n",
+    "    'min_df':3,\n",
+    "    'binary':False\n",
+    "}\n",
+    "\n",
+    "# pipeline_params are parameters that will be logged in MLFlow and are a superset of library parameters\n",
+    "pipeline_params = {\n",
+    "    'stanza_model': 'en',\n",
+    "    'sklearn-transformer': 'TFIDF'\n",
+    "}\n",
+    "\n",
+    "# update the pipeline parameters with the library-specific ones so that they show up in MLflow Tracking\n",
+    "pipeline_params.update(sklearn_transformer_params)\n",
+    "\n",
+    "with mlflow.start_run(experiment_id=mlflow_exp_id):    \n",
+    "    # LOG PARAMETERS\n",
+    "    mlflow.log_params(pipeline_params)\n",
+    "\n",
+    "    # LOG INPUTS (QUERIES) AND OUTPUTS\n",
+    "    # MLflow example uses a list of strings or a list of str->str dicts\n",
+    "    # Will be useful in STAGING/Evaluation\n",
+    "    \n",
+    "    # LOG MODEL\n",
+    "    # Instantiate sklearn TFIDFVectorizer\n",
+    "    sklearn_transformer = TfidfVectorizer(**sklearn_transformer_params)\n",
+    "\n",
+    "    print('\\n')\n",
+    "    print('-' * 80)\n",
+    "    print('sklearn fit transform on ingredients:', end='\\n')\n",
+    "\n",
+    "    model_input = whole_nlp_df['ingredients_lemmafied']\n",
+    "\n",
+    "    print('\\n')\n",
+    "    print('-' * 80)\n",
+    "    print('Input Data: ', end='\\n')\n",
+    "    print(model_input)\n",
+    "\n",
+    "    print('\\n')\n",
+    "    print('-' * 80)\n",
+    "    print('Input Data Shape: ', end='\\n')\n",
+    "    print(model_input.shape)\n",
+    "\n",
+    "    random_sample = model_input.sample(3, random_state=200)\n",
+    "\n",
+    "    print('\\n')\n",
+    "    print('-' * 80)\n",
+    "    print('Random 3 Records from Input Data: ', end='\\n')\n",
+    "    print(random_sample)\n",
+    "\n",
+    "    # Do fit transform on data\n",
+    "    response = sklearn_transformer.fit_transform(tqdm(model_input)) \n",
+    "    \n",
+    "    transformed_recipe = pd.DataFrame(\n",
+    "            response.toarray(),\n",
+    "            columns=sklearn_transformer.get_feature_names_out(),\n",
+    "            index=model_input.index\n",
+    "    )\n",
+    "\n",
+    "    signature = infer_signature(model_input=model_input,\n",
+    "                                model_output=transformed_recipe\n",
+    "                                )\n",
+    "\n",
+    "    print('\\n')\n",
+    "    print('-' * 80)\n",
+    "    print('Transformed Data:', end='\\n')\n",
+    "    print(transformed_recipe.head())\n",
+    "    \n",
+    "    combined_df = transformed_recipe.join(random_sample, how='inner')\n",
+    "\n",
+    "    print('\\n')\n",
+    "    print('-' * 80)\n",
+    "    print('Random Sample of Combined Data:', end='\\n')\n",
+    "    print(combined_df.head())\n",
+    "\n",
+    "    with open(sklearn_transformer_path, \"wb\") as fo:\n",
+    "        pickle.dump(sklearn_transformer, fo)\n",
+    "    \n",
+    "    with open(transformed_recipes_path, \"wb\") as fo:\n",
+    "        pickle.dump(transformed_recipe, fo)\n",
+    "    \n",
+    "    with open(combined_df_path, 'wb') as fo:\n",
+    "        pickle.dump(combined_df, fo)\n",
+    "\n",
+    "\n",
+    "    model_info = mlflow.pyfunc.log_model( \n",
+    "        code_path=[\"../src/\"],\n",
+    "        python_model=CustomSKLearnWrapper(),\n",
+    "        input_example=whole_nlp_df['ingredients_lemmafied'][0],\n",
+    "        signature=signature,        \n",
+    "        artifact_path=\"sklearn_model\",\n",
+    "        artifacts=artifacts\n",
+    "        ) \n",
+    "\n",
+    "    # since this uses a custom Stanza analyzer, we have to use a custom mlflow.Pyfunc.PythonModel\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "python3",
+   "language": "python",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}