diff --git a/nbs/11_sklearn_mlfow_model_testing.ipynb b/nbs/11_sklearn_mlfow_model_testing.ipynb
new file mode 100644
index 0000000..d26e705
--- /dev/null
+++ b/nbs/11_sklearn_mlfow_model_testing.ipynb
@@ -0,0 +1,1637 @@
+{
+ "cells": [
+  {
+   "cell_type": "raw",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "description: test\n",
+    "output-file: template.html\n",
+    "title: Template\n",
+    "\n",
+    "---\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# | default_exp core"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# | hide\n",
+    "from bertopic import BERTopic\n",
+    "from bertopic.vectorizers import OnlineCountVectorizer\n",
+    "import dagshub\n",
+    "from datetime import datetime\n",
+    "import dill as pickle\n",
+    "import dvc.api\n",
+    "from hdbscan import HDBSCAN\n",
+    "from itertools import tee, islice\n",
+    "import mlflow\n",
+    "from mlflow.models import infer_signature\n",
+    "import nbdev\n",
+    "from nbdev.showdoc import *\n",
+    "import pandas as pd\n",
+    "import re\n",
+    "from sentence_transformers import SentenceTransformer\n",
+    "from sklearn.feature_extraction.text import (\n",
+    "    CountVectorizer\n",
+    "    , TfidfTransformer\n",
+    "    , TfidfVectorizer\n",
+    "    , \n",
+    ")\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.pipeline import make_pipeline\n",
+    "from src.custom_stanza_mlflow import StanzaWrapper\n",
+    "import src.dataframe_preprocessor as dfpp\n",
+    "import stanza\n",
+    "from tqdm import tqdm\n",
+    "from umap import UMAP"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!export 'PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# | export\n",
+    "def foo():\n",
+    "    pass"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# | hide\n",
+    "# this function allows us to get the experiment ID from an experiment name\n",
+    "def get_experiment_id(name):\n",
+    "    exp = mlflow.get_experiment_by_name(name)\n",
+    "    if exp is None:\n",
+    "      exp_id = mlflow.create_experiment(name)\n",
+    "      return exp_id\n",
+    "    return exp.experiment_id"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def custom_analyzer(step_list, stanza_pipeline, minNgramLength, maxNgramLength, lemmatize=True):\n",
+    "    lowered = \" brk \".join(map(str, [step for step in step_list if step is not None])).lower()\n",
+    "\n",
+    "    preproc = stanza_pipeline(lowered)\n",
+    "    \n",
+    "    if lemmatize:\n",
+    "        lemmad = \" \".join(map(str,\n",
+    "                            [word.lemma\n",
+    "                            for sent in preproc.sentences \n",
+    "                            for word in sent.words if (\n",
+    "                                word.upos not in [\"NUM\", \"DET\", \"ADV\", \"CCONJ\", \"ADP\", \"SCONJ\", \"PUNCT\"]\n",
+    "                                and word is not None\n",
+    "                            )]\n",
+    "                        )\n",
+    "                    )\n",
+    "    else:\n",
+    "        lemmad = \" \".join(map(str,\n",
+    "                            [word.text\n",
+    "                            for sent in preproc.sentences \n",
+    "                            for word in sent.words if (\n",
+    "                                word is not None\n",
+    "                            )]\n",
+    "                        )\n",
+    "                    )\n",
+    "    # analyze each line of the input string seperately\n",
+    "    for ln in lemmad.split(' brk '):\n",
+    "        # tokenize the input string (customize the regex as desired)\n",
+    "        at_least_two_english_characters_whole_words = \"(?u)\\b[a-zA-Z]{2,}\\b\"\n",
+    "        terms = re.split(at_least_two_english_characters_whole_words, ln)\n",
+    "\n",
+    "        # loop ngram creation for every number between min and max ngram length\n",
+    "        for ngramLength in range(minNgramLength, maxNgramLength+1):\n",
+    "\n",
+    "            # find and return all ngrams\n",
+    "            # for ngram in zip(*[terms[i:] for i in range(3)]): \n",
+    "                # <-- solution without a generator (works the same but has higher memory usage)\n",
+    "            for ngram in zip(*[islice(seq, i, len(terms)) for i, seq in enumerate(tee(terms, ngramLength))]):   # <-- solution using a generator\n",
+    "                \n",
+    "                ngram = ' '.join(map(str, ngram))\n",
+    "                # yield ngram\n",
+    "                return str(ngram)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# | hide\n",
+    "nbdev.nbdev_export()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# | Below this are blocks to use DagsHub with MLflow"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">Repository initialized!\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "Repository initialized!\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "#@markdown Enter the username of your DAGsHub account:\n",
+    "DAGSHUB_USER_NAME = \"AaronWChen\" #@param {type:\"string\"}\n",
+    "\n",
+    "#@markdown Enter the email for your DAGsHub account:\n",
+    "DAGSHUB_EMAIL = \"awc33@cornell.edu\" #@param {type:\"string\"}\n",
+    "\n",
+    "#@markdown Enter the repo name \n",
+    "DAGSHUB_REPO_NAME = \"MeaLeon\"\n",
+    "\n",
+    "#@markdown Enter the name of the branch you are working on \n",
+    "BRANCH = \"MLF-1/start-custom-sklearn-mlflow-model\"\n",
+    "dagshub.init(repo_name=DAGSHUB_REPO_NAME\n",
+    "             , repo_owner=DAGSHUB_USER_NAME)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Starting DEV stage for One Hot Encoded model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mlflow.set_tracking_uri(f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow')\n",
+    "\n",
+    "# starter idea for making an experiment name can be the git branch, but need more specificity\n",
+    "experiment_name = f\"{DAGSHUB_EMAIL}/one-hot-encode\"\n",
+    "mlflow_exp_id = get_experiment_id(experiment_name)\n",
+    "\n",
+    "# define model location\n",
+    "# model_directory = \"/tmp/sklearn_model\"\n",
+    "model_directory = \"../models/\"\n",
+    "\n",
+    "# Define the required artifacts associated with the saved custom pyfunc\n",
+    "sklearn_model_path = model_directory + \"sklearn_model\"\n",
+    "artifacts = {'sklearn_model': \"python_model.pkl\"}\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Data Preparation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c64e8d5738864aa0bf3aeafe5237248b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024-01-24 16:08:50 INFO: Downloading default packages for language: en (English) ...\n",
+      "2024-01-24 16:08:51 INFO: File exists: /home/awchen/stanza_resources/en/default.zip\n",
+      "2024-01-24 16:08:54 INFO: Finished downloading models and saved to /home/awchen/stanza_resources.\n",
+      "2024-01-24 16:08:54 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a203edef26a14feeab3b02652894dce5",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024-01-24 16:08:55 INFO: Loading these models for language: en (English):\n",
+      "======================================\n",
+      "| Processor    | Package             |\n",
+      "--------------------------------------\n",
+      "| tokenize     | combined            |\n",
+      "| pos          | combined_charlm     |\n",
+      "| lemma        | combined_nocharlm   |\n",
+      "| constituency | ptb3-revised_charlm |\n",
+      "| depparse     | combined_charlm     |\n",
+      "| sentiment    | sstplus             |\n",
+      "| ner          | ontonotes_charlm    |\n",
+      "======================================\n",
+      "\n",
+      "2024-01-24 16:08:55 INFO: Using device: cpu\n",
+      "2024-01-24 16:08:55 INFO: Loading: tokenize\n",
+      "2024-01-24 16:08:55 INFO: Loading: pos\n",
+      "2024-01-24 16:08:55 INFO: Loading: lemma\n",
+      "2024-01-24 16:08:55 INFO: Loading: constituency\n",
+      "2024-01-24 16:08:55 INFO: Loading: depparse\n",
+      "2024-01-24 16:08:56 INFO: Loading: sentiment\n",
+      "2024-01-24 16:08:56 INFO: Loading: ner\n",
+      "2024-01-24 16:08:57 INFO: Done loading processors!\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "--------------\n",
+      "Raw Dataframe:\n",
+      "                         id  \\\n",
+      "0  54a2b6b019925f464b373351   \n",
+      "1  54a408a019925f464b3733bc   \n",
+      "2  54a408a26529d92b2c003631   \n",
+      "3  54a408a66529d92b2c003638   \n",
+      "4  54a408a719925f464b3733cc   \n",
+      "\n",
+      "                                                 dek  \\\n",
+      "0  How does fried chicken achieve No. 1 status? B...   \n",
+      "1                                Spinaci all'Ebraica   \n",
+      "2  This majestic, moist, and richly spiced honey ...   \n",
+      "3  The idea for this sandwich came to me when my ...   \n",
+      "4  In 1930, Simon Agranat, the chief justice of t...   \n",
+      "\n",
+      "                                     hed                   pubDate  \\\n",
+      "0            Pickle-Brined Fried Chicken  2014-08-19T04:00:00.000Z   \n",
+      "1                   Spinach Jewish Style  2008-09-09T04:00:00.000Z   \n",
+      "2                  New Year’s Honey Cake  2008-09-10T04:00:00.000Z   \n",
+      "3  The B.L.A.Bagel with Lox and Avocado  2008-09-08T04:00:00.000Z   \n",
+      "4        Shakshuka a la Doktor Shakshuka  2008-09-09T04:00:00.000Z   \n",
+      "\n",
+      "                             author    type  \\\n",
+      "0                                []  recipe   \n",
+      "1  [{'name': 'Edda Servi Machlin'}]  recipe   \n",
+      "2       [{'name': 'Marcy Goldman'}]  recipe   \n",
+      "3           [{'name': 'Faye Levy'}]  recipe   \n",
+      "4         [{'name': 'Joan Nathan'}]  recipe   \n",
+      "\n",
+      "                                                 url  \\\n",
+      "0  /recipes/food/views/pickle-brined-fried-chicke...   \n",
+      "1    /recipes/food/views/spinach-jewish-style-350152   \n",
+      "2  /recipes/food/views/majestic-and-moist-new-yea...   \n",
+      "3  /recipes/food/views/the-b-l-a-bagel-with-lox-a...   \n",
+      "4  /recipes/food/views/shakshuka-a-la-doktor-shak...   \n",
+      "\n",
+      "                                           photoData  \\\n",
+      "0  {'id': '54a2b64a6529d92b2c003409', 'filename':...   \n",
+      "1  {'id': '56746182accb4c9831e45e0a', 'filename':...   \n",
+      "2  {'id': '55e85ba4cf90d6663f728014', 'filename':...   \n",
+      "3  {'id': '5674617e47d1a28026045e4f', 'filename':...   \n",
+      "4  {'id': '56746183b47c050a284a4e15', 'filename':...   \n",
+      "\n",
+      "                                                 tag  aggregateRating  \\\n",
+      "0  {'category': 'ingredient', 'name': 'Chicken', ...             3.11   \n",
+      "1  {'category': 'cuisine', 'name': 'Italian', 'ur...             3.22   \n",
+      "2  {'category': 'cuisine', 'name': 'Jewish', 'url...             3.62   \n",
+      "3  {'category': 'cuisine', 'name': 'Jewish', 'url...             4.00   \n",
+      "4  {'category': 'cuisine', 'name': 'Jewish', 'url...             2.71   \n",
+      "\n",
+      "                                         ingredients  \\\n",
+      "0  [1 tablespoons yellow mustard seeds, 1 tablesp...   \n",
+      "1  [3 pounds small-leaved bulk spinach, Salt, 1/2...   \n",
+      "2  [3 1/2 cups all-purpose flour, 1 tablespoon ba...   \n",
+      "3  [1 small ripe avocado, preferably Hass (see No...   \n",
+      "4  [2 pounds fresh tomatoes, unpeeled and cut in ...   \n",
+      "\n",
+      "                                           prepSteps  reviewsCount  \\\n",
+      "0  [Toast mustard and coriander seeds in a dry me...             7   \n",
+      "1  [Remove the stems and roots from the spinach. ...             5   \n",
+      "2  [I like this cake best baked in a 9-inch angel...           105   \n",
+      "3  [A short time before serving, mash avocado and...             7   \n",
+      "4  [1. Place the tomatoes, garlic, salt, paprika,...             7   \n",
+      "\n",
+      "   willMakeAgainPct  dateCrawled  \n",
+      "0               100   1498547035  \n",
+      "1                80   1498547740  \n",
+      "2                88   1498547738  \n",
+      "3               100   1498547740  \n",
+      "4                83   1498547740  \n",
+      "(34756, 15)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# instantiate stanza pipeline\n",
+    "stanza.download('en')\n",
+    "nlp = stanza.Pipeline('en', \n",
+    "                    depparse_batch_size=50, \n",
+    "                    depparse_min_length_to_batch_separately=50,\n",
+    "                    verbose=True,\n",
+    "                    use_gpu=False, # set to true when on cloud/not on streaming computer\n",
+    "                    batch_size=100\n",
+    "                    )\n",
+    "\n",
+    "# load raw data and preprocess/clean\n",
+    "data = dvc.api.read(\n",
+    "    path='../data/recipes-en-201706/epicurious-recipes_m2.json'\n",
+    "    , mode='r')\n",
+    "raw_df = pd.read_json(data)\n",
+    "print('\\n')\n",
+    "print('--------------')\n",
+    "print('Raw Dataframe:', end='\\n')\n",
+    "print(raw_df.head())\n",
+    "print(raw_df.shape)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "--------------\n",
+      "Preprocessed Dataframe:\n",
+      "                                                                        dek  \\\n",
+      "id                                                                            \n",
+      "54a4345019925f464b386748  Goes great with: Couscous flavored with choppe...   \n",
+      "54a455036529d92b2c021482  This recipe can be prepared in 45 minutes or l...   \n",
+      "54a462bb19925f464b3958d4                                                      \n",
+      "54a43a4a6529d92b2c019d30  Heat the pizza stone and prepare the eggplant ...   \n",
+      "54a423ab19925f464b3799f2  Avocado shells make handy vessels for a bright...   \n",
+      "\n",
+      "                                                                     hed  \\\n",
+      "id                                                                         \n",
+      "54a4345019925f464b386748                       Moroccan Slow-Cooked Lamb   \n",
+      "54a455036529d92b2c021482               Asian-Style Crab and Shrimp Cakes   \n",
+      "54a462bb19925f464b3958d4  Black-Eyed Pea, Pineapple and Red Pepper Salad   \n",
+      "54a43a4a6529d92b2c019d30             Eggplant, Tomato, and Fontina Pizza   \n",
+      "54a423ab19925f464b3799f2     Avocado Salad with Bell Pepper and Tomatoes   \n",
+      "\n",
+      "                          aggregateRating  \\\n",
+      "id                                          \n",
+      "54a4345019925f464b386748             3.80   \n",
+      "54a455036529d92b2c021482             3.12   \n",
+      "54a462bb19925f464b3958d4             3.28   \n",
+      "54a43a4a6529d92b2c019d30             3.50   \n",
+      "54a423ab19925f464b3799f2             4.00   \n",
+      "\n",
+      "                                                                ingredients  \\\n",
+      "id                                                                            \n",
+      "54a4345019925f464b386748  [1 tablespoon ground cumin, 2 teaspoons ground...   \n",
+      "54a455036529d92b2c021482  [1/4 cup mayonnaise, 2 tablespoons chopped fre...   \n",
+      "54a462bb19925f464b3958d4  [4 15-ounce cans black-eyed peas, rinsed, well...   \n",
+      "54a43a4a6529d92b2c019d30  [1 (1 1/2-pound) eggplant, cut crosswise into ...   \n",
+      "54a423ab19925f464b3799f2  [1 teaspoon extra-virgin olive oil, Juice of 1...   \n",
+      "\n",
+      "                                                                  prepSteps  \\\n",
+      "id                                                                            \n",
+      "54a4345019925f464b386748  [Mix first 6 ingredients in large bowl. Add la...   \n",
+      "54a455036529d92b2c021482  [Blend first 4 ingredients in medium bowl. Mix...   \n",
+      "54a462bb19925f464b3958d4  [Combine first 7 ingredients in large bowl. Wh...   \n",
+      "54a43a4a6529d92b2c019d30  [Sprinkle eggplant with 1 1/2 teaspoons salt i...   \n",
+      "54a423ab19925f464b3799f2  [1. In a small bowl, whisk together olive oil,...   \n",
+      "\n",
+      "                          reviewsCount  willMakeAgainPct     cuisine_name  \\\n",
+      "id                                                                          \n",
+      "54a4345019925f464b386748           182                96          African   \n",
+      "54a455036529d92b2c021482            25                89             Thai   \n",
+      "54a462bb19925f464b3958d4            16                81  Missing Cuisine   \n",
+      "54a43a4a6529d92b2c019d30             4               100  Missing Cuisine   \n",
+      "54a423ab19925f464b3799f2             8               100  Missing Cuisine   \n",
+      "\n",
+      "                                                 photo_filename  \\\n",
+      "id                                                                \n",
+      "54a4345019925f464b386748                             231597.jpg   \n",
+      "54a455036529d92b2c021482    EP_12162015_placeholders_bright.jpg   \n",
+      "54a462bb19925f464b3958d4    EP_12162015_placeholders_rustic.jpg   \n",
+      "54a43a4a6529d92b2c019d30                             230755.jpg   \n",
+      "54a423ab19925f464b3799f2  51190610_avocado-pepper-salad_1x1.jpg   \n",
+      "\n",
+      "                                                               photo_credit  \\\n",
+      "id                                                                            \n",
+      "54a4345019925f464b386748                                      Brian Leatart   \n",
+      "54a455036529d92b2c021482  Photo by Chelsea Kyle, Prop Styling by Anna St...   \n",
+      "54a462bb19925f464b3958d4  Photo by Chelsea Kyle, Prop Styling by Anna St...   \n",
+      "54a43a4a6529d92b2c019d30                                       Romulo Yanes   \n",
+      "54a423ab19925f464b3799f2                                      Bryan Gardner   \n",
+      "\n",
+      "                                  author_name            date_published  \\\n",
+      "id                                                                        \n",
+      "54a4345019925f464b386748  Missing Author Name 2005-01-28 21:19:07+00:00   \n",
+      "54a455036529d92b2c021482  Missing Author Name 2004-08-20 04:00:00+00:00   \n",
+      "54a462bb19925f464b3958d4  Missing Author Name 2004-08-20 04:00:00+00:00   \n",
+      "54a43a4a6529d92b2c019d30  Missing Author Name 2006-05-16 20:12:06+00:00   \n",
+      "54a423ab19925f464b3799f2  Missing Author Name 2013-08-26 04:00:00+00:00   \n",
+      "\n",
+      "                                                                 recipe_url  \n",
+      "id                                                                           \n",
+      "54a4345019925f464b386748  https://www.epicurious.com/recipes/food/views/...  \n",
+      "54a455036529d92b2c021482  https://www.epicurious.com/recipes/food/views/...  \n",
+      "54a462bb19925f464b3958d4  https://www.epicurious.com/recipes/food/views/...  \n",
+      "54a43a4a6529d92b2c019d30  https://www.epicurious.com/recipes/food/views/...  \n",
+      "54a423ab19925f464b3799f2  https://www.epicurious.com/recipes/food/views/...  \n",
+      "(150, 13)\n",
+      "\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n",
+      "Subset Dataframe:\n",
+      "                                                                        dek  \\\n",
+      "id                                                                            \n",
+      "54a4345019925f464b386748  Goes great with: Couscous flavored with choppe...   \n",
+      "54a455036529d92b2c021482  This recipe can be prepared in 45 minutes or l...   \n",
+      "54a462bb19925f464b3958d4                                                      \n",
+      "54a43a4a6529d92b2c019d30  Heat the pizza stone and prepare the eggplant ...   \n",
+      "54a423ab19925f464b3799f2  Avocado shells make handy vessels for a bright...   \n",
+      "\n",
+      "                                                                     hed  \\\n",
+      "id                                                                         \n",
+      "54a4345019925f464b386748                       Moroccan Slow-Cooked Lamb   \n",
+      "54a455036529d92b2c021482               Asian-Style Crab and Shrimp Cakes   \n",
+      "54a462bb19925f464b3958d4  Black-Eyed Pea, Pineapple and Red Pepper Salad   \n",
+      "54a43a4a6529d92b2c019d30             Eggplant, Tomato, and Fontina Pizza   \n",
+      "54a423ab19925f464b3799f2     Avocado Salad with Bell Pepper and Tomatoes   \n",
+      "\n",
+      "                          aggregateRating  \\\n",
+      "id                                          \n",
+      "54a4345019925f464b386748             3.80   \n",
+      "54a455036529d92b2c021482             3.12   \n",
+      "54a462bb19925f464b3958d4             3.28   \n",
+      "54a43a4a6529d92b2c019d30             3.50   \n",
+      "54a423ab19925f464b3799f2             4.00   \n",
+      "\n",
+      "                                                                ingredients  \\\n",
+      "id                                                                            \n",
+      "54a4345019925f464b386748  [1 tablespoon ground cumin, 2 teaspoons ground...   \n",
+      "54a455036529d92b2c021482  [1/4 cup mayonnaise, 2 tablespoons chopped fre...   \n",
+      "54a462bb19925f464b3958d4  [4 15-ounce cans black-eyed peas, rinsed, well...   \n",
+      "54a43a4a6529d92b2c019d30  [1 (1 1/2-pound) eggplant, cut crosswise into ...   \n",
+      "54a423ab19925f464b3799f2  [1 teaspoon extra-virgin olive oil, Juice of 1...   \n",
+      "\n",
+      "                                                                  prepSteps  \\\n",
+      "id                                                                            \n",
+      "54a4345019925f464b386748  [Mix first 6 ingredients in large bowl. Add la...   \n",
+      "54a455036529d92b2c021482  [Blend first 4 ingredients in medium bowl. Mix...   \n",
+      "54a462bb19925f464b3958d4  [Combine first 7 ingredients in large bowl. Wh...   \n",
+      "54a43a4a6529d92b2c019d30  [Sprinkle eggplant with 1 1/2 teaspoons salt i...   \n",
+      "54a423ab19925f464b3799f2  [1. In a small bowl, whisk together olive oil,...   \n",
+      "\n",
+      "                          reviewsCount  willMakeAgainPct     cuisine_name  \\\n",
+      "id                                                                          \n",
+      "54a4345019925f464b386748           182                96          African   \n",
+      "54a455036529d92b2c021482            25                89             Thai   \n",
+      "54a462bb19925f464b3958d4            16                81  Missing Cuisine   \n",
+      "54a43a4a6529d92b2c019d30             4               100  Missing Cuisine   \n",
+      "54a423ab19925f464b3799f2             8               100  Missing Cuisine   \n",
+      "\n",
+      "                                                 photo_filename  \\\n",
+      "id                                                                \n",
+      "54a4345019925f464b386748                             231597.jpg   \n",
+      "54a455036529d92b2c021482    EP_12162015_placeholders_bright.jpg   \n",
+      "54a462bb19925f464b3958d4    EP_12162015_placeholders_rustic.jpg   \n",
+      "54a43a4a6529d92b2c019d30                             230755.jpg   \n",
+      "54a423ab19925f464b3799f2  51190610_avocado-pepper-salad_1x1.jpg   \n",
+      "\n",
+      "                                                               photo_credit  \\\n",
+      "id                                                                            \n",
+      "54a4345019925f464b386748                                      Brian Leatart   \n",
+      "54a455036529d92b2c021482  Photo by Chelsea Kyle, Prop Styling by Anna St...   \n",
+      "54a462bb19925f464b3958d4  Photo by Chelsea Kyle, Prop Styling by Anna St...   \n",
+      "54a43a4a6529d92b2c019d30                                       Romulo Yanes   \n",
+      "54a423ab19925f464b3799f2                                      Bryan Gardner   \n",
+      "\n",
+      "                                  author_name            date_published  \\\n",
+      "id                                                                        \n",
+      "54a4345019925f464b386748  Missing Author Name 2005-01-28 21:19:07+00:00   \n",
+      "54a455036529d92b2c021482  Missing Author Name 2004-08-20 04:00:00+00:00   \n",
+      "54a462bb19925f464b3958d4  Missing Author Name 2004-08-20 04:00:00+00:00   \n",
+      "54a43a4a6529d92b2c019d30  Missing Author Name 2006-05-16 20:12:06+00:00   \n",
+      "54a423ab19925f464b3799f2  Missing Author Name 2013-08-26 04:00:00+00:00   \n",
+      "\n",
+      "                                                                 recipe_url  \n",
+      "id                                                                           \n",
+      "54a4345019925f464b386748  https://www.epicurious.com/recipes/food/views/...  \n",
+      "54a455036529d92b2c021482  https://www.epicurious.com/recipes/food/views/...  \n",
+      "54a462bb19925f464b3958d4  https://www.epicurious.com/recipes/food/views/...  \n",
+      "54a43a4a6529d92b2c019d30  https://www.epicurious.com/recipes/food/views/...  \n",
+      "54a423ab19925f464b3799f2  https://www.epicurious.com/recipes/food/views/...  \n",
+      "(150, 13)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# take sample and train/test split \n",
+    "subset_df = raw_df.sample(n=300, random_state=45)\n",
+    "train_df, test_df = train_test_split(subset_df,test_size=0.5, random_state=45)\n",
+    "\n",
+    "# pre_proc_df is cleaned dataframe\n",
+    "pre_proc_df = dfpp.preprocess_dataframe(train_df)\n",
+    "print('\\n')\n",
+    "print('--------------')\n",
+    "print('Preprocessed Dataframe:', end='\\n')\n",
+    "print(pre_proc_df.head())\n",
+    "print(pre_proc_df.shape)\n",
+    "\n",
+    "# create subset for dev purposes\n",
+    "to_nlp_df = pre_proc_df\n",
+    "print('\\n')\n",
+    "print('-' * 80)\n",
+    "print('Subset Dataframe:', end='\\n')\n",
+    "print(to_nlp_df.head())\n",
+    "print(to_nlp_df.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "SyntaxError",
+     "evalue": "invalid syntax. Perhaps you forgot a comma? (4439283.py, line 76)",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;36m  Cell \u001b[0;32mIn[10], line 76\u001b[0;36m\u001b[0m\n\u001b[0;31m    code_path=[\"../src/\"]#, \"../models/\"],\u001b[0m\n\u001b[0m              ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax. Perhaps you forgot a comma?\n"
+     ]
+    }
+   ],
+   "source": [
+    "# load from MLflow\n",
+    "mlflow_client = mlflow.tracking.MlflowClient(\n",
+    "    tracking_uri=f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow')\n",
+    "\n",
+    "# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer\n",
+    "sklearn_transformer_params = {\n",
+    "    'strip_accents':\"unicode\",\n",
+    "    'lowercase':True,\n",
+    "    'analyzer': StanzaWrapper().stanza_analyzer(stanza_pipeline=nlp, minNgramLength=1, maxNgramLength=4),\n",
+    "    'min_df':3,\n",
+    "    'binary':True\n",
+    "}\n",
+    "\n",
+    "# bertopic_params are a superset of cv_params\n",
+    "bertopic_params = {\n",
+    "    'top_n_words':20,\n",
+    "    'min_topic_size':5,\n",
+    "    'nr_topics':'auto',\n",
+    "    'verbose':True,\n",
+    "    'low_memory':True,\n",
+    "    'calculate_probabilities':True\n",
+    "}\n",
+    "\n",
+    "# update bertopic_params to include cv_params\n",
+    "# bertopic_params.update(cv_params)\n",
+    "\n",
+    "# pipeline_params are parameters that will be logged in MLFlow and are a superset of library parameters\n",
+    "pipeline_params = {\n",
+    "    'stanza_model': 'en',\n",
+    "    'sklearn-transformer': 'OneHotEncoder'\n",
+    "}\n",
+    "\n",
+    "# update the pipeline parameters with the library-specific ones so that they show up in MLflow Tracking\n",
+    "pipeline_params.update(sklearn_transformer_params)\n",
+    "pipeline_params.update(bertopic_params)\n",
+    "\n",
+    "signature = infer_signature(to_nlp_df['ingredients'])\n",
+    "\n",
+    "with mlflow.start_run(experiment_id=mlflow_exp_id):    \n",
+    "    # LOG PARAMETERS\n",
+    "    mlflow.log_params(pipeline_params)\n",
+    "\n",
+    "    # LOG INPUTS (QUERIES) AND OUTPUTS\n",
+    "    # MLflow example uses a list of strings or a list of str->str dicts\n",
+    "    # Will be useful in STAGING/Evaluation\n",
+    "    \n",
+    "    # LOG MODEL\n",
+    "    # Instantiate sklearn OneHotEncoder\n",
+    "    ohe = CountVectorizer(**sklearn_transformer_params)\n",
+    "\n",
+    "    print('\\n')\n",
+    "    print('-' * 80)\n",
+    "    print('sklearn fit transform on ingredients:', end='\\n')\n",
+    "\n",
+    "    # Do fit transform on data\n",
+    "    response = ohe.fit_transform(tqdm(to_nlp_df['ingredients']))\n",
+    "    transformed_recipe = pd.DataFrame(\n",
+    "            response.toarray(),\n",
+    "            columns=ohe.get_feature_names_out(),\n",
+    "            index=to_nlp_df.index\n",
+    "    )\n",
+    "\n",
+    "    print('\\n')\n",
+    "    print('-' * 80)\n",
+    "    print('Transformed Data:', end='\\n')\n",
+    "    print(transformed_recipe)\n",
+    "\n",
+    "    # mlflow.pyfunc.save_model(\n",
+    "    #     path=sklearn_model_path,\n",
+    "    #     code_path=[\"../src/\"],\n",
+    "    #     python_model=StanzaWrapper(),\n",
+    "    #     input_example=to_nlp_df['ingredients'][0],\n",
+    "    # )\n",
+    "\n",
+    "    model_info = mlflow.pyfunc.log_model(\n",
+    "        code_path=[\"../src/\"],# \"../models/\"],\n",
+    "        python_model=StanzaWrapper(),\n",
+    "        input_example=to_nlp_df['ingredients'][0],\n",
+    "        signature=signature,        \n",
+    "        artifact_path=\"sklearn_model\",\n",
+    "        # artifacts=artifacts\n",
+    "        ) \n",
+    "\n",
+    "    # since this uses a custom Stanza analyzer, we have to use a custom mlflow.Pyfunc.PythonModel\n",
+    "    \n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_predict = mlflow.pyfunc.load_model(model_uri=model_info.model_uri)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# load from MLflow\n",
+    "mlflow_client = mlflow.tracking.MlflowClient(\n",
+    "    tracking_uri=f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow')\n",
+    "\n",
+    "# load dataframes from artifacts\n",
+    "# mlflow.artifacts.download_artifacts(\n",
+    "#     run_id=mlflow_run_id\n",
+    "# )\n",
+    "\n",
+    "# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer\n",
+    "cv_params = {\n",
+    "    'strip_accents':\"unicode\",\n",
+    "    'lowercase':True,\n",
+    "    'analyzer': StanzaWrapper().stanza_analyzer(stanza_pipeline=nlp, minNgramLength=1, maxNgramLength=4),\n",
+    "    'min_df':10,\n",
+    "}\n",
+    "\n",
+    "# bertopic_params are a superset of cv_params\n",
+    "bertopic_params = {\n",
+    "    'top_n_words':20,\n",
+    "    'min_topic_size':10,\n",
+    "    'nr_topics':'auto',\n",
+    "    'verbose':True,\n",
+    "    'low_memory':True,\n",
+    "    'calculate_probabilities':True\n",
+    "}\n",
+    "\n",
+    "# update bertopic_params to include cv_params\n",
+    "# bertopic_params.update(cv_params)\n",
+    "\n",
+    "# pipeline_params are parameters that will be logged in MLFlow and are a superset of library parameters\n",
+    "pipeline_params = {\n",
+    "    'stanza_model': 'en',\n",
+    "    'sklearn-transformer': 'TfidfVectorizer'\n",
+    "}\n",
+    "\n",
+    "# update the pipeline parameters with the library-specific ones so that they show up in MLflow Tracking\n",
+    "pipeline_params.update(cv_params)\n",
+    "pipeline_params.update(bertopic_params)\n",
+    "\n",
+    "with mlflow.start_run(experiment_id=get_experiment_id(f\"{DAGSHUB_EMAIL}/bertopic_stanza_ingreds_full_set_v1\")):    \n",
+    "    # LOG PARAMETERS\n",
+    "    mlflow.log_params(pipeline_params)\n",
+    "\n",
+    "    # LOG INPUTS (QUERIES) AND OUTPUTS\n",
+    "    # MLflow example uses a list of strings or a list of str->str dicts\n",
+    "    \n",
+    "    # load raw data and preprocess/clean\n",
+    "    data = dvc.api.read(\n",
+    "           path='../data/recipes-en-201706/epicurious-recipes_m2.json'\n",
+    "           , mode='r')\n",
+    "    raw_df = pd.read_json(data)\n",
+    "    print('\\n')\n",
+    "    print('--------------')\n",
+    "    print('Raw Dataframe:', end='\\n')\n",
+    "    print(raw_df.head())\n",
+    "    print(raw_df.shape)\n",
+    "\n",
+    "    # pre_proc_df is cleaned dataframe\n",
+    "    pre_proc_df = dfpp.preprocess_dataframe(raw_df)\n",
+    "    print('\\n')\n",
+    "    print('--------------')\n",
+    "    print('Preprocessed Dataframe:', end='\\n')\n",
+    "    print(pre_proc_df.head())\n",
+    "    print(pre_proc_df.shape)\n",
+    "\n",
+    "\n",
+    "    # pre_proc_df = pd.read_json(\n",
+    "    #     mlflow.artifacts.download_artifacts(\n",
+    "    #         run_id=mlflow_run_id,\n",
+    "    #         artifact_path='artifacts/preprocessed_dataframes/preprocessed_dataframe.json',\n",
+    "    #         # tracking_uri=f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow'\n",
+    "    #     )\n",
+    "    # )\n",
+    "    # print('\\n')\n",
+    "    # print('-' * 80)\n",
+    "    # print('Preprocessed Dataframe:', end='\\n')\n",
+    "    # print(pre_proc_df.head())\n",
+    "    # print(pre_proc_df.shape)\n",
+    "\n",
+    "    # create subset for dev purposes\n",
+    "    # to_nlp_df = pre_proc_df[0:50]\n",
+    "    # print('\\n')\n",
+    "    # print('-' * 80)\n",
+    "    # print('Subset Dataframe:', end='\\n')\n",
+    "    # print(to_nlp_df.head())\n",
+    "    # print(to_nlp_df.shape)\n",
+    "\n",
+    "    # LOG MODEL\n",
+    "    # Instantiate BERTopic\n",
+    "    topic_model = BERTopic(\n",
+    "        **bertopic_params,\n",
+    "    )\n",
+    "\n",
+    "    def custom_analyzer(step_list, stanza_pipeline, minNgramLength, maxNgramLength):\n",
+    "            lowered = \" brk \".join(map(str, [step for step in step_list if step is not None])).lower()\n",
+    "\n",
+    "            preproc = stanza_pipeline(lowered)\n",
+    "            \n",
+    "            lemmad = \" \".join(map(str,\n",
+    "                                [word.text\n",
+    "                                for sent in preproc.sentences \n",
+    "                                for word in sent.words if (\n",
+    "                                    word is not None\n",
+    "                                )]\n",
+    "                            )\n",
+    "                        )\n",
+    "            \n",
+    "            # analyze each line of the input string seperately\n",
+    "            for ln in lemmad.split(' brk '):\n",
+    "                \n",
+    "                # tokenize the input string (customize the regex as desired)\n",
+    "                at_least_two_english_characters_whole_words = \"(?u)\\b[a-zA-Z]{2,}\\b\"\n",
+    "                terms = re.split(at_least_two_english_characters_whole_words, ln)\n",
+    "\n",
+    "                # loop ngram creation for every number between min and max ngram length\n",
+    "                for ngramLength in range(minNgramLength, maxNgramLength+1):\n",
+    "\n",
+    "                    # find and return all ngrams\n",
+    "                    # for ngram in zip(*[terms[i:] for i in range(3)]): \n",
+    "                        # <-- solution without a generator (works the same but has higher memory usage)\n",
+    "                    for ngram in zip(*[islice(seq, i, len(terms)) for i, seq in enumerate(tee(terms, ngramLength))]):   # <-- solution using a generator\n",
+    "                        \n",
+    "                        ngram = ' '.join(map(str, ngram))\n",
+    "                        # yield ngram\n",
+    "                        return str(ngram)\n",
+    "\n",
+    "    analyzer_kwargs = {'stanza_pipeline': nlp\n",
+    "                       , 'minNgramLength': 1\n",
+    "                       , 'maxNgramLength': 4}\n",
+    "    \n",
+    "    recipe_ingreds = pre_proc_df[\"ingredients\"].apply(custom_analyzer, **analyzer_kwargs)\n",
+    "\n",
+    "    # recipe_steps = \"\".join(str(to_nlp_df[\"prepSteps\"].apply(StanzaWrapper().stanza_analyzer(stanza_pipeline=nlp, minNgramLength=1, maxNgramLength=4))))\n",
+    "    print('\\n')\n",
+    "    print('-' * 80)\n",
+    "    print('Recipe ingredients:', end='\\n')\n",
+    "    print(recipe_ingreds)\n",
+    "\n",
+    "    # train on the recipes' steps\n",
+    "    topics, probs = topic_model.fit_transform(recipe_ingreds)\n",
+    "\n",
+    "    # since this uses a custom Stanza analyzer, we have to use a custom mlflow.Pyfunc.PythonModel\n",
+    "    # Instantiate sklearn CountVectorizer\n",
+    "    # steps_vectorizer_model = CountVectorizer(**cv_params)\n",
+    "\n",
+    "    # May need to use BERTopic's OnlineCountVectorizer\n",
+    "    steps_vectorizer_model = OnlineCountVectorizer(**cv_params)\n",
+    "\n",
+    "    # Do fit transform on data\n",
+    "    # steps_test_tfidf_transform = steps_tfidf_vectorizer_model.fit_transform(tqdm(to_nlp_df[\"steps\"]))\n",
+    "    topic_model.update_topics(\n",
+    "        recipe_ingreds\n",
+    "        , vectorizer_model=steps_vectorizer_model\n",
+    "    )\n",
+    "\n",
+    "    # Display topic model results\n",
+    "    print('\\n')\n",
+    "    print('-' * 80)\n",
+    "    print('BERTopic Model Dataframe:', end='\\n')\n",
+    "    print(topic_model.get_topic_info())\n",
+    "\n",
+    "    print('\\n')\n",
+    "    print('-' * 80)\n",
+    "    print('BERTopic Model Representations:', end='\\n')\n",
+    "    print(topic_model.get_topic_info()['Representation'])\n",
+    "\n",
+    "    print('\\n')\n",
+    "    print('-' * 80)\n",
+    "    print('BERTopic Model Representations:', end='\\n')\n",
+    "    print(topic_model.get_topic_info()['Representative_Docs'])\n",
+    "\n",
+    "    # Save and log the topic model dataframe\n",
+    "    topic_model.get_topic_info().to_json('../data/processed/bertopic_model_ingreds_full_set_df.json')\n",
+    "    mlflow.log_artifact('../data/processed/bertopic_model_ingreds_full_set_df.json',\n",
+    "                        artifact_path='bertopic_models')\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Attempt run with lighter weight configuration\n",
+    "#### This attempt will still use Stanza processing on the ingredients "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# load from MLflow\n",
+    "mlflow_client = mlflow.tracking.MlflowClient(\n",
+    "    tracking_uri=f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow')\n",
+    "\n",
+    "# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer\n",
+    "sklearn_nlp_params = {\n",
+    "    'strip_accents':\"unicode\",\n",
+    "    'lowercase':True,\n",
+    "    'analyzer': StanzaWrapper().stanza_analyzer(stanza_pipeline=nlp, minNgramLength=1, maxNgramLength=4),\n",
+    "    'min_df':10,\n",
+    "}\n",
+    "\n",
+    "# create sklearn pipeline as in BERTopic lightweight configuration\n",
+    "# pipe = make_pipeline(\n",
+    "#     TfidfVectorizer(**sklearn_nlp_params),\n",
+    "#     TruncatedSVD(100)\n",
+    "# )\n",
+    "\n",
+    "# bertopic_params are a superset of cv_params\n",
+    "bertopic_params = {\n",
+    "    # 'embedding_model': TfidfVectorizer(**sklearn_nlp_params),\n",
+    "    'top_n_words':20,\n",
+    "    'min_topic_size':10,\n",
+    "    'nr_topics':50,\n",
+    "    'verbose':True,\n",
+    "    'low_memory':True,\n",
+    "    'calculate_probabilities':True,\n",
+    "    # 'min_cluster_size': 10 # Possibly only works if modifying individual HDBSCAN component of BERTopic\n",
+    "}\n",
+    "\n",
+    "# update bertopic_params to include cv_params\n",
+    "# bertopic_params.update(cv_params)\n",
+    "\n",
+    "# pipeline_params are parameters that will be logged in MLFlow and are a superset of library parameters\n",
+    "pipeline_params = {\n",
+    "    'stanza_model': 'en',\n",
+    "    'sklearn-transformer': 'TfidfVectorizer'\n",
+    "}\n",
+    "\n",
+    "# update the pipeline parameters with the library-specific ones so that they show up in MLflow Tracking\n",
+    "pipeline_params.update(sklearn_nlp_params)\n",
+    "pipeline_params.update(bertopic_params)\n",
+    "\n",
+    "with mlflow.start_run(experiment_id=get_experiment_id(f\"{DAGSHUB_EMAIL}/bertopic_lightweight_stanza_ingreds_small_set_v1\")):    \n",
+    "    # LOG PARAMETERS\n",
+    "    mlflow.log_params(pipeline_params)\n",
+    "\n",
+    "    # LOG INPUTS (QUERIES) AND OUTPUTS\n",
+    "    # MLflow example uses a list of strings or a list of str->str dicts\n",
+    "    \n",
+    "    # load raw data and preprocess/clean\n",
+    "    data = dvc.api.read(\n",
+    "           path='../data/recipes-en-201706/epicurious-recipes_m2.json'\n",
+    "           , mode='r')\n",
+    "    raw_df = pd.read_json(data)\n",
+    "    print('\\n')\n",
+    "    print('--------------')\n",
+    "    print(f'{datetime.now()}, Raw Dataframe: ', end='\\n')\n",
+    "    print(raw_df.head())\n",
+    "    print(raw_df.shape)\n",
+    "\n",
+    "    # pre_proc_df is cleaned dataframe\n",
+    "    pre_proc_df = dfpp.preprocess_dataframe(raw_df)\n",
+    "    print('\\n')\n",
+    "    print('--------------')\n",
+    "    print(f'{datetime.now()}, Preprocessed Dataframe:', end='\\n')\n",
+    "    print(pre_proc_df.head())\n",
+    "    print(pre_proc_df.shape)\n",
+    "\n",
+    "\n",
+    "    # pre_proc_df = pd.read_json(\n",
+    "    #     mlflow.artifacts.download_artifacts(\n",
+    "    #         run_id=mlflow_run_id,\n",
+    "    #         artifact_path='artifacts/preprocessed_dataframes/preprocessed_dataframe.json',\n",
+    "    #         # tracking_uri=f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow'\n",
+    "    #     )\n",
+    "    # )\n",
+    "    # print('\\n')\n",
+    "    # print('-' * 80)\n",
+    "    # print('Preprocessed Dataframe:', end='\\n')\n",
+    "    # print(pre_proc_df.head())\n",
+    "    # print(pre_proc_df.shape)\n",
+    "\n",
+    "    # create subset for dev purposes\n",
+    "    to_nlp_df = pre_proc_df[0:100]\n",
+    "    print('\\n')\n",
+    "    print('-' * 80)\n",
+    "    print(f'{datetime.now()}, Subset Dataframe:', end='\\n')\n",
+    "    print(to_nlp_df.head())\n",
+    "    print(to_nlp_df.shape)\n",
+    "\n",
+    "    # LOG MODEL\n",
+    "    # Instantiate BERTopic\n",
+    "    topic_model = BERTopic(\n",
+    "        **bertopic_params\n",
+    "    )\n",
+    "    \n",
+    "    analyzer_kwargs = {'stanza_pipeline': nlp\n",
+    "                       , 'minNgramLength': 1\n",
+    "                       , 'maxNgramLength': 4}\n",
+    "    \n",
+    "    recipe_ingreds = to_nlp_df[\"ingredients\"].apply(custom_analyzer, **analyzer_kwargs)\n",
+    "\n",
+    "    # Create TF-IDF embeddings\n",
+    "    vectorizer = TfidfVectorizer(**sklearn_nlp_params)\n",
+    "    embeddings = vectorizer.fit_transform(recipe_ingreds)\n",
+    "\n",
+    "    # recipe_steps = \"\".join(str(to_nlp_df[\"prepSteps\"].apply(StanzaWrapper().stanza_analyzer(stanza_pipeline=nlp, minNgramLength=1, maxNgramLength=4))))\n",
+    "    print('\\n')\n",
+    "    print('-' * 80)\n",
+    "    print(f'{datetime.now()}, Recipe ingredients:', end='\\n')\n",
+    "    print(recipe_ingreds)\n",
+    "\n",
+    "    # train on the recipes' ingredientss\n",
+    "    topics, probs = topic_model.fit_transform(recipe_ingreds, embeddings)\n",
+    "\n",
+    "    # since this uses a custom Stanza analyzer, we have to use a custom mlflow.Pyfunc.PythonModel\n",
+    "    # Instantiate sklearn CountVectorizer\n",
+    "    sklearn_cv_params = {\n",
+    "        'strip_accents':\"unicode\",\n",
+    "        'lowercase':True,\n",
+    "        'analyzer': StanzaWrapper().stanza_analyzer(stanza_pipeline=nlp, minNgramLength=1, maxNgramLength=4),\n",
+    "        # 'min_df':10,\n",
+    "    }\n",
+    "    steps_vectorizer_model = CountVectorizer(**sklearn_cv_params)\n",
+    "\n",
+    "    # May need to use BERTopic's OnlineCountVectorizer\n",
+    "    # steps_vectorizer_model = OnlineCountVectorizer(**sklearn_nlp_params)\n",
+    "\n",
+    "    # Do fit transform on data\n",
+    "    # steps_test_tfidf_transform = steps_tfidf_vectorizer_model.fit_transform(tqdm(to_nlp_df[\"steps\"]))\n",
+    "    topic_model.update_topics(\n",
+    "        recipe_ingreds\n",
+    "        , vectorizer_model=steps_vectorizer_model\n",
+    "    )\n",
+    "\n",
+    "    # Display topic model results\n",
+    "    print('\\n')\n",
+    "    print('-' * 80)\n",
+    "    print(f'{datetime.now()}, BERTopic Model Dataframe:', end='\\n')\n",
+    "    print(topic_model.get_topic_info())\n",
+    "\n",
+    "    print('\\n')\n",
+    "    print('-' * 80)\n",
+    "    print(f'{datetime.now()}, BERTopic Model Representations:', end='\\n')\n",
+    "    print(topic_model.get_topic_info()['Representation'])\n",
+    "\n",
+    "    print('\\n')\n",
+    "    print('-' * 80)\n",
+    "    print(f'{datetime.now()}, BERTopic Model Representative Docs:', end='\\n')\n",
+    "    print(topic_model.get_topic_info()['Representative_Docs'])\n",
+    "\n",
+    "    # Save and log the topic model dataframe\n",
+    "    topic_model.get_topic_info().to_json('../data/processed/bertopic_model_ingreds_full_set_df.json')\n",
+    "    mlflow.log_artifact('../data/processed/bertopic_model_ingreds_full_set_df.json',\n",
+    "                        artifact_path='bertopic_models')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "topic_model.get_topic_info()['Representation']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "topic_model.get_topic_info()['Representation'][0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# load from MLflow\n",
+    "mlflow_client = mlflow.tracking.MlflowClient(\n",
+    "    tracking_uri=f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow')\n",
+    "\n",
+    "# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer\n",
+    "sklearn_nlp_params = {\n",
+    "    'strip_accents':\"unicode\",\n",
+    "    'lowercase':True,\n",
+    "    'analyzer': StanzaWrapper().stanza_analyzer(stanza_pipeline=nlp, minNgramLength=1, maxNgramLength=4),\n",
+    "    'min_df':10,\n",
+    "}\n",
+    "\n",
+    "# create sklearn pipeline as in BERTopic lightweight configuration\n",
+    "# pipe = make_pipeline(\n",
+    "#     TfidfVectorizer(**sklearn_nlp_params),\n",
+    "#     TruncatedSVD(100)\n",
+    "# )\n",
+    "\n",
+    "# bertopic_params are a superset of cv_params\n",
+    "bertopic_params = {\n",
+    "    # 'embedding_model': TfidfVectorizer(**sklearn_nlp_params),\n",
+    "    'top_n_words':20,\n",
+    "    'min_topic_size':10,\n",
+    "    'nr_topics':50,\n",
+    "    'verbose':True,\n",
+    "    'low_memory':True,\n",
+    "    'calculate_probabilities':True,\n",
+    "    # 'min_cluster_size': 10 # Possibly only works if modifying individual HDBSCAN component of BERTopic\n",
+    "}\n",
+    "\n",
+    "# update bertopic_params to include cv_params\n",
+    "# bertopic_params.update(cv_params)\n",
+    "\n",
+    "# pipeline_params are parameters that will be logged in MLFlow and are a superset of library parameters\n",
+    "pipeline_params = {\n",
+    "    'stanza_model': 'en',\n",
+    "    'sklearn-transformer': 'TfidfVectorizer'\n",
+    "}\n",
+    "\n",
+    "# update the pipeline parameters with the library-specific ones so that they show up in MLflow Tracking\n",
+    "pipeline_params.update(sklearn_nlp_params)\n",
+    "pipeline_params.update(bertopic_params)\n",
+    "\n",
+    "with mlflow.start_run(experiment_id=get_experiment_id(f\"{DAGSHUB_EMAIL}/bertopic_lightweight_stanza_ingreds_small_set_v1\")):    \n",
+    "    # LOG PARAMETERS\n",
+    "    mlflow.log_params(pipeline_params)\n",
+    "\n",
+    "    # LOG INPUTS (QUERIES) AND OUTPUTS\n",
+    "    # MLflow example uses a list of strings or a list of str->str dicts\n",
+    "    \n",
+    "    # load raw data and preprocess/clean\n",
+    "    data = dvc.api.read(\n",
+    "           path='../data/recipes-en-201706/epicurious-recipes_m2.json'\n",
+    "           , mode='r')\n",
+    "    raw_df = pd.read_json(data)\n",
+    "    print('\\n')\n",
+    "    print('--------------')\n",
+    "    print(f'{datetime.now()}, Raw Dataframe: ', end='\\n')\n",
+    "    print(raw_df.head())\n",
+    "    print(raw_df.shape)\n",
+    "\n",
+    "    # pre_proc_df is cleaned dataframe\n",
+    "    pre_proc_df = dfpp.preprocess_dataframe(raw_df)\n",
+    "    print('\\n')\n",
+    "    print('--------------')\n",
+    "    print(f'{datetime.now()}, Preprocessed Dataframe:', end='\\n')\n",
+    "    print(pre_proc_df.head())\n",
+    "    print(pre_proc_df.shape)\n",
+    "\n",
+    "\n",
+    "    # pre_proc_df = pd.read_json(\n",
+    "    #     mlflow.artifacts.download_artifacts(\n",
+    "    #         run_id=mlflow_run_id,\n",
+    "    #         artifact_path='artifacts/preprocessed_dataframes/preprocessed_dataframe.json',\n",
+    "    #         # tracking_uri=f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow'\n",
+    "    #     )\n",
+    "    # )\n",
+    "    # print('\\n')\n",
+    "    # print('-' * 80)\n",
+    "    # print('Preprocessed Dataframe:', end='\\n')\n",
+    "    # print(pre_proc_df.head())\n",
+    "    # print(pre_proc_df.shape)\n",
+    "\n",
+    "    # create subset for dev purposes\n",
+    "    to_nlp_df = pre_proc_df[0:100]\n",
+    "    print('\\n')\n",
+    "    print('-' * 80)\n",
+    "    print(f'{datetime.now()}, Subset Dataframe:', end='\\n')\n",
+    "    print(to_nlp_df.head())\n",
+    "    print(to_nlp_df.shape)\n",
+    "\n",
+    "    # LOG MODEL\n",
+    "    # Instantiate BERTopic\n",
+    "    topic_model = BERTopic(\n",
+    "        **bertopic_params\n",
+    "    )\n",
+    "    \n",
+    "    analyzer_kwargs = {'stanza_pipeline': nlp\n",
+    "                       , 'minNgramLength': 1\n",
+    "                       , 'maxNgramLength': 4}\n",
+    "    \n",
+    "    recipe_ingreds = to_nlp_df[\"ingredients\"].apply(custom_analyzer, **analyzer_kwargs)\n",
+    "\n",
+    "    # Create TF-IDF embeddings\n",
+    "    vectorizer = TfidfVectorizer(**sklearn_nlp_params)\n",
+    "    embeddings = vectorizer.fit_transform(recipe_ingreds)\n",
+    "\n",
+    "    # recipe_steps = \"\".join(str(to_nlp_df[\"prepSteps\"].apply(StanzaWrapper().stanza_analyzer(stanza_pipeline=nlp, minNgramLength=1, maxNgramLength=4))))\n",
+    "    print('\\n')\n",
+    "    print('-' * 80)\n",
+    "    print(f'{datetime.now()}, Recipe ingredients:', end='\\n')\n",
+    "    print(recipe_ingreds)\n",
+    "\n",
+    "    # train on the recipes' ingredientss\n",
+    "    topics, probs = topic_model.fit_transform(recipe_ingreds, embeddings)\n",
+    "\n",
+    "    # since this uses a custom Stanza analyzer, we have to use a custom mlflow.Pyfunc.PythonModel\n",
+    "    # Instantiate sklearn CountVectorizer\n",
+    "    sklearn_cv_params = {\n",
+    "        'strip_accents':\"unicode\",\n",
+    "        'lowercase':True,\n",
+    "        # 'analyzer': StanzaWrapper().stanza_analyzer(stanza_pipeline=nlp, minNgramLength=1, maxNgramLength=4),\n",
+    "        # 'min_df':10,\n",
+    "    }\n",
+    "    steps_vectorizer_model = CountVectorizer(**sklearn_cv_params)\n",
+    "\n",
+    "    # May need to use BERTopic's OnlineCountVectorizer\n",
+    "    # steps_vectorizer_model = OnlineCountVectorizer(**sklearn_nlp_params)\n",
+    "\n",
+    "    # Do fit transform on data\n",
+    "    # steps_test_tfidf_transform = steps_tfidf_vectorizer_model.fit_transform(tqdm(to_nlp_df[\"steps\"]))\n",
+    "    topic_model.update_topics(\n",
+    "        recipe_ingreds\n",
+    "        , vectorizer_model=steps_vectorizer_model\n",
+    "    )\n",
+    "\n",
+    "    # Display topic model results\n",
+    "    print('\\n')\n",
+    "    print('-' * 80)\n",
+    "    print(f'{datetime.now()}, BERTopic Model Dataframe:', end='\\n')\n",
+    "    print(topic_model.get_topic_info())\n",
+    "\n",
+    "    print('\\n')\n",
+    "    print('-' * 80)\n",
+    "    print(f'{datetime.now()}, BERTopic Model Representations:', end='\\n')\n",
+    "    print(topic_model.get_topic_info()['Representation'])\n",
+    "\n",
+    "    print('\\n')\n",
+    "    print('-' * 80)\n",
+    "    print(f'{datetime.now()}, BERTopic Model Representative Docs:', end='\\n')\n",
+    "    print(topic_model.get_topic_info()['Representative_Docs'])\n",
+    "\n",
+    "    # Save and log the topic model dataframe\n",
+    "    topic_model.get_topic_info().to_json('../data/processed/bertopic_model_ingreds_full_set_df.json')\n",
+    "    mlflow.log_artifact('../data/processed/bertopic_model_ingreds_full_set_df.json',\n",
+    "                        artifact_path='bertopic_models')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# load from MLflow\n",
+    "mlflow_client = mlflow.tracking.MlflowClient(\n",
+    "    tracking_uri=f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow')\n",
+    "\n",
+    "# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer\n",
+    "sklearn_nlp_params = {\n",
+    "    'strip_accents':\"unicode\",\n",
+    "    'lowercase':True,\n",
+    "    'analyzer': StanzaWrapper().stanza_analyzer(stanza_pipeline=nlp, minNgramLength=1, maxNgramLength=4),\n",
+    "    'min_df':10,\n",
+    "}\n",
+    "\n",
+    "# create sklearn pipeline as in BERTopic lightweight configuration\n",
+    "# pipe = make_pipeline(\n",
+    "#     TfidfVectorizer(**sklearn_nlp_params),\n",
+    "#     TruncatedSVD(100)\n",
+    "# )\n",
+    "\n",
+    "# bertopic_params are a superset of cv_params\n",
+    "bertopic_params = {\n",
+    "    # 'embedding_model': TfidfVectorizer(**sklearn_nlp_params),\n",
+    "    'top_n_words':20,\n",
+    "    'min_topic_size':10,\n",
+    "    'nr_topics':50,\n",
+    "    'verbose':True,\n",
+    "    'low_memory':True,\n",
+    "    'calculate_probabilities':True,\n",
+    "    # 'min_cluster_size': 10 # Possibly only works if modifying individual HDBSCAN component of BERTopic\n",
+    "}\n",
+    "\n",
+    "# update bertopic_params to include cv_params\n",
+    "# bertopic_params.update(cv_params)\n",
+    "\n",
+    "# pipeline_params are parameters that will be logged in MLFlow and are a superset of library parameters\n",
+    "pipeline_params = {\n",
+    "    'stanza_model': 'en',\n",
+    "    'sklearn-transformer': 'TfidfVectorizer'\n",
+    "}\n",
+    "\n",
+    "# update the pipeline parameters with the library-specific ones so that they show up in MLflow Tracking\n",
+    "pipeline_params.update(sklearn_nlp_params)\n",
+    "pipeline_params.update(bertopic_params)\n",
+    "\n",
+    "with mlflow.start_run(experiment_id=get_experiment_id(f\"{DAGSHUB_EMAIL}/bertopic_lightweight_stanza_ingreds_small_set_v1.01\")):    \n",
+    "    # LOG PARAMETERS\n",
+    "    mlflow.log_params(pipeline_params)\n",
+    "\n",
+    "    # LOG INPUTS (QUERIES) AND OUTPUTS\n",
+    "    # MLflow example uses a list of strings or a list of str->str dicts\n",
+    "    \n",
+    "    # load raw data and preprocess/clean\n",
+    "    data = dvc.api.read(\n",
+    "           path='../data/recipes-en-201706/epicurious-recipes_m2.json'\n",
+    "           , mode='r')\n",
+    "    raw_df = pd.read_json(data)\n",
+    "    print('\\n')\n",
+    "    print('--------------')\n",
+    "    print(f'{datetime.now()}, Raw Dataframe: ', end='\\n')\n",
+    "    print(raw_df.head())\n",
+    "    print(raw_df.shape)\n",
+    "\n",
+    "    # pre_proc_df is cleaned dataframe\n",
+    "    pre_proc_df = dfpp.preprocess_dataframe(raw_df)\n",
+    "    print('\\n')\n",
+    "    print('--------------')\n",
+    "    print(f'{datetime.now()}, Preprocessed Dataframe:', end='\\n')\n",
+    "    print(pre_proc_df.head())\n",
+    "    print(pre_proc_df.shape)\n",
+    "\n",
+    "\n",
+    "    # pre_proc_df = pd.read_json(\n",
+    "    #     mlflow.artifacts.download_artifacts(\n",
+    "    #         run_id=mlflow_run_id,\n",
+    "    #         artifact_path='artifacts/preprocessed_dataframes/preprocessed_dataframe.json',\n",
+    "    #         # tracking_uri=f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow'\n",
+    "    #     )\n",
+    "    # )\n",
+    "    # print('\\n')\n",
+    "    # print('-' * 80)\n",
+    "    # print('Preprocessed Dataframe:', end='\\n')\n",
+    "    # print(pre_proc_df.head())\n",
+    "    # print(pre_proc_df.shape)\n",
+    "\n",
+    "    # create subset for dev purposes\n",
+    "    to_nlp_df = pre_proc_df[0:100]\n",
+    "    print('\\n')\n",
+    "    print('-' * 80)\n",
+    "    print(f'{datetime.now()}, Subset Dataframe:', end='\\n')\n",
+    "    print(to_nlp_df.head())\n",
+    "    print(to_nlp_df.shape)\n",
+    "\n",
+    "    # LOG MODEL\n",
+    "    # Instantiate BERTopic\n",
+    "    topic_model = BERTopic(\n",
+    "        **bertopic_params\n",
+    "    )\n",
+    "    \n",
+    "    analyzer_kwargs = {'stanza_pipeline': nlp\n",
+    "                       , 'minNgramLength': 1\n",
+    "                       , 'maxNgramLength': 4\n",
+    "                       , 'lemmatize': True}\n",
+    "    \n",
+    "    # recipe_steps = \"\".join(str(to_nlp_df[\"prepSteps\"].apply(StanzaWrapper().stanza_analyzer(stanza_pipeline=nlp, minNgramLength=1, maxNgramLength=4))))\n",
+    "    recipe_ingreds = to_nlp_df[\"ingredients\"].apply(custom_analyzer, **analyzer_kwargs)\n",
+    "\n",
+    "    print('\\n')\n",
+    "    print('-' * 80)\n",
+    "    print(f'{datetime.now()}, Recipe ingredients:', end='\\n')\n",
+    "    print([ingred for ingred in recipe_ingreds])\n",
+    "\n",
+    "    # Create TF-IDF embeddings\n",
+    "    vectorizer = TfidfVectorizer(**sklearn_nlp_params)\n",
+    "    embeddings = vectorizer.fit_transform(tqdm(recipe_ingreds))\n",
+    "\n",
+    "    # train on the recipes' ingredientss\n",
+    "    topics, probs = topic_model.fit_transform(recipe_ingreds, embeddings)\n",
+    "\n",
+    "    # since this uses a custom Stanza analyzer, we have to use a custom mlflow.Pyfunc.PythonModel\n",
+    "    # Instantiate sklearn CountVectorizer\n",
+    "    sklearn_cv_params = {\n",
+    "        # 'strip_accents':\"unicode\",\n",
+    "        # 'lowercase':True,\n",
+    "        # 'analyzer': StanzaWrapper().stanza_analyzer(stanza_pipeline=nlp, minNgramLength=1, maxNgramLength=4),\n",
+    "        # 'min_df':10,\n",
+    "        'token_pattern': r\"(?u)\\b[a-zA-Z]{2,}\\b\"\n",
+    "    }\n",
+    "    ingreds_vectorizer_model = CountVectorizer(**sklearn_cv_params)\n",
+    "\n",
+    "    # May need to use BERTopic's OnlineCountVectorizer\n",
+    "    # steps_vectorizer_model = OnlineCountVectorizer(**sklearn_nlp_params)\n",
+    "\n",
+    "    # Do fit transform on data\n",
+    "    # steps_test_tfidf_transform = steps_tfidf_vectorizer_model.fit_transform(tqdm(to_nlp_df[\"steps\"]))\n",
+    "    topic_model.update_topics(\n",
+    "        recipe_ingreds\n",
+    "        , vectorizer_model=ingreds_vectorizer_model\n",
+    "    )\n",
+    "\n",
+    "    # Display topic model results\n",
+    "    print('\\n')\n",
+    "    print('-' * 80)\n",
+    "    print(f'{datetime.now()}, BERTopic Model Dataframe:', end='\\n')\n",
+    "    print(topic_model.get_topic_info())\n",
+    "\n",
+    "    print('\\n')\n",
+    "    print('-' * 80)\n",
+    "    print(f'{datetime.now()}, BERTopic Model Representations:', end='\\n')\n",
+    "    print(topic_model.get_topic_info()['Representation'])\n",
+    "\n",
+    "    print('\\n')\n",
+    "    print('-' * 80)\n",
+    "    print(f'{datetime.now()}, BERTopic Model Representative Docs:', end='\\n')\n",
+    "    print(topic_model.get_topic_info()['Representative_Docs'])\n",
+    "\n",
+    "    # Save and log the topic model dataframe\n",
+    "    topic_model.get_topic_info().to_json('../data/processed/bertopic_model_ingreds_small_set_df.json')\n",
+    "    mlflow.log_artifact('../data/processed/bertopic_model_ingreds_small_set_df.json',\n",
+    "                        artifact_path='bertopic_models')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "to_nlp_df['ingredients'][0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_recipe_ingreds = to_nlp_df[\"ingredients\"].apply(custom_analyzer, **analyzer_kwargs)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "i think i should start leaving out units/including stopwords again since i'm not using Stanza's deep learning"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# load from MLflow\n",
+    "mlflow_client = mlflow.tracking.MlflowClient(\n",
+    "    tracking_uri=f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow')\n",
+    "\n",
+    "# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer\n",
+    "sklearn_nlp_params = {\n",
+    "    'strip_accents':\"unicode\",\n",
+    "    'lowercase':True,\n",
+    "    'analyzer': StanzaWrapper().stanza_analyzer(stanza_pipeline=nlp, minNgramLength=1, maxNgramLength=4),\n",
+    "    'min_df':10,\n",
+    "}\n",
+    "\n",
+    "# bertopic_params are a superset of cv_params\n",
+    "bertopic_params = {\n",
+    "    'top_n_words':20,\n",
+    "    'min_topic_size':10,\n",
+    "    'nr_topics':50,\n",
+    "    'verbose':True,\n",
+    "    'low_memory':True,\n",
+    "    'calculate_probabilities':True,\n",
+    "    # 'min_cluster_size': 10 # Possibly only works if modifying individual HDBSCAN component of BERTopic\n",
+    "}\n",
+    "\n",
+    "# update bertopic_params to include cv_params\n",
+    "# bertopic_params.update(cv_params)\n",
+    "\n",
+    "# pipeline_params are parameters that will be logged in MLFlow and are a superset of library parameters\n",
+    "pipeline_params = {\n",
+    "    'stanza_model': 'en',\n",
+    "    'sklearn-transformer': 'TfidfVectorizer'\n",
+    "}\n",
+    "\n",
+    "# update the pipeline parameters with the library-specific ones so that they show up in MLflow Tracking\n",
+    "pipeline_params.update(sklearn_nlp_params)\n",
+    "pipeline_params.update(bertopic_params)\n",
+    "\n",
+    "with mlflow.start_run(experiment_id=get_experiment_id(f\"{DAGSHUB_EMAIL}/bertopic_lightweight_stanza_ingreds_full_set_v1.00\")):    \n",
+    "    # LOG PARAMETERS\n",
+    "    mlflow.log_params(pipeline_params)\n",
+    "\n",
+    "    # LOG INPUTS (QUERIES) AND OUTPUTS\n",
+    "    # MLflow example uses a list of strings or a list of str->str dicts\n",
+    "    \n",
+    "    # load raw data and preprocess/clean\n",
+    "    data = dvc.api.read(\n",
+    "           path='../data/recipes-en-201706/epicurious-recipes_m2.json'\n",
+    "           , mode='r')\n",
+    "    raw_df = pd.read_json(data)\n",
+    "    print('\\n')\n",
+    "    print('--------------')\n",
+    "    print(f'{datetime.now()}, Raw Dataframe: ', end='\\n')\n",
+    "    print(raw_df.head())\n",
+    "    print(raw_df.shape)\n",
+    "\n",
+    "    # pre_proc_df is cleaned dataframe\n",
+    "    pre_proc_df = dfpp.preprocess_dataframe(raw_df)\n",
+    "    print('\\n')\n",
+    "    print('--------------')\n",
+    "    print(f'{datetime.now()}, Preprocessed Dataframe:', end='\\n')\n",
+    "    print(pre_proc_df.head())\n",
+    "    print(pre_proc_df.shape)\n",
+    "\n",
+    "\n",
+    "    # pre_proc_df = pd.read_json(\n",
+    "    #     mlflow.artifacts.download_artifacts(\n",
+    "    #         run_id=mlflow_run_id,\n",
+    "    #         artifact_path='artifacts/preprocessed_dataframes/preprocessed_dataframe.json',\n",
+    "    #         # tracking_uri=f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow'\n",
+    "    #     )\n",
+    "    # )\n",
+    "    # print('\\n')\n",
+    "    # print('-' * 80)\n",
+    "    # print('Preprocessed Dataframe:', end='\\n')\n",
+    "    # print(pre_proc_df.head())\n",
+    "    # print(pre_proc_df.shape)\n",
+    "\n",
+    "    # create subset for dev purposes\n",
+    "    # to_nlp_df = pre_proc_df[0:100]\n",
+    "    # print('\\n')\n",
+    "    # print('-' * 80)\n",
+    "    # print(f'{datetime.now()}, Subset Dataframe:', end='\\n')\n",
+    "    # print(to_nlp_df.head())\n",
+    "    # print(to_nlp_df.shape)\n",
+    "\n",
+    "    # LOG MODEL\n",
+    "    # Instantiate BERTopic\n",
+    "    topic_model = BERTopic(\n",
+    "        **bertopic_params\n",
+    "    )\n",
+    "    \n",
+    "    analyzer_kwargs = {'stanza_pipeline': nlp\n",
+    "                       , 'minNgramLength': 1\n",
+    "                       , 'maxNgramLength': 4\n",
+    "                       , 'lemmatize': True}\n",
+    "    \n",
+    "    recipe_ingreds = pre_proc_df[\"ingredients\"].apply(custom_analyzer, **analyzer_kwargs)\n",
+    "    \n",
+    "    print('\\n')\n",
+    "    print('-' * 80)\n",
+    "    print(f'{datetime.now()}, Recipe ingredients:', end='\\n')\n",
+    "    print(recipe_ingreds)\n",
+    "\n",
+    "    # Create TF-IDF embeddings\n",
+    "    vectorizer = TfidfVectorizer(**sklearn_nlp_params)\n",
+    "    embeddings = vectorizer.fit_transform(tqdm(recipe_ingreds))\n",
+    "\n",
+    "    # recipe_steps = \"\".join(str(to_nlp_df[\"prepSteps\"].apply(StanzaWrapper().stanza_analyzer(stanza_pipeline=nlp, minNgramLength=1, maxNgramLength=4))))\n",
+    "    # print('\\n')\n",
+    "    # print('-' * 80)\n",
+    "    # print(f'{datetime.now()}, Recipe ingredients:', end='\\n')\n",
+    "    # print(recipe_ingreds)\n",
+    "\n",
+    "    # train on the recipes' ingredientss\n",
+    "    topics, probs = topic_model.fit_transform(recipe_ingreds, embeddings)\n",
+    "\n",
+    "    # since this uses a custom Stanza analyzer, we have to use a custom mlflow.Pyfunc.PythonModel\n",
+    "    # Instantiate sklearn CountVectorizer\n",
+    "    sklearn_cv_params = {\n",
+    "        # 'strip_accents':\"unicode\",\n",
+    "        # 'lowercase':True,\n",
+    "        'token_pattern': r\"(?u)\\b[a-zA-Z]{2,}\\b\"\n",
+    "    }\n",
+    "    ingreds_vectorizer_model = CountVectorizer(**sklearn_cv_params)\n",
+    "\n",
+    "    # Do fit transform on data\n",
+    "    # steps_test_tfidf_transform = steps_tfidf_vectorizer_model.fit_transform(tqdm(to_nlp_df[\"steps\"]))\n",
+    "    topic_model.update_topics(\n",
+    "        recipe_ingreds\n",
+    "        , vectorizer_model=ingreds_vectorizer_model\n",
+    "    )\n",
+    "\n",
+    "    # Display topic model results\n",
+    "    print('\\n')\n",
+    "    print('-' * 80)\n",
+    "    print(f'{datetime.now()}, BERTopic Model Dataframe:', end='\\n')\n",
+    "    print(topic_model.get_topic_info())\n",
+    "\n",
+    "    print('\\n')\n",
+    "    print('-' * 80)\n",
+    "    print(f'{datetime.now()}, BERTopic Model Representations:', end='\\n')\n",
+    "    print(topic_model.get_topic_info()['Representation'])\n",
+    "\n",
+    "    print('\\n')\n",
+    "    print('-' * 80)\n",
+    "    print(f'{datetime.now()}, BERTopic Model Representative Docs:', end='\\n')\n",
+    "    print(topic_model.get_topic_info()['Representative_Docs'])\n",
+    "\n",
+    "    # Save and log the topic model dataframe\n",
+    "    topic_model.get_topic_info().to_json('../data/processed/bertopic_model_ingreds_full_set_df.json')\n",
+    "    mlflow.log_artifact('../data/processed/bertopic_model_ingreds_full_set_df.json',\n",
+    "                        artifact_path='bertopic_models')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# try splitting among CPU and GPU. Try Stanza on CPU due to its memory usage\n",
+    "nlp2 = stanza.Pipeline('en', use_gpu=False)\n",
+    "\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "python3",
+   "language": "python",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}