diff --git a/nbs/11_sklearn_mlfow_model_testing.ipynb b/nbs/11_sklearn_mlfow_model_testing.ipynb new file mode 100644 index 0000000..d26e705 --- /dev/null +++ b/nbs/11_sklearn_mlfow_model_testing.ipynb @@ -0,0 +1,1637 @@ +{ + "cells": [ + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "---\n", + "description: test\n", + "output-file: template.html\n", + "title: Template\n", + "\n", + "---\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# | default_exp core" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# | hide\n", + "from bertopic import BERTopic\n", + "from bertopic.vectorizers import OnlineCountVectorizer\n", + "import dagshub\n", + "from datetime import datetime\n", + "import dill as pickle\n", + "import dvc.api\n", + "from hdbscan import HDBSCAN\n", + "from itertools import tee, islice\n", + "import mlflow\n", + "from mlflow.models import infer_signature\n", + "import nbdev\n", + "from nbdev.showdoc import *\n", + "import pandas as pd\n", + "import re\n", + "from sentence_transformers import SentenceTransformer\n", + "from sklearn.feature_extraction.text import (\n", + " CountVectorizer\n", + " , TfidfTransformer\n", + " , TfidfVectorizer\n", + " , \n", + ")\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.pipeline import make_pipeline\n", + "from src.custom_stanza_mlflow import StanzaWrapper\n", + "import src.dataframe_preprocessor as dfpp\n", + "import stanza\n", + "from tqdm import tqdm\n", + "from umap import UMAP" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!export 'PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# | export\n", + "def foo():\n", + " pass" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# | hide\n", + "# this function allows us to get the experiment ID from an experiment name\n", + "def get_experiment_id(name):\n", + " exp = mlflow.get_experiment_by_name(name)\n", + " if exp is None:\n", + " exp_id = mlflow.create_experiment(name)\n", + " return exp_id\n", + " return exp.experiment_id" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def custom_analyzer(step_list, stanza_pipeline, minNgramLength, maxNgramLength, lemmatize=True):\n", + " lowered = \" brk \".join(map(str, [step for step in step_list if step is not None])).lower()\n", + "\n", + " preproc = stanza_pipeline(lowered)\n", + " \n", + " if lemmatize:\n", + " lemmad = \" \".join(map(str,\n", + " [word.lemma\n", + " for sent in preproc.sentences \n", + " for word in sent.words if (\n", + " word.upos not in [\"NUM\", \"DET\", \"ADV\", \"CCONJ\", \"ADP\", \"SCONJ\", \"PUNCT\"]\n", + " and word is not None\n", + " )]\n", + " )\n", + " )\n", + " else:\n", + " lemmad = \" \".join(map(str,\n", + " [word.text\n", + " for sent in preproc.sentences \n", + " for word in sent.words if (\n", + " word is not None\n", + " )]\n", + " )\n", + " )\n", + " # analyze each line of the input string seperately\n", + " for ln in lemmad.split(' brk '):\n", + " # tokenize the input string (customize the regex as desired)\n", + " at_least_two_english_characters_whole_words = \"(?u)\\b[a-zA-Z]{2,}\\b\"\n", + " terms = re.split(at_least_two_english_characters_whole_words, ln)\n", + "\n", + " # loop ngram creation for every number between min and max ngram length\n", + " for ngramLength in range(minNgramLength, maxNgramLength+1):\n", + "\n", + " # find and return all ngrams\n", + " # for ngram in zip(*[terms[i:] for i in range(3)]): \n", + " # <-- solution without a generator (works the same but has higher memory usage)\n", + " for ngram in zip(*[islice(seq, i, len(terms)) for i, seq in enumerate(tee(terms, ngramLength))]): # <-- solution using a generator\n", + " \n", + " ngram = ' '.join(map(str, ngram))\n", + " # yield ngram\n", + " return str(ngram)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# | hide\n", + "nbdev.nbdev_export()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# | Below this are blocks to use DagsHub with MLflow" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Repository initialized!\n",
+       "
\n" + ], + "text/plain": [ + "Repository initialized!\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#@markdown Enter the username of your DAGsHub account:\n", + "DAGSHUB_USER_NAME = \"AaronWChen\" #@param {type:\"string\"}\n", + "\n", + "#@markdown Enter the email for your DAGsHub account:\n", + "DAGSHUB_EMAIL = \"awc33@cornell.edu\" #@param {type:\"string\"}\n", + "\n", + "#@markdown Enter the repo name \n", + "DAGSHUB_REPO_NAME = \"MeaLeon\"\n", + "\n", + "#@markdown Enter the name of the branch you are working on \n", + "BRANCH = \"MLF-1/start-custom-sklearn-mlflow-model\"\n", + "dagshub.init(repo_name=DAGSHUB_REPO_NAME\n", + " , repo_owner=DAGSHUB_USER_NAME)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Starting DEV stage for One Hot Encoded model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mlflow.set_tracking_uri(f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow')\n", + "\n", + "# starter idea for making an experiment name can be the git branch, but need more specificity\n", + "experiment_name = f\"{DAGSHUB_EMAIL}/one-hot-encode\"\n", + "mlflow_exp_id = get_experiment_id(experiment_name)\n", + "\n", + "# define model location\n", + "# model_directory = \"/tmp/sklearn_model\"\n", + "model_directory = \"../models/\"\n", + "\n", + "# Define the required artifacts associated with the saved custom pyfunc\n", + "sklearn_model_path = model_directory + \"sklearn_model\"\n", + "artifacts = {'sklearn_model': \"python_model.pkl\"}\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Data Preparation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c64e8d5738864aa0bf3aeafe5237248b", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json: 0%| …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-01-24 16:08:50 INFO: Downloading default packages for language: en (English) ...\n", + "2024-01-24 16:08:51 INFO: File exists: /home/awchen/stanza_resources/en/default.zip\n", + "2024-01-24 16:08:54 INFO: Finished downloading models and saved to /home/awchen/stanza_resources.\n", + "2024-01-24 16:08:54 INFO: Checking for updates to resources.json in case models have been updated. Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a203edef26a14feeab3b02652894dce5", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json: 0%| …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-01-24 16:08:55 INFO: Loading these models for language: en (English):\n", + "======================================\n", + "| Processor | Package |\n", + "--------------------------------------\n", + "| tokenize | combined |\n", + "| pos | combined_charlm |\n", + "| lemma | combined_nocharlm |\n", + "| constituency | ptb3-revised_charlm |\n", + "| depparse | combined_charlm |\n", + "| sentiment | sstplus |\n", + "| ner | ontonotes_charlm |\n", + "======================================\n", + "\n", + "2024-01-24 16:08:55 INFO: Using device: cpu\n", + "2024-01-24 16:08:55 INFO: Loading: tokenize\n", + "2024-01-24 16:08:55 INFO: Loading: pos\n", + "2024-01-24 16:08:55 INFO: Loading: lemma\n", + "2024-01-24 16:08:55 INFO: Loading: constituency\n", + "2024-01-24 16:08:55 INFO: Loading: depparse\n", + "2024-01-24 16:08:56 INFO: Loading: sentiment\n", + "2024-01-24 16:08:56 INFO: Loading: ner\n", + "2024-01-24 16:08:57 INFO: Done loading processors!\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "--------------\n", + "Raw Dataframe:\n", + " id \\\n", + "0 54a2b6b019925f464b373351 \n", + "1 54a408a019925f464b3733bc \n", + "2 54a408a26529d92b2c003631 \n", + "3 54a408a66529d92b2c003638 \n", + "4 54a408a719925f464b3733cc \n", + "\n", + " dek \\\n", + "0 How does fried chicken achieve No. 1 status? B... \n", + "1 Spinaci all'Ebraica \n", + "2 This majestic, moist, and richly spiced honey ... \n", + "3 The idea for this sandwich came to me when my ... \n", + "4 In 1930, Simon Agranat, the chief justice of t... \n", + "\n", + " hed pubDate \\\n", + "0 Pickle-Brined Fried Chicken 2014-08-19T04:00:00.000Z \n", + "1 Spinach Jewish Style 2008-09-09T04:00:00.000Z \n", + "2 New Year’s Honey Cake 2008-09-10T04:00:00.000Z \n", + "3 The B.L.A.—Bagel with Lox and Avocado 2008-09-08T04:00:00.000Z \n", + "4 Shakshuka a la Doktor Shakshuka 2008-09-09T04:00:00.000Z \n", + "\n", + " author type \\\n", + "0 [] recipe \n", + "1 [{'name': 'Edda Servi Machlin'}] recipe \n", + "2 [{'name': 'Marcy Goldman'}] recipe \n", + "3 [{'name': 'Faye Levy'}] recipe \n", + "4 [{'name': 'Joan Nathan'}] recipe \n", + "\n", + " url \\\n", + "0 /recipes/food/views/pickle-brined-fried-chicke... \n", + "1 /recipes/food/views/spinach-jewish-style-350152 \n", + "2 /recipes/food/views/majestic-and-moist-new-yea... \n", + "3 /recipes/food/views/the-b-l-a-bagel-with-lox-a... \n", + "4 /recipes/food/views/shakshuka-a-la-doktor-shak... \n", + "\n", + " photoData \\\n", + "0 {'id': '54a2b64a6529d92b2c003409', 'filename':... \n", + "1 {'id': '56746182accb4c9831e45e0a', 'filename':... \n", + "2 {'id': '55e85ba4cf90d6663f728014', 'filename':... \n", + "3 {'id': '5674617e47d1a28026045e4f', 'filename':... \n", + "4 {'id': '56746183b47c050a284a4e15', 'filename':... \n", + "\n", + " tag aggregateRating \\\n", + "0 {'category': 'ingredient', 'name': 'Chicken', ... 3.11 \n", + "1 {'category': 'cuisine', 'name': 'Italian', 'ur... 3.22 \n", + "2 {'category': 'cuisine', 'name': 'Jewish', 'url... 3.62 \n", + "3 {'category': 'cuisine', 'name': 'Jewish', 'url... 4.00 \n", + "4 {'category': 'cuisine', 'name': 'Jewish', 'url... 2.71 \n", + "\n", + " ingredients \\\n", + "0 [1 tablespoons yellow mustard seeds, 1 tablesp... \n", + "1 [3 pounds small-leaved bulk spinach, Salt, 1/2... \n", + "2 [3 1/2 cups all-purpose flour, 1 tablespoon ba... \n", + "3 [1 small ripe avocado, preferably Hass (see No... \n", + "4 [2 pounds fresh tomatoes, unpeeled and cut in ... \n", + "\n", + " prepSteps reviewsCount \\\n", + "0 [Toast mustard and coriander seeds in a dry me... 7 \n", + "1 [Remove the stems and roots from the spinach. ... 5 \n", + "2 [I like this cake best baked in a 9-inch angel... 105 \n", + "3 [A short time before serving, mash avocado and... 7 \n", + "4 [1. Place the tomatoes, garlic, salt, paprika,... 7 \n", + "\n", + " willMakeAgainPct dateCrawled \n", + "0 100 1498547035 \n", + "1 80 1498547740 \n", + "2 88 1498547738 \n", + "3 100 1498547740 \n", + "4 83 1498547740 \n", + "(34756, 15)\n" + ] + } + ], + "source": [ + "# instantiate stanza pipeline\n", + "stanza.download('en')\n", + "nlp = stanza.Pipeline('en', \n", + " depparse_batch_size=50, \n", + " depparse_min_length_to_batch_separately=50,\n", + " verbose=True,\n", + " use_gpu=False, # set to true when on cloud/not on streaming computer\n", + " batch_size=100\n", + " )\n", + "\n", + "# load raw data and preprocess/clean\n", + "data = dvc.api.read(\n", + " path='../data/recipes-en-201706/epicurious-recipes_m2.json'\n", + " , mode='r')\n", + "raw_df = pd.read_json(data)\n", + "print('\\n')\n", + "print('--------------')\n", + "print('Raw Dataframe:', end='\\n')\n", + "print(raw_df.head())\n", + "print(raw_df.shape)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "--------------\n", + "Preprocessed Dataframe:\n", + " dek \\\n", + "id \n", + "54a4345019925f464b386748 Goes great with: Couscous flavored with choppe... \n", + "54a455036529d92b2c021482 This recipe can be prepared in 45 minutes or l... \n", + "54a462bb19925f464b3958d4 \n", + "54a43a4a6529d92b2c019d30 Heat the pizza stone and prepare the eggplant ... \n", + "54a423ab19925f464b3799f2 Avocado shells make handy vessels for a bright... \n", + "\n", + " hed \\\n", + "id \n", + "54a4345019925f464b386748 Moroccan Slow-Cooked Lamb \n", + "54a455036529d92b2c021482 Asian-Style Crab and Shrimp Cakes \n", + "54a462bb19925f464b3958d4 Black-Eyed Pea, Pineapple and Red Pepper Salad \n", + "54a43a4a6529d92b2c019d30 Eggplant, Tomato, and Fontina Pizza \n", + "54a423ab19925f464b3799f2 Avocado Salad with Bell Pepper and Tomatoes \n", + "\n", + " aggregateRating \\\n", + "id \n", + "54a4345019925f464b386748 3.80 \n", + "54a455036529d92b2c021482 3.12 \n", + "54a462bb19925f464b3958d4 3.28 \n", + "54a43a4a6529d92b2c019d30 3.50 \n", + "54a423ab19925f464b3799f2 4.00 \n", + "\n", + " ingredients \\\n", + "id \n", + "54a4345019925f464b386748 [1 tablespoon ground cumin, 2 teaspoons ground... \n", + "54a455036529d92b2c021482 [1/4 cup mayonnaise, 2 tablespoons chopped fre... \n", + "54a462bb19925f464b3958d4 [4 15-ounce cans black-eyed peas, rinsed, well... \n", + "54a43a4a6529d92b2c019d30 [1 (1 1/2-pound) eggplant, cut crosswise into ... \n", + "54a423ab19925f464b3799f2 [1 teaspoon extra-virgin olive oil, Juice of 1... \n", + "\n", + " prepSteps \\\n", + "id \n", + "54a4345019925f464b386748 [Mix first 6 ingredients in large bowl. Add la... \n", + "54a455036529d92b2c021482 [Blend first 4 ingredients in medium bowl. Mix... \n", + "54a462bb19925f464b3958d4 [Combine first 7 ingredients in large bowl. Wh... \n", + "54a43a4a6529d92b2c019d30 [Sprinkle eggplant with 1 1/2 teaspoons salt i... \n", + "54a423ab19925f464b3799f2 [1. In a small bowl, whisk together olive oil,... \n", + "\n", + " reviewsCount willMakeAgainPct cuisine_name \\\n", + "id \n", + "54a4345019925f464b386748 182 96 African \n", + "54a455036529d92b2c021482 25 89 Thai \n", + "54a462bb19925f464b3958d4 16 81 Missing Cuisine \n", + "54a43a4a6529d92b2c019d30 4 100 Missing Cuisine \n", + "54a423ab19925f464b3799f2 8 100 Missing Cuisine \n", + "\n", + " photo_filename \\\n", + "id \n", + "54a4345019925f464b386748 231597.jpg \n", + "54a455036529d92b2c021482 EP_12162015_placeholders_bright.jpg \n", + "54a462bb19925f464b3958d4 EP_12162015_placeholders_rustic.jpg \n", + "54a43a4a6529d92b2c019d30 230755.jpg \n", + "54a423ab19925f464b3799f2 51190610_avocado-pepper-salad_1x1.jpg \n", + "\n", + " photo_credit \\\n", + "id \n", + "54a4345019925f464b386748 Brian Leatart \n", + "54a455036529d92b2c021482 Photo by Chelsea Kyle, Prop Styling by Anna St... \n", + "54a462bb19925f464b3958d4 Photo by Chelsea Kyle, Prop Styling by Anna St... \n", + "54a43a4a6529d92b2c019d30 Romulo Yanes \n", + "54a423ab19925f464b3799f2 Bryan Gardner \n", + "\n", + " author_name date_published \\\n", + "id \n", + "54a4345019925f464b386748 Missing Author Name 2005-01-28 21:19:07+00:00 \n", + "54a455036529d92b2c021482 Missing Author Name 2004-08-20 04:00:00+00:00 \n", + "54a462bb19925f464b3958d4 Missing Author Name 2004-08-20 04:00:00+00:00 \n", + "54a43a4a6529d92b2c019d30 Missing Author Name 2006-05-16 20:12:06+00:00 \n", + "54a423ab19925f464b3799f2 Missing Author Name 2013-08-26 04:00:00+00:00 \n", + "\n", + " recipe_url \n", + "id \n", + "54a4345019925f464b386748 https://www.epicurious.com/recipes/food/views/... \n", + "54a455036529d92b2c021482 https://www.epicurious.com/recipes/food/views/... \n", + "54a462bb19925f464b3958d4 https://www.epicurious.com/recipes/food/views/... \n", + "54a43a4a6529d92b2c019d30 https://www.epicurious.com/recipes/food/views/... \n", + "54a423ab19925f464b3799f2 https://www.epicurious.com/recipes/food/views/... \n", + "(150, 13)\n", + "\n", + "\n", + "--------------------------------------------------------------------------------\n", + "Subset Dataframe:\n", + " dek \\\n", + "id \n", + "54a4345019925f464b386748 Goes great with: Couscous flavored with choppe... \n", + "54a455036529d92b2c021482 This recipe can be prepared in 45 minutes or l... \n", + "54a462bb19925f464b3958d4 \n", + "54a43a4a6529d92b2c019d30 Heat the pizza stone and prepare the eggplant ... \n", + "54a423ab19925f464b3799f2 Avocado shells make handy vessels for a bright... \n", + "\n", + " hed \\\n", + "id \n", + "54a4345019925f464b386748 Moroccan Slow-Cooked Lamb \n", + "54a455036529d92b2c021482 Asian-Style Crab and Shrimp Cakes \n", + "54a462bb19925f464b3958d4 Black-Eyed Pea, Pineapple and Red Pepper Salad \n", + "54a43a4a6529d92b2c019d30 Eggplant, Tomato, and Fontina Pizza \n", + "54a423ab19925f464b3799f2 Avocado Salad with Bell Pepper and Tomatoes \n", + "\n", + " aggregateRating \\\n", + "id \n", + "54a4345019925f464b386748 3.80 \n", + "54a455036529d92b2c021482 3.12 \n", + "54a462bb19925f464b3958d4 3.28 \n", + "54a43a4a6529d92b2c019d30 3.50 \n", + "54a423ab19925f464b3799f2 4.00 \n", + "\n", + " ingredients \\\n", + "id \n", + "54a4345019925f464b386748 [1 tablespoon ground cumin, 2 teaspoons ground... \n", + "54a455036529d92b2c021482 [1/4 cup mayonnaise, 2 tablespoons chopped fre... \n", + "54a462bb19925f464b3958d4 [4 15-ounce cans black-eyed peas, rinsed, well... \n", + "54a43a4a6529d92b2c019d30 [1 (1 1/2-pound) eggplant, cut crosswise into ... \n", + "54a423ab19925f464b3799f2 [1 teaspoon extra-virgin olive oil, Juice of 1... \n", + "\n", + " prepSteps \\\n", + "id \n", + "54a4345019925f464b386748 [Mix first 6 ingredients in large bowl. Add la... \n", + "54a455036529d92b2c021482 [Blend first 4 ingredients in medium bowl. Mix... \n", + "54a462bb19925f464b3958d4 [Combine first 7 ingredients in large bowl. Wh... \n", + "54a43a4a6529d92b2c019d30 [Sprinkle eggplant with 1 1/2 teaspoons salt i... \n", + "54a423ab19925f464b3799f2 [1. In a small bowl, whisk together olive oil,... \n", + "\n", + " reviewsCount willMakeAgainPct cuisine_name \\\n", + "id \n", + "54a4345019925f464b386748 182 96 African \n", + "54a455036529d92b2c021482 25 89 Thai \n", + "54a462bb19925f464b3958d4 16 81 Missing Cuisine \n", + "54a43a4a6529d92b2c019d30 4 100 Missing Cuisine \n", + "54a423ab19925f464b3799f2 8 100 Missing Cuisine \n", + "\n", + " photo_filename \\\n", + "id \n", + "54a4345019925f464b386748 231597.jpg \n", + "54a455036529d92b2c021482 EP_12162015_placeholders_bright.jpg \n", + "54a462bb19925f464b3958d4 EP_12162015_placeholders_rustic.jpg \n", + "54a43a4a6529d92b2c019d30 230755.jpg \n", + "54a423ab19925f464b3799f2 51190610_avocado-pepper-salad_1x1.jpg \n", + "\n", + " photo_credit \\\n", + "id \n", + "54a4345019925f464b386748 Brian Leatart \n", + "54a455036529d92b2c021482 Photo by Chelsea Kyle, Prop Styling by Anna St... \n", + "54a462bb19925f464b3958d4 Photo by Chelsea Kyle, Prop Styling by Anna St... \n", + "54a43a4a6529d92b2c019d30 Romulo Yanes \n", + "54a423ab19925f464b3799f2 Bryan Gardner \n", + "\n", + " author_name date_published \\\n", + "id \n", + "54a4345019925f464b386748 Missing Author Name 2005-01-28 21:19:07+00:00 \n", + "54a455036529d92b2c021482 Missing Author Name 2004-08-20 04:00:00+00:00 \n", + "54a462bb19925f464b3958d4 Missing Author Name 2004-08-20 04:00:00+00:00 \n", + "54a43a4a6529d92b2c019d30 Missing Author Name 2006-05-16 20:12:06+00:00 \n", + "54a423ab19925f464b3799f2 Missing Author Name 2013-08-26 04:00:00+00:00 \n", + "\n", + " recipe_url \n", + "id \n", + "54a4345019925f464b386748 https://www.epicurious.com/recipes/food/views/... \n", + "54a455036529d92b2c021482 https://www.epicurious.com/recipes/food/views/... \n", + "54a462bb19925f464b3958d4 https://www.epicurious.com/recipes/food/views/... \n", + "54a43a4a6529d92b2c019d30 https://www.epicurious.com/recipes/food/views/... \n", + "54a423ab19925f464b3799f2 https://www.epicurious.com/recipes/food/views/... \n", + "(150, 13)\n" + ] + } + ], + "source": [ + "# take sample and train/test split \n", + "subset_df = raw_df.sample(n=300, random_state=45)\n", + "train_df, test_df = train_test_split(subset_df,test_size=0.5, random_state=45)\n", + "\n", + "# pre_proc_df is cleaned dataframe\n", + "pre_proc_df = dfpp.preprocess_dataframe(train_df)\n", + "print('\\n')\n", + "print('--------------')\n", + "print('Preprocessed Dataframe:', end='\\n')\n", + "print(pre_proc_df.head())\n", + "print(pre_proc_df.shape)\n", + "\n", + "# create subset for dev purposes\n", + "to_nlp_df = pre_proc_df\n", + "print('\\n')\n", + "print('-' * 80)\n", + "print('Subset Dataframe:', end='\\n')\n", + "print(to_nlp_df.head())\n", + "print(to_nlp_df.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "ename": "SyntaxError", + "evalue": "invalid syntax. Perhaps you forgot a comma? (4439283.py, line 76)", + "output_type": "error", + "traceback": [ + "\u001b[0;36m Cell \u001b[0;32mIn[10], line 76\u001b[0;36m\u001b[0m\n\u001b[0;31m code_path=[\"../src/\"]#, \"../models/\"],\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax. Perhaps you forgot a comma?\n" + ] + } + ], + "source": [ + "# load from MLflow\n", + "mlflow_client = mlflow.tracking.MlflowClient(\n", + " tracking_uri=f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow')\n", + "\n", + "# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer\n", + "sklearn_transformer_params = {\n", + " 'strip_accents':\"unicode\",\n", + " 'lowercase':True,\n", + " 'analyzer': StanzaWrapper().stanza_analyzer(stanza_pipeline=nlp, minNgramLength=1, maxNgramLength=4),\n", + " 'min_df':3,\n", + " 'binary':True\n", + "}\n", + "\n", + "# bertopic_params are a superset of cv_params\n", + "bertopic_params = {\n", + " 'top_n_words':20,\n", + " 'min_topic_size':5,\n", + " 'nr_topics':'auto',\n", + " 'verbose':True,\n", + " 'low_memory':True,\n", + " 'calculate_probabilities':True\n", + "}\n", + "\n", + "# update bertopic_params to include cv_params\n", + "# bertopic_params.update(cv_params)\n", + "\n", + "# pipeline_params are parameters that will be logged in MLFlow and are a superset of library parameters\n", + "pipeline_params = {\n", + " 'stanza_model': 'en',\n", + " 'sklearn-transformer': 'OneHotEncoder'\n", + "}\n", + "\n", + "# update the pipeline parameters with the library-specific ones so that they show up in MLflow Tracking\n", + "pipeline_params.update(sklearn_transformer_params)\n", + "pipeline_params.update(bertopic_params)\n", + "\n", + "signature = infer_signature(to_nlp_df['ingredients'])\n", + "\n", + "with mlflow.start_run(experiment_id=mlflow_exp_id): \n", + " # LOG PARAMETERS\n", + " mlflow.log_params(pipeline_params)\n", + "\n", + " # LOG INPUTS (QUERIES) AND OUTPUTS\n", + " # MLflow example uses a list of strings or a list of str->str dicts\n", + " # Will be useful in STAGING/Evaluation\n", + " \n", + " # LOG MODEL\n", + " # Instantiate sklearn OneHotEncoder\n", + " ohe = CountVectorizer(**sklearn_transformer_params)\n", + "\n", + " print('\\n')\n", + " print('-' * 80)\n", + " print('sklearn fit transform on ingredients:', end='\\n')\n", + "\n", + " # Do fit transform on data\n", + " response = ohe.fit_transform(tqdm(to_nlp_df['ingredients']))\n", + " transformed_recipe = pd.DataFrame(\n", + " response.toarray(),\n", + " columns=ohe.get_feature_names_out(),\n", + " index=to_nlp_df.index\n", + " )\n", + "\n", + " print('\\n')\n", + " print('-' * 80)\n", + " print('Transformed Data:', end='\\n')\n", + " print(transformed_recipe)\n", + "\n", + " # mlflow.pyfunc.save_model(\n", + " # path=sklearn_model_path,\n", + " # code_path=[\"../src/\"],\n", + " # python_model=StanzaWrapper(),\n", + " # input_example=to_nlp_df['ingredients'][0],\n", + " # )\n", + "\n", + " model_info = mlflow.pyfunc.log_model(\n", + " code_path=[\"../src/\"],# \"../models/\"],\n", + " python_model=StanzaWrapper(),\n", + " input_example=to_nlp_df['ingredients'][0],\n", + " signature=signature, \n", + " artifact_path=\"sklearn_model\",\n", + " # artifacts=artifacts\n", + " ) \n", + "\n", + " # since this uses a custom Stanza analyzer, we have to use a custom mlflow.Pyfunc.PythonModel\n", + " \n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "response" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test_predict = mlflow.pyfunc.load_model(model_uri=model_info.model_uri)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# load from MLflow\n", + "mlflow_client = mlflow.tracking.MlflowClient(\n", + " tracking_uri=f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow')\n", + "\n", + "# load dataframes from artifacts\n", + "# mlflow.artifacts.download_artifacts(\n", + "# run_id=mlflow_run_id\n", + "# )\n", + "\n", + "# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer\n", + "cv_params = {\n", + " 'strip_accents':\"unicode\",\n", + " 'lowercase':True,\n", + " 'analyzer': StanzaWrapper().stanza_analyzer(stanza_pipeline=nlp, minNgramLength=1, maxNgramLength=4),\n", + " 'min_df':10,\n", + "}\n", + "\n", + "# bertopic_params are a superset of cv_params\n", + "bertopic_params = {\n", + " 'top_n_words':20,\n", + " 'min_topic_size':10,\n", + " 'nr_topics':'auto',\n", + " 'verbose':True,\n", + " 'low_memory':True,\n", + " 'calculate_probabilities':True\n", + "}\n", + "\n", + "# update bertopic_params to include cv_params\n", + "# bertopic_params.update(cv_params)\n", + "\n", + "# pipeline_params are parameters that will be logged in MLFlow and are a superset of library parameters\n", + "pipeline_params = {\n", + " 'stanza_model': 'en',\n", + " 'sklearn-transformer': 'TfidfVectorizer'\n", + "}\n", + "\n", + "# update the pipeline parameters with the library-specific ones so that they show up in MLflow Tracking\n", + "pipeline_params.update(cv_params)\n", + "pipeline_params.update(bertopic_params)\n", + "\n", + "with mlflow.start_run(experiment_id=get_experiment_id(f\"{DAGSHUB_EMAIL}/bertopic_stanza_ingreds_full_set_v1\")): \n", + " # LOG PARAMETERS\n", + " mlflow.log_params(pipeline_params)\n", + "\n", + " # LOG INPUTS (QUERIES) AND OUTPUTS\n", + " # MLflow example uses a list of strings or a list of str->str dicts\n", + " \n", + " # load raw data and preprocess/clean\n", + " data = dvc.api.read(\n", + " path='../data/recipes-en-201706/epicurious-recipes_m2.json'\n", + " , mode='r')\n", + " raw_df = pd.read_json(data)\n", + " print('\\n')\n", + " print('--------------')\n", + " print('Raw Dataframe:', end='\\n')\n", + " print(raw_df.head())\n", + " print(raw_df.shape)\n", + "\n", + " # pre_proc_df is cleaned dataframe\n", + " pre_proc_df = dfpp.preprocess_dataframe(raw_df)\n", + " print('\\n')\n", + " print('--------------')\n", + " print('Preprocessed Dataframe:', end='\\n')\n", + " print(pre_proc_df.head())\n", + " print(pre_proc_df.shape)\n", + "\n", + "\n", + " # pre_proc_df = pd.read_json(\n", + " # mlflow.artifacts.download_artifacts(\n", + " # run_id=mlflow_run_id,\n", + " # artifact_path='artifacts/preprocessed_dataframes/preprocessed_dataframe.json',\n", + " # # tracking_uri=f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow'\n", + " # )\n", + " # )\n", + " # print('\\n')\n", + " # print('-' * 80)\n", + " # print('Preprocessed Dataframe:', end='\\n')\n", + " # print(pre_proc_df.head())\n", + " # print(pre_proc_df.shape)\n", + "\n", + " # create subset for dev purposes\n", + " # to_nlp_df = pre_proc_df[0:50]\n", + " # print('\\n')\n", + " # print('-' * 80)\n", + " # print('Subset Dataframe:', end='\\n')\n", + " # print(to_nlp_df.head())\n", + " # print(to_nlp_df.shape)\n", + "\n", + " # LOG MODEL\n", + " # Instantiate BERTopic\n", + " topic_model = BERTopic(\n", + " **bertopic_params,\n", + " )\n", + "\n", + " def custom_analyzer(step_list, stanza_pipeline, minNgramLength, maxNgramLength):\n", + " lowered = \" brk \".join(map(str, [step for step in step_list if step is not None])).lower()\n", + "\n", + " preproc = stanza_pipeline(lowered)\n", + " \n", + " lemmad = \" \".join(map(str,\n", + " [word.text\n", + " for sent in preproc.sentences \n", + " for word in sent.words if (\n", + " word is not None\n", + " )]\n", + " )\n", + " )\n", + " \n", + " # analyze each line of the input string seperately\n", + " for ln in lemmad.split(' brk '):\n", + " \n", + " # tokenize the input string (customize the regex as desired)\n", + " at_least_two_english_characters_whole_words = \"(?u)\\b[a-zA-Z]{2,}\\b\"\n", + " terms = re.split(at_least_two_english_characters_whole_words, ln)\n", + "\n", + " # loop ngram creation for every number between min and max ngram length\n", + " for ngramLength in range(minNgramLength, maxNgramLength+1):\n", + "\n", + " # find and return all ngrams\n", + " # for ngram in zip(*[terms[i:] for i in range(3)]): \n", + " # <-- solution without a generator (works the same but has higher memory usage)\n", + " for ngram in zip(*[islice(seq, i, len(terms)) for i, seq in enumerate(tee(terms, ngramLength))]): # <-- solution using a generator\n", + " \n", + " ngram = ' '.join(map(str, ngram))\n", + " # yield ngram\n", + " return str(ngram)\n", + "\n", + " analyzer_kwargs = {'stanza_pipeline': nlp\n", + " , 'minNgramLength': 1\n", + " , 'maxNgramLength': 4}\n", + " \n", + " recipe_ingreds = pre_proc_df[\"ingredients\"].apply(custom_analyzer, **analyzer_kwargs)\n", + "\n", + " # recipe_steps = \"\".join(str(to_nlp_df[\"prepSteps\"].apply(StanzaWrapper().stanza_analyzer(stanza_pipeline=nlp, minNgramLength=1, maxNgramLength=4))))\n", + " print('\\n')\n", + " print('-' * 80)\n", + " print('Recipe ingredients:', end='\\n')\n", + " print(recipe_ingreds)\n", + "\n", + " # train on the recipes' steps\n", + " topics, probs = topic_model.fit_transform(recipe_ingreds)\n", + "\n", + " # since this uses a custom Stanza analyzer, we have to use a custom mlflow.Pyfunc.PythonModel\n", + " # Instantiate sklearn CountVectorizer\n", + " # steps_vectorizer_model = CountVectorizer(**cv_params)\n", + "\n", + " # May need to use BERTopic's OnlineCountVectorizer\n", + " steps_vectorizer_model = OnlineCountVectorizer(**cv_params)\n", + "\n", + " # Do fit transform on data\n", + " # steps_test_tfidf_transform = steps_tfidf_vectorizer_model.fit_transform(tqdm(to_nlp_df[\"steps\"]))\n", + " topic_model.update_topics(\n", + " recipe_ingreds\n", + " , vectorizer_model=steps_vectorizer_model\n", + " )\n", + "\n", + " # Display topic model results\n", + " print('\\n')\n", + " print('-' * 80)\n", + " print('BERTopic Model Dataframe:', end='\\n')\n", + " print(topic_model.get_topic_info())\n", + "\n", + " print('\\n')\n", + " print('-' * 80)\n", + " print('BERTopic Model Representations:', end='\\n')\n", + " print(topic_model.get_topic_info()['Representation'])\n", + "\n", + " print('\\n')\n", + " print('-' * 80)\n", + " print('BERTopic Model Representations:', end='\\n')\n", + " print(topic_model.get_topic_info()['Representative_Docs'])\n", + "\n", + " # Save and log the topic model dataframe\n", + " topic_model.get_topic_info().to_json('../data/processed/bertopic_model_ingreds_full_set_df.json')\n", + " mlflow.log_artifact('../data/processed/bertopic_model_ingreds_full_set_df.json',\n", + " artifact_path='bertopic_models')\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Attempt run with lighter weight configuration\n", + "#### This attempt will still use Stanza processing on the ingredients " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# load from MLflow\n", + "mlflow_client = mlflow.tracking.MlflowClient(\n", + " tracking_uri=f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow')\n", + "\n", + "# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer\n", + "sklearn_nlp_params = {\n", + " 'strip_accents':\"unicode\",\n", + " 'lowercase':True,\n", + " 'analyzer': StanzaWrapper().stanza_analyzer(stanza_pipeline=nlp, minNgramLength=1, maxNgramLength=4),\n", + " 'min_df':10,\n", + "}\n", + "\n", + "# create sklearn pipeline as in BERTopic lightweight configuration\n", + "# pipe = make_pipeline(\n", + "# TfidfVectorizer(**sklearn_nlp_params),\n", + "# TruncatedSVD(100)\n", + "# )\n", + "\n", + "# bertopic_params are a superset of cv_params\n", + "bertopic_params = {\n", + " # 'embedding_model': TfidfVectorizer(**sklearn_nlp_params),\n", + " 'top_n_words':20,\n", + " 'min_topic_size':10,\n", + " 'nr_topics':50,\n", + " 'verbose':True,\n", + " 'low_memory':True,\n", + " 'calculate_probabilities':True,\n", + " # 'min_cluster_size': 10 # Possibly only works if modifying individual HDBSCAN component of BERTopic\n", + "}\n", + "\n", + "# update bertopic_params to include cv_params\n", + "# bertopic_params.update(cv_params)\n", + "\n", + "# pipeline_params are parameters that will be logged in MLFlow and are a superset of library parameters\n", + "pipeline_params = {\n", + " 'stanza_model': 'en',\n", + " 'sklearn-transformer': 'TfidfVectorizer'\n", + "}\n", + "\n", + "# update the pipeline parameters with the library-specific ones so that they show up in MLflow Tracking\n", + "pipeline_params.update(sklearn_nlp_params)\n", + "pipeline_params.update(bertopic_params)\n", + "\n", + "with mlflow.start_run(experiment_id=get_experiment_id(f\"{DAGSHUB_EMAIL}/bertopic_lightweight_stanza_ingreds_small_set_v1\")): \n", + " # LOG PARAMETERS\n", + " mlflow.log_params(pipeline_params)\n", + "\n", + " # LOG INPUTS (QUERIES) AND OUTPUTS\n", + " # MLflow example uses a list of strings or a list of str->str dicts\n", + " \n", + " # load raw data and preprocess/clean\n", + " data = dvc.api.read(\n", + " path='../data/recipes-en-201706/epicurious-recipes_m2.json'\n", + " , mode='r')\n", + " raw_df = pd.read_json(data)\n", + " print('\\n')\n", + " print('--------------')\n", + " print(f'{datetime.now()}, Raw Dataframe: ', end='\\n')\n", + " print(raw_df.head())\n", + " print(raw_df.shape)\n", + "\n", + " # pre_proc_df is cleaned dataframe\n", + " pre_proc_df = dfpp.preprocess_dataframe(raw_df)\n", + " print('\\n')\n", + " print('--------------')\n", + " print(f'{datetime.now()}, Preprocessed Dataframe:', end='\\n')\n", + " print(pre_proc_df.head())\n", + " print(pre_proc_df.shape)\n", + "\n", + "\n", + " # pre_proc_df = pd.read_json(\n", + " # mlflow.artifacts.download_artifacts(\n", + " # run_id=mlflow_run_id,\n", + " # artifact_path='artifacts/preprocessed_dataframes/preprocessed_dataframe.json',\n", + " # # tracking_uri=f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow'\n", + " # )\n", + " # )\n", + " # print('\\n')\n", + " # print('-' * 80)\n", + " # print('Preprocessed Dataframe:', end='\\n')\n", + " # print(pre_proc_df.head())\n", + " # print(pre_proc_df.shape)\n", + "\n", + " # create subset for dev purposes\n", + " to_nlp_df = pre_proc_df[0:100]\n", + " print('\\n')\n", + " print('-' * 80)\n", + " print(f'{datetime.now()}, Subset Dataframe:', end='\\n')\n", + " print(to_nlp_df.head())\n", + " print(to_nlp_df.shape)\n", + "\n", + " # LOG MODEL\n", + " # Instantiate BERTopic\n", + " topic_model = BERTopic(\n", + " **bertopic_params\n", + " )\n", + " \n", + " analyzer_kwargs = {'stanza_pipeline': nlp\n", + " , 'minNgramLength': 1\n", + " , 'maxNgramLength': 4}\n", + " \n", + " recipe_ingreds = to_nlp_df[\"ingredients\"].apply(custom_analyzer, **analyzer_kwargs)\n", + "\n", + " # Create TF-IDF embeddings\n", + " vectorizer = TfidfVectorizer(**sklearn_nlp_params)\n", + " embeddings = vectorizer.fit_transform(recipe_ingreds)\n", + "\n", + " # recipe_steps = \"\".join(str(to_nlp_df[\"prepSteps\"].apply(StanzaWrapper().stanza_analyzer(stanza_pipeline=nlp, minNgramLength=1, maxNgramLength=4))))\n", + " print('\\n')\n", + " print('-' * 80)\n", + " print(f'{datetime.now()}, Recipe ingredients:', end='\\n')\n", + " print(recipe_ingreds)\n", + "\n", + " # train on the recipes' ingredientss\n", + " topics, probs = topic_model.fit_transform(recipe_ingreds, embeddings)\n", + "\n", + " # since this uses a custom Stanza analyzer, we have to use a custom mlflow.Pyfunc.PythonModel\n", + " # Instantiate sklearn CountVectorizer\n", + " sklearn_cv_params = {\n", + " 'strip_accents':\"unicode\",\n", + " 'lowercase':True,\n", + " 'analyzer': StanzaWrapper().stanza_analyzer(stanza_pipeline=nlp, minNgramLength=1, maxNgramLength=4),\n", + " # 'min_df':10,\n", + " }\n", + " steps_vectorizer_model = CountVectorizer(**sklearn_cv_params)\n", + "\n", + " # May need to use BERTopic's OnlineCountVectorizer\n", + " # steps_vectorizer_model = OnlineCountVectorizer(**sklearn_nlp_params)\n", + "\n", + " # Do fit transform on data\n", + " # steps_test_tfidf_transform = steps_tfidf_vectorizer_model.fit_transform(tqdm(to_nlp_df[\"steps\"]))\n", + " topic_model.update_topics(\n", + " recipe_ingreds\n", + " , vectorizer_model=steps_vectorizer_model\n", + " )\n", + "\n", + " # Display topic model results\n", + " print('\\n')\n", + " print('-' * 80)\n", + " print(f'{datetime.now()}, BERTopic Model Dataframe:', end='\\n')\n", + " print(topic_model.get_topic_info())\n", + "\n", + " print('\\n')\n", + " print('-' * 80)\n", + " print(f'{datetime.now()}, BERTopic Model Representations:', end='\\n')\n", + " print(topic_model.get_topic_info()['Representation'])\n", + "\n", + " print('\\n')\n", + " print('-' * 80)\n", + " print(f'{datetime.now()}, BERTopic Model Representative Docs:', end='\\n')\n", + " print(topic_model.get_topic_info()['Representative_Docs'])\n", + "\n", + " # Save and log the topic model dataframe\n", + " topic_model.get_topic_info().to_json('../data/processed/bertopic_model_ingreds_full_set_df.json')\n", + " mlflow.log_artifact('../data/processed/bertopic_model_ingreds_full_set_df.json',\n", + " artifact_path='bertopic_models')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "topic_model.get_topic_info()['Representation']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "topic_model.get_topic_info()['Representation'][0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# load from MLflow\n", + "mlflow_client = mlflow.tracking.MlflowClient(\n", + " tracking_uri=f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow')\n", + "\n", + "# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer\n", + "sklearn_nlp_params = {\n", + " 'strip_accents':\"unicode\",\n", + " 'lowercase':True,\n", + " 'analyzer': StanzaWrapper().stanza_analyzer(stanza_pipeline=nlp, minNgramLength=1, maxNgramLength=4),\n", + " 'min_df':10,\n", + "}\n", + "\n", + "# create sklearn pipeline as in BERTopic lightweight configuration\n", + "# pipe = make_pipeline(\n", + "# TfidfVectorizer(**sklearn_nlp_params),\n", + "# TruncatedSVD(100)\n", + "# )\n", + "\n", + "# bertopic_params are a superset of cv_params\n", + "bertopic_params = {\n", + " # 'embedding_model': TfidfVectorizer(**sklearn_nlp_params),\n", + " 'top_n_words':20,\n", + " 'min_topic_size':10,\n", + " 'nr_topics':50,\n", + " 'verbose':True,\n", + " 'low_memory':True,\n", + " 'calculate_probabilities':True,\n", + " # 'min_cluster_size': 10 # Possibly only works if modifying individual HDBSCAN component of BERTopic\n", + "}\n", + "\n", + "# update bertopic_params to include cv_params\n", + "# bertopic_params.update(cv_params)\n", + "\n", + "# pipeline_params are parameters that will be logged in MLFlow and are a superset of library parameters\n", + "pipeline_params = {\n", + " 'stanza_model': 'en',\n", + " 'sklearn-transformer': 'TfidfVectorizer'\n", + "}\n", + "\n", + "# update the pipeline parameters with the library-specific ones so that they show up in MLflow Tracking\n", + "pipeline_params.update(sklearn_nlp_params)\n", + "pipeline_params.update(bertopic_params)\n", + "\n", + "with mlflow.start_run(experiment_id=get_experiment_id(f\"{DAGSHUB_EMAIL}/bertopic_lightweight_stanza_ingreds_small_set_v1\")): \n", + " # LOG PARAMETERS\n", + " mlflow.log_params(pipeline_params)\n", + "\n", + " # LOG INPUTS (QUERIES) AND OUTPUTS\n", + " # MLflow example uses a list of strings or a list of str->str dicts\n", + " \n", + " # load raw data and preprocess/clean\n", + " data = dvc.api.read(\n", + " path='../data/recipes-en-201706/epicurious-recipes_m2.json'\n", + " , mode='r')\n", + " raw_df = pd.read_json(data)\n", + " print('\\n')\n", + " print('--------------')\n", + " print(f'{datetime.now()}, Raw Dataframe: ', end='\\n')\n", + " print(raw_df.head())\n", + " print(raw_df.shape)\n", + "\n", + " # pre_proc_df is cleaned dataframe\n", + " pre_proc_df = dfpp.preprocess_dataframe(raw_df)\n", + " print('\\n')\n", + " print('--------------')\n", + " print(f'{datetime.now()}, Preprocessed Dataframe:', end='\\n')\n", + " print(pre_proc_df.head())\n", + " print(pre_proc_df.shape)\n", + "\n", + "\n", + " # pre_proc_df = pd.read_json(\n", + " # mlflow.artifacts.download_artifacts(\n", + " # run_id=mlflow_run_id,\n", + " # artifact_path='artifacts/preprocessed_dataframes/preprocessed_dataframe.json',\n", + " # # tracking_uri=f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow'\n", + " # )\n", + " # )\n", + " # print('\\n')\n", + " # print('-' * 80)\n", + " # print('Preprocessed Dataframe:', end='\\n')\n", + " # print(pre_proc_df.head())\n", + " # print(pre_proc_df.shape)\n", + "\n", + " # create subset for dev purposes\n", + " to_nlp_df = pre_proc_df[0:100]\n", + " print('\\n')\n", + " print('-' * 80)\n", + " print(f'{datetime.now()}, Subset Dataframe:', end='\\n')\n", + " print(to_nlp_df.head())\n", + " print(to_nlp_df.shape)\n", + "\n", + " # LOG MODEL\n", + " # Instantiate BERTopic\n", + " topic_model = BERTopic(\n", + " **bertopic_params\n", + " )\n", + " \n", + " analyzer_kwargs = {'stanza_pipeline': nlp\n", + " , 'minNgramLength': 1\n", + " , 'maxNgramLength': 4}\n", + " \n", + " recipe_ingreds = to_nlp_df[\"ingredients\"].apply(custom_analyzer, **analyzer_kwargs)\n", + "\n", + " # Create TF-IDF embeddings\n", + " vectorizer = TfidfVectorizer(**sklearn_nlp_params)\n", + " embeddings = vectorizer.fit_transform(recipe_ingreds)\n", + "\n", + " # recipe_steps = \"\".join(str(to_nlp_df[\"prepSteps\"].apply(StanzaWrapper().stanza_analyzer(stanza_pipeline=nlp, minNgramLength=1, maxNgramLength=4))))\n", + " print('\\n')\n", + " print('-' * 80)\n", + " print(f'{datetime.now()}, Recipe ingredients:', end='\\n')\n", + " print(recipe_ingreds)\n", + "\n", + " # train on the recipes' ingredientss\n", + " topics, probs = topic_model.fit_transform(recipe_ingreds, embeddings)\n", + "\n", + " # since this uses a custom Stanza analyzer, we have to use a custom mlflow.Pyfunc.PythonModel\n", + " # Instantiate sklearn CountVectorizer\n", + " sklearn_cv_params = {\n", + " 'strip_accents':\"unicode\",\n", + " 'lowercase':True,\n", + " # 'analyzer': StanzaWrapper().stanza_analyzer(stanza_pipeline=nlp, minNgramLength=1, maxNgramLength=4),\n", + " # 'min_df':10,\n", + " }\n", + " steps_vectorizer_model = CountVectorizer(**sklearn_cv_params)\n", + "\n", + " # May need to use BERTopic's OnlineCountVectorizer\n", + " # steps_vectorizer_model = OnlineCountVectorizer(**sklearn_nlp_params)\n", + "\n", + " # Do fit transform on data\n", + " # steps_test_tfidf_transform = steps_tfidf_vectorizer_model.fit_transform(tqdm(to_nlp_df[\"steps\"]))\n", + " topic_model.update_topics(\n", + " recipe_ingreds\n", + " , vectorizer_model=steps_vectorizer_model\n", + " )\n", + "\n", + " # Display topic model results\n", + " print('\\n')\n", + " print('-' * 80)\n", + " print(f'{datetime.now()}, BERTopic Model Dataframe:', end='\\n')\n", + " print(topic_model.get_topic_info())\n", + "\n", + " print('\\n')\n", + " print('-' * 80)\n", + " print(f'{datetime.now()}, BERTopic Model Representations:', end='\\n')\n", + " print(topic_model.get_topic_info()['Representation'])\n", + "\n", + " print('\\n')\n", + " print('-' * 80)\n", + " print(f'{datetime.now()}, BERTopic Model Representative Docs:', end='\\n')\n", + " print(topic_model.get_topic_info()['Representative_Docs'])\n", + "\n", + " # Save and log the topic model dataframe\n", + " topic_model.get_topic_info().to_json('../data/processed/bertopic_model_ingreds_full_set_df.json')\n", + " mlflow.log_artifact('../data/processed/bertopic_model_ingreds_full_set_df.json',\n", + " artifact_path='bertopic_models')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# load from MLflow\n", + "mlflow_client = mlflow.tracking.MlflowClient(\n", + " tracking_uri=f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow')\n", + "\n", + "# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer\n", + "sklearn_nlp_params = {\n", + " 'strip_accents':\"unicode\",\n", + " 'lowercase':True,\n", + " 'analyzer': StanzaWrapper().stanza_analyzer(stanza_pipeline=nlp, minNgramLength=1, maxNgramLength=4),\n", + " 'min_df':10,\n", + "}\n", + "\n", + "# create sklearn pipeline as in BERTopic lightweight configuration\n", + "# pipe = make_pipeline(\n", + "# TfidfVectorizer(**sklearn_nlp_params),\n", + "# TruncatedSVD(100)\n", + "# )\n", + "\n", + "# bertopic_params are a superset of cv_params\n", + "bertopic_params = {\n", + " # 'embedding_model': TfidfVectorizer(**sklearn_nlp_params),\n", + " 'top_n_words':20,\n", + " 'min_topic_size':10,\n", + " 'nr_topics':50,\n", + " 'verbose':True,\n", + " 'low_memory':True,\n", + " 'calculate_probabilities':True,\n", + " # 'min_cluster_size': 10 # Possibly only works if modifying individual HDBSCAN component of BERTopic\n", + "}\n", + "\n", + "# update bertopic_params to include cv_params\n", + "# bertopic_params.update(cv_params)\n", + "\n", + "# pipeline_params are parameters that will be logged in MLFlow and are a superset of library parameters\n", + "pipeline_params = {\n", + " 'stanza_model': 'en',\n", + " 'sklearn-transformer': 'TfidfVectorizer'\n", + "}\n", + "\n", + "# update the pipeline parameters with the library-specific ones so that they show up in MLflow Tracking\n", + "pipeline_params.update(sklearn_nlp_params)\n", + "pipeline_params.update(bertopic_params)\n", + "\n", + "with mlflow.start_run(experiment_id=get_experiment_id(f\"{DAGSHUB_EMAIL}/bertopic_lightweight_stanza_ingreds_small_set_v1.01\")): \n", + " # LOG PARAMETERS\n", + " mlflow.log_params(pipeline_params)\n", + "\n", + " # LOG INPUTS (QUERIES) AND OUTPUTS\n", + " # MLflow example uses a list of strings or a list of str->str dicts\n", + " \n", + " # load raw data and preprocess/clean\n", + " data = dvc.api.read(\n", + " path='../data/recipes-en-201706/epicurious-recipes_m2.json'\n", + " , mode='r')\n", + " raw_df = pd.read_json(data)\n", + " print('\\n')\n", + " print('--------------')\n", + " print(f'{datetime.now()}, Raw Dataframe: ', end='\\n')\n", + " print(raw_df.head())\n", + " print(raw_df.shape)\n", + "\n", + " # pre_proc_df is cleaned dataframe\n", + " pre_proc_df = dfpp.preprocess_dataframe(raw_df)\n", + " print('\\n')\n", + " print('--------------')\n", + " print(f'{datetime.now()}, Preprocessed Dataframe:', end='\\n')\n", + " print(pre_proc_df.head())\n", + " print(pre_proc_df.shape)\n", + "\n", + "\n", + " # pre_proc_df = pd.read_json(\n", + " # mlflow.artifacts.download_artifacts(\n", + " # run_id=mlflow_run_id,\n", + " # artifact_path='artifacts/preprocessed_dataframes/preprocessed_dataframe.json',\n", + " # # tracking_uri=f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow'\n", + " # )\n", + " # )\n", + " # print('\\n')\n", + " # print('-' * 80)\n", + " # print('Preprocessed Dataframe:', end='\\n')\n", + " # print(pre_proc_df.head())\n", + " # print(pre_proc_df.shape)\n", + "\n", + " # create subset for dev purposes\n", + " to_nlp_df = pre_proc_df[0:100]\n", + " print('\\n')\n", + " print('-' * 80)\n", + " print(f'{datetime.now()}, Subset Dataframe:', end='\\n')\n", + " print(to_nlp_df.head())\n", + " print(to_nlp_df.shape)\n", + "\n", + " # LOG MODEL\n", + " # Instantiate BERTopic\n", + " topic_model = BERTopic(\n", + " **bertopic_params\n", + " )\n", + " \n", + " analyzer_kwargs = {'stanza_pipeline': nlp\n", + " , 'minNgramLength': 1\n", + " , 'maxNgramLength': 4\n", + " , 'lemmatize': True}\n", + " \n", + " # recipe_steps = \"\".join(str(to_nlp_df[\"prepSteps\"].apply(StanzaWrapper().stanza_analyzer(stanza_pipeline=nlp, minNgramLength=1, maxNgramLength=4))))\n", + " recipe_ingreds = to_nlp_df[\"ingredients\"].apply(custom_analyzer, **analyzer_kwargs)\n", + "\n", + " print('\\n')\n", + " print('-' * 80)\n", + " print(f'{datetime.now()}, Recipe ingredients:', end='\\n')\n", + " print([ingred for ingred in recipe_ingreds])\n", + "\n", + " # Create TF-IDF embeddings\n", + " vectorizer = TfidfVectorizer(**sklearn_nlp_params)\n", + " embeddings = vectorizer.fit_transform(tqdm(recipe_ingreds))\n", + "\n", + " # train on the recipes' ingredientss\n", + " topics, probs = topic_model.fit_transform(recipe_ingreds, embeddings)\n", + "\n", + " # since this uses a custom Stanza analyzer, we have to use a custom mlflow.Pyfunc.PythonModel\n", + " # Instantiate sklearn CountVectorizer\n", + " sklearn_cv_params = {\n", + " # 'strip_accents':\"unicode\",\n", + " # 'lowercase':True,\n", + " # 'analyzer': StanzaWrapper().stanza_analyzer(stanza_pipeline=nlp, minNgramLength=1, maxNgramLength=4),\n", + " # 'min_df':10,\n", + " 'token_pattern': r\"(?u)\\b[a-zA-Z]{2,}\\b\"\n", + " }\n", + " ingreds_vectorizer_model = CountVectorizer(**sklearn_cv_params)\n", + "\n", + " # May need to use BERTopic's OnlineCountVectorizer\n", + " # steps_vectorizer_model = OnlineCountVectorizer(**sklearn_nlp_params)\n", + "\n", + " # Do fit transform on data\n", + " # steps_test_tfidf_transform = steps_tfidf_vectorizer_model.fit_transform(tqdm(to_nlp_df[\"steps\"]))\n", + " topic_model.update_topics(\n", + " recipe_ingreds\n", + " , vectorizer_model=ingreds_vectorizer_model\n", + " )\n", + "\n", + " # Display topic model results\n", + " print('\\n')\n", + " print('-' * 80)\n", + " print(f'{datetime.now()}, BERTopic Model Dataframe:', end='\\n')\n", + " print(topic_model.get_topic_info())\n", + "\n", + " print('\\n')\n", + " print('-' * 80)\n", + " print(f'{datetime.now()}, BERTopic Model Representations:', end='\\n')\n", + " print(topic_model.get_topic_info()['Representation'])\n", + "\n", + " print('\\n')\n", + " print('-' * 80)\n", + " print(f'{datetime.now()}, BERTopic Model Representative Docs:', end='\\n')\n", + " print(topic_model.get_topic_info()['Representative_Docs'])\n", + "\n", + " # Save and log the topic model dataframe\n", + " topic_model.get_topic_info().to_json('../data/processed/bertopic_model_ingreds_small_set_df.json')\n", + " mlflow.log_artifact('../data/processed/bertopic_model_ingreds_small_set_df.json',\n", + " artifact_path='bertopic_models')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "to_nlp_df['ingredients'][0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test_recipe_ingreds = to_nlp_df[\"ingredients\"].apply(custom_analyzer, **analyzer_kwargs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "i think i should start leaving out units/including stopwords again since i'm not using Stanza's deep learning" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# load from MLflow\n", + "mlflow_client = mlflow.tracking.MlflowClient(\n", + " tracking_uri=f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow')\n", + "\n", + "# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer\n", + "sklearn_nlp_params = {\n", + " 'strip_accents':\"unicode\",\n", + " 'lowercase':True,\n", + " 'analyzer': StanzaWrapper().stanza_analyzer(stanza_pipeline=nlp, minNgramLength=1, maxNgramLength=4),\n", + " 'min_df':10,\n", + "}\n", + "\n", + "# bertopic_params are a superset of cv_params\n", + "bertopic_params = {\n", + " 'top_n_words':20,\n", + " 'min_topic_size':10,\n", + " 'nr_topics':50,\n", + " 'verbose':True,\n", + " 'low_memory':True,\n", + " 'calculate_probabilities':True,\n", + " # 'min_cluster_size': 10 # Possibly only works if modifying individual HDBSCAN component of BERTopic\n", + "}\n", + "\n", + "# update bertopic_params to include cv_params\n", + "# bertopic_params.update(cv_params)\n", + "\n", + "# pipeline_params are parameters that will be logged in MLFlow and are a superset of library parameters\n", + "pipeline_params = {\n", + " 'stanza_model': 'en',\n", + " 'sklearn-transformer': 'TfidfVectorizer'\n", + "}\n", + "\n", + "# update the pipeline parameters with the library-specific ones so that they show up in MLflow Tracking\n", + "pipeline_params.update(sklearn_nlp_params)\n", + "pipeline_params.update(bertopic_params)\n", + "\n", + "with mlflow.start_run(experiment_id=get_experiment_id(f\"{DAGSHUB_EMAIL}/bertopic_lightweight_stanza_ingreds_full_set_v1.00\")): \n", + " # LOG PARAMETERS\n", + " mlflow.log_params(pipeline_params)\n", + "\n", + " # LOG INPUTS (QUERIES) AND OUTPUTS\n", + " # MLflow example uses a list of strings or a list of str->str dicts\n", + " \n", + " # load raw data and preprocess/clean\n", + " data = dvc.api.read(\n", + " path='../data/recipes-en-201706/epicurious-recipes_m2.json'\n", + " , mode='r')\n", + " raw_df = pd.read_json(data)\n", + " print('\\n')\n", + " print('--------------')\n", + " print(f'{datetime.now()}, Raw Dataframe: ', end='\\n')\n", + " print(raw_df.head())\n", + " print(raw_df.shape)\n", + "\n", + " # pre_proc_df is cleaned dataframe\n", + " pre_proc_df = dfpp.preprocess_dataframe(raw_df)\n", + " print('\\n')\n", + " print('--------------')\n", + " print(f'{datetime.now()}, Preprocessed Dataframe:', end='\\n')\n", + " print(pre_proc_df.head())\n", + " print(pre_proc_df.shape)\n", + "\n", + "\n", + " # pre_proc_df = pd.read_json(\n", + " # mlflow.artifacts.download_artifacts(\n", + " # run_id=mlflow_run_id,\n", + " # artifact_path='artifacts/preprocessed_dataframes/preprocessed_dataframe.json',\n", + " # # tracking_uri=f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow'\n", + " # )\n", + " # )\n", + " # print('\\n')\n", + " # print('-' * 80)\n", + " # print('Preprocessed Dataframe:', end='\\n')\n", + " # print(pre_proc_df.head())\n", + " # print(pre_proc_df.shape)\n", + "\n", + " # create subset for dev purposes\n", + " # to_nlp_df = pre_proc_df[0:100]\n", + " # print('\\n')\n", + " # print('-' * 80)\n", + " # print(f'{datetime.now()}, Subset Dataframe:', end='\\n')\n", + " # print(to_nlp_df.head())\n", + " # print(to_nlp_df.shape)\n", + "\n", + " # LOG MODEL\n", + " # Instantiate BERTopic\n", + " topic_model = BERTopic(\n", + " **bertopic_params\n", + " )\n", + " \n", + " analyzer_kwargs = {'stanza_pipeline': nlp\n", + " , 'minNgramLength': 1\n", + " , 'maxNgramLength': 4\n", + " , 'lemmatize': True}\n", + " \n", + " recipe_ingreds = pre_proc_df[\"ingredients\"].apply(custom_analyzer, **analyzer_kwargs)\n", + " \n", + " print('\\n')\n", + " print('-' * 80)\n", + " print(f'{datetime.now()}, Recipe ingredients:', end='\\n')\n", + " print(recipe_ingreds)\n", + "\n", + " # Create TF-IDF embeddings\n", + " vectorizer = TfidfVectorizer(**sklearn_nlp_params)\n", + " embeddings = vectorizer.fit_transform(tqdm(recipe_ingreds))\n", + "\n", + " # recipe_steps = \"\".join(str(to_nlp_df[\"prepSteps\"].apply(StanzaWrapper().stanza_analyzer(stanza_pipeline=nlp, minNgramLength=1, maxNgramLength=4))))\n", + " # print('\\n')\n", + " # print('-' * 80)\n", + " # print(f'{datetime.now()}, Recipe ingredients:', end='\\n')\n", + " # print(recipe_ingreds)\n", + "\n", + " # train on the recipes' ingredientss\n", + " topics, probs = topic_model.fit_transform(recipe_ingreds, embeddings)\n", + "\n", + " # since this uses a custom Stanza analyzer, we have to use a custom mlflow.Pyfunc.PythonModel\n", + " # Instantiate sklearn CountVectorizer\n", + " sklearn_cv_params = {\n", + " # 'strip_accents':\"unicode\",\n", + " # 'lowercase':True,\n", + " 'token_pattern': r\"(?u)\\b[a-zA-Z]{2,}\\b\"\n", + " }\n", + " ingreds_vectorizer_model = CountVectorizer(**sklearn_cv_params)\n", + "\n", + " # Do fit transform on data\n", + " # steps_test_tfidf_transform = steps_tfidf_vectorizer_model.fit_transform(tqdm(to_nlp_df[\"steps\"]))\n", + " topic_model.update_topics(\n", + " recipe_ingreds\n", + " , vectorizer_model=ingreds_vectorizer_model\n", + " )\n", + "\n", + " # Display topic model results\n", + " print('\\n')\n", + " print('-' * 80)\n", + " print(f'{datetime.now()}, BERTopic Model Dataframe:', end='\\n')\n", + " print(topic_model.get_topic_info())\n", + "\n", + " print('\\n')\n", + " print('-' * 80)\n", + " print(f'{datetime.now()}, BERTopic Model Representations:', end='\\n')\n", + " print(topic_model.get_topic_info()['Representation'])\n", + "\n", + " print('\\n')\n", + " print('-' * 80)\n", + " print(f'{datetime.now()}, BERTopic Model Representative Docs:', end='\\n')\n", + " print(topic_model.get_topic_info()['Representative_Docs'])\n", + "\n", + " # Save and log the topic model dataframe\n", + " topic_model.get_topic_info().to_json('../data/processed/bertopic_model_ingreds_full_set_df.json')\n", + " mlflow.log_artifact('../data/processed/bertopic_model_ingreds_full_set_df.json',\n", + " artifact_path='bertopic_models')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# try splitting among CPU and GPU. Try Stanza on CPU due to its memory usage\n", + "nlp2 = stanza.Pipeline('en', use_gpu=False)\n", + "\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "python3", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}