From 9204aaf7a7f39aecc8fa7e8ce089ca6f958d75e9 Mon Sep 17 00:00:00 2001 From: Aaron W Chen Date: Wed, 12 Jun 2024 17:21:43 -0700 Subject: [PATCH] Upload steps for creating vector DB/embedding --- nbs/15_new_preproc_test_combined_df.ipynb | 978 ++++++++++++++++++++++ 1 file changed, 978 insertions(+) create mode 100644 nbs/15_new_preproc_test_combined_df.ipynb diff --git a/nbs/15_new_preproc_test_combined_df.ipynb b/nbs/15_new_preproc_test_combined_df.ipynb new file mode 100644 index 0000000..9e95074 --- /dev/null +++ b/nbs/15_new_preproc_test_combined_df.ipynb @@ -0,0 +1,978 @@ +{ + "cells": [ + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "---\n", + "description: test\n", + "output-file: template.html\n", + "title: Template\n", + "\n", + "---\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# | default_exp core" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "087d6d4ced3c49c88ec00adb20295872", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json: 0%| …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-05-08 19:00:54 INFO: Downloading default packages for language: en (English) ...\n", + "2024-05-08 19:00:55 INFO: File exists: /home/awchen/stanza_resources/en/default.zip\n", + "2024-05-08 19:00:58 INFO: Finished downloading models and saved to /home/awchen/stanza_resources.\n", + "2024-05-08 19:00:58 INFO: Checking for updates to resources.json in case models have been updated. Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "8af294b5fac641219a3a46629cf99fba", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json: 0%| …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-05-08 19:00:59 INFO: Loading these models for language: en (English):\n", + "======================================\n", + "| Processor | Package |\n", + "--------------------------------------\n", + "| tokenize | combined |\n", + "| pos | combined_charlm |\n", + "| lemma | combined_nocharlm |\n", + "| constituency | ptb3-revised_charlm |\n", + "| depparse | combined_charlm |\n", + "| sentiment | sstplus |\n", + "| ner | ontonotes_charlm |\n", + "======================================\n", + "\n", + "2024-05-08 19:00:59 INFO: Using device: cpu\n", + "2024-05-08 19:00:59 INFO: Loading: tokenize\n", + "2024-05-08 19:00:59 INFO: Loading: pos\n", + "2024-05-08 19:01:00 INFO: Loading: lemma\n", + "2024-05-08 19:01:00 INFO: Loading: constituency\n", + "2024-05-08 19:01:00 INFO: Loading: depparse\n", + "2024-05-08 19:01:00 INFO: Loading: sentiment\n", + "2024-05-08 19:01:00 INFO: Loading: ner\n", + "2024-05-08 19:01:01 INFO: Done loading processors!\n" + ] + } + ], + "source": [ + "# | hide\n", + "# from bertopic import BERTopic\n", + "# from bertopic.vectorizers import OnlineCountVectorizer\n", + "import dagshub\n", + "from datetime import datetime\n", + "import dill as pickle\n", + "import dvc.api\n", + "# from hdbscan import HDBSCAN\n", + "from itertools import tee, islice, product\n", + "import joblib\n", + "import nbdev\n", + "from nbdev.showdoc import *\n", + "import pandas as pd\n", + "import re\n", + "from sentence_transformers import SentenceTransformer\n", + "from sklearn.feature_extraction.text import (\n", + " CountVectorizer\n", + " , TfidfTransformer\n", + " , TfidfVectorizer\n", + " , \n", + ")\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.pipeline import make_pipeline\n", + "from src.custom_sklearn_text_transformer_mlflow import CustomSKLearnAnalyzer\n", + "import src.dataframe_preprocessor as dfpp\n", + "import stanza\n", + "from tqdm import tqdm\n", + "# from umap import UMAP" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!export 'PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# | export" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# | hide\n", + "# nbdev.nbdev_export()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Data Preparation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "9b4405a6faa044f185efdb8e5359b8e5", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json: 0%| …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-05-08 19:01:01 INFO: Downloading default packages for language: en (English) ...\n", + "2024-05-08 19:01:02 INFO: File exists: /home/awchen/stanza_resources/en/default.zip\n", + "2024-05-08 19:01:05 INFO: Finished downloading models and saved to /home/awchen/stanza_resources.\n", + "2024-05-08 19:01:05 INFO: Checking for updates to resources.json in case models have been updated. Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "6f43e74e5a7940a1b60662d5884ab4a2", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json: 0%| …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-05-08 19:01:06 INFO: Loading these models for language: en (English):\n", + "======================================\n", + "| Processor | Package |\n", + "--------------------------------------\n", + "| tokenize | combined |\n", + "| pos | combined_charlm |\n", + "| lemma | combined_nocharlm |\n", + "| constituency | ptb3-revised_charlm |\n", + "| depparse | combined_charlm |\n", + "| sentiment | sstplus |\n", + "| ner | ontonotes_charlm |\n", + "======================================\n", + "\n", + "2024-05-08 19:01:06 INFO: Using device: cuda\n", + "2024-05-08 19:01:06 INFO: Loading: tokenize\n", + "2024-05-08 19:01:10 INFO: Loading: pos\n", + "2024-05-08 19:01:10 INFO: Loading: lemma\n", + "2024-05-08 19:01:10 INFO: Loading: constituency\n", + "2024-05-08 19:01:11 INFO: Loading: depparse\n", + "2024-05-08 19:01:11 INFO: Loading: sentiment\n", + "2024-05-08 19:01:13 INFO: Loading: ner\n", + "2024-05-08 19:01:14 INFO: Done loading processors!\n" + ] + } + ], + "source": [ + "# instantiate stanza pipeline\n", + "stanza.download('en')\n", + "nlp = stanza.Pipeline('en', \n", + " depparse_batch_size=50, \n", + " depparse_min_length_to_batch_separately=50,\n", + " verbose=True,\n", + " use_gpu=True, # set to true when on cloud/not on streaming computer\n", + " batch_size=100\n", + " )\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Prepare whole dataframe for new processing\n", + "import mlflow\n", + "from mlflow.models import infer_signature\n", + "from src.custom_stanza_mlflow import CustomSKLearnWrapper" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# this function allows us to get the experiment ID from an experiment name\n", + "def get_experiment_id(name):\n", + " exp = mlflow.get_experiment_by_name(name)\n", + " if exp is None:\n", + " exp_id = mlflow.create_experiment(name)\n", + " return exp_id\n", + " return exp.experiment_id" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
                                       ❗❗❗ AUTHORIZATION REQUIRED ❗❗❗                                        \n",
+       "
\n" + ], + "text/plain": [ + " \u001b[1m❗❗❗ AUTHORIZATION REQUIRED ❗❗❗\u001b[0m \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "Open the following link in your browser to authorize the client:\n", + "https://dagshub.com/login/oauth/authorize?state=2a72caa0-4d17-4133-b792-04bc75d86098&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=9f4396584299dc580a77cbeee10d45564a42b8b6598f116f383828cec1dc79d7\n", + "\n", + "\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a37f40520504442f9d3ed6e408a7c309", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "
Repository initialized!\n",
+       "
\n" + ], + "text/plain": [ + "Repository initialized!\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#@markdown Enter the username of your DAGsHub account:\n", + "DAGSHUB_USER_NAME = \"AaronWChen\" #@param {type:\"string\"}\n", + "\n", + "#@markdown Enter the email for your DAGsHub account:\n", + "DAGSHUB_EMAIL = \"awc33@cornell.edu\" #@param {type:\"string\"}\n", + "\n", + "#@markdown Enter the repo name \n", + "DAGSHUB_REPO_NAME = \"MeaLeon\"\n", + "\n", + "#@markdown Enter the name of the branch you are working on \n", + "BRANCH = \"NGRAM-2/trying-sklearn-object-upload\"\n", + "dagshub.init(repo_name=DAGSHUB_REPO_NAME\n", + " , repo_owner=DAGSHUB_USER_NAME)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Starting DEV stage for TFIDF Encoded model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mlflow.set_tracking_uri(f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow')\n", + "\n", + "# starter idea for making an experiment name can be the git branch, but need more specificity\n", + "experiment_name = f\"{DAGSHUB_EMAIL}/TFIDF_up_to_quadgrams_small_sample_upload_test\"\n", + "mlflow_exp_id = get_experiment_id(experiment_name)\n", + "\n", + "# define model location\n", + "# model_directory = \"/tmp/sklearn_model\"\n", + "model_directory = \"../models/sklearn_model\"\n", + "\n", + "# Define the required artifacts associated with the saved custom pyfunc\n", + "# sklearn_path = model_directory + \"\"\n", + "sklearn_model_path = model_directory + \"/python_model.pkl\"\n", + "sklearn_transformer_path = model_directory + \"/sklearn_transformer.pkl\"\n", + "transformed_recipes_path = model_directory + \"/transformed_recipes.pkl\"\n", + "combined_df_path = model_directory + \"/combined_df.pkl\"\n", + "\n", + "artifacts = {'sklearn_model': sklearn_model_path,\n", + " 'sklearn_transformer': sklearn_transformer_path,\n", + " # 'transformed_recipes': transformed_recipes_path,\n", + " 'combined_data': combined_df_path\n", + " }\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
dekhedaggregateRatingingredientsprepStepsreviewsCountwillMakeAgainPctingredients_lemmafiedcuisine_namephoto_filenamephoto_creditauthor_namedate_publishedrecipe_url
id
54a2b6b019925f464b373351How does fried chicken achieve No. 1 status? B...Pickle-Brined Fried Chicken3.11[1 tablespoons yellow mustard seeds, 1 tablesp...[Toast mustard and coriander seeds in a dry me...7100tablespoon yellow mustard seed brk tablespoon ...Missing Cuisine51247610_fried-chicken_1x1.jpgMichael Graydon and Nikole HerriottMissing Author Name2014-08-19 04:00:00+00:00https://www.epicurious.com/recipes/food/views/...
54a408a019925f464b3733bcSpinaci all'EbraicaSpinach Jewish Style3.22[3 pounds small-leaved bulk spinach, Salt, 1/2...[Remove the stems and roots from the spinach. ...580pound small leave bulk spinach brk salt brk cu...ItalianEP_12162015_placeholders_rustic.jpgPhoto by Chelsea Kyle, Prop Styling by Anna St...Edda Servi Machlin2008-09-09 04:00:00+00:00https://www.epicurious.com/recipes/food/views/...
54a408a26529d92b2c003631This majestic, moist, and richly spiced honey ...New Year’s Honey Cake3.62[3 1/2 cups all-purpose flour, 1 tablespoon ba...[I like this cake best baked in a 9-inch angel...10588cup purpose flour brk tablespoon baking powder...KosherEP_09022015_honeycake-2.jpgPhoto by Chelsea Kyle, Food Styling by Anna St...Marcy Goldman2008-09-10 04:00:00+00:00https://www.epicurious.com/recipes/food/views/...
54a408a66529d92b2c003638The idea for this sandwich came to me when my ...The B.L.A.—Bagel with Lox and Avocado4.00[1 small ripe avocado, preferably Hass (see No...[A short time before serving, mash avocado and...7100small ripe avocado hass see note brk teaspoon ...KosherEP_12162015_placeholders_casual.jpgPhoto by Chelsea Kyle, Prop Styling by Rhoda B...Faye Levy2008-09-08 04:00:00+00:00https://www.epicurious.com/recipes/food/views/...
54a408a719925f464b3733ccIn 1930, Simon Agranat, the chief justice of t...Shakshuka a la Doktor Shakshuka2.71[2 pounds fresh tomatoes, unpeeled and cut in ...[1. Place the tomatoes, garlic, salt, paprika,...783pound fresh tomato unpeeled cut quarter ounce ...KosherEP_12162015_placeholders_formal.jpgPhoto by Chelsea Kyle, Prop Styling by Rhoda B...Joan Nathan2008-09-09 04:00:00+00:00https://www.epicurious.com/recipes/food/views/...
\n", + "
" + ], + "text/plain": [ + " dek \\\n", + "id \n", + "54a2b6b019925f464b373351 How does fried chicken achieve No. 1 status? B... \n", + "54a408a019925f464b3733bc Spinaci all'Ebraica \n", + "54a408a26529d92b2c003631 This majestic, moist, and richly spiced honey ... \n", + "54a408a66529d92b2c003638 The idea for this sandwich came to me when my ... \n", + "54a408a719925f464b3733cc In 1930, Simon Agranat, the chief justice of t... \n", + "\n", + " hed \\\n", + "id \n", + "54a2b6b019925f464b373351 Pickle-Brined Fried Chicken \n", + "54a408a019925f464b3733bc Spinach Jewish Style \n", + "54a408a26529d92b2c003631 New Year’s Honey Cake \n", + "54a408a66529d92b2c003638 The B.L.A.—Bagel with Lox and Avocado \n", + "54a408a719925f464b3733cc Shakshuka a la Doktor Shakshuka \n", + "\n", + " aggregateRating \\\n", + "id \n", + "54a2b6b019925f464b373351 3.11 \n", + "54a408a019925f464b3733bc 3.22 \n", + "54a408a26529d92b2c003631 3.62 \n", + "54a408a66529d92b2c003638 4.00 \n", + "54a408a719925f464b3733cc 2.71 \n", + "\n", + " ingredients \\\n", + "id \n", + "54a2b6b019925f464b373351 [1 tablespoons yellow mustard seeds, 1 tablesp... \n", + "54a408a019925f464b3733bc [3 pounds small-leaved bulk spinach, Salt, 1/2... \n", + "54a408a26529d92b2c003631 [3 1/2 cups all-purpose flour, 1 tablespoon ba... \n", + "54a408a66529d92b2c003638 [1 small ripe avocado, preferably Hass (see No... \n", + "54a408a719925f464b3733cc [2 pounds fresh tomatoes, unpeeled and cut in ... \n", + "\n", + " prepSteps \\\n", + "id \n", + "54a2b6b019925f464b373351 [Toast mustard and coriander seeds in a dry me... \n", + "54a408a019925f464b3733bc [Remove the stems and roots from the spinach. ... \n", + "54a408a26529d92b2c003631 [I like this cake best baked in a 9-inch angel... \n", + "54a408a66529d92b2c003638 [A short time before serving, mash avocado and... \n", + "54a408a719925f464b3733cc [1. Place the tomatoes, garlic, salt, paprika,... \n", + "\n", + " reviewsCount willMakeAgainPct \\\n", + "id \n", + "54a2b6b019925f464b373351 7 100 \n", + "54a408a019925f464b3733bc 5 80 \n", + "54a408a26529d92b2c003631 105 88 \n", + "54a408a66529d92b2c003638 7 100 \n", + "54a408a719925f464b3733cc 7 83 \n", + "\n", + " ingredients_lemmafied \\\n", + "id \n", + "54a2b6b019925f464b373351 tablespoon yellow mustard seed brk tablespoon ... \n", + "54a408a019925f464b3733bc pound small leave bulk spinach brk salt brk cu... \n", + "54a408a26529d92b2c003631 cup purpose flour brk tablespoon baking powder... \n", + "54a408a66529d92b2c003638 small ripe avocado hass see note brk teaspoon ... \n", + "54a408a719925f464b3733cc pound fresh tomato unpeeled cut quarter ounce ... \n", + "\n", + " cuisine_name \\\n", + "id \n", + "54a2b6b019925f464b373351 Missing Cuisine \n", + "54a408a019925f464b3733bc Italian \n", + "54a408a26529d92b2c003631 Kosher \n", + "54a408a66529d92b2c003638 Kosher \n", + "54a408a719925f464b3733cc Kosher \n", + "\n", + " photo_filename \\\n", + "id \n", + "54a2b6b019925f464b373351 51247610_fried-chicken_1x1.jpg \n", + "54a408a019925f464b3733bc EP_12162015_placeholders_rustic.jpg \n", + "54a408a26529d92b2c003631 EP_09022015_honeycake-2.jpg \n", + "54a408a66529d92b2c003638 EP_12162015_placeholders_casual.jpg \n", + "54a408a719925f464b3733cc EP_12162015_placeholders_formal.jpg \n", + "\n", + " photo_credit \\\n", + "id \n", + "54a2b6b019925f464b373351 Michael Graydon and Nikole Herriott \n", + "54a408a019925f464b3733bc Photo by Chelsea Kyle, Prop Styling by Anna St... \n", + "54a408a26529d92b2c003631 Photo by Chelsea Kyle, Food Styling by Anna St... \n", + "54a408a66529d92b2c003638 Photo by Chelsea Kyle, Prop Styling by Rhoda B... \n", + "54a408a719925f464b3733cc Photo by Chelsea Kyle, Prop Styling by Rhoda B... \n", + "\n", + " author_name date_published \\\n", + "id \n", + "54a2b6b019925f464b373351 Missing Author Name 2014-08-19 04:00:00+00:00 \n", + "54a408a019925f464b3733bc Edda Servi Machlin 2008-09-09 04:00:00+00:00 \n", + "54a408a26529d92b2c003631 Marcy Goldman 2008-09-10 04:00:00+00:00 \n", + "54a408a66529d92b2c003638 Faye Levy 2008-09-08 04:00:00+00:00 \n", + "54a408a719925f464b3733cc Joan Nathan 2008-09-09 04:00:00+00:00 \n", + "\n", + " recipe_url \n", + "id \n", + "54a2b6b019925f464b373351 https://www.epicurious.com/recipes/food/views/... \n", + "54a408a019925f464b3733bc https://www.epicurious.com/recipes/food/views/... \n", + "54a408a26529d92b2c003631 https://www.epicurious.com/recipes/food/views/... \n", + "54a408a66529d92b2c003638 https://www.epicurious.com/recipes/food/views/... \n", + "54a408a719925f464b3733cc https://www.epicurious.com/recipes/food/views/... " + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "whole_nlp_df = pd.read_parquet('../joblib/2024.03.19/pre_proc_df.parquet.gzip')\n", + "whole_nlp_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "--------------------------------------------------------------------------------\n", + "sklearn fit transform on ingredients:\n", + "\n", + "\n", + "--------------------------------------------------------------------------------\n", + "Input Data: \n", + "id\n", + "54a2b6b019925f464b373351 tablespoon yellow mustard seed brk tablespoon ...\n", + "54a408a019925f464b3733bc pound small leave bulk spinach brk salt brk cu...\n", + "54a408a26529d92b2c003631 cup purpose flour brk tablespoon baking powder...\n", + "54a408a66529d92b2c003638 small ripe avocado hass see note brk teaspoon ...\n", + "54a408a719925f464b3733cc pound fresh tomato unpeeled cut quarter ounce ...\n", + " ... \n", + "59541a31bff3052847ae2107 tablespoon unsalt butter room temperature brk ...\n", + "5954233ad52ca90dc28200e7 tablespoon stick salt butter room temperature ...\n", + "595424c2109c972493636f83 tablespoon unsalted butter more greasing pan b...\n", + "5956638625dc3d1d829b7166 coarse salt brk lime wedge brk ounce tomato ju...\n", + "59566daa25dc3d1d829b7169 bottle millileter sour beer such almanac citra...\n", + "Name: ingredients_lemmafied, Length: 34756, dtype: object\n", + "\n", + "\n", + "--------------------------------------------------------------------------------\n", + "Input Data Shape: \n", + "(34756,)\n", + "\n", + "\n", + "--------------------------------------------------------------------------------\n", + "Random 3 Records from Input Data: \n", + "id\n", + "54a40caa19925f464b374017 boneless muscovy duck breast half pound total ...\n", + "55d4e08063b1ba1b5534b198 tablespoon white wine vinegar brk teaspoon sug...\n", + "54a43ad16529d92b2c019fc3 cup basmati rice ounce brk cup sweeten flake c...\n", + "Name: ingredients_lemmafied, dtype: object\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 34756/34756 [00:03<00:00, 10261.04it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "--------------------------------------------------------------------------------\n", + "Transformed Data:\n", + " 100g 125g 13x9x2 150g 1pound 1tablespoon \\\n", + "id \n", + "54a2b6b019925f464b373351 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "54a408a019925f464b3733bc 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "54a408a26529d92b2c003631 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "54a408a66529d92b2c003638 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "54a408a719925f464b3733cc 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "\n", + " 1teaspoon 200g 250g 2cup ... árbol divide \\\n", + "id ... \n", + "54a2b6b019925f464b373351 0.0 0.0 0.0 0.0 ... 0.0 \n", + "54a408a019925f464b3733bc 0.0 0.0 0.0 0.0 ... 0.0 \n", + "54a408a26529d92b2c003631 0.0 0.0 0.0 0.0 ... 0.0 \n", + "54a408a66529d92b2c003638 0.0 0.0 0.0 0.0 ... 0.0 \n", + "54a408a719925f464b3733cc 0.0 0.0 0.0 0.0 ... 0.0 \n", + "\n", + " árbol seed árbol seed remove árbol stem \\\n", + "id \n", + "54a2b6b019925f464b373351 0.0 0.0 0.0 \n", + "54a408a019925f464b3733bc 0.0 0.0 0.0 \n", + "54a408a26529d92b2c003631 0.0 0.0 0.0 \n", + "54a408a66529d92b2c003638 0.0 0.0 0.0 \n", + "54a408a719925f464b3733cc 0.0 0.0 0.0 \n", + "\n", + " árbol teaspoon árbol teaspoon crush \\\n", + "id \n", + "54a2b6b019925f464b373351 0.0 0.0 \n", + "54a408a019925f464b3733bc 0.0 0.0 \n", + "54a408a26529d92b2c003631 0.0 0.0 \n", + "54a408a66529d92b2c003638 0.0 0.0 \n", + "54a408a719925f464b3733cc 0.0 0.0 \n", + "\n", + " árbol teaspoon crush red árbol wipe \\\n", + "id \n", + "54a2b6b019925f464b373351 0.0 0.0 \n", + "54a408a019925f464b3733bc 0.0 0.0 \n", + "54a408a26529d92b2c003631 0.0 0.0 \n", + "54a408a66529d92b2c003638 0.0 0.0 \n", + "54a408a719925f464b3733cc 0.0 0.0 \n", + "\n", + " árbol wipe clean épice \n", + "id \n", + "54a2b6b019925f464b373351 0.0 0.0 \n", + "54a408a019925f464b3733bc 0.0 0.0 \n", + "54a408a26529d92b2c003631 0.0 0.0 \n", + "54a408a66529d92b2c003638 0.0 0.0 \n", + "54a408a719925f464b3733cc 0.0 0.0 \n", + "\n", + "[5 rows x 78378 columns]\n", + "\n", + "\n", + "--------------------------------------------------------------------------------\n", + "Random Sample of Combined Data:\n", + " 100g 125g 13x9x2 150g 1pound 1tablespoon \\\n", + "id \n", + "54a40caa19925f464b374017 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "54a43ad16529d92b2c019fc3 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "55d4e08063b1ba1b5534b198 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "\n", + " 1teaspoon 200g 250g 2cup ... árbol seed \\\n", + "id ... \n", + "54a40caa19925f464b374017 0.0 0.0 0.0 0.0 ... 0.0 \n", + "54a43ad16529d92b2c019fc3 0.0 0.0 0.0 0.0 ... 0.0 \n", + "55d4e08063b1ba1b5534b198 0.0 0.0 0.0 0.0 ... 0.0 \n", + "\n", + " árbol seed remove árbol stem árbol teaspoon \\\n", + "id \n", + "54a40caa19925f464b374017 0.0 0.0 0.0 \n", + "54a43ad16529d92b2c019fc3 0.0 0.0 0.0 \n", + "55d4e08063b1ba1b5534b198 0.0 0.0 0.0 \n", + "\n", + " árbol teaspoon crush árbol teaspoon crush red \\\n", + "id \n", + "54a40caa19925f464b374017 0.0 0.0 \n", + "54a43ad16529d92b2c019fc3 0.0 0.0 \n", + "55d4e08063b1ba1b5534b198 0.0 0.0 \n", + "\n", + " árbol wipe árbol wipe clean épice \\\n", + "id \n", + "54a40caa19925f464b374017 0.0 0.0 0.0 \n", + "54a43ad16529d92b2c019fc3 0.0 0.0 0.0 \n", + "55d4e08063b1ba1b5534b198 0.0 0.0 0.0 \n", + "\n", + " ingredients_lemmafied \n", + "id \n", + "54a40caa19925f464b374017 boneless muscovy duck breast half pound total ... \n", + "54a43ad16529d92b2c019fc3 cup basmati rice ounce brk cup sweeten flake c... \n", + "55d4e08063b1ba1b5534b198 tablespoon white wine vinegar brk teaspoon sug... \n", + "\n", + "[3 rows x 78379 columns]\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "8c78d8c010124a2b81119f07b34a3614", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading artifacts: 0%| | 0/1 [00:00str dicts\n", + " # Will be useful in STAGING/Evaluation\n", + " \n", + " # LOG MODEL\n", + " # Instantiate sklearn TFIDFVectorizer\n", + " sklearn_transformer = TfidfVectorizer(**sklearn_transformer_params)\n", + "\n", + " print('\\n')\n", + " print('-' * 80)\n", + " print('sklearn fit transform on ingredients:', end='\\n')\n", + "\n", + " model_input = whole_nlp_df['ingredients_lemmafied']\n", + "\n", + " print('\\n')\n", + " print('-' * 80)\n", + " print('Input Data: ', end='\\n')\n", + " print(model_input)\n", + "\n", + " print('\\n')\n", + " print('-' * 80)\n", + " print('Input Data Shape: ', end='\\n')\n", + " print(model_input.shape)\n", + "\n", + " random_sample = model_input.sample(3, random_state=200)\n", + "\n", + " print('\\n')\n", + " print('-' * 80)\n", + " print('Random 3 Records from Input Data: ', end='\\n')\n", + " print(random_sample)\n", + "\n", + " # Do fit transform on data\n", + " response = sklearn_transformer.fit_transform(tqdm(model_input)) \n", + " \n", + " transformed_recipe = pd.DataFrame(\n", + " response.toarray(),\n", + " columns=sklearn_transformer.get_feature_names_out(),\n", + " index=model_input.index\n", + " )\n", + "\n", + " signature = infer_signature(model_input=model_input,\n", + " model_output=transformed_recipe\n", + " )\n", + "\n", + " print('\\n')\n", + " print('-' * 80)\n", + " print('Transformed Data:', end='\\n')\n", + " print(transformed_recipe.head())\n", + " \n", + " combined_df = transformed_recipe.join(random_sample, how='inner')\n", + "\n", + " print('\\n')\n", + " print('-' * 80)\n", + " print('Random Sample of Combined Data:', end='\\n')\n", + " print(combined_df.head())\n", + "\n", + " with open(sklearn_transformer_path, \"wb\") as fo:\n", + " pickle.dump(sklearn_transformer, fo)\n", + " \n", + " with open(transformed_recipes_path, \"wb\") as fo:\n", + " pickle.dump(transformed_recipe, fo)\n", + " \n", + " with open(combined_df_path, 'wb') as fo:\n", + " pickle.dump(combined_df, fo)\n", + "\n", + "\n", + " model_info = mlflow.pyfunc.log_model( \n", + " code_path=[\"../src/\"],\n", + " python_model=CustomSKLearnWrapper(),\n", + " input_example=whole_nlp_df['ingredients_lemmafied'][0],\n", + " signature=signature, \n", + " artifact_path=\"sklearn_model\",\n", + " artifacts=artifacts\n", + " ) \n", + "\n", + " # since this uses a custom Stanza analyzer, we have to use a custom mlflow.Pyfunc.PythonModel\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "python3", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}