diff --git a/nbs/15_new_preproc_test_combined_df.ipynb b/nbs/15_new_preproc_test_combined_df.ipynb
new file mode 100644
index 0000000..9e95074
--- /dev/null
+++ b/nbs/15_new_preproc_test_combined_df.ipynb
@@ -0,0 +1,978 @@
+{
+ "cells": [
+ {
+ "cell_type": "raw",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "description: test\n",
+ "output-file: template.html\n",
+ "title: Template\n",
+ "\n",
+ "---\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# | default_exp core"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "087d6d4ced3c49c88ec00adb20295872",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json: 0%| …"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2024-05-08 19:00:54 INFO: Downloading default packages for language: en (English) ...\n",
+ "2024-05-08 19:00:55 INFO: File exists: /home/awchen/stanza_resources/en/default.zip\n",
+ "2024-05-08 19:00:58 INFO: Finished downloading models and saved to /home/awchen/stanza_resources.\n",
+ "2024-05-08 19:00:58 INFO: Checking for updates to resources.json in case models have been updated. Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES\n"
+ ]
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "8af294b5fac641219a3a46629cf99fba",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json: 0%| …"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2024-05-08 19:00:59 INFO: Loading these models for language: en (English):\n",
+ "======================================\n",
+ "| Processor | Package |\n",
+ "--------------------------------------\n",
+ "| tokenize | combined |\n",
+ "| pos | combined_charlm |\n",
+ "| lemma | combined_nocharlm |\n",
+ "| constituency | ptb3-revised_charlm |\n",
+ "| depparse | combined_charlm |\n",
+ "| sentiment | sstplus |\n",
+ "| ner | ontonotes_charlm |\n",
+ "======================================\n",
+ "\n",
+ "2024-05-08 19:00:59 INFO: Using device: cpu\n",
+ "2024-05-08 19:00:59 INFO: Loading: tokenize\n",
+ "2024-05-08 19:00:59 INFO: Loading: pos\n",
+ "2024-05-08 19:01:00 INFO: Loading: lemma\n",
+ "2024-05-08 19:01:00 INFO: Loading: constituency\n",
+ "2024-05-08 19:01:00 INFO: Loading: depparse\n",
+ "2024-05-08 19:01:00 INFO: Loading: sentiment\n",
+ "2024-05-08 19:01:00 INFO: Loading: ner\n",
+ "2024-05-08 19:01:01 INFO: Done loading processors!\n"
+ ]
+ }
+ ],
+ "source": [
+ "# | hide\n",
+ "# from bertopic import BERTopic\n",
+ "# from bertopic.vectorizers import OnlineCountVectorizer\n",
+ "import dagshub\n",
+ "from datetime import datetime\n",
+ "import dill as pickle\n",
+ "import dvc.api\n",
+ "# from hdbscan import HDBSCAN\n",
+ "from itertools import tee, islice, product\n",
+ "import joblib\n",
+ "import nbdev\n",
+ "from nbdev.showdoc import *\n",
+ "import pandas as pd\n",
+ "import re\n",
+ "from sentence_transformers import SentenceTransformer\n",
+ "from sklearn.feature_extraction.text import (\n",
+ " CountVectorizer\n",
+ " , TfidfTransformer\n",
+ " , TfidfVectorizer\n",
+ " , \n",
+ ")\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "from sklearn.pipeline import make_pipeline\n",
+ "from src.custom_sklearn_text_transformer_mlflow import CustomSKLearnAnalyzer\n",
+ "import src.dataframe_preprocessor as dfpp\n",
+ "import stanza\n",
+ "from tqdm import tqdm\n",
+ "# from umap import UMAP"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!export 'PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128'"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# | export"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# | hide\n",
+ "# nbdev.nbdev_export()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Data Preparation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "9b4405a6faa044f185efdb8e5359b8e5",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json: 0%| …"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2024-05-08 19:01:01 INFO: Downloading default packages for language: en (English) ...\n",
+ "2024-05-08 19:01:02 INFO: File exists: /home/awchen/stanza_resources/en/default.zip\n",
+ "2024-05-08 19:01:05 INFO: Finished downloading models and saved to /home/awchen/stanza_resources.\n",
+ "2024-05-08 19:01:05 INFO: Checking for updates to resources.json in case models have been updated. Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES\n"
+ ]
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "6f43e74e5a7940a1b60662d5884ab4a2",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json: 0%| …"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2024-05-08 19:01:06 INFO: Loading these models for language: en (English):\n",
+ "======================================\n",
+ "| Processor | Package |\n",
+ "--------------------------------------\n",
+ "| tokenize | combined |\n",
+ "| pos | combined_charlm |\n",
+ "| lemma | combined_nocharlm |\n",
+ "| constituency | ptb3-revised_charlm |\n",
+ "| depparse | combined_charlm |\n",
+ "| sentiment | sstplus |\n",
+ "| ner | ontonotes_charlm |\n",
+ "======================================\n",
+ "\n",
+ "2024-05-08 19:01:06 INFO: Using device: cuda\n",
+ "2024-05-08 19:01:06 INFO: Loading: tokenize\n",
+ "2024-05-08 19:01:10 INFO: Loading: pos\n",
+ "2024-05-08 19:01:10 INFO: Loading: lemma\n",
+ "2024-05-08 19:01:10 INFO: Loading: constituency\n",
+ "2024-05-08 19:01:11 INFO: Loading: depparse\n",
+ "2024-05-08 19:01:11 INFO: Loading: sentiment\n",
+ "2024-05-08 19:01:13 INFO: Loading: ner\n",
+ "2024-05-08 19:01:14 INFO: Done loading processors!\n"
+ ]
+ }
+ ],
+ "source": [
+ "# instantiate stanza pipeline\n",
+ "stanza.download('en')\n",
+ "nlp = stanza.Pipeline('en', \n",
+ " depparse_batch_size=50, \n",
+ " depparse_min_length_to_batch_separately=50,\n",
+ " verbose=True,\n",
+ " use_gpu=True, # set to true when on cloud/not on streaming computer\n",
+ " batch_size=100\n",
+ " )\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Prepare whole dataframe for new processing\n",
+ "import mlflow\n",
+ "from mlflow.models import infer_signature\n",
+ "from src.custom_stanza_mlflow import CustomSKLearnWrapper"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# this function allows us to get the experiment ID from an experiment name\n",
+ "def get_experiment_id(name):\n",
+ " exp = mlflow.get_experiment_by_name(name)\n",
+ " if exp is None:\n",
+ " exp_id = mlflow.create_experiment(name)\n",
+ " return exp_id\n",
+ " return exp.experiment_id"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
❗❗❗ AUTHORIZATION REQUIRED ❗❗❗ \n",
+ "
\n"
+ ],
+ "text/plain": [
+ " \u001b[1m❗❗❗ AUTHORIZATION REQUIRED ❗❗❗\u001b[0m \n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "\n",
+ "Open the following link in your browser to authorize the client:\n",
+ "https://dagshub.com/login/oauth/authorize?state=2a72caa0-4d17-4133-b792-04bc75d86098&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=9f4396584299dc580a77cbeee10d45564a42b8b6598f116f383828cec1dc79d7\n",
+ "\n",
+ "\n"
+ ]
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "a37f40520504442f9d3ed6e408a7c309",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Output()"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n"
+ ],
+ "text/plain": []
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Repository initialized!\n",
+ "
\n"
+ ],
+ "text/plain": [
+ "Repository initialized!\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "#@markdown Enter the username of your DAGsHub account:\n",
+ "DAGSHUB_USER_NAME = \"AaronWChen\" #@param {type:\"string\"}\n",
+ "\n",
+ "#@markdown Enter the email for your DAGsHub account:\n",
+ "DAGSHUB_EMAIL = \"awc33@cornell.edu\" #@param {type:\"string\"}\n",
+ "\n",
+ "#@markdown Enter the repo name \n",
+ "DAGSHUB_REPO_NAME = \"MeaLeon\"\n",
+ "\n",
+ "#@markdown Enter the name of the branch you are working on \n",
+ "BRANCH = \"NGRAM-2/trying-sklearn-object-upload\"\n",
+ "dagshub.init(repo_name=DAGSHUB_REPO_NAME\n",
+ " , repo_owner=DAGSHUB_USER_NAME)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Starting DEV stage for TFIDF Encoded model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "mlflow.set_tracking_uri(f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow')\n",
+ "\n",
+ "# starter idea for making an experiment name can be the git branch, but need more specificity\n",
+ "experiment_name = f\"{DAGSHUB_EMAIL}/TFIDF_up_to_quadgrams_small_sample_upload_test\"\n",
+ "mlflow_exp_id = get_experiment_id(experiment_name)\n",
+ "\n",
+ "# define model location\n",
+ "# model_directory = \"/tmp/sklearn_model\"\n",
+ "model_directory = \"../models/sklearn_model\"\n",
+ "\n",
+ "# Define the required artifacts associated with the saved custom pyfunc\n",
+ "# sklearn_path = model_directory + \"\"\n",
+ "sklearn_model_path = model_directory + \"/python_model.pkl\"\n",
+ "sklearn_transformer_path = model_directory + \"/sklearn_transformer.pkl\"\n",
+ "transformed_recipes_path = model_directory + \"/transformed_recipes.pkl\"\n",
+ "combined_df_path = model_directory + \"/combined_df.pkl\"\n",
+ "\n",
+ "artifacts = {'sklearn_model': sklearn_model_path,\n",
+ " 'sklearn_transformer': sklearn_transformer_path,\n",
+ " # 'transformed_recipes': transformed_recipes_path,\n",
+ " 'combined_data': combined_df_path\n",
+ " }\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " dek | \n",
+ " hed | \n",
+ " aggregateRating | \n",
+ " ingredients | \n",
+ " prepSteps | \n",
+ " reviewsCount | \n",
+ " willMakeAgainPct | \n",
+ " ingredients_lemmafied | \n",
+ " cuisine_name | \n",
+ " photo_filename | \n",
+ " photo_credit | \n",
+ " author_name | \n",
+ " date_published | \n",
+ " recipe_url | \n",
+ "
\n",
+ " \n",
+ " id | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 54a2b6b019925f464b373351 | \n",
+ " How does fried chicken achieve No. 1 status? B... | \n",
+ " Pickle-Brined Fried Chicken | \n",
+ " 3.11 | \n",
+ " [1 tablespoons yellow mustard seeds, 1 tablesp... | \n",
+ " [Toast mustard and coriander seeds in a dry me... | \n",
+ " 7 | \n",
+ " 100 | \n",
+ " tablespoon yellow mustard seed brk tablespoon ... | \n",
+ " Missing Cuisine | \n",
+ " 51247610_fried-chicken_1x1.jpg | \n",
+ " Michael Graydon and Nikole Herriott | \n",
+ " Missing Author Name | \n",
+ " 2014-08-19 04:00:00+00:00 | \n",
+ " https://www.epicurious.com/recipes/food/views/... | \n",
+ "
\n",
+ " \n",
+ " 54a408a019925f464b3733bc | \n",
+ " Spinaci all'Ebraica | \n",
+ " Spinach Jewish Style | \n",
+ " 3.22 | \n",
+ " [3 pounds small-leaved bulk spinach, Salt, 1/2... | \n",
+ " [Remove the stems and roots from the spinach. ... | \n",
+ " 5 | \n",
+ " 80 | \n",
+ " pound small leave bulk spinach brk salt brk cu... | \n",
+ " Italian | \n",
+ " EP_12162015_placeholders_rustic.jpg | \n",
+ " Photo by Chelsea Kyle, Prop Styling by Anna St... | \n",
+ " Edda Servi Machlin | \n",
+ " 2008-09-09 04:00:00+00:00 | \n",
+ " https://www.epicurious.com/recipes/food/views/... | \n",
+ "
\n",
+ " \n",
+ " 54a408a26529d92b2c003631 | \n",
+ " This majestic, moist, and richly spiced honey ... | \n",
+ " New Year’s Honey Cake | \n",
+ " 3.62 | \n",
+ " [3 1/2 cups all-purpose flour, 1 tablespoon ba... | \n",
+ " [I like this cake best baked in a 9-inch angel... | \n",
+ " 105 | \n",
+ " 88 | \n",
+ " cup purpose flour brk tablespoon baking powder... | \n",
+ " Kosher | \n",
+ " EP_09022015_honeycake-2.jpg | \n",
+ " Photo by Chelsea Kyle, Food Styling by Anna St... | \n",
+ " Marcy Goldman | \n",
+ " 2008-09-10 04:00:00+00:00 | \n",
+ " https://www.epicurious.com/recipes/food/views/... | \n",
+ "
\n",
+ " \n",
+ " 54a408a66529d92b2c003638 | \n",
+ " The idea for this sandwich came to me when my ... | \n",
+ " The B.L.A.Bagel with Lox and Avocado | \n",
+ " 4.00 | \n",
+ " [1 small ripe avocado, preferably Hass (see No... | \n",
+ " [A short time before serving, mash avocado and... | \n",
+ " 7 | \n",
+ " 100 | \n",
+ " small ripe avocado hass see note brk teaspoon ... | \n",
+ " Kosher | \n",
+ " EP_12162015_placeholders_casual.jpg | \n",
+ " Photo by Chelsea Kyle, Prop Styling by Rhoda B... | \n",
+ " Faye Levy | \n",
+ " 2008-09-08 04:00:00+00:00 | \n",
+ " https://www.epicurious.com/recipes/food/views/... | \n",
+ "
\n",
+ " \n",
+ " 54a408a719925f464b3733cc | \n",
+ " In 1930, Simon Agranat, the chief justice of t... | \n",
+ " Shakshuka a la Doktor Shakshuka | \n",
+ " 2.71 | \n",
+ " [2 pounds fresh tomatoes, unpeeled and cut in ... | \n",
+ " [1. Place the tomatoes, garlic, salt, paprika,... | \n",
+ " 7 | \n",
+ " 83 | \n",
+ " pound fresh tomato unpeeled cut quarter ounce ... | \n",
+ " Kosher | \n",
+ " EP_12162015_placeholders_formal.jpg | \n",
+ " Photo by Chelsea Kyle, Prop Styling by Rhoda B... | \n",
+ " Joan Nathan | \n",
+ " 2008-09-09 04:00:00+00:00 | \n",
+ " https://www.epicurious.com/recipes/food/views/... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " dek \\\n",
+ "id \n",
+ "54a2b6b019925f464b373351 How does fried chicken achieve No. 1 status? B... \n",
+ "54a408a019925f464b3733bc Spinaci all'Ebraica \n",
+ "54a408a26529d92b2c003631 This majestic, moist, and richly spiced honey ... \n",
+ "54a408a66529d92b2c003638 The idea for this sandwich came to me when my ... \n",
+ "54a408a719925f464b3733cc In 1930, Simon Agranat, the chief justice of t... \n",
+ "\n",
+ " hed \\\n",
+ "id \n",
+ "54a2b6b019925f464b373351 Pickle-Brined Fried Chicken \n",
+ "54a408a019925f464b3733bc Spinach Jewish Style \n",
+ "54a408a26529d92b2c003631 New Year’s Honey Cake \n",
+ "54a408a66529d92b2c003638 The B.L.A.Bagel with Lox and Avocado \n",
+ "54a408a719925f464b3733cc Shakshuka a la Doktor Shakshuka \n",
+ "\n",
+ " aggregateRating \\\n",
+ "id \n",
+ "54a2b6b019925f464b373351 3.11 \n",
+ "54a408a019925f464b3733bc 3.22 \n",
+ "54a408a26529d92b2c003631 3.62 \n",
+ "54a408a66529d92b2c003638 4.00 \n",
+ "54a408a719925f464b3733cc 2.71 \n",
+ "\n",
+ " ingredients \\\n",
+ "id \n",
+ "54a2b6b019925f464b373351 [1 tablespoons yellow mustard seeds, 1 tablesp... \n",
+ "54a408a019925f464b3733bc [3 pounds small-leaved bulk spinach, Salt, 1/2... \n",
+ "54a408a26529d92b2c003631 [3 1/2 cups all-purpose flour, 1 tablespoon ba... \n",
+ "54a408a66529d92b2c003638 [1 small ripe avocado, preferably Hass (see No... \n",
+ "54a408a719925f464b3733cc [2 pounds fresh tomatoes, unpeeled and cut in ... \n",
+ "\n",
+ " prepSteps \\\n",
+ "id \n",
+ "54a2b6b019925f464b373351 [Toast mustard and coriander seeds in a dry me... \n",
+ "54a408a019925f464b3733bc [Remove the stems and roots from the spinach. ... \n",
+ "54a408a26529d92b2c003631 [I like this cake best baked in a 9-inch angel... \n",
+ "54a408a66529d92b2c003638 [A short time before serving, mash avocado and... \n",
+ "54a408a719925f464b3733cc [1. Place the tomatoes, garlic, salt, paprika,... \n",
+ "\n",
+ " reviewsCount willMakeAgainPct \\\n",
+ "id \n",
+ "54a2b6b019925f464b373351 7 100 \n",
+ "54a408a019925f464b3733bc 5 80 \n",
+ "54a408a26529d92b2c003631 105 88 \n",
+ "54a408a66529d92b2c003638 7 100 \n",
+ "54a408a719925f464b3733cc 7 83 \n",
+ "\n",
+ " ingredients_lemmafied \\\n",
+ "id \n",
+ "54a2b6b019925f464b373351 tablespoon yellow mustard seed brk tablespoon ... \n",
+ "54a408a019925f464b3733bc pound small leave bulk spinach brk salt brk cu... \n",
+ "54a408a26529d92b2c003631 cup purpose flour brk tablespoon baking powder... \n",
+ "54a408a66529d92b2c003638 small ripe avocado hass see note brk teaspoon ... \n",
+ "54a408a719925f464b3733cc pound fresh tomato unpeeled cut quarter ounce ... \n",
+ "\n",
+ " cuisine_name \\\n",
+ "id \n",
+ "54a2b6b019925f464b373351 Missing Cuisine \n",
+ "54a408a019925f464b3733bc Italian \n",
+ "54a408a26529d92b2c003631 Kosher \n",
+ "54a408a66529d92b2c003638 Kosher \n",
+ "54a408a719925f464b3733cc Kosher \n",
+ "\n",
+ " photo_filename \\\n",
+ "id \n",
+ "54a2b6b019925f464b373351 51247610_fried-chicken_1x1.jpg \n",
+ "54a408a019925f464b3733bc EP_12162015_placeholders_rustic.jpg \n",
+ "54a408a26529d92b2c003631 EP_09022015_honeycake-2.jpg \n",
+ "54a408a66529d92b2c003638 EP_12162015_placeholders_casual.jpg \n",
+ "54a408a719925f464b3733cc EP_12162015_placeholders_formal.jpg \n",
+ "\n",
+ " photo_credit \\\n",
+ "id \n",
+ "54a2b6b019925f464b373351 Michael Graydon and Nikole Herriott \n",
+ "54a408a019925f464b3733bc Photo by Chelsea Kyle, Prop Styling by Anna St... \n",
+ "54a408a26529d92b2c003631 Photo by Chelsea Kyle, Food Styling by Anna St... \n",
+ "54a408a66529d92b2c003638 Photo by Chelsea Kyle, Prop Styling by Rhoda B... \n",
+ "54a408a719925f464b3733cc Photo by Chelsea Kyle, Prop Styling by Rhoda B... \n",
+ "\n",
+ " author_name date_published \\\n",
+ "id \n",
+ "54a2b6b019925f464b373351 Missing Author Name 2014-08-19 04:00:00+00:00 \n",
+ "54a408a019925f464b3733bc Edda Servi Machlin 2008-09-09 04:00:00+00:00 \n",
+ "54a408a26529d92b2c003631 Marcy Goldman 2008-09-10 04:00:00+00:00 \n",
+ "54a408a66529d92b2c003638 Faye Levy 2008-09-08 04:00:00+00:00 \n",
+ "54a408a719925f464b3733cc Joan Nathan 2008-09-09 04:00:00+00:00 \n",
+ "\n",
+ " recipe_url \n",
+ "id \n",
+ "54a2b6b019925f464b373351 https://www.epicurious.com/recipes/food/views/... \n",
+ "54a408a019925f464b3733bc https://www.epicurious.com/recipes/food/views/... \n",
+ "54a408a26529d92b2c003631 https://www.epicurious.com/recipes/food/views/... \n",
+ "54a408a66529d92b2c003638 https://www.epicurious.com/recipes/food/views/... \n",
+ "54a408a719925f464b3733cc https://www.epicurious.com/recipes/food/views/... "
+ ]
+ },
+ "execution_count": null,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "whole_nlp_df = pd.read_parquet('../joblib/2024.03.19/pre_proc_df.parquet.gzip')\n",
+ "whole_nlp_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "sklearn fit transform on ingredients:\n",
+ "\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "Input Data: \n",
+ "id\n",
+ "54a2b6b019925f464b373351 tablespoon yellow mustard seed brk tablespoon ...\n",
+ "54a408a019925f464b3733bc pound small leave bulk spinach brk salt brk cu...\n",
+ "54a408a26529d92b2c003631 cup purpose flour brk tablespoon baking powder...\n",
+ "54a408a66529d92b2c003638 small ripe avocado hass see note brk teaspoon ...\n",
+ "54a408a719925f464b3733cc pound fresh tomato unpeeled cut quarter ounce ...\n",
+ " ... \n",
+ "59541a31bff3052847ae2107 tablespoon unsalt butter room temperature brk ...\n",
+ "5954233ad52ca90dc28200e7 tablespoon stick salt butter room temperature ...\n",
+ "595424c2109c972493636f83 tablespoon unsalted butter more greasing pan b...\n",
+ "5956638625dc3d1d829b7166 coarse salt brk lime wedge brk ounce tomato ju...\n",
+ "59566daa25dc3d1d829b7169 bottle millileter sour beer such almanac citra...\n",
+ "Name: ingredients_lemmafied, Length: 34756, dtype: object\n",
+ "\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "Input Data Shape: \n",
+ "(34756,)\n",
+ "\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "Random 3 Records from Input Data: \n",
+ "id\n",
+ "54a40caa19925f464b374017 boneless muscovy duck breast half pound total ...\n",
+ "55d4e08063b1ba1b5534b198 tablespoon white wine vinegar brk teaspoon sug...\n",
+ "54a43ad16529d92b2c019fc3 cup basmati rice ounce brk cup sweeten flake c...\n",
+ "Name: ingredients_lemmafied, dtype: object\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 34756/34756 [00:03<00:00, 10261.04it/s]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "Transformed Data:\n",
+ " 100g 125g 13x9x2 150g 1pound 1tablespoon \\\n",
+ "id \n",
+ "54a2b6b019925f464b373351 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "54a408a019925f464b3733bc 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "54a408a26529d92b2c003631 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "54a408a66529d92b2c003638 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "54a408a719925f464b3733cc 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "\n",
+ " 1teaspoon 200g 250g 2cup ... árbol divide \\\n",
+ "id ... \n",
+ "54a2b6b019925f464b373351 0.0 0.0 0.0 0.0 ... 0.0 \n",
+ "54a408a019925f464b3733bc 0.0 0.0 0.0 0.0 ... 0.0 \n",
+ "54a408a26529d92b2c003631 0.0 0.0 0.0 0.0 ... 0.0 \n",
+ "54a408a66529d92b2c003638 0.0 0.0 0.0 0.0 ... 0.0 \n",
+ "54a408a719925f464b3733cc 0.0 0.0 0.0 0.0 ... 0.0 \n",
+ "\n",
+ " árbol seed árbol seed remove árbol stem \\\n",
+ "id \n",
+ "54a2b6b019925f464b373351 0.0 0.0 0.0 \n",
+ "54a408a019925f464b3733bc 0.0 0.0 0.0 \n",
+ "54a408a26529d92b2c003631 0.0 0.0 0.0 \n",
+ "54a408a66529d92b2c003638 0.0 0.0 0.0 \n",
+ "54a408a719925f464b3733cc 0.0 0.0 0.0 \n",
+ "\n",
+ " árbol teaspoon árbol teaspoon crush \\\n",
+ "id \n",
+ "54a2b6b019925f464b373351 0.0 0.0 \n",
+ "54a408a019925f464b3733bc 0.0 0.0 \n",
+ "54a408a26529d92b2c003631 0.0 0.0 \n",
+ "54a408a66529d92b2c003638 0.0 0.0 \n",
+ "54a408a719925f464b3733cc 0.0 0.0 \n",
+ "\n",
+ " árbol teaspoon crush red árbol wipe \\\n",
+ "id \n",
+ "54a2b6b019925f464b373351 0.0 0.0 \n",
+ "54a408a019925f464b3733bc 0.0 0.0 \n",
+ "54a408a26529d92b2c003631 0.0 0.0 \n",
+ "54a408a66529d92b2c003638 0.0 0.0 \n",
+ "54a408a719925f464b3733cc 0.0 0.0 \n",
+ "\n",
+ " árbol wipe clean épice \n",
+ "id \n",
+ "54a2b6b019925f464b373351 0.0 0.0 \n",
+ "54a408a019925f464b3733bc 0.0 0.0 \n",
+ "54a408a26529d92b2c003631 0.0 0.0 \n",
+ "54a408a66529d92b2c003638 0.0 0.0 \n",
+ "54a408a719925f464b3733cc 0.0 0.0 \n",
+ "\n",
+ "[5 rows x 78378 columns]\n",
+ "\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "Random Sample of Combined Data:\n",
+ " 100g 125g 13x9x2 150g 1pound 1tablespoon \\\n",
+ "id \n",
+ "54a40caa19925f464b374017 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "54a43ad16529d92b2c019fc3 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "55d4e08063b1ba1b5534b198 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "\n",
+ " 1teaspoon 200g 250g 2cup ... árbol seed \\\n",
+ "id ... \n",
+ "54a40caa19925f464b374017 0.0 0.0 0.0 0.0 ... 0.0 \n",
+ "54a43ad16529d92b2c019fc3 0.0 0.0 0.0 0.0 ... 0.0 \n",
+ "55d4e08063b1ba1b5534b198 0.0 0.0 0.0 0.0 ... 0.0 \n",
+ "\n",
+ " árbol seed remove árbol stem árbol teaspoon \\\n",
+ "id \n",
+ "54a40caa19925f464b374017 0.0 0.0 0.0 \n",
+ "54a43ad16529d92b2c019fc3 0.0 0.0 0.0 \n",
+ "55d4e08063b1ba1b5534b198 0.0 0.0 0.0 \n",
+ "\n",
+ " árbol teaspoon crush árbol teaspoon crush red \\\n",
+ "id \n",
+ "54a40caa19925f464b374017 0.0 0.0 \n",
+ "54a43ad16529d92b2c019fc3 0.0 0.0 \n",
+ "55d4e08063b1ba1b5534b198 0.0 0.0 \n",
+ "\n",
+ " árbol wipe árbol wipe clean épice \\\n",
+ "id \n",
+ "54a40caa19925f464b374017 0.0 0.0 0.0 \n",
+ "54a43ad16529d92b2c019fc3 0.0 0.0 0.0 \n",
+ "55d4e08063b1ba1b5534b198 0.0 0.0 0.0 \n",
+ "\n",
+ " ingredients_lemmafied \n",
+ "id \n",
+ "54a40caa19925f464b374017 boneless muscovy duck breast half pound total ... \n",
+ "54a43ad16529d92b2c019fc3 cup basmati rice ounce brk cup sweeten flake c... \n",
+ "55d4e08063b1ba1b5534b198 tablespoon white wine vinegar brk teaspoon sug... \n",
+ "\n",
+ "[3 rows x 78379 columns]\n"
+ ]
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "8c78d8c010124a2b81119f07b34a3614",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Downloading artifacts: 0%| | 0/1 [00:00, ?it/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "7bdf0865e2bb4029b353a562f819e139",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Downloading artifacts: 0%| | 0/1 [00:00, ?it/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "b6933fbc97f64a1987293ba6851561fa",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Downloading artifacts: 0%| | 0/1 [00:00, ?it/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2024/05/08 19:14:05 WARNING mlflow.utils.environment: Encountered an unexpected error while inferring pip requirements (model URI: /tmp/tmpazhxbb7a/model, flavor: python_function), fall back to return ['cloudpickle==2.2.1']. Set logging level to DEBUG to see the full traceback.\n",
+ "/home/awchen/Repos/Projects/MeaLeon/.venv/lib/python3.10/site-packages/_distutils_hack/__init__.py:33: UserWarning: Setuptools is replacing distutils.\n",
+ " warnings.warn(\"Setuptools is replacing distutils.\")\n",
+ "2024/05/08 19:14:41 WARNING mlflow.models.model: Logging model metadata to the tracking server has failed. The model artifacts have been logged successfully under mlflow-artifacts:/284d65dcc09149b8b4279793753b69f9/bd4ea6fe14cc4964bc56b6a4e41ddf71/artifacts. Set logging level to DEBUG via `logging.getLogger(\"mlflow\").setLevel(logging.DEBUG)` to see the full traceback.\n"
+ ]
+ }
+ ],
+ "source": [
+ "# load from MLflow\n",
+ "mlflow_client = mlflow.tracking.MlflowClient(\n",
+ " tracking_uri=f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow')\n",
+ "\n",
+ "# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer\n",
+ "sklearn_transformer_params = { \n",
+ " 'analyzer': CustomSKLearnAnalyzer().ngram_maker(\n",
+ " min_ngram_length=1,\n",
+ " max_ngram_length=4,\n",
+ " ),\n",
+ " 'min_df':3,\n",
+ " 'binary':False\n",
+ "}\n",
+ "\n",
+ "# pipeline_params are parameters that will be logged in MLFlow and are a superset of library parameters\n",
+ "pipeline_params = {\n",
+ " 'stanza_model': 'en',\n",
+ " 'sklearn-transformer': 'TFIDF'\n",
+ "}\n",
+ "\n",
+ "# update the pipeline parameters with the library-specific ones so that they show up in MLflow Tracking\n",
+ "pipeline_params.update(sklearn_transformer_params)\n",
+ "\n",
+ "with mlflow.start_run(experiment_id=mlflow_exp_id): \n",
+ " # LOG PARAMETERS\n",
+ " mlflow.log_params(pipeline_params)\n",
+ "\n",
+ " # LOG INPUTS (QUERIES) AND OUTPUTS\n",
+ " # MLflow example uses a list of strings or a list of str->str dicts\n",
+ " # Will be useful in STAGING/Evaluation\n",
+ " \n",
+ " # LOG MODEL\n",
+ " # Instantiate sklearn TFIDFVectorizer\n",
+ " sklearn_transformer = TfidfVectorizer(**sklearn_transformer_params)\n",
+ "\n",
+ " print('\\n')\n",
+ " print('-' * 80)\n",
+ " print('sklearn fit transform on ingredients:', end='\\n')\n",
+ "\n",
+ " model_input = whole_nlp_df['ingredients_lemmafied']\n",
+ "\n",
+ " print('\\n')\n",
+ " print('-' * 80)\n",
+ " print('Input Data: ', end='\\n')\n",
+ " print(model_input)\n",
+ "\n",
+ " print('\\n')\n",
+ " print('-' * 80)\n",
+ " print('Input Data Shape: ', end='\\n')\n",
+ " print(model_input.shape)\n",
+ "\n",
+ " random_sample = model_input.sample(3, random_state=200)\n",
+ "\n",
+ " print('\\n')\n",
+ " print('-' * 80)\n",
+ " print('Random 3 Records from Input Data: ', end='\\n')\n",
+ " print(random_sample)\n",
+ "\n",
+ " # Do fit transform on data\n",
+ " response = sklearn_transformer.fit_transform(tqdm(model_input)) \n",
+ " \n",
+ " transformed_recipe = pd.DataFrame(\n",
+ " response.toarray(),\n",
+ " columns=sklearn_transformer.get_feature_names_out(),\n",
+ " index=model_input.index\n",
+ " )\n",
+ "\n",
+ " signature = infer_signature(model_input=model_input,\n",
+ " model_output=transformed_recipe\n",
+ " )\n",
+ "\n",
+ " print('\\n')\n",
+ " print('-' * 80)\n",
+ " print('Transformed Data:', end='\\n')\n",
+ " print(transformed_recipe.head())\n",
+ " \n",
+ " combined_df = transformed_recipe.join(random_sample, how='inner')\n",
+ "\n",
+ " print('\\n')\n",
+ " print('-' * 80)\n",
+ " print('Random Sample of Combined Data:', end='\\n')\n",
+ " print(combined_df.head())\n",
+ "\n",
+ " with open(sklearn_transformer_path, \"wb\") as fo:\n",
+ " pickle.dump(sklearn_transformer, fo)\n",
+ " \n",
+ " with open(transformed_recipes_path, \"wb\") as fo:\n",
+ " pickle.dump(transformed_recipe, fo)\n",
+ " \n",
+ " with open(combined_df_path, 'wb') as fo:\n",
+ " pickle.dump(combined_df, fo)\n",
+ "\n",
+ "\n",
+ " model_info = mlflow.pyfunc.log_model( \n",
+ " code_path=[\"../src/\"],\n",
+ " python_model=CustomSKLearnWrapper(),\n",
+ " input_example=whole_nlp_df['ingredients_lemmafied'][0],\n",
+ " signature=signature, \n",
+ " artifact_path=\"sklearn_model\",\n",
+ " artifacts=artifacts\n",
+ " ) \n",
+ "\n",
+ " # since this uses a custom Stanza analyzer, we have to use a custom mlflow.Pyfunc.PythonModel\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "python3",
+ "language": "python",
+ "name": "python3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}