diff --git a/data.dvc b/data.dvc index e5bcd28..9905113 100644 --- a/data.dvc +++ b/data.dvc @@ -1,5 +1,5 @@ outs: -- md5: 3f85b4e2df7b76c01bcb27989e564e36.dir - size: 163733640 - nfiles: 6 +- md5: 2ce6297077793c098f42db8660fb0d0e.dir + size: 656551290 + nfiles: 12 path: data diff --git a/nbs/16_notebook_refactor.ipynb b/nbs/16_notebook_refactor.ipynb new file mode 100644 index 0000000..b88ef0c --- /dev/null +++ b/nbs/16_notebook_refactor.ipynb @@ -0,0 +1,1031 @@ +{ + "cells": [ + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "# Refactor MeaLeon into notebook friendly\n", + "---\n", + "description: Refactoring Flask App requests into more a notebook friendly iteration\n", + "output-file: template.html\n", + "title: Notebook refactor\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# | default_exp testing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# | hide\n", + "import dagshub\n", + "import dill as pickle\n", + "import joblib\n", + "import mlflow\n", + "from mlflow.models import infer_signature\n", + "import nbdev #; nbdev.nbdev_export()\n", + "from nbdev.showdoc import *\n", + "import pandas as pd\n", + "import re\n", + "from sklearn.feature_extraction.text import (\n", + " CountVectorizer\n", + " , TfidfTransformer\n", + " , TfidfVectorizer\n", + " , \n", + ")\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.pipeline import make_pipeline\n", + "from src.backend.embedding_creation.apply_stanza import CustomSKLearnAnalyzer\n", + "from src.backend.embedding_creation.sklearn_transformer_as_mlflow_model import CustomSKLearnWrapper\n", + "import src.backend.raw_data_cleaning.raw_data_preprocessor as rdpp\n", + "import stanza\n", + "from tqdm import tqdm" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Need to call DAGsHub to keep track of what we're doing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Repository initialized!\n",
+       "
\n" + ], + "text/plain": [ + "Repository initialized!\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#@markdown Enter the username of your DAGsHub account:\n", + "DAGSHUB_USER_NAME = \"AaronWChen\" #@param {type:\"string\"}\n", + "\n", + "#@markdown Enter the email for your DAGsHub account:\n", + "DAGSHUB_EMAIL = \"awc33@cornell.edu\" #@param {type:\"string\"}\n", + "\n", + "#@markdown Enter the repo name \n", + "DAGSHUB_REPO_NAME = \"MeaLeon\"\n", + "\n", + "#@markdown Enter the name of the branch you are working on \n", + "BRANCH = \"init_mealeon_to_notebook_refactor\"\n", + "dagshub.init(repo_name=DAGSHUB_REPO_NAME\n", + " , repo_owner=DAGSHUB_USER_NAME)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Things I need to do\n", + "\n", + "1. app.py calls find_similar_dishes, returns a render template\n", + "2. find_similar_dishes needs to call the recipe database, the sklearn model, the model-transformed database (ie, TFIDF word matrix), and the query (which needs to be transformed)\n", + " 1. Little confused by order; why would i need the original database if i can just call the model/vector-transformed version?\n", + " 1. Original database has things like url and ID, which could be needed later\n", + " 2. ~~Future vector data can use the same recipe_id unique key, but only have the ingredient vectors. Use unique key to join original...~~\n", + " 3. Wait, need cuisine filter to improve search results...so vector database should have cuisine and recipe_id\n", + " 4. From that, can call back to original database to get URLs and other metadata\n", + " 1. SQLModel query to join\n", + " 2. Sklearn model (really any model that transforms the query) needs to be loaded from MLflow\n", + " 1. Model will be used to transform query for similarity analysis\n", + " 2. MLflow load\n", + " 3. Vector database needs to be loaded from currently a json, but should switch to Vespa\n", + " 1. Wouldn't this need to be linked to the MLflow Model? DVC + Vespa?\n", + " 2. Mlflow or DVC load?\n", + " 4. Original recipe database might also be DVC?\n", + " 5. \n", + "3. original query should be formatted and stored into recipe database (CRUD)\n", + "4. this is called to edamam API\n", + "5. edamam return is currently model-transformed then cuisine filtered\n", + " 1. Swap this order so we don't have to process as much text\n", + "6. Vector comparison against filtered data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Data Preparation\n", + "\n", + "This part can be the DVC import for our data\n", + "\n", + "Currently, raw/processed data can be imported with json, need to consider how to access data something like SQL and log some snapshot of this data (and its metadata?) with DVC\n", + "\n", + "- Can i reuse some parts of GitHub Actions?\n", + "\n", + "- DVC can handle data files fine, but SQL pulls are currently experimentally supported\n", + "- using dvc import-db https://dvc.org/doc/command-reference/import-db\n", + "\n", + "- DVC with generative AI (might be relevant to vector databases): https://youtu.be/aqMXEvWTuVY?si=2lMKrofl9s10BXVx\n", + "\n", + "#### Let's start with local data files\n", + "\n", + "Via automated ETL, DVC could log the raw data, perform the text processing if not an embedding, add the pre processed data back to DVC, then start MLflow with embedding conversion " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[?25l\u001b[32m⠋\u001b[0m Checking graph \n", + "Adding... \n", + "!\u001b[A\n", + " 0% Checking cache in '/home/awchen/Repos/Projects/MeaLeon/.dvc/cache'| |0/? [0\u001b[A\n", + " \u001b[A\n", + "!\u001b[A\n", + " 0%| |Checking out ../data/raw/201706-epicur0/? [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
dekhedaggregateRatingingredientsprepStepsreviewsCountwillMakeAgainPctingredients_lemmafiedcuisine_namephoto_filenamephoto_creditauthor_namedate_publishedrecipe_url
id
54a2b6b019925f464b373351How does fried chicken achieve No. 1 status? B...Pickle-Brined Fried Chicken3.11[1 tablespoons yellow mustard seeds, 1 tablesp...[Toast mustard and coriander seeds in a dry me...7100tablespoon yellow mustard seed brk tablespoon ...Missing Cuisine51247610_fried-chicken_1x1.jpgMichael Graydon and Nikole HerriottMissing Author Name2014-08-19 04:00:00+00:00https://www.epicurious.com/recipes/food/views/...
54a408a019925f464b3733bcSpinaci all'EbraicaSpinach Jewish Style3.22[3 pounds small-leaved bulk spinach, Salt, 1/2...[Remove the stems and roots from the spinach. ...580pound small leave bulk spinach brk salt brk cu...ItalianEP_12162015_placeholders_rustic.jpgPhoto by Chelsea Kyle, Prop Styling by Anna St...Edda Servi Machlin2008-09-09 04:00:00+00:00https://www.epicurious.com/recipes/food/views/...
54a408a26529d92b2c003631This majestic, moist, and richly spiced honey ...New Year’s Honey Cake3.62[3 1/2 cups all-purpose flour, 1 tablespoon ba...[I like this cake best baked in a 9-inch angel...10588cup purpose flour brk tablespoon baking powder...KosherEP_09022015_honeycake-2.jpgPhoto by Chelsea Kyle, Food Styling by Anna St...Marcy Goldman2008-09-10 04:00:00+00:00https://www.epicurious.com/recipes/food/views/...
54a408a66529d92b2c003638The idea for this sandwich came to me when my ...The B.L.A.—Bagel with Lox and Avocado4.00[1 small ripe avocado, preferably Hass (see No...[A short time before serving, mash avocado and...7100small ripe avocado hass see note brk teaspoon ...KosherEP_12162015_placeholders_casual.jpgPhoto by Chelsea Kyle, Prop Styling by Rhoda B...Faye Levy2008-09-08 04:00:00+00:00https://www.epicurious.com/recipes/food/views/...
54a408a719925f464b3733ccIn 1930, Simon Agranat, the chief justice of t...Shakshuka a la Doktor Shakshuka2.71[2 pounds fresh tomatoes, unpeeled and cut in ...[1. Place the tomatoes, garlic, salt, paprika,...783pound fresh tomato unpeeled cut quarter ounce ...KosherEP_12162015_placeholders_formal.jpgPhoto by Chelsea Kyle, Prop Styling by Rhoda B...Joan Nathan2008-09-09 04:00:00+00:00https://www.epicurious.com/recipes/food/views/...
\n", + "" + ], + "text/plain": [ + " dek \\\n", + "id \n", + "54a2b6b019925f464b373351 How does fried chicken achieve No. 1 status? B... \n", + "54a408a019925f464b3733bc Spinaci all'Ebraica \n", + "54a408a26529d92b2c003631 This majestic, moist, and richly spiced honey ... \n", + "54a408a66529d92b2c003638 The idea for this sandwich came to me when my ... \n", + "54a408a719925f464b3733cc In 1930, Simon Agranat, the chief justice of t... \n", + "\n", + " hed \\\n", + "id \n", + "54a2b6b019925f464b373351 Pickle-Brined Fried Chicken \n", + "54a408a019925f464b3733bc Spinach Jewish Style \n", + "54a408a26529d92b2c003631 New Year’s Honey Cake \n", + "54a408a66529d92b2c003638 The B.L.A.—Bagel with Lox and Avocado \n", + "54a408a719925f464b3733cc Shakshuka a la Doktor Shakshuka \n", + "\n", + " aggregateRating \\\n", + "id \n", + "54a2b6b019925f464b373351 3.11 \n", + "54a408a019925f464b3733bc 3.22 \n", + "54a408a26529d92b2c003631 3.62 \n", + "54a408a66529d92b2c003638 4.00 \n", + "54a408a719925f464b3733cc 2.71 \n", + "\n", + " ingredients \\\n", + "id \n", + "54a2b6b019925f464b373351 [1 tablespoons yellow mustard seeds, 1 tablesp... \n", + "54a408a019925f464b3733bc [3 pounds small-leaved bulk spinach, Salt, 1/2... \n", + "54a408a26529d92b2c003631 [3 1/2 cups all-purpose flour, 1 tablespoon ba... \n", + "54a408a66529d92b2c003638 [1 small ripe avocado, preferably Hass (see No... \n", + "54a408a719925f464b3733cc [2 pounds fresh tomatoes, unpeeled and cut in ... \n", + "\n", + " prepSteps \\\n", + "id \n", + "54a2b6b019925f464b373351 [Toast mustard and coriander seeds in a dry me... \n", + "54a408a019925f464b3733bc [Remove the stems and roots from the spinach. ... \n", + "54a408a26529d92b2c003631 [I like this cake best baked in a 9-inch angel... \n", + "54a408a66529d92b2c003638 [A short time before serving, mash avocado and... \n", + "54a408a719925f464b3733cc [1. Place the tomatoes, garlic, salt, paprika,... \n", + "\n", + " reviewsCount willMakeAgainPct \\\n", + "id \n", + "54a2b6b019925f464b373351 7 100 \n", + "54a408a019925f464b3733bc 5 80 \n", + "54a408a26529d92b2c003631 105 88 \n", + "54a408a66529d92b2c003638 7 100 \n", + "54a408a719925f464b3733cc 7 83 \n", + "\n", + " ingredients_lemmafied \\\n", + "id \n", + "54a2b6b019925f464b373351 tablespoon yellow mustard seed brk tablespoon ... \n", + "54a408a019925f464b3733bc pound small leave bulk spinach brk salt brk cu... \n", + "54a408a26529d92b2c003631 cup purpose flour brk tablespoon baking powder... \n", + "54a408a66529d92b2c003638 small ripe avocado hass see note brk teaspoon ... \n", + "54a408a719925f464b3733cc pound fresh tomato unpeeled cut quarter ounce ... \n", + "\n", + " cuisine_name \\\n", + "id \n", + "54a2b6b019925f464b373351 Missing Cuisine \n", + "54a408a019925f464b3733bc Italian \n", + "54a408a26529d92b2c003631 Kosher \n", + "54a408a66529d92b2c003638 Kosher \n", + "54a408a719925f464b3733cc Kosher \n", + "\n", + " photo_filename \\\n", + "id \n", + "54a2b6b019925f464b373351 51247610_fried-chicken_1x1.jpg \n", + "54a408a019925f464b3733bc EP_12162015_placeholders_rustic.jpg \n", + "54a408a26529d92b2c003631 EP_09022015_honeycake-2.jpg \n", + "54a408a66529d92b2c003638 EP_12162015_placeholders_casual.jpg \n", + "54a408a719925f464b3733cc EP_12162015_placeholders_formal.jpg \n", + "\n", + " photo_credit \\\n", + "id \n", + "54a2b6b019925f464b373351 Michael Graydon and Nikole Herriott \n", + "54a408a019925f464b3733bc Photo by Chelsea Kyle, Prop Styling by Anna St... \n", + "54a408a26529d92b2c003631 Photo by Chelsea Kyle, Food Styling by Anna St... \n", + "54a408a66529d92b2c003638 Photo by Chelsea Kyle, Prop Styling by Rhoda B... \n", + "54a408a719925f464b3733cc Photo by Chelsea Kyle, Prop Styling by Rhoda B... \n", + "\n", + " author_name date_published \\\n", + "id \n", + "54a2b6b019925f464b373351 Missing Author Name 2014-08-19 04:00:00+00:00 \n", + "54a408a019925f464b3733bc Edda Servi Machlin 2008-09-09 04:00:00+00:00 \n", + "54a408a26529d92b2c003631 Marcy Goldman 2008-09-10 04:00:00+00:00 \n", + "54a408a66529d92b2c003638 Faye Levy 2008-09-08 04:00:00+00:00 \n", + "54a408a719925f464b3733cc Joan Nathan 2008-09-09 04:00:00+00:00 \n", + "\n", + " recipe_url \n", + "id \n", + "54a2b6b019925f464b373351 https://www.epicurious.com/recipes/food/views/... \n", + "54a408a019925f464b3733bc https://www.epicurious.com/recipes/food/views/... \n", + "54a408a26529d92b2c003631 https://www.epicurious.com/recipes/food/views/... \n", + "54a408a66529d92b2c003638 https://www.epicurious.com/recipes/food/views/... \n", + "54a408a719925f464b3733cc https://www.epicurious.com/recipes/food/views/... " + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# this part can be done after a dvc pull\n", + "whole_nlp_df = pd.read_parquet(\"../data/processed/cleaned_df.parquet.gzip\")\n", + "whole_nlp_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "--------------------------------------------------------------------------------\n", + "sklearn fit transform on ingredients:\n", + "\n", + "\n", + "--------------------------------------------------------------------------------\n", + "Input Data: \n", + "id\n", + "54a2b6b019925f464b373351 tablespoon yellow mustard seed brk tablespoon ...\n", + "54a408a019925f464b3733bc pound small leave bulk spinach brk salt brk cu...\n", + "54a408a26529d92b2c003631 cup purpose flour brk tablespoon baking powder...\n", + "54a408a66529d92b2c003638 small ripe avocado hass see note brk teaspoon ...\n", + "54a408a719925f464b3733cc pound fresh tomato unpeeled cut quarter ounce ...\n", + " ... \n", + "59541a31bff3052847ae2107 tablespoon unsalt butter room temperature brk ...\n", + "5954233ad52ca90dc28200e7 tablespoon stick salt butter room temperature ...\n", + "595424c2109c972493636f83 tablespoon unsalted butter more greasing pan b...\n", + "5956638625dc3d1d829b7166 coarse salt brk lime wedge brk ounce tomato ju...\n", + "59566daa25dc3d1d829b7169 bottle millileter sour beer such almanac citra...\n", + "Name: ingredients_lemmafied, Length: 34756, dtype: object\n", + "\n", + "\n", + "--------------------------------------------------------------------------------\n", + "Input Data Shape: \n", + "(34756,)\n", + "\n", + "\n", + "--------------------------------------------------------------------------------\n", + "Random 3 Records from Input Data: \n", + "id\n", + "54a40caa19925f464b374017 boneless muscovy duck breast half pound total ...\n", + "55d4e08063b1ba1b5534b198 tablespoon white wine vinegar brk teaspoon sug...\n", + "54a43ad16529d92b2c019fc3 cup basmati rice ounce brk cup sweeten flake c...\n", + "Name: ingredients_lemmafied, dtype: object\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 34756/34756 [00:03<00:00, 10450.53it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "--------------------------------------------------------------------------------\n", + "Transformed Data:\n", + " 100g 125g 13x9x2 150g 1pound 1tablespoon \\\n", + "id \n", + "54a2b6b019925f464b373351 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "54a408a019925f464b3733bc 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "54a408a26529d92b2c003631 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "54a408a66529d92b2c003638 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "54a408a719925f464b3733cc 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "\n", + " 1teaspoon 200g 250g 2cup ... árbol divide \\\n", + "id ... \n", + "54a2b6b019925f464b373351 0.0 0.0 0.0 0.0 ... 0.0 \n", + "54a408a019925f464b3733bc 0.0 0.0 0.0 0.0 ... 0.0 \n", + "54a408a26529d92b2c003631 0.0 0.0 0.0 0.0 ... 0.0 \n", + "54a408a66529d92b2c003638 0.0 0.0 0.0 0.0 ... 0.0 \n", + "54a408a719925f464b3733cc 0.0 0.0 0.0 0.0 ... 0.0 \n", + "\n", + " árbol seed árbol seed remove árbol stem \\\n", + "id \n", + "54a2b6b019925f464b373351 0.0 0.0 0.0 \n", + "54a408a019925f464b3733bc 0.0 0.0 0.0 \n", + "54a408a26529d92b2c003631 0.0 0.0 0.0 \n", + "54a408a66529d92b2c003638 0.0 0.0 0.0 \n", + "54a408a719925f464b3733cc 0.0 0.0 0.0 \n", + "\n", + " árbol teaspoon árbol teaspoon crush \\\n", + "id \n", + "54a2b6b019925f464b373351 0.0 0.0 \n", + "54a408a019925f464b3733bc 0.0 0.0 \n", + "54a408a26529d92b2c003631 0.0 0.0 \n", + "54a408a66529d92b2c003638 0.0 0.0 \n", + "54a408a719925f464b3733cc 0.0 0.0 \n", + "\n", + " árbol teaspoon crush red árbol wipe \\\n", + "id \n", + "54a2b6b019925f464b373351 0.0 0.0 \n", + "54a408a019925f464b3733bc 0.0 0.0 \n", + "54a408a26529d92b2c003631 0.0 0.0 \n", + "54a408a66529d92b2c003638 0.0 0.0 \n", + "54a408a719925f464b3733cc 0.0 0.0 \n", + "\n", + " árbol wipe clean épice \n", + "id \n", + "54a2b6b019925f464b373351 0.0 0.0 \n", + "54a408a019925f464b3733bc 0.0 0.0 \n", + "54a408a26529d92b2c003631 0.0 0.0 \n", + "54a408a66529d92b2c003638 0.0 0.0 \n", + "54a408a719925f464b3733cc 0.0 0.0 \n", + "\n", + "[5 rows x 78381 columns]\n", + "\n", + "\n", + "--------------------------------------------------------------------------------\n", + "Random Sample of Combined Data:\n", + " 100g 125g 13x9x2 150g 1pound 1tablespoon \\\n", + "id \n", + "54a40caa19925f464b374017 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "54a43ad16529d92b2c019fc3 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "55d4e08063b1ba1b5534b198 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "\n", + " 1teaspoon 200g 250g 2cup ... árbol seed \\\n", + "id ... \n", + "54a40caa19925f464b374017 0.0 0.0 0.0 0.0 ... 0.0 \n", + "54a43ad16529d92b2c019fc3 0.0 0.0 0.0 0.0 ... 0.0 \n", + "55d4e08063b1ba1b5534b198 0.0 0.0 0.0 0.0 ... 0.0 \n", + "\n", + " árbol seed remove árbol stem árbol teaspoon \\\n", + "id \n", + "54a40caa19925f464b374017 0.0 0.0 0.0 \n", + "54a43ad16529d92b2c019fc3 0.0 0.0 0.0 \n", + "55d4e08063b1ba1b5534b198 0.0 0.0 0.0 \n", + "\n", + " árbol teaspoon crush árbol teaspoon crush red \\\n", + "id \n", + "54a40caa19925f464b374017 0.0 0.0 \n", + "54a43ad16529d92b2c019fc3 0.0 0.0 \n", + "55d4e08063b1ba1b5534b198 0.0 0.0 \n", + "\n", + " árbol wipe árbol wipe clean épice \\\n", + "id \n", + "54a40caa19925f464b374017 0.0 0.0 0.0 \n", + "54a43ad16529d92b2c019fc3 0.0 0.0 0.0 \n", + "55d4e08063b1ba1b5534b198 0.0 0.0 0.0 \n", + "\n", + " ingredients_lemmafied \n", + "id \n", + "54a40caa19925f464b374017 boneless muscovy duck breast half pound total ... \n", + "54a43ad16529d92b2c019fc3 cup basmati rice ounce brk cup sweeten flake c... \n", + "55d4e08063b1ba1b5534b198 tablespoon white wine vinegar brk teaspoon sug... \n", + "\n", + "[3 rows x 78382 columns]\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "413513de77ec40e097f0fe537db730da", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading artifacts: 0%| | 0/1 [00:00str dicts\n", + " # Will be useful in STAGING/Evaluation\n", + " \n", + " # LOG MODEL\n", + " # Instantiate sklearn TFIDFVectorizer\n", + " sklearn_transformer = TfidfVectorizer(**sklearn_transformer_params)\n", + "\n", + " print('\\n')\n", + " print('-' * 80)\n", + " print('sklearn fit transform on ingredients:')\n", + "\n", + " model_input = whole_nlp_df['ingredients_lemmafied']\n", + "\n", + " print('\\n')\n", + " print('-' * 80)\n", + " print('Input Data: ')\n", + " print(model_input)\n", + "\n", + " print('\\n')\n", + " print('-' * 80)\n", + " print('Input Data Shape: ')\n", + " print(model_input.shape)\n", + "\n", + " random_sample = model_input.sample(3, random_state=200)\n", + "\n", + " print('\\n')\n", + " print('-' * 80)\n", + " print('Random 3 Records from Input Data: ')\n", + " print(random_sample)\n", + "\n", + " # Do fit transform on data\n", + " response = sklearn_transformer.fit_transform(tqdm(model_input)) \n", + " \n", + " transformed_recipe = pd.DataFrame(\n", + " response.toarray(),\n", + " columns=sklearn_transformer.get_feature_names_out(),\n", + " index=model_input.index\n", + " )\n", + "\n", + " signature = infer_signature(model_input=model_input,\n", + " model_output=transformed_recipe\n", + " )\n", + "\n", + " print('\\n')\n", + " print('-' * 80)\n", + " print('Transformed Data:')\n", + " print(transformed_recipe.head())\n", + " \n", + " combined_df = transformed_recipe.join(model_input, how='inner')\n", + " combined_df_sample = transformed_recipe.join(random_sample, how='inner')\n", + "\n", + " print('\\n')\n", + " print('-' * 80)\n", + " print('Random Sample of Combined Data:')\n", + " print(combined_df_sample.head())\n", + "\n", + " with open(sklearn_transformer_path, \"wb\") as fo:\n", + " pickle.dump(sklearn_transformer, fo)\n", + "\n", + " transformed_recipe.to_parquet(path=transformed_recipes_parquet_path, compression=\"gzip\")\n", + "\n", + " combined_df.to_parquet(path=combined_df_path, compression=\"gzip\")\n", + " \n", + " combined_df_sample.to_parquet(path=combined_df_sample_path)\n", + "\n", + " model_info = mlflow.pyfunc.log_model( \n", + " code_path=[\"../src/backend/\"],\n", + " python_model=CustomSKLearnWrapper(),\n", + " input_example=whole_nlp_df['ingredients_lemmafied'][0],\n", + " signature=signature, \n", + " artifact_path=\"sklearn_model\",\n", + " artifacts=artifacts\n", + " ) \n", + "\n", + " # since this uses a custom Stanza analyzer, we have to use a custom mlflow.Pyfunc.PythonModel" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[?25l\u001b[32m⠋\u001b[0m Checking graph \n", + "Adding... \n", + "!\u001b[A\n", + " 0% Checking cache in '/home/awchen/Repos/Projects/MeaLeon/.dvc/cache'| |0/? [0\u001b[A\n", + " \u001b[A\n", + "!\u001b[A\n", + " 0%| |Transferring 0/? [00:00 pd.DataFrame: + """This function takes in a pandas DataFrame from pd.read_json and performs some preprocessing by unpacking the nested dictionaries and creating new columns with the simplified structures. It will then drop the original columns that would no longer be needed. + + Args: + pd.DataFrame + + Returns: + pd.DataFrame + """ + + def stanza_filterer(recipe_ingredients: List[str], stanza_pipeline: stanza.Pipeline) -> str: + """This function converts a list of ingredients into a list of ingredient lemmas + It is intended to be used via an apply(lambda) until a better way is devised + + Args: + recipe_ingredients: List[str] + + Returns: + lemmafied: String + """ + lemmafied = " ".join( + str(word.lemma) + for sent in stanza_pipeline(recipe_ingredients).sentences + for word in sent.words + if ( + word.upos not in ["NUM", "DET", "ADV", "CCONJ", "ADP", "SCONJ", "PUNCT"] + and word is not None + ) + ) + return lemmafied + + def ingredient_lemmafier(df: pd.DataFrame, stanza_pipeline: stanza.Pipeline) -> pd.DataFrame: + """This function performs some text preprocessing: + 1. Converts the raw list of ingredients into a big string with ' brk ' token + 2. Remove accented characters + 3. Lowercase all characters + 4. Fill in nulls with filler + 5. Apply the lemmafier function above and store the results in a new column + """ + df["ingredients_lemmafied"] = ( + df["ingredients"] + .str.join(" brk ") + .str.normalize("NFKC") + .str.lower() + .fillna("Missing ingredients") + ).apply(lambda x: stanza_filterer(x, stanza_pipeline)) + + return df + + def link_maker(recipe_link: str) -> str: + """This function takes in the incomplete recipe link from the dataframe and returns the complete one.""" + full_link = f"https://www.epicurious.com{recipe_link}" + return full_link + + def cuisine_renamer(text: str) -> str: + """This function converts redundant and/or rare categories into more common + ones/umbrella ones. + + In the future, there's a hope that this renaming mechanism will not have + under sampled cuisine tags. + """ + if text == "Central American/Caribbean": + return "Caribbean" + elif text == "Jewish": + return "Kosher" + elif text == "Eastern European/Russian": + return "Eastern European" + elif text in ["Spanish/Portuguese", "Greek"]: + return "Mediterranean" + elif text == "Central/South American": + return "Latin American" + elif text == "Sushi": + return "Japanese" + elif text == "Southern Italian": + return "Italian" + elif text in ["Southern", "Tex-Mex"]: + return "American" + elif text in ["Southeast Asian", "Korean"]: + return "Asian" + else: + return text + + def null_filler(to_check: Dict[Text, Text], key_target: Text) -> Text: + """This function takes in a dictionary that is currently fed in with a lambda function and then performs column specific preprocessing. + + Args: + to_check: dict + key_target: str + + Returns: + str + """ + + # Only look in the following keys, if the input isn't one of these, it should be recognized as an improper key + valid_keys = ["name", "filename", "credit"] + + # This dictionary converts the input keys into substrings that can be used in f-strings to fill in missing values in the record + translation_keys = { + "name": "Cuisine", + "filename": "Photo", + "credit": "Photo Credit", + } + + if key_target not in valid_keys: + # this logic makes sure we are only looking at valid keys + # this is not a real try/except + return ( + "Improper key target: can only pick from 'name', 'filename', 'credit'." + ) + + else: + if pd.isna(to_check): + # this logic checks to see if the dictionary exists at all. if so, return Missing + return f"Missing {translation_keys[key_target]}" + else: + if key_target == "name" and (to_check["category"] != "cuisine"): + # This logic checks for the cuisine, if the cuisine is not there (and instead has 'ingredient', 'type', 'item', 'equipment', 'meal'), mark as missing + return f"Missing {translation_keys[key_target]}" + else: + # Otherwise, there should be no issue with returning + return to_check[key_target] + + # separating out the below to execute with a __main__ would be cleaner + df = ingredient_lemmafier(df, nlp) + + # Dive into the tag column and extract the cuisine label. Put into new column or fills with "missing data" + df["cuisine_name"] = df["tag"].apply( + lambda x: null_filler(to_check=x, key_target="name") + ) + + # This apply uses the cuisine_renamer function above to relabel the cuisines to more general ones + df["cuisine_name"] = df["cuisine_name"].apply(cuisine_renamer) + + # this lambda function goes into the photo data column and extracts just the filename from the dictionary + df["photo_filename"] = df["photoData"].apply( + lambda x: null_filler(to_check=x, key_target="filename") + ) # type:ignore + + # This lambda function goes into the photo data column and extracts just the photo credit from the dictionary + df["photo_credit"] = df["photoData"].apply( + lambda x: null_filler(to_check=x, key_target="credit") + ) # type:ignore + + # for the above, maybe they can be refactored to one function where the arguments are a column name, dictionary key name, the substring return + + # this lambda function goes into the author column and extracts the author name or fills with "missing data" + df["author_name"] = df["author"].apply( + lambda x: x[0]["name"] if x else "Missing Author Name" + ) # type:ignore + + # This function takes in the given pubDate column and creates a new column with the pubDate values converted to datetime objects + df["date_published"] = pd.to_datetime( + df["pubDate"], infer_datetime_format=True + ) # type:ignore + + # this function takes in the given url column and prepends the full epicurious URL base + df["recipe_url"] = df["url"].apply(link_maker) # type:ignore + + # drop some original columns to clean up the dataframe + df.drop( + labels=["tag", "photoData", "author", "type", "dateCrawled", "pubDate", "url"], + axis=1, + inplace=True, + ) + + df.set_index("id", inplace=True) + + return df