From ab26b2e4a1dda08593a14f7afc36b7e5acc524b7 Mon Sep 17 00:00:00 2001 From: Aaron W Chen Date: Tue, 9 Apr 2024 13:53:50 -0700 Subject: [PATCH] Try making TFIDF embeddings Tested DVC and fixed commands Tried making TFIDF embeddings, but no space on laptop --- nbs/15_create_tfidf_embeddings.ipynb | 1249 ++++++++++++++++++++++++++ 1 file changed, 1249 insertions(+) create mode 100644 nbs/15_create_tfidf_embeddings.ipynb diff --git a/nbs/15_create_tfidf_embeddings.ipynb b/nbs/15_create_tfidf_embeddings.ipynb new file mode 100644 index 0000000..9e207ff --- /dev/null +++ b/nbs/15_create_tfidf_embeddings.ipynb @@ -0,0 +1,1249 @@ +{ + "cells": [ + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "---\n", + "description: test\n", + "output-file: template.html\n", + "title: Template\n", + "\n", + "---\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# | default_exp core" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "d7088c22cb4c4a2aa598d9fb700e8af0", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json: 0%| …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-03-27 20:57:40 INFO: Downloading default packages for language: en (English) ...\n", + "2024-03-27 20:57:41 INFO: File exists: /home/awchen/stanza_resources/en/default.zip\n", + "2024-03-27 20:57:44 INFO: Finished downloading models and saved to /home/awchen/stanza_resources.\n", + "2024-03-27 20:57:44 INFO: Checking for updates to resources.json in case models have been updated. Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "f62ab630814a49449d6f0be2ac47e87e", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json: 0%| …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-03-27 20:57:45 INFO: Loading these models for language: en (English):\n", + "======================================\n", + "| Processor | Package |\n", + "--------------------------------------\n", + "| tokenize | combined |\n", + "| pos | combined_charlm |\n", + "| lemma | combined_nocharlm |\n", + "| constituency | ptb3-revised_charlm |\n", + "| depparse | combined_charlm |\n", + "| sentiment | sstplus |\n", + "| ner | ontonotes_charlm |\n", + "======================================\n", + "\n", + "2024-03-27 20:57:45 INFO: Using device: cpu\n", + "2024-03-27 20:57:45 INFO: Loading: tokenize\n", + "2024-03-27 20:57:45 INFO: Loading: pos\n", + "2024-03-27 20:57:45 INFO: Loading: lemma\n", + "2024-03-27 20:57:45 INFO: Loading: constituency\n", + "2024-03-27 20:57:45 INFO: Loading: depparse\n", + "2024-03-27 20:57:45 INFO: Loading: sentiment\n", + "2024-03-27 20:57:46 INFO: Loading: ner\n", + "2024-03-27 20:57:46 INFO: Done loading processors!\n" + ] + } + ], + "source": [ + "# | hide\n", + "# from bertopic import BERTopic\n", + "# from bertopic.vectorizers import OnlineCountVectorizer\n", + "import dagshub\n", + "from datetime import datetime\n", + "import dill as pickle\n", + "import dvc.api\n", + "# from hdbscan import HDBSCAN\n", + "from itertools import tee, islice, product\n", + "import joblib\n", + "import nbdev\n", + "from nbdev.showdoc import *\n", + "import pandas as pd\n", + "import re\n", + "from sentence_transformers import SentenceTransformer\n", + "from sklearn.feature_extraction.text import (\n", + " CountVectorizer\n", + " , TfidfTransformer\n", + " , TfidfVectorizer\n", + " , \n", + ")\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.pipeline import make_pipeline\n", + "from src.custom_sklearn_text_transformer_mlflow import CustomSKLearnAnalyzer\n", + "import src.dataframe_preprocessor as dfpp\n", + "import stanza\n", + "from tqdm import tqdm\n", + "# from umap import UMAP" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!export 'PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# | export" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# | hide\n", + "# nbdev.nbdev_export()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Data Preparation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "e03d339b07814cbc840d2f131a85f927", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json: 0%| …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-03-27 20:57:46 INFO: Downloading default packages for language: en (English) ...\n", + "2024-03-27 20:57:47 INFO: File exists: /home/awchen/stanza_resources/en/default.zip\n", + "2024-03-27 20:57:50 INFO: Finished downloading models and saved to /home/awchen/stanza_resources.\n", + "2024-03-27 20:57:50 INFO: Checking for updates to resources.json in case models have been updated. Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c31d2969037c4ad1a79333ff34364488", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json: 0%| …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-03-27 20:57:51 INFO: Loading these models for language: en (English):\n", + "======================================\n", + "| Processor | Package |\n", + "--------------------------------------\n", + "| tokenize | combined |\n", + "| pos | combined_charlm |\n", + "| lemma | combined_nocharlm |\n", + "| constituency | ptb3-revised_charlm |\n", + "| depparse | combined_charlm |\n", + "| sentiment | sstplus |\n", + "| ner | ontonotes_charlm |\n", + "======================================\n", + "\n", + "2024-03-27 20:57:51 INFO: Using device: cuda\n", + "2024-03-27 20:57:51 INFO: Loading: tokenize\n", + "2024-03-27 20:57:54 INFO: Loading: pos\n", + "2024-03-27 20:57:54 INFO: Loading: lemma\n", + "2024-03-27 20:57:54 INFO: Loading: constituency\n", + "2024-03-27 20:57:54 INFO: Loading: depparse\n", + "2024-03-27 20:57:54 INFO: Loading: sentiment\n", + "2024-03-27 20:57:55 INFO: Loading: ner\n", + "2024-03-27 20:57:55 INFO: Done loading processors!\n" + ] + } + ], + "source": [ + "# instantiate stanza pipeline\n", + "stanza.download('en')\n", + "nlp = stanza.Pipeline('en', \n", + " depparse_batch_size=50, \n", + " depparse_min_length_to_batch_separately=50,\n", + " verbose=True,\n", + " use_gpu=True, # set to true when on cloud/not on streaming computer\n", + " batch_size=100\n", + " )\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# load raw data and preprocess/clean\n", + "data = dvc.api.read(\n", + " path='../data/recipes-en-201706/epicurious-recipes_m2.json'\n", + " , mode='r')\n", + "raw_df = pd.read_json(data)\n", + "print('\\n')\n", + "print('--------------')\n", + "print('Raw Dataframe:', end='\\n')\n", + "print(raw_df.head())\n", + "print(raw_df.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# take sample and train/test split \n", + "subset_df = raw_df.sample(n=100, random_state=45)\n", + "train_df, test_df = train_test_split(subset_df,test_size=0.5, random_state=45)\n", + "\n", + "# pre_proc_df is cleaned dataframe\n", + "to_nlp_df = dfpp.preprocess_dataframe(train_df)\n", + "print('\\n')\n", + "print('--------------')\n", + "print('Preprocessed Dataframe:', end='\\n')\n", + "print(to_nlp_df.head())\n", + "print(to_nlp_df.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer\n", + "sklearn_transformer_params = { \n", + " 'analyzer': CustomSKLearnAnalyzer().ngram_maker(\n", + " min_ngram_length=1,\n", + " max_ngram_length=4,\n", + " ),\n", + " 'min_df':3,\n", + " # 'binary':False\n", + "}\n", + "\n", + "sklearn_transformer = TfidfVectorizer(**sklearn_transformer_params)\n", + "\n", + "model_input = to_nlp_df['ingredients_lemmafied']\n", + "\n", + "# Do fit transform on data\n", + "print(\"fit_transform start: \" + str(datetime.now()))\n", + "response = sklearn_transformer.fit_transform(tqdm(model_input)) \n", + "print(\"fit_transform end: \" + str(datetime.now()))\n", + "\n", + "transformed_recipe = pd.DataFrame(\n", + " response.toarray(),\n", + " columns=sklearn_transformer.get_feature_names_out(),\n", + " index=model_input.index\n", + ")\n", + "\n", + "print(transformed_recipe.columns)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "transformed_recipe" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "transformed_recipe.columns.tolist()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "to_nlp_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Prepare whole dataframe for new processing\n", + "import mlflow\n", + "from mlflow.models import infer_signature\n", + "from src.custom_stanza_mlflow import CustomSKLearnWrapper" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# this function allows us to get the experiment ID from an experiment name\n", + "def get_experiment_id(name):\n", + " exp = mlflow.get_experiment_by_name(name)\n", + " if exp is None:\n", + " exp_id = mlflow.create_experiment(name)\n", + " return exp_id\n", + " return exp.experiment_id" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Repository initialized!\n",
+       "
\n" + ], + "text/plain": [ + "Repository initialized!\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#@markdown Enter the username of your DAGsHub account:\n", + "DAGSHUB_USER_NAME = \"AaronWChen\" #@param {type:\"string\"}\n", + "\n", + "#@markdown Enter the email for your DAGsHub account:\n", + "DAGSHUB_EMAIL = \"awc33@cornell.edu\" #@param {type:\"string\"}\n", + "\n", + "#@markdown Enter the repo name \n", + "DAGSHUB_REPO_NAME = \"MeaLeon\"\n", + "\n", + "#@markdown Enter the name of the branch you are working on \n", + "BRANCH = \"NGRAM-1/try-llm-code-speedup\"\n", + "dagshub.init(repo_name=DAGSHUB_REPO_NAME\n", + " , repo_owner=DAGSHUB_USER_NAME)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Starting DEV stage for TFIDF Encoded model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mlflow.set_tracking_uri(f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow')\n", + "\n", + "# starter idea for making an experiment name can be the git branch, but need more specificity\n", + "experiment_name = f\"{DAGSHUB_EMAIL}/OHE_up_to_quadgrams\"\n", + "mlflow_exp_id = get_experiment_id(experiment_name)\n", + "\n", + "# define model location\n", + "# model_directory = \"/tmp/sklearn_model\"\n", + "model_directory = \"../models/sklearn_model\"\n", + "\n", + "# Define the required artifacts associated with the saved custom pyfunc\n", + "# sklearn_path = model_directory + \"\"\n", + "sklearn_model_path = model_directory + \"/python_model.pkl\"\n", + "sklearn_transformer_path = model_directory + \"/sklearn_transformer.pkl\"\n", + "transformed_recipes_path = model_directory + \"/transformed_recipes.pkl\"\n", + "combined_df_path = model_directory + \"/combined_df.pkl\"\n", + "\n", + "artifacts = {'sklearn_model': sklearn_model_path,\n", + " 'sklearn_transformer': sklearn_transformer_path,\n", + " 'transformed_recipes': transformed_recipes_path,\n", + " # 'combined_data': combined_df_path\n", + " }\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# pre_proc_df is cleaned dataframe\n", + "print(\"Preprocess start: \" + str(datetime.now()))\n", + "whole_nlp_df = dfpp.preprocess_dataframe(raw_df)\n", + "print(\"Preprocess end: \" + str(datetime.now()))\n", + "print('\\n')\n", + "print('--------------')\n", + "print('Preprocessed Dataframe: ', end='\\n')\n", + "print(whole_nlp_df.head())\n", + "print(whole_nlp_df.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "whole_nlp_df.to_parquet('../joblib/2024.03.19/pre_proc_df.parquet.gzip', \n", + " compression='gzip',\n", + " index=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
dekhedaggregateRatingingredientsprepStepsreviewsCountwillMakeAgainPctingredients_lemmafiedcuisine_namephoto_filenamephoto_creditauthor_namedate_publishedrecipe_url
id
54a2b6b019925f464b373351How does fried chicken achieve No. 1 status? B...Pickle-Brined Fried Chicken3.11[1 tablespoons yellow mustard seeds, 1 tablesp...[Toast mustard and coriander seeds in a dry me...7100tablespoon yellow mustard seed brk tablespoon ...Missing Cuisine51247610_fried-chicken_1x1.jpgMichael Graydon and Nikole HerriottMissing Author Name2014-08-19 04:00:00+00:00https://www.epicurious.com/recipes/food/views/...
54a408a019925f464b3733bcSpinaci all'EbraicaSpinach Jewish Style3.22[3 pounds small-leaved bulk spinach, Salt, 1/2...[Remove the stems and roots from the spinach. ...580pound small leave bulk spinach brk salt brk cu...ItalianEP_12162015_placeholders_rustic.jpgPhoto by Chelsea Kyle, Prop Styling by Anna St...Edda Servi Machlin2008-09-09 04:00:00+00:00https://www.epicurious.com/recipes/food/views/...
54a408a26529d92b2c003631This majestic, moist, and richly spiced honey ...New Year’s Honey Cake3.62[3 1/2 cups all-purpose flour, 1 tablespoon ba...[I like this cake best baked in a 9-inch angel...10588cup purpose flour brk tablespoon baking powder...KosherEP_09022015_honeycake-2.jpgPhoto by Chelsea Kyle, Food Styling by Anna St...Marcy Goldman2008-09-10 04:00:00+00:00https://www.epicurious.com/recipes/food/views/...
54a408a66529d92b2c003638The idea for this sandwich came to me when my ...The B.L.A.—Bagel with Lox and Avocado4.00[1 small ripe avocado, preferably Hass (see No...[A short time before serving, mash avocado and...7100small ripe avocado hass see note brk teaspoon ...KosherEP_12162015_placeholders_casual.jpgPhoto by Chelsea Kyle, Prop Styling by Rhoda B...Faye Levy2008-09-08 04:00:00+00:00https://www.epicurious.com/recipes/food/views/...
54a408a719925f464b3733ccIn 1930, Simon Agranat, the chief justice of t...Shakshuka a la Doktor Shakshuka2.71[2 pounds fresh tomatoes, unpeeled and cut in ...[1. Place the tomatoes, garlic, salt, paprika,...783pound fresh tomato unpeeled cut quarter ounce ...KosherEP_12162015_placeholders_formal.jpgPhoto by Chelsea Kyle, Prop Styling by Rhoda B...Joan Nathan2008-09-09 04:00:00+00:00https://www.epicurious.com/recipes/food/views/...
\n", + "
" + ], + "text/plain": [ + " dek \\\n", + "id \n", + "54a2b6b019925f464b373351 How does fried chicken achieve No. 1 status? B... \n", + "54a408a019925f464b3733bc Spinaci all'Ebraica \n", + "54a408a26529d92b2c003631 This majestic, moist, and richly spiced honey ... \n", + "54a408a66529d92b2c003638 The idea for this sandwich came to me when my ... \n", + "54a408a719925f464b3733cc In 1930, Simon Agranat, the chief justice of t... \n", + "\n", + " hed \\\n", + "id \n", + "54a2b6b019925f464b373351 Pickle-Brined Fried Chicken \n", + "54a408a019925f464b3733bc Spinach Jewish Style \n", + "54a408a26529d92b2c003631 New Year’s Honey Cake \n", + "54a408a66529d92b2c003638 The B.L.A.—Bagel with Lox and Avocado \n", + "54a408a719925f464b3733cc Shakshuka a la Doktor Shakshuka \n", + "\n", + " aggregateRating \\\n", + "id \n", + "54a2b6b019925f464b373351 3.11 \n", + "54a408a019925f464b3733bc 3.22 \n", + "54a408a26529d92b2c003631 3.62 \n", + "54a408a66529d92b2c003638 4.00 \n", + "54a408a719925f464b3733cc 2.71 \n", + "\n", + " ingredients \\\n", + "id \n", + "54a2b6b019925f464b373351 [1 tablespoons yellow mustard seeds, 1 tablesp... \n", + "54a408a019925f464b3733bc [3 pounds small-leaved bulk spinach, Salt, 1/2... \n", + "54a408a26529d92b2c003631 [3 1/2 cups all-purpose flour, 1 tablespoon ba... \n", + "54a408a66529d92b2c003638 [1 small ripe avocado, preferably Hass (see No... \n", + "54a408a719925f464b3733cc [2 pounds fresh tomatoes, unpeeled and cut in ... \n", + "\n", + " prepSteps \\\n", + "id \n", + "54a2b6b019925f464b373351 [Toast mustard and coriander seeds in a dry me... \n", + "54a408a019925f464b3733bc [Remove the stems and roots from the spinach. ... \n", + "54a408a26529d92b2c003631 [I like this cake best baked in a 9-inch angel... \n", + "54a408a66529d92b2c003638 [A short time before serving, mash avocado and... \n", + "54a408a719925f464b3733cc [1. Place the tomatoes, garlic, salt, paprika,... \n", + "\n", + " reviewsCount willMakeAgainPct \\\n", + "id \n", + "54a2b6b019925f464b373351 7 100 \n", + "54a408a019925f464b3733bc 5 80 \n", + "54a408a26529d92b2c003631 105 88 \n", + "54a408a66529d92b2c003638 7 100 \n", + "54a408a719925f464b3733cc 7 83 \n", + "\n", + " ingredients_lemmafied \\\n", + "id \n", + "54a2b6b019925f464b373351 tablespoon yellow mustard seed brk tablespoon ... \n", + "54a408a019925f464b3733bc pound small leave bulk spinach brk salt brk cu... \n", + "54a408a26529d92b2c003631 cup purpose flour brk tablespoon baking powder... \n", + "54a408a66529d92b2c003638 small ripe avocado hass see note brk teaspoon ... \n", + "54a408a719925f464b3733cc pound fresh tomato unpeeled cut quarter ounce ... \n", + "\n", + " cuisine_name \\\n", + "id \n", + "54a2b6b019925f464b373351 Missing Cuisine \n", + "54a408a019925f464b3733bc Italian \n", + "54a408a26529d92b2c003631 Kosher \n", + "54a408a66529d92b2c003638 Kosher \n", + "54a408a719925f464b3733cc Kosher \n", + "\n", + " photo_filename \\\n", + "id \n", + "54a2b6b019925f464b373351 51247610_fried-chicken_1x1.jpg \n", + "54a408a019925f464b3733bc EP_12162015_placeholders_rustic.jpg \n", + "54a408a26529d92b2c003631 EP_09022015_honeycake-2.jpg \n", + "54a408a66529d92b2c003638 EP_12162015_placeholders_casual.jpg \n", + "54a408a719925f464b3733cc EP_12162015_placeholders_formal.jpg \n", + "\n", + " photo_credit \\\n", + "id \n", + "54a2b6b019925f464b373351 Michael Graydon and Nikole Herriott \n", + "54a408a019925f464b3733bc Photo by Chelsea Kyle, Prop Styling by Anna St... \n", + "54a408a26529d92b2c003631 Photo by Chelsea Kyle, Food Styling by Anna St... \n", + "54a408a66529d92b2c003638 Photo by Chelsea Kyle, Prop Styling by Rhoda B... \n", + "54a408a719925f464b3733cc Photo by Chelsea Kyle, Prop Styling by Rhoda B... \n", + "\n", + " author_name date_published \\\n", + "id \n", + "54a2b6b019925f464b373351 Missing Author Name 2014-08-19 04:00:00+00:00 \n", + "54a408a019925f464b3733bc Edda Servi Machlin 2008-09-09 04:00:00+00:00 \n", + "54a408a26529d92b2c003631 Marcy Goldman 2008-09-10 04:00:00+00:00 \n", + "54a408a66529d92b2c003638 Faye Levy 2008-09-08 04:00:00+00:00 \n", + "54a408a719925f464b3733cc Joan Nathan 2008-09-09 04:00:00+00:00 \n", + "\n", + " recipe_url \n", + "id \n", + "54a2b6b019925f464b373351 https://www.epicurious.com/recipes/food/views/... \n", + "54a408a019925f464b3733bc https://www.epicurious.com/recipes/food/views/... \n", + "54a408a26529d92b2c003631 https://www.epicurious.com/recipes/food/views/... \n", + "54a408a66529d92b2c003638 https://www.epicurious.com/recipes/food/views/... \n", + "54a408a719925f464b3733cc https://www.epicurious.com/recipes/food/views/... " + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "whole_nlp_df = pd.read_parquet('../joblib/2024.03.19/pre_proc_df.parquet.gzip')\n", + "whole_nlp_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "--------------------------------------------------------------------------------\n", + "sklearn fit transform on ingredients:\n", + "\n", + "\n", + "--------------------------------------------------------------------------------\n", + "Input Data: \n", + "id\n", + "54a2b6b019925f464b373351 tablespoon yellow mustard seed brk tablespoon ...\n", + "54a408a019925f464b3733bc pound small leave bulk spinach brk salt brk cu...\n", + "54a408a26529d92b2c003631 cup purpose flour brk tablespoon baking powder...\n", + "54a408a66529d92b2c003638 small ripe avocado hass see note brk teaspoon ...\n", + "54a408a719925f464b3733cc pound fresh tomato unpeeled cut quarter ounce ...\n", + " ... \n", + "59541a31bff3052847ae2107 tablespoon unsalt butter room temperature brk ...\n", + "5954233ad52ca90dc28200e7 tablespoon stick salt butter room temperature ...\n", + "595424c2109c972493636f83 tablespoon unsalted butter more greasing pan b...\n", + "5956638625dc3d1d829b7166 coarse salt brk lime wedge brk ounce tomato ju...\n", + "59566daa25dc3d1d829b7169 bottle millileter sour beer such almanac citra...\n", + "Name: ingredients_lemmafied, Length: 34756, dtype: object\n", + "\n", + "\n", + "--------------------------------------------------------------------------------\n", + "Input Data Shape: \n", + "(34756,)\n", + "\n", + "\n", + "--------------------------------------------------------------------------------\n", + "Random 3 Records from Input Data: \n", + "id\n", + "54a40caa19925f464b374017 boneless muscovy duck breast half pound total ...\n", + "55d4e08063b1ba1b5534b198 tablespoon white wine vinegar brk teaspoon sug...\n", + "54a43ad16529d92b2c019fc3 cup basmati rice ounce brk cup sweeten flake c...\n", + "Name: ingredients_lemmafied, dtype: object\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 34756/34756 [00:02<00:00, 11734.80it/s]\n", + "/home/awchen/Repos/Projects/MeaLeon/.venv/lib/python3.10/site-packages/mlflow/models/signature.py:213: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values `_ for more details.\n", + " outputs = _infer_schema(model_output) if model_output is not None else None\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "--------------------------------------------------------------------------------\n", + "Transformed Data:\n", + " 100g 125g 13x9x2 150g 1pound 1tablespoon \\\n", + "id \n", + "54a2b6b019925f464b373351 0 0 0 0 0 0 \n", + "54a408a019925f464b3733bc 0 0 0 0 0 0 \n", + "54a408a26529d92b2c003631 0 0 0 0 0 0 \n", + "54a408a66529d92b2c003638 0 0 0 0 0 0 \n", + "54a408a719925f464b3733cc 0 0 0 0 0 0 \n", + "\n", + " 1teaspoon 200g 250g 2cup ... árbol divide \\\n", + "id ... \n", + "54a2b6b019925f464b373351 0 0 0 0 ... 0 \n", + "54a408a019925f464b3733bc 0 0 0 0 ... 0 \n", + "54a408a26529d92b2c003631 0 0 0 0 ... 0 \n", + "54a408a66529d92b2c003638 0 0 0 0 ... 0 \n", + "54a408a719925f464b3733cc 0 0 0 0 ... 0 \n", + "\n", + " árbol seed árbol seed remove árbol stem \\\n", + "id \n", + "54a2b6b019925f464b373351 0 0 0 \n", + "54a408a019925f464b3733bc 0 0 0 \n", + "54a408a26529d92b2c003631 0 0 0 \n", + "54a408a66529d92b2c003638 0 0 0 \n", + "54a408a719925f464b3733cc 0 0 0 \n", + "\n", + " árbol teaspoon árbol teaspoon crush \\\n", + "id \n", + "54a2b6b019925f464b373351 0 0 \n", + "54a408a019925f464b3733bc 0 0 \n", + "54a408a26529d92b2c003631 0 0 \n", + "54a408a66529d92b2c003638 0 0 \n", + "54a408a719925f464b3733cc 0 0 \n", + "\n", + " árbol teaspoon crush red árbol wipe \\\n", + "id \n", + "54a2b6b019925f464b373351 0 0 \n", + "54a408a019925f464b3733bc 0 0 \n", + "54a408a26529d92b2c003631 0 0 \n", + "54a408a66529d92b2c003638 0 0 \n", + "54a408a719925f464b3733cc 0 0 \n", + "\n", + " árbol wipe clean épice \n", + "id \n", + "54a2b6b019925f464b373351 0 0 \n", + "54a408a019925f464b3733bc 0 0 \n", + "54a408a26529d92b2c003631 0 0 \n", + "54a408a66529d92b2c003638 0 0 \n", + "54a408a719925f464b3733cc 0 0 \n", + "\n", + "[5 rows x 78378 columns]\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "75ae1f83e714420fafae1ba91d492f9a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading artifacts: 0%| | 0/1 [00:00 715\u001b[0m httplib_response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_request\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 716\u001b[0m \u001b[43m \u001b[49m\u001b[43mconn\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 717\u001b[0m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 718\u001b[0m \u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 719\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout_obj\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 720\u001b[0m \u001b[43m \u001b[49m\u001b[43mbody\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbody\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 721\u001b[0m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 722\u001b[0m \u001b[43m \u001b[49m\u001b[43mchunked\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mchunked\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 723\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 725\u001b[0m \u001b[38;5;66;03m# If we're going to release the connection in ``finally:``, then\u001b[39;00m\n\u001b[1;32m 726\u001b[0m \u001b[38;5;66;03m# the response doesn't need to know about the connection. Otherwise\u001b[39;00m\n\u001b[1;32m 727\u001b[0m \u001b[38;5;66;03m# it will also try to release it and we'll have a double-release\u001b[39;00m\n\u001b[1;32m 728\u001b[0m \u001b[38;5;66;03m# mess.\u001b[39;00m\n", + "File \u001b[0;32m~/Repos/Projects/MeaLeon/.venv/lib/python3.10/site-packages/urllib3/connectionpool.py:416\u001b[0m, in \u001b[0;36mHTTPConnectionPool._make_request\u001b[0;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[1;32m 415\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 416\u001b[0m \u001b[43mconn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mhttplib_request_kw\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 418\u001b[0m \u001b[38;5;66;03m# We are swallowing BrokenPipeError (errno.EPIPE) since the server is\u001b[39;00m\n\u001b[1;32m 419\u001b[0m \u001b[38;5;66;03m# legitimately able to close the connection after sending a valid response.\u001b[39;00m\n\u001b[1;32m 420\u001b[0m \u001b[38;5;66;03m# With this behaviour, the received response is still readable.\u001b[39;00m\n", + "File \u001b[0;32m~/Repos/Projects/MeaLeon/.venv/lib/python3.10/site-packages/urllib3/connection.py:244\u001b[0m, in \u001b[0;36mHTTPConnection.request\u001b[0;34m(self, method, url, body, headers)\u001b[0m\n\u001b[1;32m 243\u001b[0m headers[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUser-Agent\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m _get_default_user_agent()\n\u001b[0;32m--> 244\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mHTTPConnection\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbody\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbody\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mheaders\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/.asdf/installs/python/3.10.10/lib/python3.10/http/client.py:1282\u001b[0m, in \u001b[0;36mHTTPConnection.request\u001b[0;34m(self, method, url, body, headers, encode_chunked)\u001b[0m\n\u001b[1;32m 1281\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Send a complete request to the server.\"\"\"\u001b[39;00m\n\u001b[0;32m-> 1282\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_send_request\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbody\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mencode_chunked\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/.asdf/installs/python/3.10.10/lib/python3.10/http/client.py:1328\u001b[0m, in \u001b[0;36mHTTPConnection._send_request\u001b[0;34m(self, method, url, body, headers, encode_chunked)\u001b[0m\n\u001b[1;32m 1327\u001b[0m body \u001b[38;5;241m=\u001b[39m _encode(body, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbody\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m-> 1328\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mendheaders\u001b[49m\u001b[43m(\u001b[49m\u001b[43mbody\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mencode_chunked\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mencode_chunked\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/.asdf/installs/python/3.10.10/lib/python3.10/http/client.py:1277\u001b[0m, in \u001b[0;36mHTTPConnection.endheaders\u001b[0;34m(self, message_body, encode_chunked)\u001b[0m\n\u001b[1;32m 1276\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m CannotSendHeader()\n\u001b[0;32m-> 1277\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_send_output\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmessage_body\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mencode_chunked\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mencode_chunked\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/.asdf/installs/python/3.10.10/lib/python3.10/http/client.py:1076\u001b[0m, in \u001b[0;36mHTTPConnection._send_output\u001b[0;34m(self, message_body, encode_chunked)\u001b[0m\n\u001b[1;32m 1074\u001b[0m chunk \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(chunk)\u001b[38;5;132;01m:\u001b[39;00m\u001b[38;5;124mX\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;130;01m\\r\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;241m.\u001b[39mencode(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mascii\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;241m+\u001b[39m chunk \\\n\u001b[1;32m 1075\u001b[0m \u001b[38;5;241m+\u001b[39m \u001b[38;5;124mb\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;130;01m\\r\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m'\u001b[39m\n\u001b[0;32m-> 1076\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msend\u001b[49m\u001b[43m(\u001b[49m\u001b[43mchunk\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1078\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m encode_chunked \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_http_vsn \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m11\u001b[39m:\n\u001b[1;32m 1079\u001b[0m \u001b[38;5;66;03m# end chunked transfer\u001b[39;00m\n", + "File \u001b[0;32m~/.asdf/installs/python/3.10.10/lib/python3.10/http/client.py:998\u001b[0m, in \u001b[0;36mHTTPConnection.send\u001b[0;34m(self, data)\u001b[0m\n\u001b[1;32m 997\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 998\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msock\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msendall\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 999\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n", + "File \u001b[0;32m~/.asdf/installs/python/3.10.10/lib/python3.10/ssl.py:1237\u001b[0m, in \u001b[0;36mSSLSocket.sendall\u001b[0;34m(self, data, flags)\u001b[0m\n\u001b[1;32m 1236\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m count \u001b[38;5;241m<\u001b[39m amount:\n\u001b[0;32m-> 1237\u001b[0m v \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msend\u001b[49m\u001b[43m(\u001b[49m\u001b[43mbyte_view\u001b[49m\u001b[43m[\u001b[49m\u001b[43mcount\u001b[49m\u001b[43m:\u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1238\u001b[0m count \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m v\n", + "File \u001b[0;32m~/.asdf/installs/python/3.10.10/lib/python3.10/ssl.py:1206\u001b[0m, in \u001b[0;36mSSLSocket.send\u001b[0;34m(self, data, flags)\u001b[0m\n\u001b[1;32m 1203\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 1204\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnon-zero flags not allowed in calls to send() on \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m%\u001b[39m\n\u001b[1;32m 1205\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m)\n\u001b[0;32m-> 1206\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_sslobj\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwrite\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1207\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n", + "\u001b[0;31mSSLEOFError\u001b[0m: EOF occurred in violation of protocol (_ssl.c:2396)", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[0;31mMaxRetryError\u001b[0m Traceback (most recent call last)", + "File \u001b[0;32m~/Repos/Projects/MeaLeon/.venv/lib/python3.10/site-packages/requests/adapters.py:486\u001b[0m, in \u001b[0;36mHTTPAdapter.send\u001b[0;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[1;32m 485\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 486\u001b[0m resp \u001b[38;5;241m=\u001b[39m \u001b[43mconn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43murlopen\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 487\u001b[0m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 488\u001b[0m \u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 489\u001b[0m \u001b[43m \u001b[49m\u001b[43mbody\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbody\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 490\u001b[0m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 491\u001b[0m \u001b[43m \u001b[49m\u001b[43mredirect\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 492\u001b[0m \u001b[43m \u001b[49m\u001b[43massert_same_host\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 493\u001b[0m \u001b[43m \u001b[49m\u001b[43mpreload_content\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 494\u001b[0m \u001b[43m \u001b[49m\u001b[43mdecode_content\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 495\u001b[0m \u001b[43m \u001b[49m\u001b[43mretries\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmax_retries\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 496\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 497\u001b[0m \u001b[43m \u001b[49m\u001b[43mchunked\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mchunked\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 498\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 500\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (ProtocolError, \u001b[38;5;167;01mOSError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m err:\n", + "File \u001b[0;32m~/Repos/Projects/MeaLeon/.venv/lib/python3.10/site-packages/urllib3/connectionpool.py:827\u001b[0m, in \u001b[0;36mHTTPConnectionPool.urlopen\u001b[0;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)\u001b[0m\n\u001b[1;32m 824\u001b[0m log\u001b[38;5;241m.\u001b[39mwarning(\n\u001b[1;32m 825\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRetrying (\u001b[39m\u001b[38;5;132;01m%r\u001b[39;00m\u001b[38;5;124m) after connection broken by \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m%r\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m: \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m\"\u001b[39m, retries, err, url\n\u001b[1;32m 826\u001b[0m )\n\u001b[0;32m--> 827\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43murlopen\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 828\u001b[0m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 829\u001b[0m \u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 830\u001b[0m \u001b[43m \u001b[49m\u001b[43mbody\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 831\u001b[0m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 832\u001b[0m \u001b[43m \u001b[49m\u001b[43mretries\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 833\u001b[0m \u001b[43m \u001b[49m\u001b[43mredirect\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 834\u001b[0m \u001b[43m \u001b[49m\u001b[43massert_same_host\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 835\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 836\u001b[0m \u001b[43m \u001b[49m\u001b[43mpool_timeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpool_timeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 837\u001b[0m \u001b[43m \u001b[49m\u001b[43mrelease_conn\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrelease_conn\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 838\u001b[0m \u001b[43m \u001b[49m\u001b[43mchunked\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mchunked\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 839\u001b[0m \u001b[43m \u001b[49m\u001b[43mbody_pos\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbody_pos\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 840\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mresponse_kw\u001b[49m\n\u001b[1;32m 841\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 843\u001b[0m \u001b[38;5;66;03m# Handle redirect?\u001b[39;00m\n", + "File \u001b[0;32m~/Repos/Projects/MeaLeon/.venv/lib/python3.10/site-packages/urllib3/connectionpool.py:827\u001b[0m, in \u001b[0;36mHTTPConnectionPool.urlopen\u001b[0;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)\u001b[0m\n\u001b[1;32m 824\u001b[0m log\u001b[38;5;241m.\u001b[39mwarning(\n\u001b[1;32m 825\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRetrying (\u001b[39m\u001b[38;5;132;01m%r\u001b[39;00m\u001b[38;5;124m) after connection broken by \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m%r\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m: \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m\"\u001b[39m, retries, err, url\n\u001b[1;32m 826\u001b[0m )\n\u001b[0;32m--> 827\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43murlopen\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 828\u001b[0m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 829\u001b[0m \u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 830\u001b[0m \u001b[43m \u001b[49m\u001b[43mbody\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 831\u001b[0m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 832\u001b[0m \u001b[43m \u001b[49m\u001b[43mretries\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 833\u001b[0m \u001b[43m \u001b[49m\u001b[43mredirect\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 834\u001b[0m \u001b[43m \u001b[49m\u001b[43massert_same_host\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 835\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 836\u001b[0m \u001b[43m \u001b[49m\u001b[43mpool_timeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpool_timeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 837\u001b[0m \u001b[43m \u001b[49m\u001b[43mrelease_conn\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrelease_conn\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 838\u001b[0m \u001b[43m \u001b[49m\u001b[43mchunked\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mchunked\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 839\u001b[0m \u001b[43m \u001b[49m\u001b[43mbody_pos\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbody_pos\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 840\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mresponse_kw\u001b[49m\n\u001b[1;32m 841\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 843\u001b[0m \u001b[38;5;66;03m# Handle redirect?\u001b[39;00m\n", + " \u001b[0;31m[... skipping similar frames: HTTPConnectionPool.urlopen at line 827 (2 times)]\u001b[0m\n", + "File \u001b[0;32m~/Repos/Projects/MeaLeon/.venv/lib/python3.10/site-packages/urllib3/connectionpool.py:827\u001b[0m, in \u001b[0;36mHTTPConnectionPool.urlopen\u001b[0;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)\u001b[0m\n\u001b[1;32m 824\u001b[0m log\u001b[38;5;241m.\u001b[39mwarning(\n\u001b[1;32m 825\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRetrying (\u001b[39m\u001b[38;5;132;01m%r\u001b[39;00m\u001b[38;5;124m) after connection broken by \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m%r\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m: \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m\"\u001b[39m, retries, err, url\n\u001b[1;32m 826\u001b[0m )\n\u001b[0;32m--> 827\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43murlopen\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 828\u001b[0m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 829\u001b[0m \u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 830\u001b[0m \u001b[43m \u001b[49m\u001b[43mbody\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 831\u001b[0m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 832\u001b[0m \u001b[43m \u001b[49m\u001b[43mretries\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 833\u001b[0m \u001b[43m \u001b[49m\u001b[43mredirect\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 834\u001b[0m \u001b[43m \u001b[49m\u001b[43massert_same_host\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 835\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 836\u001b[0m \u001b[43m \u001b[49m\u001b[43mpool_timeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpool_timeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 837\u001b[0m \u001b[43m \u001b[49m\u001b[43mrelease_conn\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrelease_conn\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 838\u001b[0m \u001b[43m \u001b[49m\u001b[43mchunked\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mchunked\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 839\u001b[0m \u001b[43m \u001b[49m\u001b[43mbody_pos\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbody_pos\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 840\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mresponse_kw\u001b[49m\n\u001b[1;32m 841\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 843\u001b[0m \u001b[38;5;66;03m# Handle redirect?\u001b[39;00m\n", + "File \u001b[0;32m~/Repos/Projects/MeaLeon/.venv/lib/python3.10/site-packages/urllib3/connectionpool.py:799\u001b[0m, in \u001b[0;36mHTTPConnectionPool.urlopen\u001b[0;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)\u001b[0m\n\u001b[1;32m 797\u001b[0m e \u001b[38;5;241m=\u001b[39m ProtocolError(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mConnection aborted.\u001b[39m\u001b[38;5;124m\"\u001b[39m, e)\n\u001b[0;32m--> 799\u001b[0m retries \u001b[38;5;241m=\u001b[39m \u001b[43mretries\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mincrement\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 800\u001b[0m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merror\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43me\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m_pool\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m_stacktrace\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msys\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexc_info\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m2\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[1;32m 801\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 802\u001b[0m retries\u001b[38;5;241m.\u001b[39msleep()\n", + "File \u001b[0;32m~/Repos/Projects/MeaLeon/.venv/lib/python3.10/site-packages/urllib3/util/retry.py:592\u001b[0m, in \u001b[0;36mRetry.increment\u001b[0;34m(self, method, url, response, error, _pool, _stacktrace)\u001b[0m\n\u001b[1;32m 591\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m new_retry\u001b[38;5;241m.\u001b[39mis_exhausted():\n\u001b[0;32m--> 592\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m MaxRetryError(_pool, url, error \u001b[38;5;129;01mor\u001b[39;00m ResponseError(cause))\n\u001b[1;32m 594\u001b[0m log\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mIncremented Retry for (url=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m): \u001b[39m\u001b[38;5;132;01m%r\u001b[39;00m\u001b[38;5;124m\"\u001b[39m, url, new_retry)\n", + "\u001b[0;31mMaxRetryError\u001b[0m: HTTPSConnectionPool(host='dagshub.com', port=443): Max retries exceeded with url: /AaronWChen/MeaLeon.mlflow/api/2.0/mlflow-artifacts/artifacts/ad83ec0a104a44b5a16da48605603245/2e72322335494f84af9ff7e7e44c3ff9/artifacts/sklearn_model/artifacts/transformed_recipes.pkl (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:2396)')))", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[0;31mSSLError\u001b[0m Traceback (most recent call last)", + "File \u001b[0;32m~/Repos/Projects/MeaLeon/.venv/lib/python3.10/site-packages/mlflow/utils/rest_utils.py:99\u001b[0m, in \u001b[0;36mhttp_request\u001b[0;34m(host_creds, endpoint, method, max_retries, backoff_factor, extra_headers, retry_codes, timeout, raise_on_status, **kwargs)\u001b[0m\n\u001b[1;32m 98\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m---> 99\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_get_http_response_with_retries\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 100\u001b[0m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 101\u001b[0m \u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 102\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_retries\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 103\u001b[0m \u001b[43m \u001b[49m\u001b[43mbackoff_factor\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 104\u001b[0m \u001b[43m \u001b[49m\u001b[43mretry_codes\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 105\u001b[0m \u001b[43m \u001b[49m\u001b[43mraise_on_status\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 106\u001b[0m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 107\u001b[0m \u001b[43m \u001b[49m\u001b[43mverify\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhost_creds\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mverify\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 108\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 109\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 110\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 111\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m requests\u001b[38;5;241m.\u001b[39mexceptions\u001b[38;5;241m.\u001b[39mTimeout \u001b[38;5;28;01mas\u001b[39;00m to:\n", + "File \u001b[0;32m~/Repos/Projects/MeaLeon/.venv/lib/python3.10/site-packages/mlflow/utils/request_utils.py:151\u001b[0m, in \u001b[0;36m_get_http_response_with_retries\u001b[0;34m(method, url, max_retries, backoff_factor, retry_codes, raise_on_status, **kwargs)\u001b[0m\n\u001b[1;32m 150\u001b[0m session \u001b[38;5;241m=\u001b[39m _get_request_session(max_retries, backoff_factor, retry_codes, raise_on_status)\n\u001b[0;32m--> 151\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43msession\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/Repos/Projects/MeaLeon/.venv/lib/python3.10/site-packages/requests/sessions.py:589\u001b[0m, in \u001b[0;36mSession.request\u001b[0;34m(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)\u001b[0m\n\u001b[1;32m 588\u001b[0m send_kwargs\u001b[38;5;241m.\u001b[39mupdate(settings)\n\u001b[0;32m--> 589\u001b[0m resp \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msend\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprep\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43msend_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 591\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m resp\n", + "File \u001b[0;32m~/Repos/Projects/MeaLeon/.venv/lib/python3.10/site-packages/requests/sessions.py:703\u001b[0m, in \u001b[0;36mSession.send\u001b[0;34m(self, request, **kwargs)\u001b[0m\n\u001b[1;32m 702\u001b[0m \u001b[38;5;66;03m# Send the request\u001b[39;00m\n\u001b[0;32m--> 703\u001b[0m r \u001b[38;5;241m=\u001b[39m \u001b[43madapter\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msend\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 705\u001b[0m \u001b[38;5;66;03m# Total elapsed time of the request (approximately)\u001b[39;00m\n", + "File \u001b[0;32m~/Repos/Projects/MeaLeon/.venv/lib/python3.10/site-packages/requests/adapters.py:517\u001b[0m, in \u001b[0;36mHTTPAdapter.send\u001b[0;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[1;32m 515\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(e\u001b[38;5;241m.\u001b[39mreason, _SSLError):\n\u001b[1;32m 516\u001b[0m \u001b[38;5;66;03m# This branch is for urllib3 v1.22 and later.\u001b[39;00m\n\u001b[0;32m--> 517\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m SSLError(e, request\u001b[38;5;241m=\u001b[39mrequest)\n\u001b[1;32m 519\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mConnectionError\u001b[39;00m(e, request\u001b[38;5;241m=\u001b[39mrequest)\n", + "\u001b[0;31mSSLError\u001b[0m: HTTPSConnectionPool(host='dagshub.com', port=443): Max retries exceeded with url: /AaronWChen/MeaLeon.mlflow/api/2.0/mlflow-artifacts/artifacts/ad83ec0a104a44b5a16da48605603245/2e72322335494f84af9ff7e7e44c3ff9/artifacts/sklearn_model/artifacts/transformed_recipes.pkl (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:2396)')))", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[0;31mMlflowException\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[8], line 103\u001b[0m\n\u001b[1;32m 97\u001b[0m pickle\u001b[38;5;241m.\u001b[39mdump(transformed_recipe, fo)\n\u001b[1;32m 99\u001b[0m \u001b[38;5;66;03m# with open(combined_df_path, 'wb') as fo:\u001b[39;00m\n\u001b[1;32m 100\u001b[0m \u001b[38;5;66;03m# pickle.dump(combined_df, fo)\u001b[39;00m\n\u001b[0;32m--> 103\u001b[0m model_info \u001b[38;5;241m=\u001b[39m \u001b[43mmlflow\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpyfunc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlog_model\u001b[49m\u001b[43m(\u001b[49m\u001b[43m \u001b[49m\n\u001b[1;32m 104\u001b[0m \u001b[43m \u001b[49m\u001b[43mcode_path\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m../src/\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 105\u001b[0m \u001b[43m \u001b[49m\u001b[43mpython_model\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mCustomSKLearnWrapper\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 106\u001b[0m \u001b[43m \u001b[49m\u001b[43minput_example\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mwhole_nlp_df\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mingredients_lemmafied\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 107\u001b[0m \u001b[43m \u001b[49m\u001b[43msignature\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msignature\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\n\u001b[1;32m 108\u001b[0m \u001b[43m \u001b[49m\u001b[43martifact_path\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43msklearn_model\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 109\u001b[0m \u001b[43m \u001b[49m\u001b[43martifacts\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43martifacts\u001b[49m\n\u001b[1;32m 110\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m \n\u001b[1;32m 112\u001b[0m \u001b[38;5;66;03m# since this uses a custom Stanza analyzer, we have to use a custom mlflow.Pyfunc.PythonModel\u001b[39;00m\n", + "File \u001b[0;32m~/Repos/Projects/MeaLeon/.venv/lib/python3.10/site-packages/mlflow/pyfunc/__init__.py:2116\u001b[0m, in \u001b[0;36mlog_model\u001b[0;34m(artifact_path, loader_module, data_path, code_path, conda_env, python_model, artifacts, registered_model_name, signature, input_example, await_registration_for, pip_requirements, extra_pip_requirements, metadata, model_config)\u001b[0m\n\u001b[1;32m 1949\u001b[0m \u001b[38;5;129m@format_docstring\u001b[39m(LOG_MODEL_PARAM_DOCS\u001b[38;5;241m.\u001b[39mformat(package_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mscikit-learn\u001b[39m\u001b[38;5;124m\"\u001b[39m))\n\u001b[1;32m 1950\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mlog_model\u001b[39m(\n\u001b[1;32m 1951\u001b[0m artifact_path,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1965\u001b[0m model_config\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 1966\u001b[0m ):\n\u001b[1;32m 1967\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 1968\u001b[0m \u001b[38;5;124;03m Log a Pyfunc model with custom inference logic and optional data dependencies as an MLflow\u001b[39;00m\n\u001b[1;32m 1969\u001b[0m \u001b[38;5;124;03m artifact for the current run.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 2114\u001b[0m \u001b[38;5;124;03m metadata of the logged model.\u001b[39;00m\n\u001b[1;32m 2115\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m-> 2116\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mModel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlog\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2117\u001b[0m \u001b[43m \u001b[49m\u001b[43martifact_path\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43martifact_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2118\u001b[0m \u001b[43m \u001b[49m\u001b[43mflavor\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmlflow\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpyfunc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2119\u001b[0m \u001b[43m \u001b[49m\u001b[43mloader_module\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mloader_module\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2120\u001b[0m \u001b[43m \u001b[49m\u001b[43mdata_path\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2121\u001b[0m \u001b[43m \u001b[49m\u001b[43mcode_path\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcode_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2122\u001b[0m \u001b[43m \u001b[49m\u001b[43mpython_model\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpython_model\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2123\u001b[0m \u001b[43m \u001b[49m\u001b[43martifacts\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43martifacts\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2124\u001b[0m \u001b[43m \u001b[49m\u001b[43mconda_env\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconda_env\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2125\u001b[0m \u001b[43m \u001b[49m\u001b[43mregistered_model_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mregistered_model_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2126\u001b[0m \u001b[43m \u001b[49m\u001b[43msignature\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msignature\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2127\u001b[0m \u001b[43m \u001b[49m\u001b[43minput_example\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minput_example\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2128\u001b[0m \u001b[43m \u001b[49m\u001b[43mawait_registration_for\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mawait_registration_for\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2129\u001b[0m \u001b[43m \u001b[49m\u001b[43mpip_requirements\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpip_requirements\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2130\u001b[0m \u001b[43m \u001b[49m\u001b[43mextra_pip_requirements\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextra_pip_requirements\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2131\u001b[0m \u001b[43m \u001b[49m\u001b[43mmetadata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmetadata\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2132\u001b[0m \u001b[43m \u001b[49m\u001b[43mmodel_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodel_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2133\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/Repos/Projects/MeaLeon/.venv/lib/python3.10/site-packages/mlflow/models/model.py:620\u001b[0m, in \u001b[0;36mModel.log\u001b[0;34m(cls, artifact_path, flavor, registered_model_name, await_registration_for, metadata, **kwargs)\u001b[0m\n\u001b[1;32m 618\u001b[0m _logger\u001b[38;5;241m.\u001b[39mwarning(_LOG_MODEL_MISSING_SIGNATURE_WARNING)\n\u001b[1;32m 619\u001b[0m flavor\u001b[38;5;241m.\u001b[39msave_model(path\u001b[38;5;241m=\u001b[39mlocal_path, mlflow_model\u001b[38;5;241m=\u001b[39mmlflow_model, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m--> 620\u001b[0m \u001b[43mmlflow\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtracking\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfluent\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlog_artifacts\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlocal_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmlflow_model\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43martifact_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 621\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 622\u001b[0m mlflow\u001b[38;5;241m.\u001b[39mtracking\u001b[38;5;241m.\u001b[39mfluent\u001b[38;5;241m.\u001b[39m_record_logged_model(mlflow_model)\n", + "File \u001b[0;32m~/Repos/Projects/MeaLeon/.venv/lib/python3.10/site-packages/mlflow/tracking/fluent.py:1008\u001b[0m, in \u001b[0;36mlog_artifacts\u001b[0;34m(local_dir, artifact_path)\u001b[0m\n\u001b[1;32m 978\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 979\u001b[0m \u001b[38;5;124;03mLog all the contents of a local directory as artifacts of the run. If no run is active,\u001b[39;00m\n\u001b[1;32m 980\u001b[0m \u001b[38;5;124;03mthis method will create a new active run.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1005\u001b[0m \u001b[38;5;124;03m mlflow.log_artifacts(\"data\", artifact_path=\"states\")\u001b[39;00m\n\u001b[1;32m 1006\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 1007\u001b[0m run_id \u001b[38;5;241m=\u001b[39m _get_or_start_run()\u001b[38;5;241m.\u001b[39minfo\u001b[38;5;241m.\u001b[39mrun_id\n\u001b[0;32m-> 1008\u001b[0m \u001b[43mMlflowClient\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlog_artifacts\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrun_id\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlocal_dir\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43martifact_path\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/Repos/Projects/MeaLeon/.venv/lib/python3.10/site-packages/mlflow/tracking/client.py:1188\u001b[0m, in \u001b[0;36mMlflowClient.log_artifacts\u001b[0;34m(self, run_id, local_dir, artifact_path)\u001b[0m\n\u001b[1;32m 1144\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mlog_artifacts\u001b[39m(\n\u001b[1;32m 1145\u001b[0m \u001b[38;5;28mself\u001b[39m, run_id: \u001b[38;5;28mstr\u001b[39m, local_dir: \u001b[38;5;28mstr\u001b[39m, artifact_path: Optional[\u001b[38;5;28mstr\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1146\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 1147\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 1148\u001b[0m \u001b[38;5;124;03m Write a directory of files to the remote ``artifact_uri``.\u001b[39;00m\n\u001b[1;32m 1149\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1186\u001b[0m \u001b[38;5;124;03m is_dir: True\u001b[39;00m\n\u001b[1;32m 1187\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m-> 1188\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_tracking_client\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlog_artifacts\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrun_id\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlocal_dir\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43martifact_path\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/Repos/Projects/MeaLeon/.venv/lib/python3.10/site-packages/mlflow/tracking/_tracking_service/client.py:538\u001b[0m, in \u001b[0;36mTrackingServiceClient.log_artifacts\u001b[0;34m(self, run_id, local_dir, artifact_path)\u001b[0m\n\u001b[1;32m 531\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mlog_artifacts\u001b[39m(\u001b[38;5;28mself\u001b[39m, run_id, local_dir, artifact_path\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[1;32m 532\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 533\u001b[0m \u001b[38;5;124;03m Write a directory of files to the remote ``artifact_uri``.\u001b[39;00m\n\u001b[1;32m 534\u001b[0m \n\u001b[1;32m 535\u001b[0m \u001b[38;5;124;03m :param local_dir: Path to the directory of files to write.\u001b[39;00m\n\u001b[1;32m 536\u001b[0m \u001b[38;5;124;03m :param artifact_path: If provided, the directory in ``artifact_uri`` to write to.\u001b[39;00m\n\u001b[1;32m 537\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 538\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_artifact_repo\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrun_id\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlog_artifacts\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlocal_dir\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43martifact_path\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/Repos/Projects/MeaLeon/.venv/lib/python3.10/site-packages/mlflow/store/artifact/http_artifact_repo.py:45\u001b[0m, in \u001b[0;36mHttpArtifactRepository.log_artifacts\u001b[0;34m(self, local_dir, artifact_path)\u001b[0m\n\u001b[1;32m 41\u001b[0m artifact_dir \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 42\u001b[0m posixpath\u001b[38;5;241m.\u001b[39mjoin(artifact_path, rel_path) \u001b[38;5;28;01mif\u001b[39;00m artifact_path \u001b[38;5;28;01melse\u001b[39;00m rel_path\n\u001b[1;32m 43\u001b[0m )\n\u001b[1;32m 44\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m f \u001b[38;5;129;01min\u001b[39;00m filenames:\n\u001b[0;32m---> 45\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlog_artifact\u001b[49m\u001b[43m(\u001b[49m\u001b[43mos\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpath\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mjoin\u001b[49m\u001b[43m(\u001b[49m\u001b[43mroot\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mf\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43martifact_dir\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/Repos/Projects/MeaLeon/.venv/lib/python3.10/site-packages/mlflow/store/artifact/http_artifact_repo.py:28\u001b[0m, in \u001b[0;36mHttpArtifactRepository.log_artifact\u001b[0;34m(self, local_file, artifact_path)\u001b[0m\n\u001b[1;32m 26\u001b[0m extra_headers \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mContent-Type\u001b[39m\u001b[38;5;124m\"\u001b[39m: mime_type}\n\u001b[1;32m 27\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mopen\u001b[39m(local_file, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrb\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[0;32m---> 28\u001b[0m resp \u001b[38;5;241m=\u001b[39m \u001b[43mhttp_request\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 29\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_host_creds\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mendpoint\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mPUT\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mextra_headers\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextra_headers\u001b[49m\n\u001b[1;32m 30\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 31\u001b[0m augmented_raise_for_status(resp)\n", + "File \u001b[0;32m~/Repos/Projects/MeaLeon/.venv/lib/python3.10/site-packages/mlflow/utils/rest_utils.py:120\u001b[0m, in \u001b[0;36mhttp_request\u001b[0;34m(host_creds, endpoint, method, max_retries, backoff_factor, extra_headers, retry_codes, timeout, raise_on_status, **kwargs)\u001b[0m\n\u001b[1;32m 118\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m InvalidUrlException(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mInvalid url: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00murl\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01miu\u001b[39;00m\n\u001b[1;32m 119\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m--> 120\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m MlflowException(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAPI request to \u001b[39m\u001b[38;5;132;01m{\u001b[39;00murl\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m failed with exception \u001b[39m\u001b[38;5;132;01m{\u001b[39;00me\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n", + "\u001b[0;31mMlflowException\u001b[0m: API request to https://dagshub.com/AaronWChen/MeaLeon.mlflow/api/2.0/mlflow-artifacts/artifacts/ad83ec0a104a44b5a16da48605603245/2e72322335494f84af9ff7e7e44c3ff9/artifacts/sklearn_model/artifacts/transformed_recipes.pkl failed with exception HTTPSConnectionPool(host='dagshub.com', port=443): Max retries exceeded with url: /AaronWChen/MeaLeon.mlflow/api/2.0/mlflow-artifacts/artifacts/ad83ec0a104a44b5a16da48605603245/2e72322335494f84af9ff7e7e44c3ff9/artifacts/sklearn_model/artifacts/transformed_recipes.pkl (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:2396)')))" + ] + } + ], + "source": [ + "# load from MLflow\n", + "mlflow_client = mlflow.tracking.MlflowClient(\n", + " tracking_uri=f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow')\n", + "\n", + "# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer\n", + "sklearn_transformer_params = { \n", + " 'analyzer': CustomSKLearnAnalyzer().ngram_maker(\n", + " min_ngram_length=1,\n", + " max_ngram_length=4,\n", + " ),\n", + " 'min_df':3,\n", + " 'binary':True\n", + "}\n", + "\n", + "# pipeline_params are parameters that will be logged in MLFlow and are a superset of library parameters\n", + "pipeline_params = {\n", + " 'stanza_model': 'en',\n", + " 'sklearn-transformer': 'OHE'\n", + "}\n", + "\n", + "# update the pipeline parameters with the library-specific ones so that they show up in MLflow Tracking\n", + "pipeline_params.update(sklearn_transformer_params)\n", + "\n", + "with mlflow.start_run(experiment_id=mlflow_exp_id): \n", + " # LOG PARAMETERS\n", + " mlflow.log_params(pipeline_params)\n", + "\n", + " # LOG INPUTS (QUERIES) AND OUTPUTS\n", + " # MLflow example uses a list of strings or a list of str->str dicts\n", + " # Will be useful in STAGING/Evaluation\n", + " \n", + " # LOG MODEL\n", + " # Instantiate sklearn OneHotEncoder\n", + " sklearn_transformer = CountVectorizer(**sklearn_transformer_params)\n", + "\n", + " print('\\n')\n", + " print('-' * 80)\n", + " print('sklearn fit transform on ingredients:', end='\\n')\n", + "\n", + " model_input = whole_nlp_df['ingredients_lemmafied']\n", + "\n", + " print('\\n')\n", + " print('-' * 80)\n", + " print('Input Data: ', end='\\n')\n", + " print(model_input)\n", + "\n", + " print('\\n')\n", + " print('-' * 80)\n", + " print('Input Data Shape: ', end='\\n')\n", + " print(model_input.shape)\n", + "\n", + " print('\\n')\n", + " print('-' * 80)\n", + " print('Random 3 Records from Input Data: ', end='\\n')\n", + " print(model_input.sample(3, random_state=200))\n", + "\n", + " # Do fit transform on data\n", + " response = sklearn_transformer.fit_transform(tqdm(model_input)) \n", + " \n", + " transformed_recipe = pd.DataFrame(\n", + " response.toarray(),\n", + " columns=sklearn_transformer.get_feature_names_out(),\n", + " index=model_input.index\n", + " )\n", + "\n", + " signature = infer_signature(model_input=model_input,\n", + " model_output=transformed_recipe\n", + " )\n", + "\n", + " print('\\n')\n", + " print('-' * 80)\n", + " print('Transformed Data:', end='\\n')\n", + " print(transformed_recipe.head())\n", + " \n", + " # mlflow.pyfunc.save_model(\n", + " # path=model_directory,\n", + " # code_path=[\"../src/\"],\n", + " # python_model=CustomSKLearnWrapper(),\n", + " # input_example=to_nlp_df['ingredients'][0], \n", + " # artifacts=artifacts\n", + " # )\n", + "\n", + " # combined_df = pd.concat(\n", + " # [transformed_recipe,\n", + " # whole_nlp_df\n", + " # ]\n", + " # , axis=1)\n", + " # print('\\n')\n", + " # print('-' * 80)\n", + " # print('Combined Data:', end='\\n')\n", + " # print(combined_df.head())\n", + "\n", + " with open(sklearn_transformer_path, \"wb\") as fo:\n", + " pickle.dump(sklearn_transformer, fo)\n", + " \n", + " with open(transformed_recipes_path, \"wb\") as fo:\n", + " pickle.dump(transformed_recipe, fo)\n", + " \n", + " # with open(combined_df_path, 'wb') as fo:\n", + " # pickle.dump(combined_df, fo)\n", + "\n", + "\n", + " model_info = mlflow.pyfunc.log_model( \n", + " code_path=[\"../src/\"],\n", + " python_model=CustomSKLearnWrapper(),\n", + " input_example=whole_nlp_df['ingredients_lemmafied'][0],\n", + " signature=signature, \n", + " artifact_path=\"sklearn_model\",\n", + " artifacts=artifacts\n", + " ) \n", + "\n", + " # since this uses a custom Stanza analyzer, we have to use a custom mlflow.Pyfunc.PythonModel\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# pre_proc_df is cleaned dataframe\n", + "whole_nlp_df = dfpp.preprocess_dataframe(raw_df)\n", + "print('\\n')\n", + "print('--------------')\n", + "print('Preprocessed Dataframe:', end='\\n')\n", + "print(whole_nlp_df.head())\n", + "print(whole_nlp_df.shape)\n", + "\n", + "# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer\n", + "sklearn_transformer_params = { \n", + " 'analyzer': CustomSKLearnAnalyzer().ngram_maker(\n", + " min_ngram_length=1,\n", + " max_ngram_length=4,\n", + " ),\n", + " 'min_df':3,\n", + "}\n", + "\n", + "sklearn_transformer = TfidfVectorizer(**sklearn_transformer_params)\n", + "\n", + "model_input = whole_nlp_df['ingredients_lemmafied']\n", + "\n", + "# Do fit transform on data\n", + "print(\"fit_transform start: \" + str(datetime.now()))\n", + "response = sklearn_transformer.fit_transform(tqdm(model_input)) \n", + "print(\"fit_transform end: \" + str(datetime.now()))\n", + "\n", + "transformed_recipe = pd.DataFrame(\n", + " response.toarray(),\n", + " columns=sklearn_transformer.get_feature_names_out(),\n", + " index=model_input.index\n", + ")\n", + "\n", + "combined_df = pd.concat([transformed_recipe, whole_nlp_df], axis=1)\n", + "\n", + "with open(\"../joblib/2024.03.19/combined_df.joblib\", 'wb') as fo:\n", + " joblib.dump()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "response" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test_predictor = mlflow.pyfunc.load_model(model_uri=model_info.model_uri)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# pre_proc_df is cleaned dataframe\n", + "pre_proc_test_df = dfpp.preprocess_dataframe(test_df)\n", + "print('\\n')\n", + "print('--------------')\n", + "print('Preprocessed Dataframe: ', end='\\n')\n", + "print(pre_proc_test_df.head())\n", + "print(pre_proc_test_df.shape)\n", + "\n", + "# create subset for dev purposes\n", + "# to_nlp_test_df = pre_proc_test_df\n", + "# print('\\n')\n", + "# print('-' * 80)\n", + "# print('Subset Dataframe:', end='\\n')\n", + "# print(to_nlp_test_df.head())\n", + "# print(to_nlp_test_df.shape)\n", + "\n", + "test_model_input = pre_proc_test_df['ingredients']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test_model_input" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test_model_input.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test_model_input.values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model_info.signature.to_dict()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test_predictor.predict(test_model_input)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print('\\n')\n", + "print('-' * 80)\n", + "print('Input Data: ', end='\\n')\n", + "print(test_model_input)\n", + "\n", + "print('\\n')\n", + "print('-' * 80)\n", + "print('Input Data Shape: ', end='\\n')\n", + "print(test_model_input.shape)\n", + "\n", + "print('\\n')\n", + "print('-' * 80)\n", + "print('Random 3 Records from Input Data: ', end='\\n')\n", + "print(test_model_input.sample(3, random_state=200))\n", + "\n", + "# test_response = sklearn_transformer.transform(tqdm(test_model_input)) \n", + "test_response = sklearn_transformer.transform(test_model_input)\n", + " \n", + " \n", + "test_transformed_recipe = pd.DataFrame(\n", + " test_response.toarray(),\n", + " columns=sklearn_transformer.get_feature_names_out(),\n", + " index=test_model_input.index\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "type(test_predictor)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test_transformed_recipe" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "python3", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}