From 89fc77a0017ff14131f5a4871eeeadb1c8f81e16 Mon Sep 17 00:00:00 2001 From: Aaron W Chen Date: Wed, 8 May 2024 10:18:47 -0700 Subject: [PATCH] Close out open notebooks, update old NLTK code --- .gitconfig | 11 + nbs/01_initial_eda.ipynb | 8 +- nbs/04_word_lists_combine_features.ipynb | 5 + nbs/07_bertopic_testing.ipynb | 47 +- nbs/11_sklearn_mlfow_model_testing.ipynb | 151 +- nbs/13_new_preproc_test.ipynb | 2486 +++++++++++++++++++++- src/nltk/dish_predictor.py | 485 +++-- src/nltk/ohe_dish_predictor.py | 491 +++-- src/nltk/prepare_ohe_database.py | 179 +- 9 files changed, 3106 insertions(+), 757 deletions(-) create mode 100644 .gitconfig diff --git a/.gitconfig b/.gitconfig new file mode 100644 index 0000000..9054574 --- /dev/null +++ b/.gitconfig @@ -0,0 +1,11 @@ +# Generated by nbdev_install_hooks +# +# If you need to disable this instrumentation do: +# git config --local --unset include.path +# +# To restore: +# git config --local include.path ../.gitconfig +# +[merge "nbdev-merge"] + name = resolve conflicts with nbdev_fix + driver = nbdev_merge %O %A %B %P diff --git a/nbs/01_initial_eda.ipynb b/nbs/01_initial_eda.ipynb index df28015..620ee2c 100644 --- a/nbs/01_initial_eda.ipynb +++ b/nbs/01_initial_eda.ipynb @@ -3059,7 +3059,13 @@ "source": [] } ], - "metadata": {}, + "metadata": { + "kernelspec": { + "display_name": "python3", + "language": "python", + "name": "python3" + } + }, "nbformat": 4, "nbformat_minor": 2 } diff --git a/nbs/04_word_lists_combine_features.ipynb b/nbs/04_word_lists_combine_features.ipynb index eacce42..e4cbfb9 100644 --- a/nbs/04_word_lists_combine_features.ipynb +++ b/nbs/04_word_lists_combine_features.ipynb @@ -1547,6 +1547,11 @@ "filtered_df.head()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/nbs/07_bertopic_testing.ipynb b/nbs/07_bertopic_testing.ipynb index 58ea557..505f9ab 100644 --- a/nbs/07_bertopic_testing.ipynb +++ b/nbs/07_bertopic_testing.ipynb @@ -132,54 +132,13 @@ "metadata": {}, "outputs": [ { - "data": { - "text/html": [ - "
                                       ❗❗❗ AUTHORIZATION REQUIRED ❗❗❗                                        \n",
-       "
\n" - ], - "text/plain": [ - " \u001b[1m❗❗❗ AUTHORIZATION REQUIRED ❗❗❗\u001b[0m \n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "084490be36a047529096776bcb49e6de", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Output()" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "\n", - "\n", - "Open the following link in your browser to authorize the client:\n", - "https://dagshub.com/login/oauth/authorize?state=1bf46b97-13cb-4873-9c57-af8141828675&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=b534b316020d1f4e499a4a24567fc8ff36a49714fc1a4e01f004354ef9e20587\n", - "\n", - "\n" + "Token Dagshub OAuth token, valid until 2023-03-15 05:42:01.541960+00:00 does not exist in the storage\n", + "Token Dagshub OAuth token, valid until 2023-07-27 17:31:38.987842+00:00 does not exist in the storage\n" ] }, - { - "data": { - "text/html": [ - "
\n"
-      ],
-      "text/plain": []
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
     {
      "data": {
       "text/html": [
diff --git a/nbs/11_sklearn_mlfow_model_testing.ipynb b/nbs/11_sklearn_mlfow_model_testing.ipynb
index 85645c8..b6393e6 100644
--- a/nbs/11_sklearn_mlfow_model_testing.ipynb
+++ b/nbs/11_sklearn_mlfow_model_testing.ipynb
@@ -242,149 +242,14 @@
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "5c7f9ca7c04d4d61a1d93d784e67a1e8",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2024-02-13 11:12:24 INFO: Downloading default packages for language: en (English) ...\n",
-      "2024-02-13 11:12:24 INFO: File exists: /home/awchen/stanza_resources/en/default.zip\n",
-      "2024-02-13 11:12:27 INFO: Finished downloading models and saved to /home/awchen/stanza_resources.\n",
-      "2024-02-13 11:12:27 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "341eed1075774bfaa7c7873d53723222",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2024-02-13 11:12:28 INFO: Loading these models for language: en (English):\n",
-      "======================================\n",
-      "| Processor    | Package             |\n",
-      "--------------------------------------\n",
-      "| tokenize     | combined            |\n",
-      "| pos          | combined_charlm     |\n",
-      "| lemma        | combined_nocharlm   |\n",
-      "| constituency | ptb3-revised_charlm |\n",
-      "| depparse     | combined_charlm     |\n",
-      "| sentiment    | sstplus             |\n",
-      "| ner          | ontonotes_charlm    |\n",
-      "======================================\n",
-      "\n",
-      "2024-02-13 11:12:28 INFO: Using device: cpu\n",
-      "2024-02-13 11:12:28 INFO: Loading: tokenize\n",
-      "2024-02-13 11:12:28 INFO: Loading: pos\n",
-      "2024-02-13 11:12:28 INFO: Loading: lemma\n",
-      "2024-02-13 11:12:29 INFO: Loading: constituency\n",
-      "2024-02-13 11:12:29 INFO: Loading: depparse\n",
-      "2024-02-13 11:12:29 INFO: Loading: sentiment\n",
-      "2024-02-13 11:12:29 INFO: Loading: ner\n",
-      "2024-02-13 11:12:29 INFO: Done loading processors!\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "\n",
-      "--------------\n",
-      "Raw Dataframe:\n",
-      "                         id  \\\n",
-      "0  54a2b6b019925f464b373351   \n",
-      "1  54a408a019925f464b3733bc   \n",
-      "2  54a408a26529d92b2c003631   \n",
-      "3  54a408a66529d92b2c003638   \n",
-      "4  54a408a719925f464b3733cc   \n",
-      "\n",
-      "                                                 dek  \\\n",
-      "0  How does fried chicken achieve No. 1 status? B...   \n",
-      "1                                Spinaci all'Ebraica   \n",
-      "2  This majestic, moist, and richly spiced honey ...   \n",
-      "3  The idea for this sandwich came to me when my ...   \n",
-      "4  In 1930, Simon Agranat, the chief justice of t...   \n",
-      "\n",
-      "                                     hed                   pubDate  \\\n",
-      "0            Pickle-Brined Fried Chicken  2014-08-19T04:00:00.000Z   \n",
-      "1                   Spinach Jewish Style  2008-09-09T04:00:00.000Z   \n",
-      "2                  New Year’s Honey Cake  2008-09-10T04:00:00.000Z   \n",
-      "3  The B.L.A.—Bagel with Lox and Avocado  2008-09-08T04:00:00.000Z   \n",
-      "4        Shakshuka a la Doktor Shakshuka  2008-09-09T04:00:00.000Z   \n",
-      "\n",
-      "                             author    type  \\\n",
-      "0                                []  recipe   \n",
-      "1  [{'name': 'Edda Servi Machlin'}]  recipe   \n",
-      "2       [{'name': 'Marcy Goldman'}]  recipe   \n",
-      "3           [{'name': 'Faye Levy'}]  recipe   \n",
-      "4         [{'name': 'Joan Nathan'}]  recipe   \n",
-      "\n",
-      "                                                 url  \\\n",
-      "0  /recipes/food/views/pickle-brined-fried-chicke...   \n",
-      "1    /recipes/food/views/spinach-jewish-style-350152   \n",
-      "2  /recipes/food/views/majestic-and-moist-new-yea...   \n",
-      "3  /recipes/food/views/the-b-l-a-bagel-with-lox-a...   \n",
-      "4  /recipes/food/views/shakshuka-a-la-doktor-shak...   \n",
-      "\n",
-      "                                           photoData  \\\n",
-      "0  {'id': '54a2b64a6529d92b2c003409', 'filename':...   \n",
-      "1  {'id': '56746182accb4c9831e45e0a', 'filename':...   \n",
-      "2  {'id': '55e85ba4cf90d6663f728014', 'filename':...   \n",
-      "3  {'id': '5674617e47d1a28026045e4f', 'filename':...   \n",
-      "4  {'id': '56746183b47c050a284a4e15', 'filename':...   \n",
-      "\n",
-      "                                                 tag  aggregateRating  \\\n",
-      "0  {'category': 'ingredient', 'name': 'Chicken', ...             3.11   \n",
-      "1  {'category': 'cuisine', 'name': 'Italian', 'ur...             3.22   \n",
-      "2  {'category': 'cuisine', 'name': 'Jewish', 'url...             3.62   \n",
-      "3  {'category': 'cuisine', 'name': 'Jewish', 'url...             4.00   \n",
-      "4  {'category': 'cuisine', 'name': 'Jewish', 'url...             2.71   \n",
-      "\n",
-      "                                         ingredients  \\\n",
-      "0  [1 tablespoons yellow mustard seeds, 1 tablesp...   \n",
-      "1  [3 pounds small-leaved bulk spinach, Salt, 1/2...   \n",
-      "2  [3 1/2 cups all-purpose flour, 1 tablespoon ba...   \n",
-      "3  [1 small ripe avocado, preferably Hass (see No...   \n",
-      "4  [2 pounds fresh tomatoes, unpeeled and cut in ...   \n",
-      "\n",
-      "                                           prepSteps  reviewsCount  \\\n",
-      "0  [Toast mustard and coriander seeds in a dry me...             7   \n",
-      "1  [Remove the stems and roots from the spinach. ...             5   \n",
-      "2  [I like this cake best baked in a 9-inch angel...           105   \n",
-      "3  [A short time before serving, mash avocado and...             7   \n",
-      "4  [1. Place the tomatoes, garlic, salt, paprika,...             7   \n",
-      "\n",
-      "   willMakeAgainPct  dateCrawled  \n",
-      "0               100   1498547035  \n",
-      "1                80   1498547740  \n",
-      "2                88   1498547738  \n",
-      "3               100   1498547740  \n",
-      "4                83   1498547740  \n",
-      "(34756, 15)\n"
+     "ename": "NameError",
+     "evalue": "name 'stanza' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[1], line 2\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;66;03m# instantiate stanza pipeline\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[43mstanza\u001b[49m\u001b[38;5;241m.\u001b[39mdownload(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124men\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m      3\u001b[0m nlp \u001b[38;5;241m=\u001b[39m stanza\u001b[38;5;241m.\u001b[39mPipeline(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124men\u001b[39m\u001b[38;5;124m'\u001b[39m, \n\u001b[1;32m      4\u001b[0m                     depparse_batch_size\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m50\u001b[39m, \n\u001b[1;32m      5\u001b[0m                     depparse_min_length_to_batch_separately\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m50\u001b[39m,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m      8\u001b[0m                     batch_size\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m100\u001b[39m\n\u001b[1;32m      9\u001b[0m                     )\n\u001b[1;32m     11\u001b[0m \u001b[38;5;66;03m# load raw data and preprocess/clean\u001b[39;00m\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'stanza' is not defined"
      ]
     }
    ],
diff --git a/nbs/13_new_preproc_test.ipynb b/nbs/13_new_preproc_test.ipynb
index 9e207ff..7dea6f0 100644
--- a/nbs/13_new_preproc_test.ipynb
+++ b/nbs/13_new_preproc_test.ipynb
@@ -28,7 +28,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "d7088c22cb4c4a2aa598d9fb700e8af0",
+       "model_id": "d85c6278f7234ed288c39189c62f316e",
        "version_major": 2,
        "version_minor": 0
       },
@@ -43,16 +43,16 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2024-03-27 20:57:40 INFO: Downloading default packages for language: en (English) ...\n",
-      "2024-03-27 20:57:41 INFO: File exists: /home/awchen/stanza_resources/en/default.zip\n",
-      "2024-03-27 20:57:44 INFO: Finished downloading models and saved to /home/awchen/stanza_resources.\n",
-      "2024-03-27 20:57:44 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES\n"
+      "2024-04-03 18:58:24 INFO: Downloading default packages for language: en (English) ...\n",
+      "2024-04-03 18:58:24 INFO: File exists: /home/awchen/stanza_resources/en/default.zip\n",
+      "2024-04-03 18:58:27 INFO: Finished downloading models and saved to /home/awchen/stanza_resources.\n",
+      "2024-04-03 18:58:27 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES\n"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "f62ab630814a49449d6f0be2ac47e87e",
+       "model_id": "646c5253882946ff89f2460e06d236a4",
        "version_major": 2,
        "version_minor": 0
       },
@@ -67,7 +67,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2024-03-27 20:57:45 INFO: Loading these models for language: en (English):\n",
+      "2024-04-03 18:58:28 INFO: Loading these models for language: en (English):\n",
       "======================================\n",
       "| Processor    | Package             |\n",
       "--------------------------------------\n",
@@ -80,15 +80,15 @@
       "| ner          | ontonotes_charlm    |\n",
       "======================================\n",
       "\n",
-      "2024-03-27 20:57:45 INFO: Using device: cpu\n",
-      "2024-03-27 20:57:45 INFO: Loading: tokenize\n",
-      "2024-03-27 20:57:45 INFO: Loading: pos\n",
-      "2024-03-27 20:57:45 INFO: Loading: lemma\n",
-      "2024-03-27 20:57:45 INFO: Loading: constituency\n",
-      "2024-03-27 20:57:45 INFO: Loading: depparse\n",
-      "2024-03-27 20:57:45 INFO: Loading: sentiment\n",
-      "2024-03-27 20:57:46 INFO: Loading: ner\n",
-      "2024-03-27 20:57:46 INFO: Done loading processors!\n"
+      "2024-04-03 18:58:28 INFO: Using device: cpu\n",
+      "2024-04-03 18:58:28 INFO: Loading: tokenize\n",
+      "2024-04-03 18:58:28 INFO: Loading: pos\n",
+      "2024-04-03 18:58:28 INFO: Loading: lemma\n",
+      "2024-04-03 18:58:29 INFO: Loading: constituency\n",
+      "2024-04-03 18:58:29 INFO: Loading: depparse\n",
+      "2024-04-03 18:58:29 INFO: Loading: sentiment\n",
+      "2024-04-03 18:58:29 INFO: Loading: ner\n",
+      "2024-04-03 18:58:30 INFO: Done loading processors!\n"
      ]
     }
    ],
@@ -164,7 +164,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "e03d339b07814cbc840d2f131a85f927",
+       "model_id": "7bd9eefff2a34d3dab46d8aa6564f18a",
        "version_major": 2,
        "version_minor": 0
       },
@@ -179,16 +179,16 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2024-03-27 20:57:46 INFO: Downloading default packages for language: en (English) ...\n",
-      "2024-03-27 20:57:47 INFO: File exists: /home/awchen/stanza_resources/en/default.zip\n",
-      "2024-03-27 20:57:50 INFO: Finished downloading models and saved to /home/awchen/stanza_resources.\n",
-      "2024-03-27 20:57:50 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES\n"
+      "2024-04-03 18:58:30 INFO: Downloading default packages for language: en (English) ...\n",
+      "2024-04-03 18:58:31 INFO: File exists: /home/awchen/stanza_resources/en/default.zip\n",
+      "2024-04-03 18:58:34 INFO: Finished downloading models and saved to /home/awchen/stanza_resources.\n",
+      "2024-04-03 18:58:34 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES\n"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "c31d2969037c4ad1a79333ff34364488",
+       "model_id": "e2c669f0e57b433b9d1afce065617d0e",
        "version_major": 2,
        "version_minor": 0
       },
@@ -203,7 +203,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2024-03-27 20:57:51 INFO: Loading these models for language: en (English):\n",
+      "2024-04-03 18:58:35 INFO: Loading these models for language: en (English):\n",
       "======================================\n",
       "| Processor    | Package             |\n",
       "--------------------------------------\n",
@@ -216,15 +216,15 @@
       "| ner          | ontonotes_charlm    |\n",
       "======================================\n",
       "\n",
-      "2024-03-27 20:57:51 INFO: Using device: cuda\n",
-      "2024-03-27 20:57:51 INFO: Loading: tokenize\n",
-      "2024-03-27 20:57:54 INFO: Loading: pos\n",
-      "2024-03-27 20:57:54 INFO: Loading: lemma\n",
-      "2024-03-27 20:57:54 INFO: Loading: constituency\n",
-      "2024-03-27 20:57:54 INFO: Loading: depparse\n",
-      "2024-03-27 20:57:54 INFO: Loading: sentiment\n",
-      "2024-03-27 20:57:55 INFO: Loading: ner\n",
-      "2024-03-27 20:57:55 INFO: Done loading processors!\n"
+      "2024-04-03 18:58:35 INFO: Using device: cuda\n",
+      "2024-04-03 18:58:35 INFO: Loading: tokenize\n",
+      "2024-04-03 18:58:38 INFO: Loading: pos\n",
+      "2024-04-03 18:58:38 INFO: Loading: lemma\n",
+      "2024-04-03 18:58:38 INFO: Loading: constituency\n",
+      "2024-04-03 18:58:39 INFO: Loading: depparse\n",
+      "2024-04-03 18:58:39 INFO: Loading: sentiment\n",
+      "2024-04-03 18:58:39 INFO: Loading: ner\n",
+      "2024-04-03 18:58:40 INFO: Done loading processors!\n"
      ]
     }
    ],
@@ -244,7 +244,88 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "--------------\n",
+      "Raw Dataframe:\n",
+      "                         id  \\\n",
+      "0  54a2b6b019925f464b373351   \n",
+      "1  54a408a019925f464b3733bc   \n",
+      "2  54a408a26529d92b2c003631   \n",
+      "3  54a408a66529d92b2c003638   \n",
+      "4  54a408a719925f464b3733cc   \n",
+      "\n",
+      "                                                 dek  \\\n",
+      "0  How does fried chicken achieve No. 1 status? B...   \n",
+      "1                                Spinaci all'Ebraica   \n",
+      "2  This majestic, moist, and richly spiced honey ...   \n",
+      "3  The idea for this sandwich came to me when my ...   \n",
+      "4  In 1930, Simon Agranat, the chief justice of t...   \n",
+      "\n",
+      "                                     hed                   pubDate  \\\n",
+      "0            Pickle-Brined Fried Chicken  2014-08-19T04:00:00.000Z   \n",
+      "1                   Spinach Jewish Style  2008-09-09T04:00:00.000Z   \n",
+      "2                  New Year’s Honey Cake  2008-09-10T04:00:00.000Z   \n",
+      "3  The B.L.A.—Bagel with Lox and Avocado  2008-09-08T04:00:00.000Z   \n",
+      "4        Shakshuka a la Doktor Shakshuka  2008-09-09T04:00:00.000Z   \n",
+      "\n",
+      "                             author    type  \\\n",
+      "0                                []  recipe   \n",
+      "1  [{'name': 'Edda Servi Machlin'}]  recipe   \n",
+      "2       [{'name': 'Marcy Goldman'}]  recipe   \n",
+      "3           [{'name': 'Faye Levy'}]  recipe   \n",
+      "4         [{'name': 'Joan Nathan'}]  recipe   \n",
+      "\n",
+      "                                                 url  \\\n",
+      "0  /recipes/food/views/pickle-brined-fried-chicke...   \n",
+      "1    /recipes/food/views/spinach-jewish-style-350152   \n",
+      "2  /recipes/food/views/majestic-and-moist-new-yea...   \n",
+      "3  /recipes/food/views/the-b-l-a-bagel-with-lox-a...   \n",
+      "4  /recipes/food/views/shakshuka-a-la-doktor-shak...   \n",
+      "\n",
+      "                                           photoData  \\\n",
+      "0  {'id': '54a2b64a6529d92b2c003409', 'filename':...   \n",
+      "1  {'id': '56746182accb4c9831e45e0a', 'filename':...   \n",
+      "2  {'id': '55e85ba4cf90d6663f728014', 'filename':...   \n",
+      "3  {'id': '5674617e47d1a28026045e4f', 'filename':...   \n",
+      "4  {'id': '56746183b47c050a284a4e15', 'filename':...   \n",
+      "\n",
+      "                                                 tag  aggregateRating  \\\n",
+      "0  {'category': 'ingredient', 'name': 'Chicken', ...             3.11   \n",
+      "1  {'category': 'cuisine', 'name': 'Italian', 'ur...             3.22   \n",
+      "2  {'category': 'cuisine', 'name': 'Jewish', 'url...             3.62   \n",
+      "3  {'category': 'cuisine', 'name': 'Jewish', 'url...             4.00   \n",
+      "4  {'category': 'cuisine', 'name': 'Jewish', 'url...             2.71   \n",
+      "\n",
+      "                                         ingredients  \\\n",
+      "0  [1 tablespoons yellow mustard seeds, 1 tablesp...   \n",
+      "1  [3 pounds small-leaved bulk spinach, Salt, 1/2...   \n",
+      "2  [3 1/2 cups all-purpose flour, 1 tablespoon ba...   \n",
+      "3  [1 small ripe avocado, preferably Hass (see No...   \n",
+      "4  [2 pounds fresh tomatoes, unpeeled and cut in ...   \n",
+      "\n",
+      "                                           prepSteps  reviewsCount  \\\n",
+      "0  [Toast mustard and coriander seeds in a dry me...             7   \n",
+      "1  [Remove the stems and roots from the spinach. ...             5   \n",
+      "2  [I like this cake best baked in a 9-inch angel...           105   \n",
+      "3  [A short time before serving, mash avocado and...             7   \n",
+      "4  [1. Place the tomatoes, garlic, salt, paprika,...             7   \n",
+      "\n",
+      "   willMakeAgainPct  dateCrawled  \n",
+      "0               100   1498547035  \n",
+      "1                80   1498547740  \n",
+      "2                88   1498547738  \n",
+      "3               100   1498547740  \n",
+      "4                83   1498547740  \n",
+      "(34756, 15)\n"
+     ]
+    }
+   ],
    "source": [
     "# load raw data and preprocess/clean\n",
     "data = dvc.api.read(\n",
@@ -262,7 +343,114 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "--------------\n",
+      "Preprocessed Dataframe:\n",
+      "                                                                        dek  \\\n",
+      "id                                                                            \n",
+      "54a4270b19925f464b37c1dc                                                      \n",
+      "54a42cde19925f464b3809d2  Green chiles pickled in soy sauce and vinegar ...   \n",
+      "54a433036529d92b2c015de3  This soup features the flavors of India: aroma...   \n",
+      "54a451926529d92b2c01eda8                                                      \n",
+      "54a430876529d92b2c013e2b  Brown sugar and molasses are balanced by fresh...   \n",
+      "\n",
+      "                                                                        hed  \\\n",
+      "id                                                                            \n",
+      "54a4270b19925f464b37c1dc  Grilled Hearts of Romaine with Blue Cheese Vin...   \n",
+      "54a42cde19925f464b3809d2                              Soy-Pickled Jalapeños   \n",
+      "54a433036529d92b2c015de3  Curried Potato and Spinach Soup with Onion Sal...   \n",
+      "54a451926529d92b2c01eda8                                       Chicken Soup   \n",
+      "54a430876529d92b2c013e2b                           Sweet-Hot Barbecue Sauce   \n",
+      "\n",
+      "                          aggregateRating  \\\n",
+      "id                                          \n",
+      "54a4270b19925f464b37c1dc             3.64   \n",
+      "54a42cde19925f464b3809d2             3.43   \n",
+      "54a433036529d92b2c015de3             3.00   \n",
+      "54a451926529d92b2c01eda8             3.19   \n",
+      "54a430876529d92b2c013e2b             0.00   \n",
+      "\n",
+      "                                                                ingredients  \\\n",
+      "id                                                                            \n",
+      "54a4270b19925f464b37c1dc  [1 1/2 cups white wine vinegar, 1/2 cup sugar,...   \n",
+      "54a42cde19925f464b3809d2  [3 large fresh jalapeños (4 inches), sliced 1/...   \n",
+      "54a433036529d92b2c015de3  [4 cups chopped red onions (about 2 large), 1 ...   \n",
+      "54a451926529d92b2c01eda8  [1 pound chicken parts, 2 stalks celery, inclu...   \n",
+      "54a430876529d92b2c013e2b  [2 tablespoons olive oil, 1 cup chopped onion,...   \n",
+      "\n",
+      "                                                                  prepSteps  \\\n",
+      "id                                                                            \n",
+      "54a4270b19925f464b37c1dc  [Combine first 5 ingredients and 1/4 teaspoon ...   \n",
+      "54a42cde19925f464b3809d2  [Combine all ingredients in a small heavy sauc...   \n",
+      "54a433036529d92b2c015de3  [Combine first 5 ingredients in heavy medium s...   \n",
+      "54a451926529d92b2c01eda8  [1. Pour 12 cups of cold water into a large st...   \n",
+      "54a430876529d92b2c013e2b  [Heat oil in large saucepan over medium-high h...   \n",
+      "\n",
+      "                          reviewsCount  willMakeAgainPct  \\\n",
+      "id                                                         \n",
+      "54a4270b19925f464b37c1dc             9               100   \n",
+      "54a42cde19925f464b3809d2             6               100   \n",
+      "54a433036529d92b2c015de3             6                67   \n",
+      "54a451926529d92b2c01eda8            32                87   \n",
+      "54a430876529d92b2c013e2b             0                 0   \n",
+      "\n",
+      "                                                      ingredients_lemmafied  \\\n",
+      "id                                                                            \n",
+      "54a4270b19925f464b37c1dc  cup white wine vinegar brk cup sugar brk cup w...   \n",
+      "54a42cde19925f464b3809d2  large fresh jalapeño inch slice inch thick brk...   \n",
+      "54a433036529d92b2c015de3  cup chop red onion large brk tablespoon sunflo...   \n",
+      "54a451926529d92b2c01eda8  pound chicken part brk stalk celery include le...   \n",
+      "54a430876529d92b2c013e2b  tablespoon olive oil brk cup chop onion brk cu...   \n",
+      "\n",
+      "                             cuisine_name  \\\n",
+      "id                                          \n",
+      "54a4270b19925f464b37c1dc  Missing Cuisine   \n",
+      "54a42cde19925f464b3809d2  Missing Cuisine   \n",
+      "54a433036529d92b2c015de3           Indian   \n",
+      "54a451926529d92b2c01eda8           Kosher   \n",
+      "54a430876529d92b2c013e2b  Missing Cuisine   \n",
+      "\n",
+      "                                               photo_filename  \\\n",
+      "id                                                              \n",
+      "54a4270b19925f464b37c1dc  EP_12162015_placeholders_casual.jpg   \n",
+      "54a42cde19925f464b3809d2  EP_12162015_placeholders_rustic.jpg   \n",
+      "54a433036529d92b2c015de3                           234125.jpg   \n",
+      "54a451926529d92b2c01eda8  EP_12162015_placeholders_formal.jpg   \n",
+      "54a430876529d92b2c013e2b  EP_12162015_placeholders_rustic.jpg   \n",
+      "\n",
+      "                                                               photo_credit  \\\n",
+      "id                                                                            \n",
+      "54a4270b19925f464b37c1dc  Photo by Chelsea Kyle, Prop Styling by Rhoda B...   \n",
+      "54a42cde19925f464b3809d2  Photo by Chelsea Kyle, Prop Styling by Anna St...   \n",
+      "54a433036529d92b2c015de3                                      Brian Leatart   \n",
+      "54a451926529d92b2c01eda8  Photo by Chelsea Kyle, Prop Styling by Rhoda B...   \n",
+      "54a430876529d92b2c013e2b  Photo by Chelsea Kyle, Prop Styling by Anna St...   \n",
+      "\n",
+      "                              author_name            date_published  \\\n",
+      "id                                                                    \n",
+      "54a4270b19925f464b37c1dc    Kate Higgins  2010-12-16 04:00:00+00:00   \n",
+      "54a42cde19925f464b3809d2     Lillian Chou 2009-02-19 04:00:00+00:00   \n",
+      "54a433036529d92b2c015de3     Peter Gordon 2006-03-07 04:00:00+00:00   \n",
+      "54a451926529d92b2c01eda8  Sharon Lebewohl 2004-08-20 04:00:00+00:00   \n",
+      "54a430876529d92b2c013e2b   Suzanne Tracht 2007-12-03 20:11:11+00:00   \n",
+      "\n",
+      "                                                                 recipe_url  \n",
+      "id                                                                           \n",
+      "54a4270b19925f464b37c1dc  https://www.epicurious.com/recipes/food/views/...  \n",
+      "54a42cde19925f464b3809d2  https://www.epicurious.com/recipes/food/views/...  \n",
+      "54a433036529d92b2c015de3  https://www.epicurious.com/recipes/food/views/...  \n",
+      "54a451926529d92b2c01eda8  https://www.epicurious.com/recipes/food/views/...  \n",
+      "54a430876529d92b2c013e2b  https://www.epicurious.com/recipes/food/views/...  \n",
+      "(50, 14)\n"
+     ]
+    }
+   ],
    "source": [
     "# take sample and train/test split \n",
     "subset_df = raw_df.sample(n=100, random_state=45)\n",
@@ -281,7 +469,35 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "fit_transform start: 2024-04-03 19:00:04.028764\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 50/50 [00:00<00:00, 8700.07it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "fit_transform end: 2024-04-03 19:00:04.049144\n",
+      "Index(['English', 'English hothouse', 'English hothouse cucumber', 'available',\n",
+      "       'baby', 'baking', 'baking powder', 'bay', 'bay leave', 'beef',\n",
+      "       ...\n",
+      "       'white', 'white vinegar', 'white wine', 'white wine vinegar', 'whole',\n",
+      "       'wine', 'wine vinegar', 'yukon', 'yukon gold', 'yukon gold potato'],\n",
+      "      dtype='object', length=283)\n"
+     ]
+    }
+   ],
    "source": [
     "# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer\n",
     "sklearn_transformer_params = {    \n",
@@ -315,7 +531,1661 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
EnglishEnglish hothouseEnglish hothouse cucumberavailablebabybakingbaking powderbaybay leavebeef...whitewhite vinegarwhite winewhite wine vinegarwholewinewine vinegaryukonyukon goldyukon gold potato
id
54a4270b19925f464b37c1dc0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.1499950.1499950.000000...0.1881550.0000000.2811100.2811100.0000000.2413430.2526410.0000000.0000000.000000
54a42cde19925f464b3809d20.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
54a433036529d92b2c015de30.0000000.0000000.0000000.0000000.0896660.0000000.0000000.0896660.0896660.000000...0.0562390.0000000.0000000.0000000.0000000.0000000.0000000.0896660.0896660.089666
54a451926529d92b2c01eda80.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.4780390.0000000.0000000.0000000.0000000.000000
54a430876529d92b2c013e2b0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000...0.0758790.1209800.0000000.0000000.0000000.0973290.0000000.0000000.0000000.000000
54a453df6529d92b2c0206870.0000000.0000000.0000000.0000000.0000000.1344960.0000000.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.1154690.0000000.0000000.0000000.0000000.000000
55b0e7116284773353bf45800.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
54a42bab6529d92b2c00ffa70.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
54a4748f19925f464b399ef20.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000...0.0971410.0000000.0000000.0000000.0000000.0000000.0000000.1548790.1548790.154879
54a4356a19925f464b3875bb0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000...0.0984600.0000000.1471020.1471020.1262920.1262920.1322040.0000000.0000000.000000
54a4697e6529d92b2c0279d30.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
54a45e426529d92b2c02488f0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
54a452c96529d92b2c01f8890.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.252416...0.1583160.2524160.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
54a4323619925f464b384bcc0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
54a4259119925f464b37af9c0.0000000.0000000.0000000.0000000.1655440.0000000.0000000.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
54a431da6529d92b2c014ee90.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.1841320.1841320.184132
54a426fd19925f464b37c1250.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
54a47bb019925f464b39b9b70.0000000.0000000.0000000.0000000.0000000.2206190.2354370.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
54a434d819925f464b386e620.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
54a428116529d92b2c00d1a70.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
54a436036529d92b2c01859e0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
54a47edf19925f464b39c58d0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
54a419706529d92b2c0066500.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
54a4349619925f464b386b120.0000000.0000000.0000000.0000000.0000000.1257390.1341840.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.1079520.0000000.0000000.0000000.0000000.000000
54a4340f6529d92b2c016be80.0000000.0000000.0000000.0000000.0000000.0000000.0000000.1880890.1880890.000000...0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
54a40e546529d92b2c0046060.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
54a428b419925f464b37d5ce0.0000000.0000000.0000000.0000000.0000000.1202450.1283210.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.1032350.0000000.0000000.0000000.0000000.000000
54a453a519925f464b38fd160.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
54a41cb219925f464b376d820.1293530.1293530.1293530.1293530.0000000.0000000.0000000.0000000.0000000.000000...0.0811310.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
54a431896529d92b2c014b270.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.147772...0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
54a423ab19925f464b3799f20.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.1323490.0000000.0000000.0000000.0000000.000000
54a47c1419925f464b39bb280.0000000.0000000.0000000.1468740.0000000.0000000.0000000.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
593ee3ba12c27b182380821f0.1083410.1083410.1083410.0000000.0000000.0000000.0000000.0000000.0000000.000000...0.0679520.0000000.1015220.1015220.0000000.0871600.0912400.0000000.0000000.000000
54a456366529d92b2c02235a0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.120817...0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
54a452d419925f464b38f1b50.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000...0.1119270.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
54a4659b6529d92b2c026a530.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
54a46d5d19925f464b3982d30.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000...0.1110530.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
54a4582119925f464b3927a10.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
54a4205319925f464b377c9f0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
54a470cc19925f464b39906b0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.1689370.1768450.0000000.0000000.000000
54a44f4a6529d92b2c01de450.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000...0.2592130.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
592ef494ae10ad089795ebfa0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000...0.0985600.0000000.1472520.1472520.0000000.1264210.1323390.0000000.0000000.000000
54a41f016529d92b2c00757d0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000...0.1242630.1981220.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
54a436266529d92b2c01876e0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
54a428bd19925f464b37d63e0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
54a45a4f6529d92b2c0234d10.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
54a41ed76529d92b2c0074400.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
54a42c906529d92b2c010b740.0000000.0000000.0000000.1851290.0000000.0000000.0000000.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.1489370.0000000.0000000.0000000.0000000.000000
569519a6dc18ea6c22c9b9ab0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000...0.4072260.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
54a438d56529d92b2c0196480.1511650.1511650.1511650.0000000.1511650.0000000.0000000.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.1216130.1273060.0000000.0000000.000000
\n", + "

50 rows × 283 columns

\n", + "
" + ], + "text/plain": [ + " English English hothouse \\\n", + "id \n", + "54a4270b19925f464b37c1dc 0.000000 0.000000 \n", + "54a42cde19925f464b3809d2 0.000000 0.000000 \n", + "54a433036529d92b2c015de3 0.000000 0.000000 \n", + "54a451926529d92b2c01eda8 0.000000 0.000000 \n", + "54a430876529d92b2c013e2b 0.000000 0.000000 \n", + "54a453df6529d92b2c020687 0.000000 0.000000 \n", + "55b0e7116284773353bf4580 0.000000 0.000000 \n", + "54a42bab6529d92b2c00ffa7 0.000000 0.000000 \n", + "54a4748f19925f464b399ef2 0.000000 0.000000 \n", + "54a4356a19925f464b3875bb 0.000000 0.000000 \n", + "54a4697e6529d92b2c0279d3 0.000000 0.000000 \n", + "54a45e426529d92b2c02488f 0.000000 0.000000 \n", + "54a452c96529d92b2c01f889 0.000000 0.000000 \n", + "54a4323619925f464b384bcc 0.000000 0.000000 \n", + "54a4259119925f464b37af9c 0.000000 0.000000 \n", + "54a431da6529d92b2c014ee9 0.000000 0.000000 \n", + "54a426fd19925f464b37c125 0.000000 0.000000 \n", + "54a47bb019925f464b39b9b7 0.000000 0.000000 \n", + "54a434d819925f464b386e62 0.000000 0.000000 \n", + "54a428116529d92b2c00d1a7 0.000000 0.000000 \n", + "54a436036529d92b2c01859e 0.000000 0.000000 \n", + "54a47edf19925f464b39c58d 0.000000 0.000000 \n", + "54a419706529d92b2c006650 0.000000 0.000000 \n", + "54a4349619925f464b386b12 0.000000 0.000000 \n", + "54a4340f6529d92b2c016be8 0.000000 0.000000 \n", + "54a40e546529d92b2c004606 0.000000 0.000000 \n", + "54a428b419925f464b37d5ce 0.000000 0.000000 \n", + "54a453a519925f464b38fd16 0.000000 0.000000 \n", + "54a41cb219925f464b376d82 0.129353 0.129353 \n", + "54a431896529d92b2c014b27 0.000000 0.000000 \n", + "54a423ab19925f464b3799f2 0.000000 0.000000 \n", + "54a47c1419925f464b39bb28 0.000000 0.000000 \n", + "593ee3ba12c27b182380821f 0.108341 0.108341 \n", + "54a456366529d92b2c02235a 0.000000 0.000000 \n", + "54a452d419925f464b38f1b5 0.000000 0.000000 \n", + "54a4659b6529d92b2c026a53 0.000000 0.000000 \n", + "54a46d5d19925f464b3982d3 0.000000 0.000000 \n", + "54a4582119925f464b3927a1 0.000000 0.000000 \n", + "54a4205319925f464b377c9f 0.000000 0.000000 \n", + "54a470cc19925f464b39906b 0.000000 0.000000 \n", + "54a44f4a6529d92b2c01de45 0.000000 0.000000 \n", + "592ef494ae10ad089795ebfa 0.000000 0.000000 \n", + "54a41f016529d92b2c00757d 0.000000 0.000000 \n", + "54a436266529d92b2c01876e 0.000000 0.000000 \n", + "54a428bd19925f464b37d63e 0.000000 0.000000 \n", + "54a45a4f6529d92b2c0234d1 0.000000 0.000000 \n", + "54a41ed76529d92b2c007440 0.000000 0.000000 \n", + "54a42c906529d92b2c010b74 0.000000 0.000000 \n", + "569519a6dc18ea6c22c9b9ab 0.000000 0.000000 \n", + "54a438d56529d92b2c019648 0.151165 0.151165 \n", + "\n", + " English hothouse cucumber available baby \\\n", + "id \n", + "54a4270b19925f464b37c1dc 0.000000 0.000000 0.000000 \n", + "54a42cde19925f464b3809d2 0.000000 0.000000 0.000000 \n", + "54a433036529d92b2c015de3 0.000000 0.000000 0.089666 \n", + "54a451926529d92b2c01eda8 0.000000 0.000000 0.000000 \n", + "54a430876529d92b2c013e2b 0.000000 0.000000 0.000000 \n", + "54a453df6529d92b2c020687 0.000000 0.000000 0.000000 \n", + "55b0e7116284773353bf4580 0.000000 0.000000 0.000000 \n", + "54a42bab6529d92b2c00ffa7 0.000000 0.000000 0.000000 \n", + "54a4748f19925f464b399ef2 0.000000 0.000000 0.000000 \n", + "54a4356a19925f464b3875bb 0.000000 0.000000 0.000000 \n", + "54a4697e6529d92b2c0279d3 0.000000 0.000000 0.000000 \n", + "54a45e426529d92b2c02488f 0.000000 0.000000 0.000000 \n", + "54a452c96529d92b2c01f889 0.000000 0.000000 0.000000 \n", + "54a4323619925f464b384bcc 0.000000 0.000000 0.000000 \n", + "54a4259119925f464b37af9c 0.000000 0.000000 0.165544 \n", + "54a431da6529d92b2c014ee9 0.000000 0.000000 0.000000 \n", + "54a426fd19925f464b37c125 0.000000 0.000000 0.000000 \n", + "54a47bb019925f464b39b9b7 0.000000 0.000000 0.000000 \n", + "54a434d819925f464b386e62 0.000000 0.000000 0.000000 \n", + "54a428116529d92b2c00d1a7 0.000000 0.000000 0.000000 \n", + "54a436036529d92b2c01859e 0.000000 0.000000 0.000000 \n", + "54a47edf19925f464b39c58d 0.000000 0.000000 0.000000 \n", + "54a419706529d92b2c006650 0.000000 0.000000 0.000000 \n", + "54a4349619925f464b386b12 0.000000 0.000000 0.000000 \n", + "54a4340f6529d92b2c016be8 0.000000 0.000000 0.000000 \n", + "54a40e546529d92b2c004606 0.000000 0.000000 0.000000 \n", + "54a428b419925f464b37d5ce 0.000000 0.000000 0.000000 \n", + "54a453a519925f464b38fd16 0.000000 0.000000 0.000000 \n", + "54a41cb219925f464b376d82 0.129353 0.129353 0.000000 \n", + "54a431896529d92b2c014b27 0.000000 0.000000 0.000000 \n", + "54a423ab19925f464b3799f2 0.000000 0.000000 0.000000 \n", + "54a47c1419925f464b39bb28 0.000000 0.146874 0.000000 \n", + "593ee3ba12c27b182380821f 0.108341 0.000000 0.000000 \n", + "54a456366529d92b2c02235a 0.000000 0.000000 0.000000 \n", + "54a452d419925f464b38f1b5 0.000000 0.000000 0.000000 \n", + "54a4659b6529d92b2c026a53 0.000000 0.000000 0.000000 \n", + "54a46d5d19925f464b3982d3 0.000000 0.000000 0.000000 \n", + "54a4582119925f464b3927a1 0.000000 0.000000 0.000000 \n", + "54a4205319925f464b377c9f 0.000000 0.000000 0.000000 \n", + "54a470cc19925f464b39906b 0.000000 0.000000 0.000000 \n", + "54a44f4a6529d92b2c01de45 0.000000 0.000000 0.000000 \n", + "592ef494ae10ad089795ebfa 0.000000 0.000000 0.000000 \n", + "54a41f016529d92b2c00757d 0.000000 0.000000 0.000000 \n", + "54a436266529d92b2c01876e 0.000000 0.000000 0.000000 \n", + "54a428bd19925f464b37d63e 0.000000 0.000000 0.000000 \n", + "54a45a4f6529d92b2c0234d1 0.000000 0.000000 0.000000 \n", + "54a41ed76529d92b2c007440 0.000000 0.000000 0.000000 \n", + "54a42c906529d92b2c010b74 0.000000 0.185129 0.000000 \n", + "569519a6dc18ea6c22c9b9ab 0.000000 0.000000 0.000000 \n", + "54a438d56529d92b2c019648 0.151165 0.000000 0.151165 \n", + "\n", + " baking baking powder bay bay leave \\\n", + "id \n", + "54a4270b19925f464b37c1dc 0.000000 0.000000 0.149995 0.149995 \n", + "54a42cde19925f464b3809d2 0.000000 0.000000 0.000000 0.000000 \n", + "54a433036529d92b2c015de3 0.000000 0.000000 0.089666 0.089666 \n", + "54a451926529d92b2c01eda8 0.000000 0.000000 0.000000 0.000000 \n", + "54a430876529d92b2c013e2b 0.000000 0.000000 0.000000 0.000000 \n", + "54a453df6529d92b2c020687 0.134496 0.000000 0.000000 0.000000 \n", + "55b0e7116284773353bf4580 0.000000 0.000000 0.000000 0.000000 \n", + "54a42bab6529d92b2c00ffa7 0.000000 0.000000 0.000000 0.000000 \n", + "54a4748f19925f464b399ef2 0.000000 0.000000 0.000000 0.000000 \n", + "54a4356a19925f464b3875bb 0.000000 0.000000 0.000000 0.000000 \n", + "54a4697e6529d92b2c0279d3 0.000000 0.000000 0.000000 0.000000 \n", + "54a45e426529d92b2c02488f 0.000000 0.000000 0.000000 0.000000 \n", + "54a452c96529d92b2c01f889 0.000000 0.000000 0.000000 0.000000 \n", + "54a4323619925f464b384bcc 0.000000 0.000000 0.000000 0.000000 \n", + "54a4259119925f464b37af9c 0.000000 0.000000 0.000000 0.000000 \n", + "54a431da6529d92b2c014ee9 0.000000 0.000000 0.000000 0.000000 \n", + "54a426fd19925f464b37c125 0.000000 0.000000 0.000000 0.000000 \n", + "54a47bb019925f464b39b9b7 0.220619 0.235437 0.000000 0.000000 \n", + "54a434d819925f464b386e62 0.000000 0.000000 0.000000 0.000000 \n", + "54a428116529d92b2c00d1a7 0.000000 0.000000 0.000000 0.000000 \n", + "54a436036529d92b2c01859e 0.000000 0.000000 0.000000 0.000000 \n", + "54a47edf19925f464b39c58d 0.000000 0.000000 0.000000 0.000000 \n", + "54a419706529d92b2c006650 0.000000 0.000000 0.000000 0.000000 \n", + "54a4349619925f464b386b12 0.125739 0.134184 0.000000 0.000000 \n", + "54a4340f6529d92b2c016be8 0.000000 0.000000 0.188089 0.188089 \n", + "54a40e546529d92b2c004606 0.000000 0.000000 0.000000 0.000000 \n", + "54a428b419925f464b37d5ce 0.120245 0.128321 0.000000 0.000000 \n", + "54a453a519925f464b38fd16 0.000000 0.000000 0.000000 0.000000 \n", + "54a41cb219925f464b376d82 0.000000 0.000000 0.000000 0.000000 \n", + "54a431896529d92b2c014b27 0.000000 0.000000 0.000000 0.000000 \n", + "54a423ab19925f464b3799f2 0.000000 0.000000 0.000000 0.000000 \n", + "54a47c1419925f464b39bb28 0.000000 0.000000 0.000000 0.000000 \n", + "593ee3ba12c27b182380821f 0.000000 0.000000 0.000000 0.000000 \n", + "54a456366529d92b2c02235a 0.000000 0.000000 0.000000 0.000000 \n", + "54a452d419925f464b38f1b5 0.000000 0.000000 0.000000 0.000000 \n", + "54a4659b6529d92b2c026a53 0.000000 0.000000 0.000000 0.000000 \n", + "54a46d5d19925f464b3982d3 0.000000 0.000000 0.000000 0.000000 \n", + "54a4582119925f464b3927a1 0.000000 0.000000 0.000000 0.000000 \n", + "54a4205319925f464b377c9f 0.000000 0.000000 0.000000 0.000000 \n", + "54a470cc19925f464b39906b 0.000000 0.000000 0.000000 0.000000 \n", + "54a44f4a6529d92b2c01de45 0.000000 0.000000 0.000000 0.000000 \n", + "592ef494ae10ad089795ebfa 0.000000 0.000000 0.000000 0.000000 \n", + "54a41f016529d92b2c00757d 0.000000 0.000000 0.000000 0.000000 \n", + "54a436266529d92b2c01876e 0.000000 0.000000 0.000000 0.000000 \n", + "54a428bd19925f464b37d63e 0.000000 0.000000 0.000000 0.000000 \n", + "54a45a4f6529d92b2c0234d1 0.000000 0.000000 0.000000 0.000000 \n", + "54a41ed76529d92b2c007440 0.000000 0.000000 0.000000 0.000000 \n", + "54a42c906529d92b2c010b74 0.000000 0.000000 0.000000 0.000000 \n", + "569519a6dc18ea6c22c9b9ab 0.000000 0.000000 0.000000 0.000000 \n", + "54a438d56529d92b2c019648 0.000000 0.000000 0.000000 0.000000 \n", + "\n", + " beef ... white white vinegar white wine \\\n", + "id ... \n", + "54a4270b19925f464b37c1dc 0.000000 ... 0.188155 0.000000 0.281110 \n", + "54a42cde19925f464b3809d2 0.000000 ... 0.000000 0.000000 0.000000 \n", + "54a433036529d92b2c015de3 0.000000 ... 0.056239 0.000000 0.000000 \n", + "54a451926529d92b2c01eda8 0.000000 ... 0.000000 0.000000 0.000000 \n", + "54a430876529d92b2c013e2b 0.000000 ... 0.075879 0.120980 0.000000 \n", + "54a453df6529d92b2c020687 0.000000 ... 0.000000 0.000000 0.000000 \n", + "55b0e7116284773353bf4580 0.000000 ... 0.000000 0.000000 0.000000 \n", + "54a42bab6529d92b2c00ffa7 0.000000 ... 0.000000 0.000000 0.000000 \n", + "54a4748f19925f464b399ef2 0.000000 ... 0.097141 0.000000 0.000000 \n", + "54a4356a19925f464b3875bb 0.000000 ... 0.098460 0.000000 0.147102 \n", + "54a4697e6529d92b2c0279d3 0.000000 ... 0.000000 0.000000 0.000000 \n", + "54a45e426529d92b2c02488f 0.000000 ... 0.000000 0.000000 0.000000 \n", + "54a452c96529d92b2c01f889 0.252416 ... 0.158316 0.252416 0.000000 \n", + "54a4323619925f464b384bcc 0.000000 ... 0.000000 0.000000 0.000000 \n", + "54a4259119925f464b37af9c 0.000000 ... 0.000000 0.000000 0.000000 \n", + "54a431da6529d92b2c014ee9 0.000000 ... 0.000000 0.000000 0.000000 \n", + "54a426fd19925f464b37c125 0.000000 ... 0.000000 0.000000 0.000000 \n", + "54a47bb019925f464b39b9b7 0.000000 ... 0.000000 0.000000 0.000000 \n", + "54a434d819925f464b386e62 0.000000 ... 0.000000 0.000000 0.000000 \n", + "54a428116529d92b2c00d1a7 0.000000 ... 0.000000 0.000000 0.000000 \n", + "54a436036529d92b2c01859e 0.000000 ... 0.000000 0.000000 0.000000 \n", + "54a47edf19925f464b39c58d 0.000000 ... 0.000000 0.000000 0.000000 \n", + "54a419706529d92b2c006650 0.000000 ... 0.000000 0.000000 0.000000 \n", + "54a4349619925f464b386b12 0.000000 ... 0.000000 0.000000 0.000000 \n", + "54a4340f6529d92b2c016be8 0.000000 ... 0.000000 0.000000 0.000000 \n", + "54a40e546529d92b2c004606 0.000000 ... 0.000000 0.000000 0.000000 \n", + "54a428b419925f464b37d5ce 0.000000 ... 0.000000 0.000000 0.000000 \n", + "54a453a519925f464b38fd16 0.000000 ... 0.000000 0.000000 0.000000 \n", + "54a41cb219925f464b376d82 0.000000 ... 0.081131 0.000000 0.000000 \n", + "54a431896529d92b2c014b27 0.147772 ... 0.000000 0.000000 0.000000 \n", + "54a423ab19925f464b3799f2 0.000000 ... 0.000000 0.000000 0.000000 \n", + "54a47c1419925f464b39bb28 0.000000 ... 0.000000 0.000000 0.000000 \n", + "593ee3ba12c27b182380821f 0.000000 ... 0.067952 0.000000 0.101522 \n", + "54a456366529d92b2c02235a 0.120817 ... 0.000000 0.000000 0.000000 \n", + "54a452d419925f464b38f1b5 0.000000 ... 0.111927 0.000000 0.000000 \n", + "54a4659b6529d92b2c026a53 0.000000 ... 0.000000 0.000000 0.000000 \n", + "54a46d5d19925f464b3982d3 0.000000 ... 0.111053 0.000000 0.000000 \n", + "54a4582119925f464b3927a1 0.000000 ... 0.000000 0.000000 0.000000 \n", + "54a4205319925f464b377c9f 0.000000 ... 0.000000 0.000000 0.000000 \n", + "54a470cc19925f464b39906b 0.000000 ... 0.000000 0.000000 0.000000 \n", + "54a44f4a6529d92b2c01de45 0.000000 ... 0.259213 0.000000 0.000000 \n", + "592ef494ae10ad089795ebfa 0.000000 ... 0.098560 0.000000 0.147252 \n", + "54a41f016529d92b2c00757d 0.000000 ... 0.124263 0.198122 0.000000 \n", + "54a436266529d92b2c01876e 0.000000 ... 0.000000 0.000000 0.000000 \n", + "54a428bd19925f464b37d63e 0.000000 ... 0.000000 0.000000 0.000000 \n", + "54a45a4f6529d92b2c0234d1 0.000000 ... 0.000000 0.000000 0.000000 \n", + "54a41ed76529d92b2c007440 0.000000 ... 0.000000 0.000000 0.000000 \n", + "54a42c906529d92b2c010b74 0.000000 ... 0.000000 0.000000 0.000000 \n", + "569519a6dc18ea6c22c9b9ab 0.000000 ... 0.407226 0.000000 0.000000 \n", + "54a438d56529d92b2c019648 0.000000 ... 0.000000 0.000000 0.000000 \n", + "\n", + " white wine vinegar whole wine \\\n", + "id \n", + "54a4270b19925f464b37c1dc 0.281110 0.000000 0.241343 \n", + "54a42cde19925f464b3809d2 0.000000 0.000000 0.000000 \n", + "54a433036529d92b2c015de3 0.000000 0.000000 0.000000 \n", + "54a451926529d92b2c01eda8 0.000000 0.478039 0.000000 \n", + "54a430876529d92b2c013e2b 0.000000 0.000000 0.097329 \n", + "54a453df6529d92b2c020687 0.000000 0.115469 0.000000 \n", + "55b0e7116284773353bf4580 0.000000 0.000000 0.000000 \n", + "54a42bab6529d92b2c00ffa7 0.000000 0.000000 0.000000 \n", + "54a4748f19925f464b399ef2 0.000000 0.000000 0.000000 \n", + "54a4356a19925f464b3875bb 0.147102 0.126292 0.126292 \n", + "54a4697e6529d92b2c0279d3 0.000000 0.000000 0.000000 \n", + "54a45e426529d92b2c02488f 0.000000 0.000000 0.000000 \n", + "54a452c96529d92b2c01f889 0.000000 0.000000 0.000000 \n", + "54a4323619925f464b384bcc 0.000000 0.000000 0.000000 \n", + "54a4259119925f464b37af9c 0.000000 0.000000 0.000000 \n", + "54a431da6529d92b2c014ee9 0.000000 0.000000 0.000000 \n", + "54a426fd19925f464b37c125 0.000000 0.000000 0.000000 \n", + "54a47bb019925f464b39b9b7 0.000000 0.000000 0.000000 \n", + "54a434d819925f464b386e62 0.000000 0.000000 0.000000 \n", + "54a428116529d92b2c00d1a7 0.000000 0.000000 0.000000 \n", + "54a436036529d92b2c01859e 0.000000 0.000000 0.000000 \n", + "54a47edf19925f464b39c58d 0.000000 0.000000 0.000000 \n", + "54a419706529d92b2c006650 0.000000 0.000000 0.000000 \n", + "54a4349619925f464b386b12 0.000000 0.107952 0.000000 \n", + "54a4340f6529d92b2c016be8 0.000000 0.000000 0.000000 \n", + "54a40e546529d92b2c004606 0.000000 0.000000 0.000000 \n", + "54a428b419925f464b37d5ce 0.000000 0.103235 0.000000 \n", + "54a453a519925f464b38fd16 0.000000 0.000000 0.000000 \n", + "54a41cb219925f464b376d82 0.000000 0.000000 0.000000 \n", + "54a431896529d92b2c014b27 0.000000 0.000000 0.000000 \n", + "54a423ab19925f464b3799f2 0.000000 0.132349 0.000000 \n", + "54a47c1419925f464b39bb28 0.000000 0.000000 0.000000 \n", + "593ee3ba12c27b182380821f 0.101522 0.000000 0.087160 \n", + "54a456366529d92b2c02235a 0.000000 0.000000 0.000000 \n", + "54a452d419925f464b38f1b5 0.000000 0.000000 0.000000 \n", + "54a4659b6529d92b2c026a53 0.000000 0.000000 0.000000 \n", + "54a46d5d19925f464b3982d3 0.000000 0.000000 0.000000 \n", + "54a4582119925f464b3927a1 0.000000 0.000000 0.000000 \n", + "54a4205319925f464b377c9f 0.000000 0.000000 0.000000 \n", + "54a470cc19925f464b39906b 0.000000 0.000000 0.168937 \n", + "54a44f4a6529d92b2c01de45 0.000000 0.000000 0.000000 \n", + "592ef494ae10ad089795ebfa 0.147252 0.000000 0.126421 \n", + "54a41f016529d92b2c00757d 0.000000 0.000000 0.000000 \n", + "54a436266529d92b2c01876e 0.000000 0.000000 0.000000 \n", + "54a428bd19925f464b37d63e 0.000000 0.000000 0.000000 \n", + "54a45a4f6529d92b2c0234d1 0.000000 0.000000 0.000000 \n", + "54a41ed76529d92b2c007440 0.000000 0.000000 0.000000 \n", + "54a42c906529d92b2c010b74 0.000000 0.148937 0.000000 \n", + "569519a6dc18ea6c22c9b9ab 0.000000 0.000000 0.000000 \n", + "54a438d56529d92b2c019648 0.000000 0.000000 0.121613 \n", + "\n", + " wine vinegar yukon yukon gold \\\n", + "id \n", + "54a4270b19925f464b37c1dc 0.252641 0.000000 0.000000 \n", + "54a42cde19925f464b3809d2 0.000000 0.000000 0.000000 \n", + "54a433036529d92b2c015de3 0.000000 0.089666 0.089666 \n", + "54a451926529d92b2c01eda8 0.000000 0.000000 0.000000 \n", + "54a430876529d92b2c013e2b 0.000000 0.000000 0.000000 \n", + "54a453df6529d92b2c020687 0.000000 0.000000 0.000000 \n", + "55b0e7116284773353bf4580 0.000000 0.000000 0.000000 \n", + "54a42bab6529d92b2c00ffa7 0.000000 0.000000 0.000000 \n", + "54a4748f19925f464b399ef2 0.000000 0.154879 0.154879 \n", + "54a4356a19925f464b3875bb 0.132204 0.000000 0.000000 \n", + "54a4697e6529d92b2c0279d3 0.000000 0.000000 0.000000 \n", + "54a45e426529d92b2c02488f 0.000000 0.000000 0.000000 \n", + "54a452c96529d92b2c01f889 0.000000 0.000000 0.000000 \n", + "54a4323619925f464b384bcc 0.000000 0.000000 0.000000 \n", + "54a4259119925f464b37af9c 0.000000 0.000000 0.000000 \n", + "54a431da6529d92b2c014ee9 0.000000 0.184132 0.184132 \n", + "54a426fd19925f464b37c125 0.000000 0.000000 0.000000 \n", + "54a47bb019925f464b39b9b7 0.000000 0.000000 0.000000 \n", + "54a434d819925f464b386e62 0.000000 0.000000 0.000000 \n", + "54a428116529d92b2c00d1a7 0.000000 0.000000 0.000000 \n", + "54a436036529d92b2c01859e 0.000000 0.000000 0.000000 \n", + "54a47edf19925f464b39c58d 0.000000 0.000000 0.000000 \n", + "54a419706529d92b2c006650 0.000000 0.000000 0.000000 \n", + "54a4349619925f464b386b12 0.000000 0.000000 0.000000 \n", + "54a4340f6529d92b2c016be8 0.000000 0.000000 0.000000 \n", + "54a40e546529d92b2c004606 0.000000 0.000000 0.000000 \n", + "54a428b419925f464b37d5ce 0.000000 0.000000 0.000000 \n", + "54a453a519925f464b38fd16 0.000000 0.000000 0.000000 \n", + "54a41cb219925f464b376d82 0.000000 0.000000 0.000000 \n", + "54a431896529d92b2c014b27 0.000000 0.000000 0.000000 \n", + "54a423ab19925f464b3799f2 0.000000 0.000000 0.000000 \n", + "54a47c1419925f464b39bb28 0.000000 0.000000 0.000000 \n", + "593ee3ba12c27b182380821f 0.091240 0.000000 0.000000 \n", + "54a456366529d92b2c02235a 0.000000 0.000000 0.000000 \n", + "54a452d419925f464b38f1b5 0.000000 0.000000 0.000000 \n", + "54a4659b6529d92b2c026a53 0.000000 0.000000 0.000000 \n", + "54a46d5d19925f464b3982d3 0.000000 0.000000 0.000000 \n", + "54a4582119925f464b3927a1 0.000000 0.000000 0.000000 \n", + "54a4205319925f464b377c9f 0.000000 0.000000 0.000000 \n", + "54a470cc19925f464b39906b 0.176845 0.000000 0.000000 \n", + "54a44f4a6529d92b2c01de45 0.000000 0.000000 0.000000 \n", + "592ef494ae10ad089795ebfa 0.132339 0.000000 0.000000 \n", + "54a41f016529d92b2c00757d 0.000000 0.000000 0.000000 \n", + "54a436266529d92b2c01876e 0.000000 0.000000 0.000000 \n", + "54a428bd19925f464b37d63e 0.000000 0.000000 0.000000 \n", + "54a45a4f6529d92b2c0234d1 0.000000 0.000000 0.000000 \n", + "54a41ed76529d92b2c007440 0.000000 0.000000 0.000000 \n", + "54a42c906529d92b2c010b74 0.000000 0.000000 0.000000 \n", + "569519a6dc18ea6c22c9b9ab 0.000000 0.000000 0.000000 \n", + "54a438d56529d92b2c019648 0.127306 0.000000 0.000000 \n", + "\n", + " yukon gold potato \n", + "id \n", + "54a4270b19925f464b37c1dc 0.000000 \n", + "54a42cde19925f464b3809d2 0.000000 \n", + "54a433036529d92b2c015de3 0.089666 \n", + "54a451926529d92b2c01eda8 0.000000 \n", + "54a430876529d92b2c013e2b 0.000000 \n", + "54a453df6529d92b2c020687 0.000000 \n", + "55b0e7116284773353bf4580 0.000000 \n", + "54a42bab6529d92b2c00ffa7 0.000000 \n", + "54a4748f19925f464b399ef2 0.154879 \n", + "54a4356a19925f464b3875bb 0.000000 \n", + "54a4697e6529d92b2c0279d3 0.000000 \n", + "54a45e426529d92b2c02488f 0.000000 \n", + "54a452c96529d92b2c01f889 0.000000 \n", + "54a4323619925f464b384bcc 0.000000 \n", + "54a4259119925f464b37af9c 0.000000 \n", + "54a431da6529d92b2c014ee9 0.184132 \n", + "54a426fd19925f464b37c125 0.000000 \n", + "54a47bb019925f464b39b9b7 0.000000 \n", + "54a434d819925f464b386e62 0.000000 \n", + "54a428116529d92b2c00d1a7 0.000000 \n", + "54a436036529d92b2c01859e 0.000000 \n", + "54a47edf19925f464b39c58d 0.000000 \n", + "54a419706529d92b2c006650 0.000000 \n", + "54a4349619925f464b386b12 0.000000 \n", + "54a4340f6529d92b2c016be8 0.000000 \n", + "54a40e546529d92b2c004606 0.000000 \n", + "54a428b419925f464b37d5ce 0.000000 \n", + "54a453a519925f464b38fd16 0.000000 \n", + "54a41cb219925f464b376d82 0.000000 \n", + "54a431896529d92b2c014b27 0.000000 \n", + "54a423ab19925f464b3799f2 0.000000 \n", + "54a47c1419925f464b39bb28 0.000000 \n", + "593ee3ba12c27b182380821f 0.000000 \n", + "54a456366529d92b2c02235a 0.000000 \n", + "54a452d419925f464b38f1b5 0.000000 \n", + "54a4659b6529d92b2c026a53 0.000000 \n", + "54a46d5d19925f464b3982d3 0.000000 \n", + "54a4582119925f464b3927a1 0.000000 \n", + "54a4205319925f464b377c9f 0.000000 \n", + "54a470cc19925f464b39906b 0.000000 \n", + "54a44f4a6529d92b2c01de45 0.000000 \n", + "592ef494ae10ad089795ebfa 0.000000 \n", + "54a41f016529d92b2c00757d 0.000000 \n", + "54a436266529d92b2c01876e 0.000000 \n", + "54a428bd19925f464b37d63e 0.000000 \n", + "54a45a4f6529d92b2c0234d1 0.000000 \n", + "54a41ed76529d92b2c007440 0.000000 \n", + "54a42c906529d92b2c010b74 0.000000 \n", + "569519a6dc18ea6c22c9b9ab 0.000000 \n", + "54a438d56529d92b2c019648 0.000000 \n", + "\n", + "[50 rows x 283 columns]" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "transformed_recipe" ] @@ -324,7 +2194,300 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['English',\n", + " 'English hothouse',\n", + " 'English hothouse cucumber',\n", + " 'available',\n", + " 'baby',\n", + " 'baking',\n", + " 'baking powder',\n", + " 'bay',\n", + " 'bay leave',\n", + " 'beef',\n", + " 'bell',\n", + " 'bell pepper',\n", + " 'black',\n", + " 'black pepper',\n", + " 'bread',\n", + " 'brk',\n", + " 'broth',\n", + " 'brown',\n", + " 'brown sugar',\n", + " 'bunch',\n", + " 'butter',\n", + " 'can',\n", + " 'carrot',\n", + " 'cayenne',\n", + " 'cayenne pepper',\n", + " 'celery',\n", + " 'cheese',\n", + " 'cherry',\n", + " 'chicken',\n", + " 'chile',\n", + " 'chop',\n", + " 'chop fresh',\n", + " 'chop fresh cilantro',\n", + " 'chop onion',\n", + " 'chop red',\n", + " 'cilantro',\n", + " 'cinnamon',\n", + " 'clove',\n", + " 'clove mince',\n", + " 'coarse',\n", + " 'coarse kosher',\n", + " 'coarse kosher salt',\n", + " 'core',\n", + " 'core cut',\n", + " 'core cut inch',\n", + " 'coriander',\n", + " 'cream',\n", + " 'crosswise',\n", + " 'cube',\n", + " 'cucumber',\n", + " 'cumin',\n", + " 'cup',\n", + " 'cup chop',\n", + " 'cup chop onion',\n", + " 'cup dry',\n", + " 'cup fresh',\n", + " 'cup grate',\n", + " 'cup olive',\n", + " 'cup olive oil',\n", + " 'cup pack',\n", + " 'cup purpose',\n", + " 'cup purpose flour',\n", + " 'cup slice',\n", + " 'cup sour',\n", + " 'cup sour cream',\n", + " 'cup stick',\n", + " 'cup sugar',\n", + " 'cup tablespoon',\n", + " 'cup water',\n", + " 'curry',\n", + " 'curry powder',\n", + " 'cut',\n", + " 'cut inch',\n", + " 'cut inch cube',\n", + " 'cut inch thick',\n", + " 'dark',\n", + " 'dark brown',\n", + " 'dark brown sugar',\n", + " 'dijon',\n", + " 'dijon mustard',\n", + " 'distil',\n", + " 'distil white',\n", + " 'distil white vinegar',\n", + " 'divide',\n", + " 'drain',\n", + " 'dry',\n", + " 'egg',\n", + " 'equipment',\n", + " 'extra',\n", + " 'extra virgin',\n", + " 'extra virgin olive',\n", + " 'extra virgin olive oil',\n", + " 'extract',\n", + " 'fillet',\n", + " 'firm',\n", + " 'flour',\n", + " 'food',\n", + " 'food store',\n", + " 'fresh',\n", + " 'fresh cilantro',\n", + " 'fresh lemon',\n", + " 'fresh lemon juice',\n", + " 'fresh lime',\n", + " 'fresh lime juice',\n", + " 'fresh mint',\n", + " 'garlic',\n", + " 'garlic clove',\n", + " 'garlic clove mince',\n", + " 'garlic powder',\n", + " 'garnish',\n", + " 'ginger',\n", + " 'gold',\n", + " 'gold potato',\n", + " 'golden',\n", + " 'golden brown',\n", + " 'grain',\n", + " 'grate',\n", + " 'grate lemon',\n", + " 'green',\n", + " 'green onion',\n", + " 'ground',\n", + " 'ground black',\n", + " 'ground black pepper',\n", + " 'ground cinnamon',\n", + " 'ground cumin',\n", + " 'halve',\n", + " 'halve pit',\n", + " 'hot',\n", + " 'hothouse',\n", + " 'hothouse cucumber',\n", + " 'inch',\n", + " 'inch cube',\n", + " 'inch long',\n", + " 'inch piece',\n", + " 'inch thick',\n", + " 'jalape',\n", + " 'juice',\n", + " 'kosher',\n", + " 'kosher salt',\n", + " 'large',\n", + " 'large egg',\n", + " 'large garlic',\n", + " 'large garlic clove',\n", + " 'leave',\n", + " 'lemon',\n", + " 'lemon juice',\n", + " 'light',\n", + " 'lime',\n", + " 'lime juice',\n", + " 'liqueur',\n", + " 'long',\n", + " 'low',\n", + " 'low salt',\n", + " 'medium',\n", + " 'milk',\n", + " 'mince',\n", + " 'mint',\n", + " 'mustard',\n", + " 'oil',\n", + " 'olive',\n", + " 'olive oil',\n", + " 'onion',\n", + " 'onion chop',\n", + " 'onion slice',\n", + " 'orange',\n", + " 'other',\n", + " 'ounce',\n", + " 'ounce can',\n", + " 'pack',\n", + " 'pack dark',\n", + " 'pack dark brown',\n", + " 'pack dark brown sugar',\n", + " 'parsley',\n", + " 'paste',\n", + " 'peel',\n", + " 'pepper',\n", + " 'piece',\n", + " 'pinch',\n", + " 'pit',\n", + " 'potato',\n", + " 'pound',\n", + " 'powder',\n", + " 'purpose',\n", + " 'purpose flour',\n", + " 'red',\n", + " 'red bell',\n", + " 'red bell pepper',\n", + " 'red onion',\n", + " 'red wine',\n", + " 'rice',\n", + " 'roast',\n", + " 'romaine',\n", + " 'room',\n", + " 'room temperature',\n", + " 'rosemary',\n", + " 'salt',\n", + " 'salt ground',\n", + " 'sauce',\n", + " 'scallion',\n", + " 'seed',\n", + " 'sesame',\n", + " 'shallot',\n", + " 'slice',\n", + " 'small',\n", + " 'sour',\n", + " 'sour cream',\n", + " 'soy',\n", + " 'soy sauce',\n", + " 'special',\n", + " 'special equipment',\n", + " 'specialty',\n", + " 'specialty food',\n", + " 'sprig',\n", + " 'stem',\n", + " 'stick',\n", + " 'store',\n", + " 'strip',\n", + " 'style',\n", + " 'such',\n", + " 'sugar',\n", + " 'tablespoon',\n", + " 'tablespoon chop',\n", + " 'tablespoon chop fresh',\n", + " 'tablespoon chop fresh cilantro',\n", + " 'tablespoon extra',\n", + " 'tablespoon extra virgin',\n", + " 'tablespoon extra virgin olive',\n", + " 'tablespoon fresh',\n", + " 'tablespoon fresh lemon',\n", + " 'tablespoon fresh lemon juice',\n", + " 'tablespoon light',\n", + " 'tablespoon olive',\n", + " 'tablespoon olive oil',\n", + " 'tablespoon sugar',\n", + " 'tablespoon white',\n", + " 'tablespoon white wine',\n", + " 'tablespoon white wine vinegar',\n", + " 'taste',\n", + " 'teaspoon',\n", + " 'teaspoon baking',\n", + " 'teaspoon dry',\n", + " 'teaspoon grate',\n", + " 'teaspoon grate lemon',\n", + " 'teaspoon ground',\n", + " 'teaspoon ground cumin',\n", + " 'teaspoon salt',\n", + " 'teaspoon vanilla',\n", + " 'teaspoon vanilla extract',\n", + " 'temperature',\n", + " 'thick',\n", + " 'thick slice',\n", + " 'thyme',\n", + " 'to',\n", + " 'to taste',\n", + " 'toast',\n", + " 'tomato',\n", + " 'tomato halve',\n", + " 'trim',\n", + " 'unpeeled',\n", + " 'unsalt',\n", + " 'unsalt butter',\n", + " 'unsweetened',\n", + " 'use',\n", + " 'vanilla',\n", + " 'vanilla extract',\n", + " 'vegetable',\n", + " 'vegetable oil',\n", + " 'vinegar',\n", + " 'virgin',\n", + " 'virgin olive',\n", + " 'virgin olive oil',\n", + " 'water',\n", + " 'wedge',\n", + " 'white',\n", + " 'white vinegar',\n", + " 'white wine',\n", + " 'white wine vinegar',\n", + " 'whole',\n", + " 'wine',\n", + " 'wine vinegar',\n", + " 'yukon',\n", + " 'yukon gold',\n", + " 'yukon gold potato']" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "transformed_recipe.columns.tolist()" ] @@ -333,7 +2496,254 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
dekhedaggregateRatingingredientsprepStepsreviewsCountwillMakeAgainPctingredients_lemmafiedcuisine_namephoto_filenamephoto_creditauthor_namedate_publishedrecipe_url
id
54a4270b19925f464b37c1dcGrilled Hearts of Romaine with Blue Cheese Vin...3.64[1 1/2 cups white wine vinegar, 1/2 cup sugar,...[Combine first 5 ingredients and 1/4 teaspoon ...9100cup white wine vinegar brk cup sugar brk cup w...Missing CuisineEP_12162015_placeholders_casual.jpgPhoto by Chelsea Kyle, Prop Styling by Rhoda B...Kate Higgins2010-12-16 04:00:00+00:00https://www.epicurious.com/recipes/food/views/...
54a42cde19925f464b3809d2Green chiles pickled in soy sauce and vinegar ...Soy-Pickled Jalapeños3.43[3 large fresh jalapeños (4 inches), sliced 1/...[Combine all ingredients in a small heavy sauc...6100large fresh jalapeño inch slice inch thick brk...Missing CuisineEP_12162015_placeholders_rustic.jpgPhoto by Chelsea Kyle, Prop Styling by Anna St...Lillian Chou2009-02-19 04:00:00+00:00https://www.epicurious.com/recipes/food/views/...
54a433036529d92b2c015de3This soup features the flavors of India: aroma...Curried Potato and Spinach Soup with Onion Sal...3.00[4 cups chopped red onions (about 2 large), 1 ...[Combine first 5 ingredients in heavy medium s...667cup chop red onion large brk tablespoon sunflo...Indian234125.jpgBrian LeatartPeter Gordon2006-03-07 04:00:00+00:00https://www.epicurious.com/recipes/food/views/...
54a451926529d92b2c01eda8Chicken Soup3.19[1 pound chicken parts, 2 stalks celery, inclu...[1. Pour 12 cups of cold water into a large st...3287pound chicken part brk stalk celery include le...KosherEP_12162015_placeholders_formal.jpgPhoto by Chelsea Kyle, Prop Styling by Rhoda B...Sharon Lebewohl2004-08-20 04:00:00+00:00https://www.epicurious.com/recipes/food/views/...
54a430876529d92b2c013e2bBrown sugar and molasses are balanced by fresh...Sweet-Hot Barbecue Sauce0.00[2 tablespoons olive oil, 1 cup chopped onion,...[Heat oil in large saucepan over medium-high h...00tablespoon olive oil brk cup chop onion brk cu...Missing CuisineEP_12162015_placeholders_rustic.jpgPhoto by Chelsea Kyle, Prop Styling by Anna St...Suzanne Tracht2007-12-03 20:11:11+00:00https://www.epicurious.com/recipes/food/views/...
\n", + "
" + ], + "text/plain": [ + " dek \\\n", + "id \n", + "54a4270b19925f464b37c1dc \n", + "54a42cde19925f464b3809d2 Green chiles pickled in soy sauce and vinegar ... \n", + "54a433036529d92b2c015de3 This soup features the flavors of India: aroma... \n", + "54a451926529d92b2c01eda8 \n", + "54a430876529d92b2c013e2b Brown sugar and molasses are balanced by fresh... \n", + "\n", + " hed \\\n", + "id \n", + "54a4270b19925f464b37c1dc Grilled Hearts of Romaine with Blue Cheese Vin... \n", + "54a42cde19925f464b3809d2 Soy-Pickled Jalapeños \n", + "54a433036529d92b2c015de3 Curried Potato and Spinach Soup with Onion Sal... \n", + "54a451926529d92b2c01eda8 Chicken Soup \n", + "54a430876529d92b2c013e2b Sweet-Hot Barbecue Sauce \n", + "\n", + " aggregateRating \\\n", + "id \n", + "54a4270b19925f464b37c1dc 3.64 \n", + "54a42cde19925f464b3809d2 3.43 \n", + "54a433036529d92b2c015de3 3.00 \n", + "54a451926529d92b2c01eda8 3.19 \n", + "54a430876529d92b2c013e2b 0.00 \n", + "\n", + " ingredients \\\n", + "id \n", + "54a4270b19925f464b37c1dc [1 1/2 cups white wine vinegar, 1/2 cup sugar,... \n", + "54a42cde19925f464b3809d2 [3 large fresh jalapeños (4 inches), sliced 1/... \n", + "54a433036529d92b2c015de3 [4 cups chopped red onions (about 2 large), 1 ... \n", + "54a451926529d92b2c01eda8 [1 pound chicken parts, 2 stalks celery, inclu... \n", + "54a430876529d92b2c013e2b [2 tablespoons olive oil, 1 cup chopped onion,... \n", + "\n", + " prepSteps \\\n", + "id \n", + "54a4270b19925f464b37c1dc [Combine first 5 ingredients and 1/4 teaspoon ... \n", + "54a42cde19925f464b3809d2 [Combine all ingredients in a small heavy sauc... \n", + "54a433036529d92b2c015de3 [Combine first 5 ingredients in heavy medium s... \n", + "54a451926529d92b2c01eda8 [1. Pour 12 cups of cold water into a large st... \n", + "54a430876529d92b2c013e2b [Heat oil in large saucepan over medium-high h... \n", + "\n", + " reviewsCount willMakeAgainPct \\\n", + "id \n", + "54a4270b19925f464b37c1dc 9 100 \n", + "54a42cde19925f464b3809d2 6 100 \n", + "54a433036529d92b2c015de3 6 67 \n", + "54a451926529d92b2c01eda8 32 87 \n", + "54a430876529d92b2c013e2b 0 0 \n", + "\n", + " ingredients_lemmafied \\\n", + "id \n", + "54a4270b19925f464b37c1dc cup white wine vinegar brk cup sugar brk cup w... \n", + "54a42cde19925f464b3809d2 large fresh jalapeño inch slice inch thick brk... \n", + "54a433036529d92b2c015de3 cup chop red onion large brk tablespoon sunflo... \n", + "54a451926529d92b2c01eda8 pound chicken part brk stalk celery include le... \n", + "54a430876529d92b2c013e2b tablespoon olive oil brk cup chop onion brk cu... \n", + "\n", + " cuisine_name \\\n", + "id \n", + "54a4270b19925f464b37c1dc Missing Cuisine \n", + "54a42cde19925f464b3809d2 Missing Cuisine \n", + "54a433036529d92b2c015de3 Indian \n", + "54a451926529d92b2c01eda8 Kosher \n", + "54a430876529d92b2c013e2b Missing Cuisine \n", + "\n", + " photo_filename \\\n", + "id \n", + "54a4270b19925f464b37c1dc EP_12162015_placeholders_casual.jpg \n", + "54a42cde19925f464b3809d2 EP_12162015_placeholders_rustic.jpg \n", + "54a433036529d92b2c015de3 234125.jpg \n", + "54a451926529d92b2c01eda8 EP_12162015_placeholders_formal.jpg \n", + "54a430876529d92b2c013e2b EP_12162015_placeholders_rustic.jpg \n", + "\n", + " photo_credit \\\n", + "id \n", + "54a4270b19925f464b37c1dc Photo by Chelsea Kyle, Prop Styling by Rhoda B... \n", + "54a42cde19925f464b3809d2 Photo by Chelsea Kyle, Prop Styling by Anna St... \n", + "54a433036529d92b2c015de3 Brian Leatart \n", + "54a451926529d92b2c01eda8 Photo by Chelsea Kyle, Prop Styling by Rhoda B... \n", + "54a430876529d92b2c013e2b Photo by Chelsea Kyle, Prop Styling by Anna St... \n", + "\n", + " author_name date_published \\\n", + "id \n", + "54a4270b19925f464b37c1dc Kate Higgins 2010-12-16 04:00:00+00:00 \n", + "54a42cde19925f464b3809d2 Lillian Chou 2009-02-19 04:00:00+00:00 \n", + "54a433036529d92b2c015de3 Peter Gordon 2006-03-07 04:00:00+00:00 \n", + "54a451926529d92b2c01eda8 Sharon Lebewohl 2004-08-20 04:00:00+00:00 \n", + "54a430876529d92b2c013e2b Suzanne Tracht 2007-12-03 20:11:11+00:00 \n", + "\n", + " recipe_url \n", + "id \n", + "54a4270b19925f464b37c1dc https://www.epicurious.com/recipes/food/views/... \n", + "54a42cde19925f464b3809d2 https://www.epicurious.com/recipes/food/views/... \n", + "54a433036529d92b2c015de3 https://www.epicurious.com/recipes/food/views/... \n", + "54a451926529d92b2c01eda8 https://www.epicurious.com/recipes/food/views/... \n", + "54a430876529d92b2c013e2b https://www.epicurious.com/recipes/food/views/... " + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "to_nlp_df.head()" ] diff --git a/src/nltk/dish_predictor.py b/src/nltk/dish_predictor.py index a0c1ffd..fb68b78 100644 --- a/src/nltk/dish_predictor.py +++ b/src/nltk/dish_predictor.py @@ -15,8 +15,9 @@ import numpy as np from pandas.io.json import json_normalize import nltk -nltk.download('wordnet') -nltk.download('stopwords') + +nltk.download("wordnet") +nltk.download("stopwords") from nltk.corpus import stopwords from nltk import word_tokenize, FreqDist from nltk.stem.wordnet import WordNetLemmatizer @@ -28,235 +29,283 @@ # Define all functions + def import_stored_files(): - # Load in the stored Epicurious database, TFIDF Vectorizer object to transform, - # the input, and the TFIDF word matrix from joblib and created by - # prepare_database.py - with open("joblib/tfidf_recipe_dataframe.joblib", "rb") as fo: - prepped = joblib.load("joblib/tfidf_recipe_dataframe.joblib") + # Load in the stored Epicurious database, TFIDF Vectorizer object to transform, + # the input, and the TFIDF word matrix from joblib and created by + # prepare_database.py + with open("joblib/tfidf_recipe_dataframe.joblib", "rb") as fo: + prepped = joblib.load("joblib/tfidf_recipe_dataframe.joblib") - with open("joblib/recipe_tfidf.joblib", "rb") as fo: - ingred_tfidf = joblib.load("joblib/recipe_tfidf.joblib") + with open("joblib/recipe_tfidf.joblib", "rb") as fo: + ingred_tfidf = joblib.load("joblib/recipe_tfidf.joblib") - with open("joblib/recipe_word_matrix_tfidf.joblib", "rb") as fo: - ingred_word_matrix = joblib.load("joblib/recipe_word_matrix_tfidf.joblib") + with open("joblib/recipe_word_matrix_tfidf.joblib", "rb") as fo: + ingred_word_matrix = joblib.load("joblib/recipe_word_matrix_tfidf.joblib") - return prepped, ingred_tfidf, ingred_word_matrix + return prepped, ingred_tfidf, ingred_word_matrix def transform_tfidf(ingred_tfidf, recipe): - # This function takes in a TFIDF Vectorizer object and a recipe, then - # creates/transforms the given recipe into a TFIDF form - - ingreds = recipe['ingredients'].apply(" ".join).str.lower() - response = ingred_tfidf.transform(ingreds) - transformed_recipe = pd.DataFrame(response.toarray(), - columns=ingred_tfidf.get_feature_names(), - index=recipe.index) - return transformed_recipe - - -def filter_out_cuisine(ingred_word_matrix, - X_df, - cuisine_name, - tfidf): - # This function takes in the ingredient word matrix (from joblib), a - # dataframe made from the database (from joblib), the user inputted cuisine - # name, and the ingredient TFIDF Vectorizer object (from joblib) and returns - # a word sub matrix that removes all recipes with the same cuisine as the - # inputted recipe. - - east_asian = ['Asian', 'Chinese', 'Japanese'] - - southeast_asian = ['Thai', 'Vietnamese'] - - euro_islands = ['English', 'Irish'] - - euro_continental = ['French', 'German', 'Eastern European'] - - mediterranean = ['Italian', 'Mediterranean', 'Kosher', 'Middle Eastern'] - - all_cuisines = ['African', 'American', 'Asian', 'Cajun/Creole', 'Chinese', 'Eastern European', 'English', - 'French', 'German', 'Indian', 'Irish', 'Italian', 'Japanese', 'Kosher', 'Latin American', - 'Mediterranean', 'Mexican', 'Middle Eastern', 'Moroccan', 'Scandinavian', 'Southwestern', - 'Thai', 'Vietnamese'] - - if cuisine_name in east_asian: - choices = [cuis for cuis in all_cuisines if cuis not in east_asian] - elif cuisine_name in southeast_asian: - choices = [cuis for cuis in all_cuisines if cuis not in southeast_asian] - elif cuisine_name in euro_islands: - choices = [cuis for cuis in all_cuisines if cuis not in euro_islands] - elif cuisine_name in euro_continental: - choices = [cuis for cuis in all_cuisines if cuis not in euro_continental] - elif cuisine_name in mediterranean: - choices = [cuis for cuis in all_cuisines if cuis not in mediterranean] - else: - choices = [cuis for cuis in all_cuisines if cuis != cuisine_name] - - combo = pd.concat([ingred_word_matrix, X_df['imputed_label']], axis=1) - filtered_ingred_word_matrix = combo[combo['imputed_label'].isin(choices)].drop('imputed_label', - axis=1) - return filtered_ingred_word_matrix + # This function takes in a TFIDF Vectorizer object and a recipe, then + # creates/transforms the given recipe into a TFIDF form + + ingreds = recipe["ingredients"].apply(" ".join).str.lower() + response = ingred_tfidf.transform(ingreds) + transformed_recipe = pd.DataFrame( + response.toarray(), columns=ingred_tfidf.get_feature_names(), index=recipe.index + ) + return transformed_recipe + + +def filter_out_cuisine(ingred_word_matrix, X_df, cuisine_name, tfidf): + # This function takes in the ingredient word matrix (from joblib), a + # dataframe made from the database (from joblib), the user inputted cuisine + # name, and the ingredient TFIDF Vectorizer object (from joblib) and returns + # a word sub matrix that removes all recipes with the same cuisine as the + # inputted recipe. + + east_asian = ["Asian", "Chinese", "Japanese"] + + southeast_asian = ["Thai", "Vietnamese"] + + euro_islands = ["English", "Irish"] + + euro_continental = ["French", "German", "Eastern European"] + + mediterranean = ["Italian", "Mediterranean", "Kosher", "Middle Eastern"] + + all_cuisines = [ + "African", + "American", + "Asian", + "Cajun/Creole", + "Chinese", + "Eastern European", + "English", + "French", + "German", + "Indian", + "Irish", + "Italian", + "Japanese", + "Kosher", + "Latin American", + "Mediterranean", + "Mexican", + "Middle Eastern", + "Moroccan", + "Scandinavian", + "Southwestern", + "Thai", + "Vietnamese", + ] + + if cuisine_name in east_asian: + choices = [cuis for cuis in all_cuisines if cuis not in east_asian] + elif cuisine_name in southeast_asian: + choices = [cuis for cuis in all_cuisines if cuis not in southeast_asian] + elif cuisine_name in euro_islands: + choices = [cuis for cuis in all_cuisines if cuis not in euro_islands] + elif cuisine_name in euro_continental: + choices = [cuis for cuis in all_cuisines if cuis not in euro_continental] + elif cuisine_name in mediterranean: + choices = [cuis for cuis in all_cuisines if cuis not in mediterranean] + else: + choices = [cuis for cuis in all_cuisines if cuis != cuisine_name] + + combo = pd.concat([ingred_word_matrix, X_df["imputed_label"]], axis=1) + filtered_ingred_word_matrix = combo[combo["imputed_label"].isin(choices)].drop( + "imputed_label", axis=1 + ) + return filtered_ingred_word_matrix def picture_placer(filename): - # This function takes in a filename and returns the relative location inside - # an HTML tag - location = f'photos/{filename}' - return location + # This function takes in a filename and returns the relative location inside + # an HTML tag + location = f"photos/{filename}" + return location def link_maker(recipe_link): - # This function takes in the incomplete recipe link from the dataframe and - # returns the complete one. - full_link = f'https://www.epicurious.com{recipe_link}' - return full_link - - -def find_closest_recipes(filtered_ingred_word_matrix, - recipe_tfidf, - X_df): - # This function takes in the filtered ingredient word matrix from function - # filter_out_cuisine, the TFIDF recipe from function transform_tfidf, and - # a dataframe made from the database (from joblib) and returns a Pandas - # DataFrame with the top five most similar recipes and a Pandas Series - # containing the similarity amount - - m2 = (recipe_tfidf != 0).any() - recipe_weights = recipe_tfidf.iloc[0][recipe_tfidf.iloc[0] != 0].to_dict() - - ingreds_used = m2.index[m2].tolist() - search_vec = np.array(recipe_tfidf).reshape(1,-1) - res_cos_sim = cosine_similarity(filtered_ingred_word_matrix, search_vec) - top_five = np.argsort(res_cos_sim.flatten())[-5:][::-1] - top_five_list = top_five.tolist() - - recipe_ids = [filtered_ingred_word_matrix.iloc[idx].name for idx in top_five] - - suggest_df = X_df.loc[recipe_ids] - proximity = pd.DataFrame(data=res_cos_sim[top_five], - columns=['cosine_similarity'], - index=suggest_df.index) - - full_df = pd.concat([suggest_df, proximity], axis=1) - expand_photo_df = pd.concat([full_df.drop(["photoData"], axis=1), - full_df["photoData"].apply(pd.Series)], axis=1) - reduced = expand_photo_df[['hed', 'recipe_url', 'filename', 'imputed_label', 'ingredients', 'cosine_similarity']].dropna(axis=1) - reduced['photo'] = reduced['filename'].apply(picture_placer) - reduced['fixed_url'] = reduced["recipe_url"].apply(link_maker) - reduced['rounded'] = reduced['cosine_similarity'].round(3) - - reduced = reduced.drop('recipe_url', axis=1) - - ingr_weights = [filtered_ingred_word_matrix.iloc[num][filtered_ingred_word_matrix.iloc[num] != 0].to_dict() for num in top_five_list] - reduced['ingred_weights'] = ingr_weights - - return reduced, ingreds_used, recipe_weights + # This function takes in the incomplete recipe link from the dataframe and + # returns the complete one. + full_link = f"https://www.epicurious.com{recipe_link}" + return full_link + + +def find_closest_recipes(filtered_ingred_word_matrix, recipe_tfidf, X_df): + # This function takes in the filtered ingredient word matrix from function + # filter_out_cuisine, the TFIDF recipe from function transform_tfidf, and + # a dataframe made from the database (from joblib) and returns a Pandas + # DataFrame with the top five most similar recipes and a Pandas Series + # containing the similarity amount + + m2 = (recipe_tfidf != 0).any() + recipe_weights = recipe_tfidf.iloc[0][recipe_tfidf.iloc[0] != 0].to_dict() + + ingreds_used = m2.index[m2].tolist() + search_vec = np.array(recipe_tfidf).reshape(1, -1) + res_cos_sim = cosine_similarity(filtered_ingred_word_matrix, search_vec) + top_five = np.argsort(res_cos_sim.flatten())[-5:][::-1] + top_five_list = top_five.tolist() + + recipe_ids = [filtered_ingred_word_matrix.iloc[idx].name for idx in top_five] + + suggest_df = X_df.loc[recipe_ids] + proximity = pd.DataFrame( + data=res_cos_sim[top_five], + columns=["cosine_similarity"], + index=suggest_df.index, + ) + + full_df = pd.concat([suggest_df, proximity], axis=1) + expand_photo_df = pd.concat( + [full_df.drop(["photoData"], axis=1), full_df["photoData"].apply(pd.Series)], + axis=1, + ) + reduced = expand_photo_df[ + [ + "hed", + "recipe_url", + "filename", + "imputed_label", + "ingredients", + "cosine_similarity", + ] + ].dropna(axis=1) + reduced["photo"] = reduced["filename"].apply(picture_placer) + reduced["fixed_url"] = reduced["recipe_url"].apply(link_maker) + reduced["rounded"] = reduced["cosine_similarity"].round(3) + + reduced = reduced.drop("recipe_url", axis=1) + + ingr_weights = [ + filtered_ingred_word_matrix.iloc[num][ + filtered_ingred_word_matrix.iloc[num] != 0 + ].to_dict() + for num in top_five_list + ] + reduced["ingred_weights"] = ingr_weights + + return reduced, ingreds_used, recipe_weights def find_similar_dishes(dish_name, cuisine_name): - prepped, ingred_tfidf, ingred_word_matrix = import_stored_files() - # This function calls the Edamam API, stores the results as a JSON, and - # stores the timestamp, dish name, and cuisine name/classification in a - # separate csv. - now = datetime.now() - dt_string = now.strftime("%d_%m_%Y_%H_%M_%S") - - api_base = "https://api.edamam.com/search?" - - # Level up: - # Implement lemmatization using trained dataset on input in order to make - # future database be less likely to have redundant entries - # (e.g., taco vs tacos) - - q = f"q={dish_name}" - - # Level up: - # Check a database of dishes to see if this query has been asked for already - # If not, do an API call - - # Currently, just does an API call, may hit API limit if continuing with this - # version - f = open("secrets/edamam.json","r") - cred = json.load(f) - f.close() - - app_id = cred["id"] - app_id_s = f"&app_id={app_id}" - - app_key = cred["key"] - app_key_s = f"&app_key={app_key}" - - # Level up: - # Explicitly ask for a few recipes using limiter and make an "average version" - # of the input in order to get better results from the API call - # limiter = "&from=0&to=4" - # API currently defaults to returning 10 - - api_call = api_base + q+ app_id_s + app_key_s #+ limiter - - resp = requests.get(api_call) - - if resp.status_code == 200: - response_dict = resp.json() - resp_dict_hits = response_dict['hits'] - - # Store the API result into a JSON and the cuisine type and dish name into a - # csv - # Heroku does not save files to directory - # Can work with EC2 - # with open(f"../write_data/{dt_string}_{dish_name}_edamam_api_return.json", "w") as f: - # json.dump(resp_dict_hits, f) - - # fields = [dt_string, dish_name, cuisine_name] - # with open("../write_data/user_requests.csv", "a", newline='') as f: - # writer = csv.writer(f) - # writer.writerow(fields) - - urls = [] - labels = [] - sources = [] - ingreds = [] - - for recipe in resp_dict_hits: - recipe_path = recipe['recipe'] - urls.append(recipe_path['url']) - labels.append(recipe_path['label']) - sources.append(recipe_path['source']) - ingreds.append([item['text'] for item in recipe_path['ingredients']]) - - all_recipes = {'url': urls, - 'label': labels, - 'source': sources, - 'ingredients': ingreds - } - - recipe_df = pd.DataFrame(all_recipes) - - one_recipe = [] - - for listing in recipe_df['ingredients']: - for ingred in listing: - one_recipe.append(ingred.lower()) - - one_recipe = list(set(one_recipe)) - - query_df = pd.DataFrame(data={'name': dish_name, 'ingredients': [one_recipe], 'cuisine': cuisine_name}) - - query_tfidf = transform_tfidf(ingred_tfidf=ingred_tfidf, recipe=query_df) - query_matrix = filter_out_cuisine(ingred_word_matrix=ingred_word_matrix, - X_df=prepped, - cuisine_name=cuisine_name, - tfidf=ingred_tfidf) - - query_similar, ingreds_used, recipe_weights = find_closest_recipes(filtered_ingred_word_matrix=query_matrix, - recipe_tfidf=query_tfidf, - X_df=prepped) - - return query_similar.to_dict(orient='records'), ingreds_used, recipe_weights - - - else: - return("Error, unable to retrieve. Server response code is: ", - resp.status_code) + prepped, ingred_tfidf, ingred_word_matrix = import_stored_files() + # This function calls the Edamam API, stores the results as a JSON, and + # stores the timestamp, dish name, and cuisine name/classification in a + # separate csv. + now = datetime.now() + dt_string = now.strftime("%d_%m_%Y_%H_%M_%S") + + api_base = "https://api.edamam.com/search?" + + # Level up: + # Implement lemmatization using trained dataset on input in order to make + # future database be less likely to have redundant entries + # (e.g., taco vs tacos) + + q = f"q={dish_name}" + + # Level up: + # Check a database of dishes to see if this query has been asked for already + # If not, do an API call + + # Currently, just does an API call, may hit API limit if continuing with this + # version + f = open("secrets/edamam.json", "r") + cred = json.load(f) + f.close() + + app_id = cred["id"] + app_id_s = f"&app_id={app_id}" + + app_key = cred["key"] + app_key_s = f"&app_key={app_key}" + + # Level up: + # Explicitly ask for a few recipes using limiter and make an "average version" + # of the input in order to get better results from the API call + # limiter = "&from=0&to=4" + # API currently defaults to returning 10 + + api_call = api_base + q + app_id_s + app_key_s # + limiter + + resp = requests.get(api_call) + + if resp.status_code == 200: + response_dict = resp.json() + resp_dict_hits = response_dict["hits"] + + # Store the API result into a JSON and the cuisine type and dish name into a + # csv + # Heroku does not save files to directory + # Can work with EC2 + # with open(f"../write_data/{dt_string}_{dish_name}_edamam_api_return.json", "w") as f: + # json.dump(resp_dict_hits, f) + + # fields = [dt_string, dish_name, cuisine_name] + # with open("../write_data/user_requests.csv", "a", newline='') as f: + # writer = csv.writer(f) + # writer.writerow(fields) + + urls = [] + labels = [] + sources = [] + ingreds = [] + + for recipe in resp_dict_hits: + recipe_path = recipe["recipe"] + urls.append(recipe_path["url"]) + labels.append(recipe_path["label"]) + sources.append(recipe_path["source"]) + ingreds.append([item["text"] for item in recipe_path["ingredients"]]) + + all_recipes = { + "url": urls, + "label": labels, + "source": sources, + "ingredients": ingreds, + } + + recipe_df = pd.DataFrame(all_recipes) + + one_recipe = [] + + for listing in recipe_df["ingredients"]: + for ingred in listing: + one_recipe.append(ingred.lower()) + + one_recipe = list(set(one_recipe)) + + query_df = pd.DataFrame( + data={ + "name": dish_name, + "ingredients": [one_recipe], + "cuisine": cuisine_name, + } + ) + + query_tfidf = transform_tfidf(ingred_tfidf=ingred_tfidf, recipe=query_df) + query_matrix = filter_out_cuisine( + ingred_word_matrix=ingred_word_matrix, + X_df=prepped, + cuisine_name=cuisine_name, + tfidf=ingred_tfidf, + ) + + query_similar, ingreds_used, recipe_weights = find_closest_recipes( + filtered_ingred_word_matrix=query_matrix, + recipe_tfidf=query_tfidf, + X_df=prepped, + ) + + return query_similar.to_dict(orient="records"), ingreds_used, recipe_weights + + else: + return ( + "Error, unable to retrieve. Server response code is: ", + resp.status_code, + ) diff --git a/src/nltk/ohe_dish_predictor.py b/src/nltk/ohe_dish_predictor.py index accbcf6..691820f 100644 --- a/src/nltk/ohe_dish_predictor.py +++ b/src/nltk/ohe_dish_predictor.py @@ -15,8 +15,9 @@ import numpy as np from pandas.io.json import json_normalize import nltk -nltk.download('wordnet') -nltk.download('stopwords') + +nltk.download("wordnet") +nltk.download("stopwords") from nltk.corpus import stopwords from nltk import word_tokenize, FreqDist from nltk.stem.wordnet import WordNetLemmatizer @@ -28,239 +29,285 @@ # Define all functions + def import_stored_files(): - # Load in the stored Epicurious database, OHE/CountVectorizer object to transform, - # the input, and the OHE/CountVectorizer word matrix from joblib and created by - # prepare_database.py - with open("joblib/ohe_recipe_dataframe.joblib", "rb") as fo: - prepped = joblib.load("joblib/ohe_recipe_dataframe.joblib") + # Load in the stored Epicurious database, OHE/CountVectorizer object to transform, + # the input, and the OHE/CountVectorizer word matrix from joblib and created by + # prepare_database.py + with open("joblib/ohe_recipe_dataframe.joblib", "rb") as fo: + prepped = joblib.load("joblib/ohe_recipe_dataframe.joblib") - with open("joblib/recipe_ohe.joblib", "rb") as fo: - ingred_ohe = joblib.load("joblib/recipe_ohe.joblib") + with open("joblib/recipe_ohe.joblib", "rb") as fo: + ingred_ohe = joblib.load("joblib/recipe_ohe.joblib") - with open("joblib/recipe_word_matrix_ohe.joblib", "rb") as fo: - ingred_word_matrix = joblib.load("joblib/recipe_word_matrix_ohe.joblib") + with open("joblib/recipe_word_matrix_ohe.joblib", "rb") as fo: + ingred_word_matrix = joblib.load("joblib/recipe_word_matrix_ohe.joblib") - return prepped, ingred_ohe, ingred_word_matrix + return prepped, ingred_ohe, ingred_word_matrix def transform_ohe(ingred_ohe, recipe): - # This function takes in a OHE/CountVectorizer object and a recipe, then - # creates/transforms the given recipe into a CountVectorizer form - pd.set_option('display.max_colwidth', 500) - print('Raw ingredients') - print(recipe['ingredients']) - ingreds = recipe['ingredients'].apply(" ".join).str.lower() - print(ingreds) - response = ingred_ohe.transform(ingreds) - transformed_recipe = pd.DataFrame(response.toarray(), - columns=ingred_ohe.get_feature_names(), - index=recipe.index) - return transformed_recipe - - -def filter_out_cuisine(ingred_word_matrix, - X_df, - cuisine_name, - ohe): - # This function takes in the ingredient word matrix (from joblib), a - # dataframe made from the database (from joblib), the user inputted cuisine - # name, and the ingredient OHE/CountVectorizer object (from joblib) and returns - # a word sub matrix that removes all recipes with the same cuisine as the - # inputted recipe. - - east_asian = ['Asian', 'Chinese', 'Japanese'] - - southeast_asian = ['Thai', 'Vietnamese'] - - euro_islands = ['English', 'Irish'] - - euro_continental = ['French', 'German', 'Eastern European'] - - mediterranean = ['Italian', 'Mediterranean', 'Kosher', 'Middle Eastern'] - - all_cuisines = ['African', 'American', 'Asian', 'Cajun/Creole', 'Chinese', 'Eastern European', 'English', - 'French', 'German', 'Indian', 'Irish', 'Italian', 'Japanese', 'Kosher', 'Latin American', - 'Mediterranean', 'Mexican', 'Middle Eastern', 'Moroccan', 'Scandinavian', 'Southwestern', - 'Thai', 'Vietnamese'] - - if cuisine_name in east_asian: - choices = [cuis for cuis in all_cuisines if cuis not in east_asian] - elif cuisine_name in southeast_asian: - choices = [cuis for cuis in all_cuisines if cuis not in southeast_asian] - elif cuisine_name in euro_islands: - choices = [cuis for cuis in all_cuisines if cuis not in euro_islands] - elif cuisine_name in euro_continental: - choices = [cuis for cuis in all_cuisines if cuis not in euro_continental] - elif cuisine_name in mediterranean: - choices = [cuis for cuis in all_cuisines if cuis not in mediterranean] - else: - choices = [cuis for cuis in all_cuisines if cuis != cuisine_name] - - combo = pd.concat([ingred_word_matrix, X_df['imputed_label']], axis=1) - filtered_ingred_word_matrix = combo[combo['imputed_label'].isin(choices)].drop('imputed_label', - axis=1) - return filtered_ingred_word_matrix + # This function takes in a OHE/CountVectorizer object and a recipe, then + # creates/transforms the given recipe into a CountVectorizer form + pd.set_option("display.max_colwidth", 500) + print("Raw ingredients") + print(recipe["ingredients"]) + ingreds = recipe["ingredients"].apply(" ".join).str.lower() + print(ingreds) + response = ingred_ohe.transform(ingreds) + transformed_recipe = pd.DataFrame( + response.toarray(), columns=ingred_ohe.get_feature_names(), index=recipe.index + ) + return transformed_recipe + + +def filter_out_cuisine(ingred_word_matrix, X_df, cuisine_name, ohe): + # This function takes in the ingredient word matrix (from joblib), a + # dataframe made from the database (from joblib), the user inputted cuisine + # name, and the ingredient OHE/CountVectorizer object (from joblib) and returns + # a word sub matrix that removes all recipes with the same cuisine as the + # inputted recipe. + + east_asian = ["Asian", "Chinese", "Japanese"] + + southeast_asian = ["Thai", "Vietnamese"] + + euro_islands = ["English", "Irish"] + + euro_continental = ["French", "German", "Eastern European"] + + mediterranean = ["Italian", "Mediterranean", "Kosher", "Middle Eastern"] + + all_cuisines = [ + "African", + "American", + "Asian", + "Cajun/Creole", + "Chinese", + "Eastern European", + "English", + "French", + "German", + "Indian", + "Irish", + "Italian", + "Japanese", + "Kosher", + "Latin American", + "Mediterranean", + "Mexican", + "Middle Eastern", + "Moroccan", + "Scandinavian", + "Southwestern", + "Thai", + "Vietnamese", + ] + + if cuisine_name in east_asian: + choices = [cuis for cuis in all_cuisines if cuis not in east_asian] + elif cuisine_name in southeast_asian: + choices = [cuis for cuis in all_cuisines if cuis not in southeast_asian] + elif cuisine_name in euro_islands: + choices = [cuis for cuis in all_cuisines if cuis not in euro_islands] + elif cuisine_name in euro_continental: + choices = [cuis for cuis in all_cuisines if cuis not in euro_continental] + elif cuisine_name in mediterranean: + choices = [cuis for cuis in all_cuisines if cuis not in mediterranean] + else: + choices = [cuis for cuis in all_cuisines if cuis != cuisine_name] + + combo = pd.concat([ingred_word_matrix, X_df["imputed_label"]], axis=1) + filtered_ingred_word_matrix = combo[combo["imputed_label"].isin(choices)].drop( + "imputed_label", axis=1 + ) + return filtered_ingred_word_matrix def picture_placer(filename): - # This function takes in a filename and returns the relative location inside - # an HTML tag - location = f'photos/{filename}' - return location + # This function takes in a filename and returns the relative location inside + # an HTML tag + location = f"photos/{filename}" + return location def link_maker(recipe_link): - # This function takes in the incomplete recipe link from the dataframe and - # returns the complete one. - full_link = f'https://www.epicurious.com{recipe_link}' - return full_link - - -def find_closest_recipes(filtered_ingred_word_matrix, - recipe_ohe, - X_df): - # This function takes in the filtered ingredient word matrix from function - # filter_out_cuisine, the ohe recipe from function transform_ohe, and - # a dataframe made from the database (from joblib) and returns a Pandas - # DataFrame with the top five most similar recipes and a Pandas Series - # containing the similarity amount - - m2 = (recipe_ohe != 0).any() - recipe_ohe_weights = recipe_ohe.iloc[0][recipe_ohe.iloc[0] != 0].to_dict() - - ingreds_used = m2.index[m2].tolist() - search_vec = np.array(recipe_ohe).reshape(1,-1) - res_cos_sim = cosine_similarity(filtered_ingred_word_matrix, search_vec) - top_five = np.argsort(res_cos_sim.flatten())[-5:][::-1] - - top_five_list = top_five.tolist() - - recipe_ids = [filtered_ingred_word_matrix.iloc[idx].name for idx in top_five] - - suggest_df = X_df.loc[recipe_ids] - proximity = pd.DataFrame(data=res_cos_sim[top_five], - columns=['cosine_similarity'], - index=suggest_df.index) - - full_df = pd.concat([suggest_df, proximity], axis=1) - expand_photo_df = pd.concat([full_df.drop(["photoData"], axis=1), - full_df["photoData"].apply(pd.Series)], axis=1) - reduced = expand_photo_df[['hed', 'recipe_url', 'filename', 'imputed_label', 'ingredients', 'cosine_similarity']].dropna(axis=1) - reduced['photo'] = reduced['filename'].apply(picture_placer) - reduced['fixed_url'] = reduced["recipe_url"].apply(link_maker) - reduced['rounded'] = reduced['cosine_similarity'].round(3) - - reduced = reduced.drop('recipe_url', axis=1) - - ingr_weights = [filtered_ingred_word_matrix.iloc[num][filtered_ingred_word_matrix.iloc[num] != 0].to_dict() for num in top_five_list] - reduced['ingred_weights'] = ingr_weights - print(ingr_weights) - return reduced, ingreds_used, recipe_ohe_weights + # This function takes in the incomplete recipe link from the dataframe and + # returns the complete one. + full_link = f"https://www.epicurious.com{recipe_link}" + return full_link + + +def find_closest_recipes(filtered_ingred_word_matrix, recipe_ohe, X_df): + # This function takes in the filtered ingredient word matrix from function + # filter_out_cuisine, the ohe recipe from function transform_ohe, and + # a dataframe made from the database (from joblib) and returns a Pandas + # DataFrame with the top five most similar recipes and a Pandas Series + # containing the similarity amount + + m2 = (recipe_ohe != 0).any() + recipe_ohe_weights = recipe_ohe.iloc[0][recipe_ohe.iloc[0] != 0].to_dict() + + ingreds_used = m2.index[m2].tolist() + search_vec = np.array(recipe_ohe).reshape(1, -1) + res_cos_sim = cosine_similarity(filtered_ingred_word_matrix, search_vec) + top_five = np.argsort(res_cos_sim.flatten())[-5:][::-1] + + top_five_list = top_five.tolist() + + recipe_ids = [filtered_ingred_word_matrix.iloc[idx].name for idx in top_five] + + suggest_df = X_df.loc[recipe_ids] + proximity = pd.DataFrame( + data=res_cos_sim[top_five], + columns=["cosine_similarity"], + index=suggest_df.index, + ) + + full_df = pd.concat([suggest_df, proximity], axis=1) + expand_photo_df = pd.concat( + [full_df.drop(["photoData"], axis=1), full_df["photoData"].apply(pd.Series)], + axis=1, + ) + reduced = expand_photo_df[ + [ + "hed", + "recipe_url", + "filename", + "imputed_label", + "ingredients", + "cosine_similarity", + ] + ].dropna(axis=1) + reduced["photo"] = reduced["filename"].apply(picture_placer) + reduced["fixed_url"] = reduced["recipe_url"].apply(link_maker) + reduced["rounded"] = reduced["cosine_similarity"].round(3) + + reduced = reduced.drop("recipe_url", axis=1) + + ingr_weights = [ + filtered_ingred_word_matrix.iloc[num][ + filtered_ingred_word_matrix.iloc[num] != 0 + ].to_dict() + for num in top_five_list + ] + reduced["ingred_weights"] = ingr_weights + print(ingr_weights) + return reduced, ingreds_used, recipe_ohe_weights def find_similar_dishes(dish_name, cuisine_name): - prepped, ingred_ohe, ingred_word_matrix = import_stored_files() - # This function calls the Edamam API, stores the results as a JSON, and - # stores the timestamp, dish name, and cuisine name/classification in a - # separate csv. - now = datetime.now() - dt_string = now.strftime("%d_%m_%Y_%H_%M_%S") - - api_base = "https://api.edamam.com/search?" - - # Level up: - # Implement lemmatization using trained dataset on input in order to make - # future database be less likely to have redundant entries - # (e.g., taco vs tacos) - - q = f"q={dish_name}" - - # Level up: - # Check a database of dishes to see if this query has been asked for already - # If not, do an API call - - # Currently, just does an API call, may hit API limit if continuing with this - # version - f = open("secrets/edamam.json","r") - cred = json.load(f) - f.close() - - app_id = cred["id"] - app_id_s = f"&app_id={app_id}" - - app_key = cred["key"] - app_key_s = f"&app_key={app_key}" - - # Level up: - # Explicitly ask for a few recipes using limiter and make an "average version" - # of the input in order to get better results from the API call - # limiter = "&from=0&to=4" - # API currently defaults to returning 10 - - api_call = api_base + q+ app_id_s + app_key_s #+ limiter - - resp = requests.get(api_call) - - if resp.status_code == 200: - response_dict = resp.json() - resp_dict_hits = response_dict['hits'] - - # Store the API result into a JSON and the cuisine type and dish name into a - # csv - # Heroku does not save files to directory - # Can work with EC2 - # with open(f"../write_data/{dt_string}_{dish_name}_edamam_api_return.json", "w") as f: - # json.dump(resp_dict_hits, f) - - # fields = [dt_string, dish_name, cuisine_name] - # with open("../write_data/user_requests.csv", "a", newline='') as f: - # writer = csv.writer(f) - # writer.writerow(fields) - - urls = [] - labels = [] - sources = [] - ingreds = [] - - for recipe in resp_dict_hits: - recipe_path = recipe['recipe'] - urls.append(recipe_path['url']) - labels.append(recipe_path['label']) - sources.append(recipe_path['source']) - ingreds.append([item['text'] for item in recipe_path['ingredients']]) - - all_recipes = {'url': urls, - 'label': labels, - 'source': sources, - 'ingredients': ingreds - } - - recipe_df = pd.DataFrame(all_recipes) - - one_recipe = [] - - for listing in recipe_df['ingredients']: - for ingred in listing: - one_recipe.append(ingred.lower()) - - one_recipe = list(set(one_recipe)) - - query_df = pd.DataFrame(data={'name': dish_name, 'ingredients': [one_recipe], 'cuisine': cuisine_name}) - - query_ohe = transform_ohe(ingred_ohe=ingred_ohe, recipe=query_df) - query_matrix = filter_out_cuisine(ingred_word_matrix=ingred_word_matrix, - X_df=prepped, - cuisine_name=cuisine_name, - ohe=ingred_ohe) - - query_similar, ingreds_used, rec_weights = find_closest_recipes(filtered_ingred_word_matrix=query_matrix, - recipe_ohe=query_ohe, - X_df=prepped) - - return query_similar.to_dict(orient='records'), ingreds_used, rec_weights - - - else: - return("Error, unable to retrieve. Server response code is: ", - resp.status_code) + prepped, ingred_ohe, ingred_word_matrix = import_stored_files() + # This function calls the Edamam API, stores the results as a JSON, and + # stores the timestamp, dish name, and cuisine name/classification in a + # separate csv. + now = datetime.now() + dt_string = now.strftime("%d_%m_%Y_%H_%M_%S") + + api_base = "https://api.edamam.com/search?" + + # Level up: + # Implement lemmatization using trained dataset on input in order to make + # future database be less likely to have redundant entries + # (e.g., taco vs tacos) + + q = f"q={dish_name}" + + # Level up: + # Check a database of dishes to see if this query has been asked for already + # If not, do an API call + + # Currently, just does an API call, may hit API limit if continuing with this + # version + f = open("secrets/edamam.json", "r") + cred = json.load(f) + f.close() + + app_id = cred["id"] + app_id_s = f"&app_id={app_id}" + + app_key = cred["key"] + app_key_s = f"&app_key={app_key}" + + # Level up: + # Explicitly ask for a few recipes using limiter and make an "average version" + # of the input in order to get better results from the API call + # limiter = "&from=0&to=4" + # API currently defaults to returning 10 + + api_call = api_base + q + app_id_s + app_key_s # + limiter + + resp = requests.get(api_call) + + if resp.status_code == 200: + response_dict = resp.json() + resp_dict_hits = response_dict["hits"] + + # Store the API result into a JSON and the cuisine type and dish name into a + # csv + # Heroku does not save files to directory + # Can work with EC2 + # with open(f"../write_data/{dt_string}_{dish_name}_edamam_api_return.json", "w") as f: + # json.dump(resp_dict_hits, f) + + # fields = [dt_string, dish_name, cuisine_name] + # with open("../write_data/user_requests.csv", "a", newline='') as f: + # writer = csv.writer(f) + # writer.writerow(fields) + + urls = [] + labels = [] + sources = [] + ingreds = [] + + for recipe in resp_dict_hits: + recipe_path = recipe["recipe"] + urls.append(recipe_path["url"]) + labels.append(recipe_path["label"]) + sources.append(recipe_path["source"]) + ingreds.append([item["text"] for item in recipe_path["ingredients"]]) + + all_recipes = { + "url": urls, + "label": labels, + "source": sources, + "ingredients": ingreds, + } + + recipe_df = pd.DataFrame(all_recipes) + + one_recipe = [] + + for listing in recipe_df["ingredients"]: + for ingred in listing: + one_recipe.append(ingred.lower()) + + one_recipe = list(set(one_recipe)) + + query_df = pd.DataFrame( + data={ + "name": dish_name, + "ingredients": [one_recipe], + "cuisine": cuisine_name, + } + ) + + query_ohe = transform_ohe(ingred_ohe=ingred_ohe, recipe=query_df) + query_matrix = filter_out_cuisine( + ingred_word_matrix=ingred_word_matrix, + X_df=prepped, + cuisine_name=cuisine_name, + ohe=ingred_ohe, + ) + + query_similar, ingreds_used, rec_weights = find_closest_recipes( + filtered_ingred_word_matrix=query_matrix, recipe_ohe=query_ohe, X_df=prepped + ) + + return query_similar.to_dict(orient="records"), ingreds_used, rec_weights + + else: + return ( + "Error, unable to retrieve. Server response code is: ", + resp.status_code, + ) diff --git a/src/nltk/prepare_ohe_database.py b/src/nltk/prepare_ohe_database.py index 7ba5dce..7db9b2f 100644 --- a/src/nltk/prepare_ohe_database.py +++ b/src/nltk/prepare_ohe_database.py @@ -12,9 +12,10 @@ import pandas as pd import numpy as np import nltk -nltk.download('wordnet') -nltk.download('stopwords') -nltk.download('punkt') + +nltk.download("wordnet") +nltk.download("stopwords") +nltk.download("punkt") from nltk.corpus import stopwords from nltk.stem.wordnet import WordNetLemmatizer from nltk.tokenize import sent_tokenize, word_tokenize @@ -39,12 +40,12 @@ # Define functions def cuisine_namer(text): - """This function converts redundant and/or rare categories into more common - ones/umbrella ones. - - In the future, there's a hope that this renaming mechanism will not have - under sampled cuisine tags. - """ + """This function converts redundant and/or rare categories into more common + ones/umbrella ones. + + In the future, there's a hope that this renaming mechanism will not have + under sampled cuisine tags. + """ if text == "Central American/Caribbean": return "Caribbean" elif text == "Jewish": @@ -74,8 +75,8 @@ def cuisine_namer(text): def load_data(filepath, test_size=0.1, random_state=10): - """ This function uses a filepath, test_size, and random_state - to load the Epicurious JSON into a dataframe and then split into + """This function uses a filepath, test_size, and random_state + to load the Epicurious JSON into a dataframe and then split into train/test sets.""" with open(filepath, "r") as f: datastore = json.load(f) @@ -87,121 +88,117 @@ def load_data(filepath, test_size=0.1, random_state=10): def prep_data(X): - """ This function takes a dataframe X, drops columns that will not be used, - expands the hierarchical column into the dataframe, renames the columns - to be more human-readable, and drops one column created during dataframe - expansion""" - X.drop( - [ - "pubDate", - "author", - "type", - "aggregateRating", - "reviewsCount", - "willMakeAgainPct", - "dateCrawled", - 'prepSteps' - ], - axis=1, - inplace=True, - ) - - X.rename({'url': 'recipe_url'}, axis=1, inplace=True) - - concat = pd.concat([X.drop(["tag"], axis=1), X["tag"].apply(pd.Series)], axis=1) - concat.drop( - [ - 0, - "photosBadgeAltText", - "photosBadgeFileName", - "photosBadgeID", - "photosBadgeRelatedUri", - "url" - ], - axis=1, - inplace=True, - ) - cuisine_only = concat[concat["category"] == "cuisine"] - cuisine_only.dropna(axis=0, inplace=True) - cuisine_only["imputed_label"] = cuisine_only["name"].apply(cuisine_namer) - cuisine_only.drop('name', axis=1, inplace=True) - return cuisine_only + """This function takes a dataframe X, drops columns that will not be used, + expands the hierarchical column into the dataframe, renames the columns + to be more human-readable, and drops one column created during dataframe + expansion""" + X.drop( + [ + "pubDate", + "author", + "type", + "aggregateRating", + "reviewsCount", + "willMakeAgainPct", + "dateCrawled", + "prepSteps", + ], + axis=1, + inplace=True, + ) + + X.rename({"url": "recipe_url"}, axis=1, inplace=True) + + concat = pd.concat([X.drop(["tag"], axis=1), X["tag"].apply(pd.Series)], axis=1) + concat.drop( + [ + 0, + "photosBadgeAltText", + "photosBadgeFileName", + "photosBadgeID", + "photosBadgeRelatedUri", + "url", + ], + axis=1, + inplace=True, + ) + cuisine_only = concat[concat["category"] == "cuisine"] + cuisine_only.dropna(axis=0, inplace=True) + cuisine_only["imputed_label"] = cuisine_only["name"].apply(cuisine_namer) + cuisine_only.drop("name", axis=1, inplace=True) + return cuisine_only def fit_transform_ohe_matrix(X_df, stopwords_list): - ohe = CountVectorizer( + ohe = CountVectorizer( stop_words=stopwords_list, min_df=2, token_pattern=r"(?u)\b[a-zA-Z]{2,}\b", preprocessor=lemmatizer.lemmatize, binary=True, ) - ingreds = X_df["ingredients"].apply(" ".join).str.lower() - ohe.fit(ingreds) - response = ohe.transform(ingreds) - ohe_matrix = pd.DataFrame(response.toarray(), - columns=ohe.get_feature_names(), - index=X_df.index - ) - return ohe, ohe_matrix - + ingreds = X_df["ingredients"].apply(" ".join).str.lower() + ohe.fit(ingreds) + response = ohe.transform(ingreds) + ohe_matrix = pd.DataFrame( + response.toarray(), columns=ohe.get_feature_names(), index=X_df.index + ) + return ohe, ohe_matrix + def transform_ohe(ohe, recipe): - ingreds = recipe['ingredients'].apply(" ".join).str.lower() - response = ohe.transform(ingreds) + ingreds = recipe["ingredients"].apply(" ".join).str.lower() + response = ohe.transform(ingreds) - ohe_transformed_recipe = pd.DataFrame( - response.toarray(), - columns=ohe.get_feature_names(), - index=recipe.index - ) - return ohe_transformed_recipe + ohe_transformed_recipe = pd.DataFrame( + response.toarray(), columns=ohe.get_feature_names(), index=recipe.index + ) + return ohe_transformed_recipe def transform_from_test_ohe(ohe, df, idx): - recipe = df['ingredients'].iloc[idx].apply(' '.join).str.lower() - response = ohe.transform(recipe) - ohe_transformed_test_recipe = pd.DataFrame( - response.toarray(), - columns=ohe.get_feature_names() - ) + recipe = df["ingredients"].iloc[idx].apply(" ".join).str.lower() + response = ohe.transform(recipe) + ohe_transformed_test_recipe = pd.DataFrame( + response.toarray(), columns=ohe.get_feature_names() + ) - return ohe_transformed_test_recipe + return ohe_transformed_test_recipe def filter_out_cuisine(ingred_word_matrix, X_df, cuisine_name, ohe): - combo = pd.concat([ingred_word_matrix, X_df["imputed_label"]], axis=1) - filtered_ingred_word_matrix = combo[combo["imputed_label"] != cuisine_name].drop( - "imputed_label", axis=1 - ) - return filtered_ingred_word_matrix + combo = pd.concat([ingred_word_matrix, X_df["imputed_label"]], axis=1) + filtered_ingred_word_matrix = combo[combo["imputed_label"] != cuisine_name].drop( + "imputed_label", axis=1 + ) + return filtered_ingred_word_matrix def find_closest_recipes(filtered_ingred_word_matrix, recipe_ohe_transform, X_df): - search_vec = np.array(recipe_ohe_transform).reshape(1, -1) - res_cos_sim = cosine_similarity(filtered_ingred_word_matrix, search_vec) - top_five = np.argsort(res_cos_sim.flatten())[-5:][::-1] - proximity = res_cos_sim[top_five] - recipe_ids = [filtered_ingred_word_matrix.iloc[idx].name for idx in top_five] - suggest_df = X_df.loc[recipe_ids] + search_vec = np.array(recipe_ohe_transform).reshape(1, -1) + res_cos_sim = cosine_similarity(filtered_ingred_word_matrix, search_vec) + top_five = np.argsort(res_cos_sim.flatten())[-5:][::-1] + proximity = res_cos_sim[top_five] + recipe_ids = [filtered_ingred_word_matrix.iloc[idx].name for idx in top_five] + suggest_df = X_df.loc[recipe_ids] - return suggest_df, proximity + return suggest_df, proximity # Create the dataframe X_train, X_test = load_data(filename) with open("../joblib/ohe_test_subset.joblib", "wb") as fo: - joblib.dump(X_test, fo, compress=True) + joblib.dump(X_test, fo, compress=True) prepped = prep_data(X_train) with open("../joblib/ohe_recipe_dataframe.joblib", "wb") as fo: - joblib.dump(prepped, fo, compress=True) + joblib.dump(prepped, fo, compress=True) # Create the ingredients OHE matrix ingred_ohe, ingred_word_matrix = fit_transform_ohe_matrix(prepped, stopwords_list) with open("../joblib/recipe_ohe.joblib", "wb") as fo: - joblib.dump(ingred_ohe, fo, compress=True) + joblib.dump(ingred_ohe, fo, compress=True) with open("../joblib/recipe_word_matrix_ohe.joblib", "wb") as fo: - joblib.dump(ingred_word_matrix, fo, compress=True) + joblib.dump(ingred_word_matrix, fo, compress=True)