Skip to content

Commit

Permalink
Merge pull request #93 from AaronWChen/NGRAM-2/trying-BGEM3-embeddings
Browse files Browse the repository at this point in the history
Ngram 2/trying bgem3 embeddings
  • Loading branch information
AaronWChen authored Jul 30, 2024
2 parents 14e9838 + 89fc77a commit afdd06c
Show file tree
Hide file tree
Showing 11 changed files with 3,109 additions and 765 deletions.
5 changes: 0 additions & 5 deletions .dvc/config
Original file line number Diff line number Diff line change
@@ -1,8 +1,3 @@
[core]
remote = origin
['remote "origin"']
url = https://dagshub.com/AaronWChen/MeaLeon.dvc
read_timeout = 0
['remote "origin-s3"']
url = s3://dvc
endpointurl = https://dagshub.com/AaronWChen/MeaLeon.s3
11 changes: 11 additions & 0 deletions .gitconfig
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Generated by nbdev_install_hooks
#
# If you need to disable this instrumentation do:
# git config --local --unset include.path
#
# To restore:
# git config --local include.path ../.gitconfig
#
[merge "nbdev-merge"]
name = resolve conflicts with nbdev_fix
driver = nbdev_merge %O %A %B %P
6 changes: 3 additions & 3 deletions data.dvc
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
outs:
- md5: cf11c176411ce04792dbb83a6380b64b.dir
size: 74605953
nfiles: 1
- md5: 3f85b4e2df7b76c01bcb27989e564e36.dir
size: 163733640
nfiles: 6
path: data
8 changes: 7 additions & 1 deletion nbs/01_initial_eda.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -3059,7 +3059,13 @@
"source": []
}
],
"metadata": {},
"metadata": {
"kernelspec": {
"display_name": "python3",
"language": "python",
"name": "python3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
5 changes: 5 additions & 0 deletions nbs/04_word_lists_combine_features.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -1547,6 +1547,11 @@
"filtered_df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down
47 changes: 3 additions & 44 deletions nbs/07_bertopic_testing.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -132,54 +132,13 @@
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"> <span style=\"font-weight: bold\">❗❗❗ AUTHORIZATION REQUIRED ❗❗❗</span> \n",
"</pre>\n"
],
"text/plain": [
" \u001b[1m❗❗❗ AUTHORIZATION REQUIRED ❗❗❗\u001b[0m \n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "084490be36a047529096776bcb49e6de",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Output()"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
"\n",
"Open the following link in your browser to authorize the client:\n",
"https://dagshub.com/login/oauth/authorize?state=1bf46b97-13cb-4873-9c57-af8141828675&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=b534b316020d1f4e499a4a24567fc8ff36a49714fc1a4e01f004354ef9e20587\n",
"\n",
"\n"
"Token Dagshub OAuth token, valid until 2023-03-15 05:42:01.541960+00:00 does not exist in the storage\n",
"Token Dagshub OAuth token, valid until 2023-07-27 17:31:38.987842+00:00 does not exist in the storage\n"
]
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
],
"text/plain": []
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
Expand Down
151 changes: 8 additions & 143 deletions nbs/11_sklearn_mlfow_model_testing.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -242,149 +242,14 @@
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "5c7f9ca7c04d4d61a1d93d784e67a1e8",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json: 0%| …"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2024-02-13 11:12:24 INFO: Downloading default packages for language: en (English) ...\n",
"2024-02-13 11:12:24 INFO: File exists: /home/awchen/stanza_resources/en/default.zip\n",
"2024-02-13 11:12:27 INFO: Finished downloading models and saved to /home/awchen/stanza_resources.\n",
"2024-02-13 11:12:27 INFO: Checking for updates to resources.json in case models have been updated. Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "341eed1075774bfaa7c7873d53723222",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json: 0%| …"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2024-02-13 11:12:28 INFO: Loading these models for language: en (English):\n",
"======================================\n",
"| Processor | Package |\n",
"--------------------------------------\n",
"| tokenize | combined |\n",
"| pos | combined_charlm |\n",
"| lemma | combined_nocharlm |\n",
"| constituency | ptb3-revised_charlm |\n",
"| depparse | combined_charlm |\n",
"| sentiment | sstplus |\n",
"| ner | ontonotes_charlm |\n",
"======================================\n",
"\n",
"2024-02-13 11:12:28 INFO: Using device: cpu\n",
"2024-02-13 11:12:28 INFO: Loading: tokenize\n",
"2024-02-13 11:12:28 INFO: Loading: pos\n",
"2024-02-13 11:12:28 INFO: Loading: lemma\n",
"2024-02-13 11:12:29 INFO: Loading: constituency\n",
"2024-02-13 11:12:29 INFO: Loading: depparse\n",
"2024-02-13 11:12:29 INFO: Loading: sentiment\n",
"2024-02-13 11:12:29 INFO: Loading: ner\n",
"2024-02-13 11:12:29 INFO: Done loading processors!\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"--------------\n",
"Raw Dataframe:\n",
" id \\\n",
"0 54a2b6b019925f464b373351 \n",
"1 54a408a019925f464b3733bc \n",
"2 54a408a26529d92b2c003631 \n",
"3 54a408a66529d92b2c003638 \n",
"4 54a408a719925f464b3733cc \n",
"\n",
" dek \\\n",
"0 How does fried chicken achieve No. 1 status? B... \n",
"1 Spinaci all'Ebraica \n",
"2 This majestic, moist, and richly spiced honey ... \n",
"3 The idea for this sandwich came to me when my ... \n",
"4 In 1930, Simon Agranat, the chief justice of t... \n",
"\n",
" hed pubDate \\\n",
"0 Pickle-Brined Fried Chicken 2014-08-19T04:00:00.000Z \n",
"1 Spinach Jewish Style 2008-09-09T04:00:00.000Z \n",
"2 New Year’s Honey Cake 2008-09-10T04:00:00.000Z \n",
"3 The B.L.A.—Bagel with Lox and Avocado 2008-09-08T04:00:00.000Z \n",
"4 Shakshuka a la Doktor Shakshuka 2008-09-09T04:00:00.000Z \n",
"\n",
" author type \\\n",
"0 [] recipe \n",
"1 [{'name': 'Edda Servi Machlin'}] recipe \n",
"2 [{'name': 'Marcy Goldman'}] recipe \n",
"3 [{'name': 'Faye Levy'}] recipe \n",
"4 [{'name': 'Joan Nathan'}] recipe \n",
"\n",
" url \\\n",
"0 /recipes/food/views/pickle-brined-fried-chicke... \n",
"1 /recipes/food/views/spinach-jewish-style-350152 \n",
"2 /recipes/food/views/majestic-and-moist-new-yea... \n",
"3 /recipes/food/views/the-b-l-a-bagel-with-lox-a... \n",
"4 /recipes/food/views/shakshuka-a-la-doktor-shak... \n",
"\n",
" photoData \\\n",
"0 {'id': '54a2b64a6529d92b2c003409', 'filename':... \n",
"1 {'id': '56746182accb4c9831e45e0a', 'filename':... \n",
"2 {'id': '55e85ba4cf90d6663f728014', 'filename':... \n",
"3 {'id': '5674617e47d1a28026045e4f', 'filename':... \n",
"4 {'id': '56746183b47c050a284a4e15', 'filename':... \n",
"\n",
" tag aggregateRating \\\n",
"0 {'category': 'ingredient', 'name': 'Chicken', ... 3.11 \n",
"1 {'category': 'cuisine', 'name': 'Italian', 'ur... 3.22 \n",
"2 {'category': 'cuisine', 'name': 'Jewish', 'url... 3.62 \n",
"3 {'category': 'cuisine', 'name': 'Jewish', 'url... 4.00 \n",
"4 {'category': 'cuisine', 'name': 'Jewish', 'url... 2.71 \n",
"\n",
" ingredients \\\n",
"0 [1 tablespoons yellow mustard seeds, 1 tablesp... \n",
"1 [3 pounds small-leaved bulk spinach, Salt, 1/2... \n",
"2 [3 1/2 cups all-purpose flour, 1 tablespoon ba... \n",
"3 [1 small ripe avocado, preferably Hass (see No... \n",
"4 [2 pounds fresh tomatoes, unpeeled and cut in ... \n",
"\n",
" prepSteps reviewsCount \\\n",
"0 [Toast mustard and coriander seeds in a dry me... 7 \n",
"1 [Remove the stems and roots from the spinach. ... 5 \n",
"2 [I like this cake best baked in a 9-inch angel... 105 \n",
"3 [A short time before serving, mash avocado and... 7 \n",
"4 [1. Place the tomatoes, garlic, salt, paprika,... 7 \n",
"\n",
" willMakeAgainPct dateCrawled \n",
"0 100 1498547035 \n",
"1 80 1498547740 \n",
"2 88 1498547738 \n",
"3 100 1498547740 \n",
"4 83 1498547740 \n",
"(34756, 15)\n"
"ename": "NameError",
"evalue": "name 'stanza' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[1], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# instantiate stanza pipeline\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[43mstanza\u001b[49m\u001b[38;5;241m.\u001b[39mdownload(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124men\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 3\u001b[0m nlp \u001b[38;5;241m=\u001b[39m stanza\u001b[38;5;241m.\u001b[39mPipeline(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124men\u001b[39m\u001b[38;5;124m'\u001b[39m, \n\u001b[1;32m 4\u001b[0m depparse_batch_size\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m50\u001b[39m, \n\u001b[1;32m 5\u001b[0m depparse_min_length_to_batch_separately\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m50\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 8\u001b[0m batch_size\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m100\u001b[39m\n\u001b[1;32m 9\u001b[0m )\n\u001b[1;32m 11\u001b[0m \u001b[38;5;66;03m# load raw data and preprocess/clean\u001b[39;00m\n",
"\u001b[0;31mNameError\u001b[0m: name 'stanza' is not defined"
]
}
],
Expand Down
Loading

0 comments on commit afdd06c

Please sign in to comment.