From 0a59ddfc4367c6581d64626621803e7a060961b1 Mon Sep 17 00:00:00 2001
From: Ivan Nardini <88703814+inardini@users.noreply.github.com>
Date: Tue, 17 Sep 2024 14:38:05 +0200
Subject: [PATCH] feat: update embeddings tuning notebook (#1096)
# Description
Thank you for opening a Pull Request!
Before submitting your PR, there are a few things you can do to make
sure it goes smoothly:
- [x] Follow the [`CONTRIBUTING`
Guide](https://github.com/GoogleCloudPlatform/generative-ai/blob/main/CONTRIBUTING.md).
- [x] You are listed as the author in your notebook or README file.
- [x] Your account is listed in
[`CODEOWNERS`](https://github.com/GoogleCloudPlatform/generative-ai/blob/main/.github/CODEOWNERS)
for the file(s).
- [x] Make your Pull Request title in the
specification.
- [x] Ensure the tests and linter pass (Run `nox -s format` from the
repository root to format).
- [x] Appropriate docs were updated (if necessary)
---------
Co-authored-by: Owl Bot
---
embeddings/intro_embeddings_tuning.ipynb | 36 +++++++++++++-----------
1 file changed, 20 insertions(+), 16 deletions(-)
diff --git a/embeddings/intro_embeddings_tuning.ipynb b/embeddings/intro_embeddings_tuning.ipynb
index 467a5a2206e..ac6646abefe 100644
--- a/embeddings/intro_embeddings_tuning.ipynb
+++ b/embeddings/intro_embeddings_tuning.ipynb
@@ -156,8 +156,8 @@
},
"outputs": [],
"source": [
- "! pip3 install --upgrade --user google-cloud-aiplatform==1.48.0 google-cloud-documentai==2.26.0 google-cloud-documentai-toolbox==0.13.3a0\n",
- "! pip3 install --upgrade --user langchain==0.1.16 langchain-core==0.1.44 langchain-text-splitters==0.0.1 langchain-google-community==1.0.2 gcsfs==2024.3.1 etils==1.7.0"
+ "! pip3 install --upgrade --user google-cloud-aiplatform google-cloud-documentai google-cloud-documentai-toolbox --quiet\n",
+ "! pip3 install --upgrade --user langchain langchain-core langchain-text-splitters langchain-google-community gcsfs etils --quiet"
]
},
{
@@ -402,10 +402,9 @@
"from google.api_core.client_options import ClientOptions\n",
"from google.cloud import aiplatform, documentai\n",
"from google.protobuf.json_format import MessageToDict\n",
- "from langchain_community.document_loaders.blob_loaders import Blob\n",
- "from langchain_community.document_loaders.parsers import DocAIParser\n",
"import langchain_core\n",
"from langchain_core.documents.base import Document\n",
+ "from langchain_google_community.docai import Blob, DocAIParser\n",
"from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
"import numpy as np\n",
"import pandas as pd\n",
@@ -517,11 +516,11 @@
" }\n",
"\n",
" prompt_template = \"\"\"\n",
- " You are an examinator. Your task is to create one QUESTION for an exam using only.\n",
+ " You are an examinator. Your task is to create one QUESTION for an exam using only.\n",
"\n",
- " \n",
+ " \n",
" {chunk}\n",
- " \n",
+ " \n",
"\n",
" QUESTION:\n",
" \"\"\"\n",
@@ -568,7 +567,7 @@
" \"\"\"Get uploaded model from the pipeline job\"\"\"\n",
" evaluation_task = get_task_by_name(job, task_name)\n",
" upload_metadata = MessageToDict(evaluation_task.execution._pb)[\"metadata\"]\n",
- " return vertexai.Model(upload_metadata[\"output:model_resource_name\"])\n",
+ " return aiplatform.Model(upload_metadata[\"output:model_resource_name\"])\n",
"\n",
"\n",
"def get_training_output_dir(\n",
@@ -724,7 +723,7 @@
},
"outputs": [],
"source": [
- "blob = Blob(\n",
+ "blob = Blob.from_path(\n",
" path=f\"{RAW_DATA_URI}/goog-10-k-2023.pdf\",\n",
")\n",
"\n",
@@ -1382,28 +1381,33 @@
},
"outputs": [],
"source": [
- "import os\n",
+ "import shutil\n",
"\n",
"delete_endpoint = False\n",
"delete_model = False\n",
"delete_job = False\n",
"delete_bucket = False\n",
+ "delete_tutorial = False\n",
"\n",
"# Delete endpoint resource\n",
- "if delete_endpoint or os.getenv(\"IS_TESTING\"):\n",
- " endpoint.delete()\n",
+ "if delete_endpoint:\n",
+ " endpoint.delete(force=True)\n",
"\n",
"# Delete model resource\n",
- "if delete_model or os.getenv(\"IS_TESTING\"):\n",
+ "if delete_model:\n",
" model.delete()\n",
"\n",
"# Delete pipeline job\n",
- "if delete_job or os.getenv(\"IS_TESTING\"):\n",
+ "if delete_job:\n",
" job.delete()\n",
"\n",
"# Delete Cloud Storage objects that were created\n",
- "if delete_bucket or os.getenv(\"IS_TESTING\"):\n",
- " ! gsutil -m rm -r $BUCKET_URI"
+ "if delete_bucket:\n",
+ " ! gsutil -m rm -r $BUCKET_URI\n",
+ "\n",
+ "# Delete tutorial folder\n",
+ "if delete_tutorial:\n",
+ " shutil.rmtree(str(tutorial_path))"
]
}
],