Skip to content

Commit

Permalink
add document writer exercises
Browse files Browse the repository at this point in the history
  • Loading branch information
lfunderburk committed Nov 18, 2023
1 parent 08e991f commit 92cf834
Showing 1 changed file with 325 additions and 0 deletions.
325 changes: 325 additions & 0 deletions ch3/jupyter-notebooks/components.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,331 @@
"result_text['metadata']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### SentenceTransformersDocumentEmbedder"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "695d690b186d49d78023f05d8d6178e7",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Batches: 0%| | 0/1 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[-0.07804739475250244, 0.14989925920963287]\n"
]
}
],
"source": [
"from haystack.preview.components.embedders import SentenceTransformersDocumentEmbedder\n",
"\n",
"# Initialize the document embedder with a model from the Sentence Transformers library\n",
"doc_embedder = SentenceTransformersDocumentEmbedder(model_name_or_path=\"sentence-transformers/all-mpnet-base-v2\")\n",
"doc_embedder.warm_up()\n",
"\n",
"# Create a document to embed\n",
"doc = Document(content=\"I love pizza!\")\n",
"\n",
"# Embed the document and print the embedding\n",
"result = doc_embedder.run([doc])\n",
"print(result['documents'][0].embedding[0:2])\n"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Document(id='ac2bc369f8115bb5bdee26d31f642520041e731da70d578ef116d3f67ad50c69', content='I love pizza!', dataframe=None, blob=None, meta={}, score=None)"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"result['documents'][0]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### SentenceTransformersTextEmbedder"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "f0b7466f1f2f4b75b120bb2880ef81e8",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Batches: 0%| | 0/1 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[-0.07804739475250244, 0.14989925920963287]\n"
]
}
],
"source": [
"from haystack.preview.components.embedders import SentenceTransformersTextEmbedder\n",
"\n",
"# Initialize the text embedder with a specific model from Sentence Transformers\n",
"text_embedder = SentenceTransformersTextEmbedder(model_name_or_path=\"sentence-transformers/all-mpnet-base-v2\")\n",
"\n",
"# Warm up the model before use\n",
"text_embedder.warm_up()\n",
"\n",
"# Define the text you want to embed\n",
"text_to_embed = \"I love pizza!\"\n",
"\n",
"# Embed the text and retrieve the embedding\n",
"result = text_embedder.run(text_to_embed)\n",
"\n",
"# Print the embedding vector\n",
"print(result['embedding'][0:2])\n",
"# Output: List of floats representing the embedded vector\n"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"dict_keys(['embedding'])"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"result.keys()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Writing content into a Document Store"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### DocumentWriter"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Writing regular documents."
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Documents written: 2\n"
]
}
],
"source": [
"from haystack.preview.components.writers import DocumentWriter\n",
"from haystack.preview.document_stores import InMemoryDocumentStore\n",
"from haystack.preview.dataclasses import Document\n",
"\n",
"# Initialize an in-memory document store\n",
"doc_store = InMemoryDocumentStore()\n",
"\n",
"# Create the DocumentWriter component with the document store\n",
"document_writer = DocumentWriter(document_store=doc_store)\n",
"\n",
"# Define a list of documents to write\n",
"documents_to_write = [\n",
" Document(content=\"Document 1 content\"),\n",
" Document(content=\"Document 2 content\"),\n",
"]\n",
"\n",
"# Use the DocumentWriter component to write documents to the store\n",
"result = document_writer.run(documents=documents_to_write)\n",
"\n",
"# Print the number of documents written\n",
"print(f\"Documents written: {result['documents_written']}\")\n"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"doc_store.count_documents()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Document(id='10b329f15a2de8355bd9d538c759c45eb6193f51c7576f310400d14a9475deb8', content='Document 1 content', dataframe=None, blob=None, meta={}, score=None),\n",
" Document(id='8d5435d9fd98ef235133c6c0bf4977595b69f10683fcc27a31e56fb15a024ff7', content='Document 2 content', dataframe=None, blob=None, meta={}, score=None)]"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"doc_store.filter_documents()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Writing embedded documents."
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "4306d3edbac04c60b39d2481bd71fc45",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Batches: 0%| | 0/1 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"{'documents_written': 2}"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from haystack.preview.document_stores import InMemoryDocumentStore\n",
"from haystack.preview.components.writers import DocumentWriter\n",
"from haystack.preview.components.embedders import SentenceTransformersDocumentEmbedder\n",
"from haystack.preview.dataclasses import Document\n",
"\n",
"# Initialize document store and components\n",
"doc_store = InMemoryDocumentStore()\n",
"doc_embedder = SentenceTransformersDocumentEmbedder(model_name_or_path=\"sentence-transformers/all-mpnet-base-v2\")\n",
"document_writer = DocumentWriter(document_store=doc_store)\n",
"\n",
"# Example document\n",
"documents = [\n",
" Document(content=\"The quick brown fox jumps over the lazy dog.\"),\n",
" Document(content=\"When it comes to natural language processing, context is key.\")\n",
"]\n",
"\n",
"# Warm up the embedder and compute embeddings\n",
"doc_embedder.warm_up()\n",
"embedded_docs = doc_embedder.run(documents)['documents']\n",
"\n",
"# Write documents with embeddings to the document store\n",
"document_writer.run(documents=embedded_docs)\n"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Document(id='2e3218009b01cfc57f865bbf81fa70de81b5ebae02c4cc7092e46ffde03f3c49', content='The quick brown fox jumps over the lazy dog.', dataframe=None, blob=None, meta={}, score=None),\n",
" Document(id='8baba41960a8807c42da6783a39dbbf50873f9700ff861844ec8ccce65d4f50e', content='When it comes to natural language processing, context is key.', dataframe=None, blob=None, meta={}, score=None)]"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"doc_store.filter_documents()"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down

0 comments on commit 92cf834

Please sign in to comment.