diff --git a/docs/extras/guides/privacy/presidio_data_anonymization.ipynb b/docs/extras/guides/privacy/presidio_data_anonymization.ipynb index faa99292594da..4b4b718e29b97 100644 --- a/docs/extras/guides/privacy/presidio_data_anonymization.ipynb +++ b/docs/extras/guides/privacy/presidio_data_anonymization.ipynb @@ -28,7 +28,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -47,16 +47,16 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'My name is Mrs. Rachel Chen DDS, call me at 849-829-7628x073 or email me at christopherfrey@example.org'" + "'My name is Laura Ruiz, call me at +1-412-982-8374x13414 or email me at javierwatkins@example.net'" ] }, - "execution_count": 14, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -82,7 +82,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -94,35 +94,53 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "text = f\"\"\"Slim Shady recently lost his wallet. \n", + "Inside is some cash and his credit card with the number 4916 0387 9536 0861. \n", + "If you would find it, please call at 313-666-7440 or write an email here: real.slim.shady@gmail.com.\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 5, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "AIMessage(content='You can find our super secret data at https://www.ross.com/', additional_kwargs={}, example=False)" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "Dear Sir/Madam,\n", + "\n", + "We regret to inform you that Richard Fields has recently misplaced his wallet, which contains a sum of cash and his credit card bearing the number 30479847307774. \n", + "\n", + "Should you happen to come across it, we kindly request that you contact us immediately at 6439182672 or via email at frank45@example.com.\n", + "\n", + "Thank you for your attention to this matter.\n", + "\n", + "Yours faithfully,\n", + "\n", + "[Your Name]\n" + ] } ], "source": [ "from langchain.prompts.prompt import PromptTemplate\n", "from langchain.chat_models import ChatOpenAI\n", - "from langchain.schema.runnable import RunnablePassthrough\n", "\n", - "template = \"\"\"According to this text, where can you find our super secret data?\n", + "anonymizer = PresidioAnonymizer()\n", "\n", - "{anonymized_text}\n", + "template = \"\"\"Rewrite this text into an official, short email:\n", "\n", - "Answer:\"\"\"\n", + "{anonymized_text}\"\"\"\n", "prompt = PromptTemplate.from_template(template)\n", - "llm = ChatOpenAI()\n", + "llm = ChatOpenAI(temperature=0)\n", "\n", "chain = {\"anonymized_text\": anonymizer.anonymize} | prompt | llm\n", - "chain.invoke(\"You can find our super secret data at https://supersecretdata.com\")" + "response = chain.invoke(text)\n", + "print(response.content)" ] }, { @@ -135,16 +153,16 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'My name is Gabrielle Edwards, call me at 313-666-7440 or email me at real.slim.shady@gmail.com'" + "'My name is Adrian Fleming, call me at 313-666-7440 or email me at real.slim.shady@gmail.com'" ] }, - "execution_count": 18, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -166,16 +184,16 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'My name is Victoria Mckinney, call me at 713-549-8623 or email me at real.slim.shady@gmail.com'" + "'My name is Justin Miller, call me at 761-824-1889 or email me at real.slim.shady@gmail.com'" ] }, - "execution_count": 3, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -201,16 +219,16 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'My name is Billy Russo, call me at 970-996-9453x038 or email me at jamie80@example.org'" + "'My name is Dr. Jennifer Baker, call me at (508)839-9329x232 or email me at ehamilton@example.com'" ] }, - "execution_count": 4, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -232,16 +250,16 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'My polish phone number is EVIA70648911396944'" + "'My polish phone number is NRGN41434238921378'" ] }, - "execution_count": 5, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -261,7 +279,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -291,7 +309,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -308,7 +326,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -337,16 +355,16 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'+48 533 220 543'" + "'511 622 683'" ] }, - "execution_count": 9, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -374,7 +392,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -389,7 +407,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -398,16 +416,16 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'My polish phone number is +48 692 715 636'" + "'My polish phone number is +48 734 630 977'" ] }, - "execution_count": 12, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -443,7 +461,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.1" + "version": "3.11.4" } }, "nbformat": 4, diff --git a/docs/extras/guides/privacy/presidio_reversible_anonymization.ipynb b/docs/extras/guides/privacy/presidio_reversible_anonymization.ipynb new file mode 100644 index 0000000000000..be5c397ecfaea --- /dev/null +++ b/docs/extras/guides/privacy/presidio_reversible_anonymization.ipynb @@ -0,0 +1,461 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Reversible data anonymization with Microsoft Presidio\n", + "\n", + "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/privacy/reversible_anonymization.ipynb)\n", + "\n", + "\n", + "## Use case\n", + "\n", + "We have already written about the importance of anonymizing sensitive data in the previous section. **Reversible Anonymization** is an equally essential technology while sharing information with language models, as it balances data protection with data usability. This technique involves masking sensitive personally identifiable information (PII), yet it can be reversed and original data can be restored when authorized users need it. Its main advantage lies in the fact that while it conceals individual identities to prevent misuse, it also allows the concealed data to be accurately unmasked should it be necessary for legal or compliance purposes. \n", + "\n", + "## Overview\n", + "\n", + "We implemented the `PresidioReversibleAnonymizer`, which consists of two parts:\n", + "\n", + "1. anonymization - it works the same way as `PresidioAnonymizer`, plus the object itself stores a mapping of made-up values to original ones, for example:\n", + "```\n", + " {\n", + " \"PERSON\": {\n", + " \"\": \"\",\n", + " \"John Doe\": \"Slim Shady\"\n", + " },\n", + " \"PHONE_NUMBER\": {\n", + " \"111-111-1111\": \"555-555-5555\"\n", + " }\n", + " ...\n", + " }\n", + "```\n", + "\n", + "2. deanonymization - using the mapping described above, it matches fake data with original data and then substitutes it.\n", + "\n", + "Between anonymization and deanonymization user can perform different operations, for example, passing the output to LLM.\n", + "\n", + "## Quickstart\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# Install necessary packages\n", + "# ! pip install langchain langchain-experimental openai presidio-analyzer presidio-anonymizer spacy Faker\n", + "# ! python -m spacy download en_core_web_lg" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`PresidioReversibleAnonymizer` is not significantly different from its predecessor (`PresidioAnonymizer`) in terms of anonymization:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'My name is Maria Lynch, call me at 7344131647 or email me at jamesmichael@example.com. By the way, my card number is: 4838637940262'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer\n", + "\n", + "anonymizer = PresidioReversibleAnonymizer(\n", + " analyzed_fields=[\"PERSON\", \"PHONE_NUMBER\", \"EMAIL_ADDRESS\", \"CREDIT_CARD\"],\n", + " # Faker seed is used here to make sure the same fake data is generated for the test purposes\n", + " # In production, it is recommended to remove the faker_seed parameter (it will default to None)\n", + " faker_seed=42,\n", + ")\n", + "\n", + "anonymizer.anonymize(\n", + " \"My name is Slim Shady, call me at 313-666-7440 or email me at real.slim.shady@gmail.com. \"\n", + " \"By the way, my card number is: 4916 0387 9536 0861\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is what the full string we want to deanonymize looks like:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Maria Lynch recently lost his wallet. \n", + "Inside is some cash and his credit card with the number 4838637940262. \n", + "If you would find it, please call at 7344131647 or write an email here: jamesmichael@example.com.\n", + "Maria Lynch would be very grateful!\n" + ] + } + ], + "source": [ + "# We know this data, as we set the faker_seed parameter\n", + "fake_name = \"Maria Lynch\"\n", + "fake_phone = \"7344131647\"\n", + "fake_email = \"jamesmichael@example.com\"\n", + "fake_credit_card = \"4838637940262\"\n", + "\n", + "anonymized_text = f\"\"\"{fake_name} recently lost his wallet. \n", + "Inside is some cash and his credit card with the number {fake_credit_card}. \n", + "If you would find it, please call at {fake_phone} or write an email here: {fake_email}.\n", + "{fake_name} would be very grateful!\"\"\"\n", + "\n", + "print(anonymized_text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And now, using the `deanonymize` method, we can reverse the process:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Slim Shady recently lost his wallet. \n", + "Inside is some cash and his credit card with the number 4916 0387 9536 0861. \n", + "If you would find it, please call at 313-666-7440 or write an email here: real.slim.shady@gmail.com.\n", + "Slim Shady would be very grateful!\n" + ] + } + ], + "source": [ + "print(anonymizer.deanonymize(anonymized_text))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Using with LangChain Expression Language\n", + "\n", + "With LCEL we can easily chain together anonymization and deanonymization with the rest of our application. This is an example of using the anonymization mechanism with a query to LLM (without deanonymization for now):" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "text = f\"\"\"Slim Shady recently lost his wallet. \n", + "Inside is some cash and his credit card with the number 4916 0387 9536 0861. \n", + "If you would find it, please call at 313-666-7440 or write an email here: real.slim.shady@gmail.com.\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dear Sir/Madam,\n", + "\n", + "We regret to inform you that Mr. Dana Rhodes has reported the loss of his wallet. The wallet contains a sum of cash and his credit card, bearing the number 4397528473885757. \n", + "\n", + "If you happen to come across the aforementioned wallet, we kindly request that you contact us immediately at 258-481-7074x714 or via email at laurengoodman@example.com.\n", + "\n", + "Your prompt assistance in this matter would be greatly appreciated.\n", + "\n", + "Yours faithfully,\n", + "\n", + "[Your Name]\n" + ] + } + ], + "source": [ + "from langchain.prompts.prompt import PromptTemplate\n", + "from langchain.chat_models import ChatOpenAI\n", + "\n", + "anonymizer = PresidioReversibleAnonymizer()\n", + "\n", + "template = \"\"\"Rewrite this text into an official, short email:\n", + "\n", + "{anonymized_text}\"\"\"\n", + "prompt = PromptTemplate.from_template(template)\n", + "llm = ChatOpenAI(temperature=0)\n", + "\n", + "chain = {\"anonymized_text\": anonymizer.anonymize} | prompt | llm\n", + "response = chain.invoke(text)\n", + "print(response.content)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, let's add **deanonymization step** to our sequence:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dear Sir/Madam,\n", + "\n", + "We regret to inform you that Mr. Slim Shady has recently misplaced his wallet. The wallet contains a sum of cash and his credit card, bearing the number 4916 0387 9536 0861. \n", + "\n", + "If by any chance you come across the lost wallet, kindly contact us immediately at 313-666-7440 or send an email to real.slim.shady@gmail.com.\n", + "\n", + "Your prompt assistance in this matter would be greatly appreciated.\n", + "\n", + "Yours faithfully,\n", + "\n", + "[Your Name]\n" + ] + } + ], + "source": [ + "chain = chain | (lambda ai_message: anonymizer.deanonymize(ai_message.content))\n", + "response = chain.invoke(text)\n", + "print(response)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Anonymized data was given to the model itself, and therefore it was protected from being leaked to the outside world. Then, the model's response was processed, and the factual value was replaced with the real one." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Extra knowledge" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`PresidioReversibleAnonymizer` stores the mapping of the fake values to the original values in the `deanonymizer_mapping` parameter, where key is fake PII and value is the original one: " + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'PERSON': {'Maria Lynch': 'Slim Shady'},\n", + " 'PHONE_NUMBER': {'7344131647': '313-666-7440'},\n", + " 'EMAIL_ADDRESS': {'jamesmichael@example.com': 'real.slim.shady@gmail.com'},\n", + " 'CREDIT_CARD': {'4838637940262': '4916 0387 9536 0861'}}" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer\n", + "\n", + "anonymizer = PresidioReversibleAnonymizer(\n", + " analyzed_fields=[\"PERSON\", \"PHONE_NUMBER\", \"EMAIL_ADDRESS\", \"CREDIT_CARD\"],\n", + " # Faker seed is used here to make sure the same fake data is generated for the test purposes\n", + " # In production, it is recommended to remove the faker_seed parameter (it will default to None)\n", + " faker_seed=42,\n", + ")\n", + "\n", + "anonymizer.anonymize(\n", + " \"My name is Slim Shady, call me at 313-666-7440 or email me at real.slim.shady@gmail.com. \"\n", + " \"By the way, my card number is: 4916 0387 9536 0861\"\n", + ")\n", + "\n", + "anonymizer.deanonymizer_mapping" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Anonymizing more texts will result in new mapping entries:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Do you have his VISA card number? Yep, it's 3537672423884966. I'm William Bowman by the way.\n" + ] + }, + { + "data": { + "text/plain": [ + "{'PERSON': {'Maria Lynch': 'Slim Shady', 'William Bowman': 'John Doe'},\n", + " 'PHONE_NUMBER': {'7344131647': '313-666-7440'},\n", + " 'EMAIL_ADDRESS': {'jamesmichael@example.com': 'real.slim.shady@gmail.com'},\n", + " 'CREDIT_CARD': {'4838637940262': '4916 0387 9536 0861',\n", + " '3537672423884966': '4001 9192 5753 7193'}}" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(\n", + " anonymizer.anonymize(\n", + " \"Do you have his VISA card number? Yep, it's 4001 9192 5753 7193. I'm John Doe by the way.\"\n", + " )\n", + ")\n", + "\n", + "anonymizer.deanonymizer_mapping" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can save the mapping itself to a file for future use: " + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# We can save the deanonymizer mapping as a JSON or YAML file\n", + "\n", + "anonymizer.save_deanonymizer_mapping(\"deanonymizer_mapping.json\")\n", + "# anonymizer.save_deanonymizer_mapping(\"deanonymizer_mapping.yaml\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And then, load it in another `PresidioReversibleAnonymizer` instance:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{}" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "anonymizer = PresidioReversibleAnonymizer()\n", + "\n", + "anonymizer.deanonymizer_mapping" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'PERSON': {'Maria Lynch': 'Slim Shady', 'William Bowman': 'John Doe'},\n", + " 'PHONE_NUMBER': {'7344131647': '313-666-7440'},\n", + " 'EMAIL_ADDRESS': {'jamesmichael@example.com': 'real.slim.shady@gmail.com'},\n", + " 'CREDIT_CARD': {'4838637940262': '4916 0387 9536 0861',\n", + " '3537672423884966': '4001 9192 5753 7193'}}" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "anonymizer.load_deanonymizer_mapping(\"deanonymizer_mapping.json\")\n", + "\n", + "anonymizer.deanonymizer_mapping" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Future works\n", + "\n", + "- **instance anonymization** - at this point, each occurrence of PII is treated as a separate entity and separately anonymized. Therefore, two occurrences of the name John Doe in the text will be changed to two different names. It is therefore worth introducing support for full instance detection, so that repeated occurrences are treated as a single object.\n", + "- **better matching and substitution of fake values for real ones** - currently the strategy is based on matching full strings and then substituting them. Due to the indeterminism of language models, it may happen that the value in the answer is slightly changed (e.g. *John Doe* -> *John* or *Main St, New York* -> *New York*) and such a substitution is then no longer possible. Therefore, it is worth adjusting the matching for your needs." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/docs/extras/use_cases/qa_structured/integrations/myscale_vector_sql.ipynb b/docs/extras/use_cases/qa_structured/integrations/myscale_vector_sql.ipynb new file mode 100644 index 0000000000000..65bd8323ed068 --- /dev/null +++ b/docs/extras/use_cases/qa_structured/integrations/myscale_vector_sql.ipynb @@ -0,0 +1,200 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "245065c6", + "metadata": {}, + "source": [ + "# Vector SQL Retriever with MyScale\n", + "\n", + ">[MyScale](https://docs.myscale.com/en/) is an integrated vector database. You can access your database in SQL and also from here, LangChain. MyScale can make a use of [various data types and functions for filters](https://blog.myscale.com/2023/06/06/why-integrated-database-solution-can-boost-your-llm-apps/#filter-on-anything-without-constraints). It will boost up your LLM app no matter if you are scaling up your data or expand your system to broader application." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0246c5bf", + "metadata": {}, + "outputs": [], + "source": [ + "!pip3 install clickhouse-sqlalchemy InstructorEmbedding sentence_transformers openai langchain-experimental" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7585d2c3", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "from os import environ\n", + "import getpass\n", + "from typing import Dict, Any\n", + "from langchain import OpenAI, SQLDatabase, LLMChain\n", + "from langchain_experimental.sql.vector_sql import VectorSQLDatabaseChain\n", + "from sqlalchemy import create_engine, Column, MetaData\n", + "from langchain import PromptTemplate\n", + "\n", + "\n", + "from sqlalchemy import create_engine\n", + "\n", + "MYSCALE_HOST = \"msc-1decbcc9.us-east-1.aws.staging.myscale.cloud\"\n", + "MYSCALE_PORT = 443\n", + "MYSCALE_USER = \"chatdata\"\n", + "MYSCALE_PASSWORD = \"myscale_rocks\"\n", + "OPENAI_API_KEY = getpass.getpass(\"OpenAI API Key:\")\n", + "\n", + "engine = create_engine(\n", + " f\"clickhouse://{MYSCALE_USER}:{MYSCALE_PASSWORD}@{MYSCALE_HOST}:{MYSCALE_PORT}/default?protocol=https\"\n", + ")\n", + "metadata = MetaData(bind=engine)\n", + "environ[\"OPENAI_API_KEY\"] = OPENAI_API_KEY" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e08d9ddc", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.embeddings import HuggingFaceInstructEmbeddings\n", + "from langchain_experimental.sql.vector_sql import VectorSQLOutputParser\n", + "\n", + "output_parser = VectorSQLOutputParser.from_embeddings(\n", + " model=HuggingFaceInstructEmbeddings(\n", + " model_name=\"hkunlp/instructor-xl\", model_kwargs={\"device\": \"cpu\"}\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "84b705b2", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "from langchain.llms import OpenAI\n", + "from langchain.callbacks import StdOutCallbackHandler\n", + "\n", + "from langchain.utilities.sql_database import SQLDatabase\n", + "from langchain_experimental.sql.prompt import MYSCALE_PROMPT\n", + "from langchain_experimental.sql.vector_sql import VectorSQLDatabaseChain\n", + "\n", + "chain = VectorSQLDatabaseChain(\n", + " llm_chain=LLMChain(\n", + " llm=OpenAI(openai_api_key=OPENAI_API_KEY, temperature=0),\n", + " prompt=MYSCALE_PROMPT,\n", + " ),\n", + " top_k=10,\n", + " return_direct=True,\n", + " sql_cmd_parser=output_parser,\n", + " database=SQLDatabase(engine, None, metadata),\n", + ")\n", + "\n", + "import pandas as pd\n", + "\n", + "pd.DataFrame(\n", + " chain.run(\n", + " \"Please give me 10 papers to ask what is PageRank?\",\n", + " callbacks=[StdOutCallbackHandler()],\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "6c09cda0", + "metadata": {}, + "source": [ + "## SQL Database as Retriever" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "734d7ff5", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.chat_models import ChatOpenAI\n", + "from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain\n", + "\n", + "from langchain_experimental.sql.vector_sql import VectorSQLDatabaseChain\n", + "from langchain_experimental.retrievers.vector_sql_database \\\n", + " import VectorSQLDatabaseChainRetriever\n", + "from langchain_experimental.sql.prompt import MYSCALE_PROMPT\n", + "from langchain_experimental.sql.vector_sql import VectorSQLRetrieveAllOutputParser\n", + "\n", + "output_parser_retrieve_all = VectorSQLRetrieveAllOutputParser.from_embeddings(\n", + " output_parser.model\n", + ")\n", + "\n", + "chain = VectorSQLDatabaseChain.from_llm(\n", + " llm=OpenAI(openai_api_key=OPENAI_API_KEY, temperature=0),\n", + " prompt=MYSCALE_PROMPT,\n", + " top_k=10,\n", + " return_direct=True,\n", + " db=SQLDatabase(engine, None, metadata),\n", + " sql_cmd_parser=output_parser_retrieve_all,\n", + " native_format=True,\n", + ")\n", + "\n", + "# You need all those keys to get docs\n", + "retriever = VectorSQLDatabaseChainRetriever(sql_db_chain=chain, page_content_key=\"abstract\")\n", + "\n", + "document_with_metadata_prompt = PromptTemplate(\n", + " input_variables=[\"page_content\", \"id\", \"title\", \"authors\", \"pubdate\", \"categories\"],\n", + " template=\"Content:\\n\\tTitle: {title}\\n\\tAbstract: {page_content}\\n\\tAuthors: {authors}\\n\\tDate of Publication: {pubdate}\\n\\tCategories: {categories}\\nSOURCE: {id}\",\n", + ")\n", + "\n", + "chain = RetrievalQAWithSourcesChain.from_chain_type(\n", + " ChatOpenAI(\n", + " model_name=\"gpt-3.5-turbo-16k\", openai_api_key=OPENAI_API_KEY, temperature=0.6\n", + " ),\n", + " retriever=retriever,\n", + " chain_type=\"stuff\",\n", + " chain_type_kwargs={\n", + " \"document_prompt\": document_with_metadata_prompt,\n", + " },\n", + " return_source_documents=True,\n", + ")\n", + "ans = chain(\"Please give me 10 papers to ask what is PageRank?\",\n", + " callbacks=[StdOutCallbackHandler()])\n", + "print(ans[\"answer\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4948ff25", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/libs/experimental/langchain_experimental/data_anonymizer/__init__.py b/libs/experimental/langchain_experimental/data_anonymizer/__init__.py index 69babad859a01..f43d6d98df55c 100644 --- a/libs/experimental/langchain_experimental/data_anonymizer/__init__.py +++ b/libs/experimental/langchain_experimental/data_anonymizer/__init__.py @@ -1,4 +1,7 @@ """Data anonymizer package""" -from langchain_experimental.data_anonymizer.presidio import PresidioAnonymizer +from langchain_experimental.data_anonymizer.presidio import ( + PresidioAnonymizer, + PresidioReversibleAnonymizer, +) -__all__ = ["PresidioAnonymizer"] +__all__ = ["PresidioAnonymizer", "PresidioReversibleAnonymizer"] diff --git a/libs/experimental/langchain_experimental/data_anonymizer/base.py b/libs/experimental/langchain_experimental/data_anonymizer/base.py index 3f9905375e0ee..875032342a7a4 100644 --- a/libs/experimental/langchain_experimental/data_anonymizer/base.py +++ b/libs/experimental/langchain_experimental/data_anonymizer/base.py @@ -15,3 +15,17 @@ def anonymize(self, text: str) -> str: @abstractmethod def _anonymize(self, text: str) -> str: """Abstract method to anonymize text""" + + +class ReversibleAnonymizerBase(AnonymizerBase): + """ + Base abstract class for reversible anonymizers. + """ + + def deanonymize(self, text: str) -> str: + """Deanonymize text""" + return self._deanonymize(text) + + @abstractmethod + def _deanonymize(self, text: str) -> str: + """Abstract method to deanonymize text""" diff --git a/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_mapping.py b/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_mapping.py new file mode 100644 index 0000000000000..2ee03eb208040 --- /dev/null +++ b/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_mapping.py @@ -0,0 +1,21 @@ +from collections import defaultdict +from dataclasses import dataclass, field +from typing import Dict + +MappingDataType = Dict[str, Dict[str, str]] + + +@dataclass +class DeanonymizerMapping: + mapping: MappingDataType = field( + default_factory=lambda: defaultdict(lambda: defaultdict(str)) + ) + + @property + def data(self) -> MappingDataType: + """Return the deanonymizer mapping""" + return {k: dict(v) for k, v in self.mapping.items()} + + def update(self, new_mapping: MappingDataType) -> None: + for entity_type, values in new_mapping.items(): + self.mapping[entity_type].update(values) diff --git a/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_matching_strategies.py b/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_matching_strategies.py new file mode 100644 index 0000000000000..e5d9e8581b6dc --- /dev/null +++ b/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_matching_strategies.py @@ -0,0 +1,17 @@ +from langchain_experimental.data_anonymizer.presidio import MappingDataType + + +def default_matching_strategy(text: str, deanonymizer_mapping: MappingDataType) -> str: + """ + Default matching strategy for deanonymization. + It replaces all the anonymized entities with the original ones. + + Args: + text: text to deanonymize + deanonymizer_mapping: mapping between anonymized entities and original ones""" + + # Iterate over all the entities (PERSON, EMAIL_ADDRESS, etc.) + for entity_type in deanonymizer_mapping: + for anonymized, original in deanonymizer_mapping[entity_type].items(): + text = text.replace(anonymized, original) + return text diff --git a/libs/experimental/langchain_experimental/data_anonymizer/faker_presidio_mapping.py b/libs/experimental/langchain_experimental/data_anonymizer/faker_presidio_mapping.py index 8db4f94c2fd3c..c2a339088e99a 100644 --- a/libs/experimental/langchain_experimental/data_anonymizer/faker_presidio_mapping.py +++ b/libs/experimental/langchain_experimental/data_anonymizer/faker_presidio_mapping.py @@ -1,8 +1,8 @@ import string -from typing import Callable, Dict +from typing import Callable, Dict, Optional -def get_pseudoanonymizer_mapping() -> Dict[str, Callable]: +def get_pseudoanonymizer_mapping(seed: Optional[int] = None) -> Dict[str, Callable]: try: from faker import Faker except ImportError as e: @@ -11,6 +11,7 @@ def get_pseudoanonymizer_mapping() -> Dict[str, Callable]: ) from e fake = Faker() + fake.seed_instance(seed) # Listed entities supported by Microsoft Presidio (for now, global and US only) # Source: https://microsoft.github.io/presidio/supported_entities/ diff --git a/libs/experimental/langchain_experimental/data_anonymizer/presidio.py b/libs/experimental/langchain_experimental/data_anonymizer/presidio.py index 298e3de1d562b..d4886eb32c1c8 100644 --- a/libs/experimental/langchain_experimental/data_anonymizer/presidio.py +++ b/libs/experimental/langchain_experimental/data_anonymizer/presidio.py @@ -1,24 +1,56 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Dict, List, Optional +import json +from collections import defaultdict +from pathlib import Path +from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Union -from langchain_experimental.data_anonymizer.base import AnonymizerBase +import yaml + +from langchain_experimental.data_anonymizer.base import ( + AnonymizerBase, + ReversibleAnonymizerBase, +) +from langchain_experimental.data_anonymizer.deanonymizer_mapping import ( + DeanonymizerMapping, + MappingDataType, +) +from langchain_experimental.data_anonymizer.deanonymizer_matching_strategies import ( + default_matching_strategy, +) from langchain_experimental.data_anonymizer.faker_presidio_mapping import ( get_pseudoanonymizer_mapping, ) -if TYPE_CHECKING: - from presidio_analyzer import EntityRecognizer +try: + from presidio_analyzer import AnalyzerEngine +except ImportError as e: + raise ImportError( + "Could not import presidio_analyzer, please install with " + "`pip install presidio-analyzer`. You will also need to download a " + "spaCy model to use the analyzer, e.g. " + "`python -m spacy download en_core_web_lg`." + ) from e +try: + from presidio_anonymizer import AnonymizerEngine from presidio_anonymizer.entities import OperatorConfig +except ImportError as e: + raise ImportError( + "Could not import presidio_anonymizer, please install with " + "`pip install presidio-anonymizer`." + ) from e +if TYPE_CHECKING: + from presidio_analyzer import EntityRecognizer, RecognizerResult + from presidio_anonymizer.entities import EngineResult -class PresidioAnonymizer(AnonymizerBase): - """Anonymizer using Microsoft Presidio.""" +class PresidioAnonymizerBase(AnonymizerBase): def __init__( self, analyzed_fields: Optional[List[str]] = None, operators: Optional[Dict[str, OperatorConfig]] = None, + faker_seed: Optional[int] = None, ): """ Args: @@ -28,25 +60,10 @@ def __init__( Operators allow for custom anonymization of detected PII. Learn more: https://microsoft.github.io/presidio/tutorial/10_simple_anonymization/ + faker_seed: Seed used to initialize faker. + Defaults to None, in which case faker will be seeded randomly + and provide random values. """ - try: - from presidio_analyzer import AnalyzerEngine - except ImportError as e: - raise ImportError( - "Could not import presidio_analyzer, please install with " - "`pip install presidio-analyzer`. You will also need to download a " - "spaCy model to use the analyzer, e.g. " - "`python -m spacy download en_core_web_lg`." - ) from e - try: - from presidio_anonymizer import AnonymizerEngine - from presidio_anonymizer.entities import OperatorConfig - except ImportError as e: - raise ImportError( - "Could not import presidio_anonymizer, please install with " - "`pip install presidio-anonymizer`." - ) from e - self.analyzed_fields = ( analyzed_fields if analyzed_fields is not None @@ -59,13 +76,41 @@ def __init__( field: OperatorConfig( operator_name="custom", params={"lambda": faker_function} ) - for field, faker_function in get_pseudoanonymizer_mapping().items() + for field, faker_function in get_pseudoanonymizer_mapping( + faker_seed + ).items() } ) self._analyzer = AnalyzerEngine() self._anonymizer = AnonymizerEngine() + def add_recognizer(self, recognizer: EntityRecognizer) -> None: + """Add a recognizer to the analyzer + + Args: + recognizer: Recognizer to add to the analyzer. + """ + self._analyzer.registry.add_recognizer(recognizer) + self.analyzed_fields.extend(recognizer.supported_entities) + + def add_operators(self, operators: Dict[str, OperatorConfig]) -> None: + """Add operators to the anonymizer + + Args: + operators: Operators to add to the anonymizer. + """ + self.operators.update(operators) + + +class PresidioAnonymizer(PresidioAnonymizerBase): def _anonymize(self, text: str) -> str: + """Anonymize text. + Each PII entity is replaced with a fake value. + Each time fake values will be different, as they are generated randomly. + + Args: + text: text to anonymize + """ results = self._analyzer.analyze( text, entities=self.analyzed_fields, @@ -78,11 +123,185 @@ def _anonymize(self, text: str) -> str: operators=self.operators, ).text - def add_recognizer(self, recognizer: EntityRecognizer) -> None: - """Add a recognizer to the analyzer""" - self._analyzer.registry.add_recognizer(recognizer) - self.analyzed_fields.extend(recognizer.supported_entities) - def add_operators(self, operators: Dict[str, OperatorConfig]) -> None: - """Add operators to the anonymizer""" - self.operators.update(operators) +class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerBase): + def __init__( + self, + analyzed_fields: Optional[List[str]] = None, + operators: Optional[Dict[str, OperatorConfig]] = None, + faker_seed: Optional[int] = None, + ): + super().__init__(analyzed_fields, operators, faker_seed) + self._deanonymizer_mapping = DeanonymizerMapping() + + @property + def deanonymizer_mapping(self) -> MappingDataType: + """Return the deanonymizer mapping""" + return self._deanonymizer_mapping.data + + def _update_deanonymizer_mapping( + self, + original_text: str, + analyzer_results: List[RecognizerResult], + anonymizer_results: EngineResult, + ) -> None: + """Creates or updates the mapping used to de-anonymize text. + + This method exploits the results returned by the + analysis and anonymization processes. + + It constructs a mapping from each anonymized entity + back to its original text value. + + Mapping will be stored as "deanonymizer_mapping" property. + + Example of "deanonymizer_mapping": + { + "PERSON": { + "": "", + "John Doe": "Slim Shady" + }, + "PHONE_NUMBER": { + "111-111-1111": "555-555-5555" + } + ... + } + """ + + # We are able to zip and loop through both lists because we expect + # them to return corresponding entities for each identified piece + # of analyzable data from our input. + + # We sort them by their 'start' attribute because it allows us to + # match corresponding entities by their position in the input text. + analyzer_results = sorted(analyzer_results, key=lambda d: d.start) + anonymizer_results.items = sorted( + anonymizer_results.items, key=lambda d: d.start + ) + + new_deanonymizer_mapping: MappingDataType = defaultdict(dict) + + for analyzed_entity, anonymized_entity in zip( + analyzer_results, anonymizer_results.items + ): + original_value = original_text[analyzed_entity.start : analyzed_entity.end] + new_deanonymizer_mapping[anonymized_entity.entity_type][ + anonymized_entity.text + ] = original_value + + self._deanonymizer_mapping.update(new_deanonymizer_mapping) + + def _anonymize(self, text: str) -> str: + """Anonymize text. + Each PII entity is replaced with a fake value. + Each time fake values will be different, as they are generated randomly. + At the same time, we will create a mapping from each anonymized entity + back to its original text value. + + Args: + text: text to anonymize + """ + analyzer_results = self._analyzer.analyze( + text, + entities=self.analyzed_fields, + language="en", + ) + + filtered_analyzer_results = ( + self._anonymizer._remove_conflicts_and_get_text_manipulation_data( + analyzer_results + ) + ) + + anonymizer_results = self._anonymizer.anonymize( + text, + analyzer_results=analyzer_results, + operators=self.operators, + ) + + self._update_deanonymizer_mapping( + text, filtered_analyzer_results, anonymizer_results + ) + + return anonymizer_results.text + + def _deanonymize( + self, + text_to_deanonymize: str, + deanonymizer_matching_strategy: Callable[ + [str, MappingDataType], str + ] = default_matching_strategy, + ) -> str: + """Deanonymize text. + Each anonymized entity is replaced with its original value. + This method exploits the mapping created during the anonymization process. + + Args: + text_to_deanonymize: text to deanonymize + deanonymizer_matching_strategy: function to use to match + anonymized entities with their original values and replace them. + """ + if not self._deanonymizer_mapping: + raise ValueError( + "Deanonymizer mapping is empty.", + "Please call anonymize() and anonymize some text first.", + ) + + text_to_deanonymize = deanonymizer_matching_strategy( + text_to_deanonymize, self.deanonymizer_mapping + ) + + return text_to_deanonymize + + def save_deanonymizer_mapping(self, file_path: Union[Path, str]) -> None: + """Save the deanonymizer mapping to a JSON or YAML file. + + Args: + file_path: Path to file to save the mapping to. + + Example: + .. code-block:: python + + anonymizer.save_deanonymizer_mapping(file_path="path/mapping.json") + """ + + save_path = Path(file_path) + + if save_path.suffix not in [".json", ".yaml"]: + raise ValueError(f"{save_path} must have an extension of .json or .yaml") + + # Make sure parent directories exist + save_path.parent.mkdir(parents=True, exist_ok=True) + + if save_path.suffix == ".json": + with open(save_path, "w") as f: + json.dump(self.deanonymizer_mapping, f, indent=2) + elif save_path.suffix == ".yaml": + with open(save_path, "w") as f: + yaml.dump(self.deanonymizer_mapping, f, default_flow_style=False) + + def load_deanonymizer_mapping(self, file_path: Union[Path, str]) -> None: + """Load the deanonymizer mapping from a JSON or YAML file. + + Args: + file_path: Path to file to load the mapping from. + + Example: + .. code-block:: python + + anonymizer.load_deanonymizer_mapping(file_path="path/mapping.json") + """ + + load_path = Path(file_path) + + if load_path.suffix not in [".json", ".yaml"]: + raise ValueError(f"{load_path} must have an extension of .json or .yaml") + + if load_path.suffix == ".json": + with open(load_path, "r") as f: + loaded_mapping = json.load(f) + elif load_path.suffix == ".yaml": + with open(load_path, "r") as f: + loaded_mapping = yaml.load(f, Loader=yaml.FullLoader) + + self._deanonymizer_mapping.update(loaded_mapping) diff --git a/libs/experimental/langchain_experimental/retrievers/vector_sql_database.py b/libs/experimental/langchain_experimental/retrievers/vector_sql_database.py new file mode 100644 index 0000000000000..1ec088dbc515f --- /dev/null +++ b/libs/experimental/langchain_experimental/retrievers/vector_sql_database.py @@ -0,0 +1,38 @@ +"""Vector SQL Database Chain Retriever""" +from typing import Any, Dict, List + +from langchain.callbacks.manager import ( + AsyncCallbackManagerForRetrieverRun, + CallbackManagerForRetrieverRun, +) +from langchain.schema import BaseRetriever, Document + +from langchain_experimental.sql.vector_sql import VectorSQLDatabaseChain + + +class VectorSQLDatabaseChainRetriever(BaseRetriever): + """Retriever that uses SQLDatabase as Retriever""" + + sql_db_chain: VectorSQLDatabaseChain + """SQL Database Chain""" + page_content_key: str = "content" + """column name for page content of documents""" + + def _get_relevant_documents( + self, + query: str, + *, + run_manager: CallbackManagerForRetrieverRun, + **kwargs: Any, + ) -> List[Document]: + ret: List[Dict[str, Any]] = self.sql_db_chain( + query, callbacks=run_manager.get_child(), **kwargs + )["result"] + return [ + Document(page_content=r[self.page_content_key], metadata=r) for r in ret + ] + + async def _aget_relevant_documents( + self, query: str, *, run_manager: AsyncCallbackManagerForRetrieverRun + ) -> List[Document]: + raise NotImplementedError diff --git a/libs/experimental/langchain_experimental/sql/prompt.py b/libs/experimental/langchain_experimental/sql/prompt.py new file mode 100644 index 0000000000000..5f4c9b8a4fd6f --- /dev/null +++ b/libs/experimental/langchain_experimental/sql/prompt.py @@ -0,0 +1,85 @@ +# flake8: noqa +from langchain.prompts.prompt import PromptTemplate + + +PROMPT_SUFFIX = """Only use the following tables: +{table_info} + +Question: {input}""" + +_VECTOR_SQL_DEFAULT_TEMPLATE = """You are a {dialect} expert. Given an input question, first create a syntactically correct {dialect} query to run, then look at the results of the query and return the answer to the input question. +{dialect} queries has a vector distance function called `DISTANCE(column, array)` to compute relevance to the user's question and sort the feature array column by the relevance. +When the query is asking for {top_k} closest row, you have to use this distance function to calculate distance to entity's array on vector column and order by the distance to retrieve relevant rows. + +*NOTICE*: `DISTANCE(column, array)` only accept an array column as its first argument and a `NeuralArray(entity)` as its second argument. You also need a user defined function called `NeuralArray(entity)` to retrieve the entity's array. + +Unless the user specifies in the question a specific number of examples to obtain, query for at most {top_k} results using the LIMIT clause as per {dialect}. You should only order according to the distance function. +Never query for all columns from a table. You must query only the columns that are needed to answer the question. Wrap each column name in double quotes (") to denote them as delimited identifiers. +Pay attention to use only the column names you can see in the tables below. Be careful to not query for columns that do not exist. Also, pay attention to which column is in which table. +Pay attention to use today() function to get the current date, if the question involves "today". `ORDER BY` clause should always be after `WHERE` clause. DO NOT add semicolon to the end of SQL. Pay attention to the comment in table schema. + +Use the following format: + +Question: "Question here" +SQLQuery: "SQL Query to run" +SQLResult: "Result of the SQLQuery" +Answer: "Final answer here" +""" + +VECTOR_SQL_PROMPT = PromptTemplate( + input_variables=["input", "table_info", "dialect", "top_k"], + template=_VECTOR_SQL_DEFAULT_TEMPLATE + PROMPT_SUFFIX, +) + + +_myscale_prompt = """You are a MyScale expert. Given an input question, first create a syntactically correct MyScale query to run, then look at the results of the query and return the answer to the input question. +MyScale queries has a vector distance function called `DISTANCE(column, array)` to compute relevance to the user's question and sort the feature array column by the relevance. +When the query is asking for {top_k} closest row, you have to use this distance function to calculate distance to entity's array on vector column and order by the distance to retrieve relevant rows. + +*NOTICE*: `DISTANCE(column, array)` only accept an array column as its first argument and a `NeuralArray(entity)` as its second argument. You also need a user defined function called `NeuralArray(entity)` to retrieve the entity's array. + +Unless the user specifies in the question a specific number of examples to obtain, query for at most {top_k} results using the LIMIT clause as per MyScale. You should only order according to the distance function. +Never query for all columns from a table. You must query only the columns that are needed to answer the question. Wrap each column name in double quotes (") to denote them as delimited identifiers. +Pay attention to use only the column names you can see in the tables below. Be careful to not query for columns that do not exist. Also, pay attention to which column is in which table. +Pay attention to use today() function to get the current date, if the question involves "today". `ORDER BY` clause should always be after `WHERE` clause. DO NOT add semicolon to the end of SQL. Pay attention to the comment in table schema. + +Use the following format: + +======== table info ======== + + +Question: "Question here" +SQLQuery: "SQL Query to run" + + +Here are some examples: + +======== table info ======== +CREATE TABLE "ChatPaper" ( + abstract String, + id String, + vector Array(Float32), +) ENGINE = ReplicatedReplacingMergeTree() + ORDER BY id + PRIMARY KEY id + +Question: What is Feartue Pyramid Network? +SQLQuery: SELECT ChatPaper.title, ChatPaper.id, ChatPaper.authors FROM ChatPaper ORDER BY DISTANCE(vector, NeuralArray(PaperRank contribution)) LIMIT {top_k} + + +Let's begin: +======== table info ======== +{table_info} + +Question: {input} +SQLQuery: """ + +MYSCALE_PROMPT = PromptTemplate( + input_variables=["input", "table_info", "top_k"], + template=_myscale_prompt + PROMPT_SUFFIX, +) + + +VECTOR_SQL_PROMPTS = { + "myscale": MYSCALE_PROMPT, +} diff --git a/libs/experimental/langchain_experimental/sql/vector_sql.py b/libs/experimental/langchain_experimental/sql/vector_sql.py new file mode 100644 index 0000000000000..98f3c2dee0c18 --- /dev/null +++ b/libs/experimental/langchain_experimental/sql/vector_sql.py @@ -0,0 +1,237 @@ +"""Vector SQL Database Chain Retriever""" +from __future__ import annotations + +from typing import Any, Dict, List, Optional, Union + +from langchain.callbacks.manager import CallbackManagerForChainRun +from langchain.chains.llm import LLMChain +from langchain.chains.sql_database.prompt import PROMPT, SQL_PROMPTS +from langchain.embeddings.base import Embeddings +from langchain.prompts.prompt import PromptTemplate +from langchain.schema import BaseOutputParser, BasePromptTemplate +from langchain.schema.language_model import BaseLanguageModel +from langchain.tools.sql_database.prompt import QUERY_CHECKER +from langchain.utilities.sql_database import SQLDatabase + +from langchain_experimental.sql.base import INTERMEDIATE_STEPS_KEY, SQLDatabaseChain + + +class VectorSQLOutputParser(BaseOutputParser[str]): + """Output Parser for Vector SQL + 1. finds for `NeuralArray()` and replace it with the embedding + 2. finds for `DISTANCE()` and replace it with the distance name in backend SQL + """ + + model: Embeddings + """Embedding model to extract embedding for entity""" + distance_func_name: str = "distance" + """Distance name for Vector SQL""" + + class Config: + arbitrary_types_allowed = 1 + + @property + def _type(self) -> str: + return "vector_sql_parser" + + @classmethod + def from_embeddings( + cls, model: Embeddings, distance_func_name: str = "distance", **kwargs: Any + ) -> BaseOutputParser: + return cls(model=model, distance_func_name=distance_func_name, **kwargs) + + def parse(self, text: str) -> str: + text = text.strip() + start = text.find("NeuralArray(") + _sql_str_compl = text + if start > 0: + _matched = text[text.find("NeuralArray(") + len("NeuralArray(") :] + end = _matched.find(")") + start + len("NeuralArray(") + 1 + entity = _matched[: _matched.find(")")] + vecs = self.model.embed_query(entity) + vecs_str = "[" + ",".join(map(str, vecs)) + "]" + _sql_str_compl = text.replace("DISTANCE", self.distance_func_name).replace( + text[start:end], vecs_str + ) + if _sql_str_compl[-1] == ";": + _sql_str_compl = _sql_str_compl[:-1] + return _sql_str_compl + + +class VectorSQLRetrieveAllOutputParser(VectorSQLOutputParser): + """Based on VectorSQLOutputParser + It also modify the SQL to get all columns + """ + + @property + def _type(self) -> str: + return "vector_sql_retrieve_all_parser" + + def parse(self, text: str) -> str: + text = text.strip() + start = text.upper().find("SELECT") + if start >= 0: + end = text.upper().find("FROM") + text = text.replace(text[start + len("SELECT") + 1 : end - 1], "*") + return super().parse(text) + + +def _try_eval(x: Any) -> Any: + try: + return eval(x) + except Exception: + return x + + +def get_result_from_sqldb( + db: SQLDatabase, cmd: str +) -> Union[str, List[Dict[str, Any]], Dict[str, Any]]: + result = db._execute(cmd, fetch="all") # type: ignore + if isinstance(result, list): + return [{k: _try_eval(v) for k, v in dict(d._asdict()).items()} for d in result] + else: + return { + k: _try_eval(v) for k, v in dict(result._asdict()).items() # type: ignore + } + + +class VectorSQLDatabaseChain(SQLDatabaseChain): + """Chain for interacting with Vector SQL Database. + + Example: + .. code-block:: python + + from langchain_experimental.sql import SQLDatabaseChain + from langchain import OpenAI, SQLDatabase, OpenAIEmbeddings + db = SQLDatabase(...) + db_chain = VectorSQLDatabaseChain.from_llm(OpenAI(), db, OpenAIEmbeddings()) + + *Security note*: Make sure that the database connection uses credentials + that are narrowly-scoped to only include the permissions this chain needs. + Failure to do so may result in data corruption or loss, since this chain may + attempt commands like `DROP TABLE` or `INSERT` if appropriately prompted. + The best way to guard against such negative outcomes is to (as appropriate) + limit the permissions granted to the credentials used with this chain. + This issue shows an example negative outcome if these steps are not taken: + https://github.com/langchain-ai/langchain/issues/5923 + """ + + sql_cmd_parser: VectorSQLOutputParser + """Parser for Vector SQL""" + native_format: bool = False + """If return_direct, controls whether to return in python native format""" + + def _call( + self, + inputs: Dict[str, Any], + run_manager: Optional[CallbackManagerForChainRun] = None, + ) -> Dict[str, Any]: + _run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager() + input_text = f"{inputs[self.input_key]}\nSQLQuery:" + _run_manager.on_text(input_text, verbose=self.verbose) + # If not present, then defaults to None which is all tables. + table_names_to_use = inputs.get("table_names_to_use") + table_info = self.database.get_table_info(table_names=table_names_to_use) + llm_inputs = { + "input": input_text, + "top_k": str(self.top_k), + "dialect": self.database.dialect, + "table_info": table_info, + "stop": ["\nSQLResult:"], + } + intermediate_steps: List = [] + try: + intermediate_steps.append(llm_inputs) # input: sql generation + llm_out = self.llm_chain.predict( + callbacks=_run_manager.get_child(), + **llm_inputs, + ) + sql_cmd = self.sql_cmd_parser.parse(llm_out) + if self.return_sql: + return {self.output_key: sql_cmd} + if not self.use_query_checker: + _run_manager.on_text(llm_out, color="green", verbose=self.verbose) + intermediate_steps.append( + llm_out + ) # output: sql generation (no checker) + intermediate_steps.append({"sql_cmd": llm_out}) # input: sql exec + result = get_result_from_sqldb(self.database, sql_cmd) + intermediate_steps.append(str(result)) # output: sql exec + else: + query_checker_prompt = self.query_checker_prompt or PromptTemplate( + template=QUERY_CHECKER, input_variables=["query", "dialect"] + ) + query_checker_chain = LLMChain( + llm=self.llm_chain.llm, + prompt=query_checker_prompt, + output_parser=self.llm_chain.output_parser, + ) + query_checker_inputs = { + "query": llm_out, + "dialect": self.database.dialect, + } + checked_llm_out = query_checker_chain.predict( + callbacks=_run_manager.get_child(), **query_checker_inputs + ) + checked_sql_command = self.sql_cmd_parser.parse(checked_llm_out) + intermediate_steps.append( + checked_llm_out + ) # output: sql generation (checker) + _run_manager.on_text( + checked_llm_out, color="green", verbose=self.verbose + ) + intermediate_steps.append( + {"sql_cmd": checked_llm_out} + ) # input: sql exec + result = get_result_from_sqldb(self.database, checked_sql_command) + intermediate_steps.append(str(result)) # output: sql exec + llm_out = checked_llm_out + sql_cmd = checked_sql_command + + _run_manager.on_text("\nSQLResult: ", verbose=self.verbose) + _run_manager.on_text(str(result), color="yellow", verbose=self.verbose) + # If return direct, we just set the final result equal to + # the result of the sql query result, otherwise try to get a human readable + # final answer + if self.return_direct: + final_result = result + else: + _run_manager.on_text("\nAnswer:", verbose=self.verbose) + input_text += f"{llm_out}\nSQLResult: {result}\nAnswer:" + llm_inputs["input"] = input_text + intermediate_steps.append(llm_inputs) # input: final answer + final_result = self.llm_chain.predict( + callbacks=_run_manager.get_child(), + **llm_inputs, + ).strip() + intermediate_steps.append(final_result) # output: final answer + _run_manager.on_text(final_result, color="green", verbose=self.verbose) + chain_result: Dict[str, Any] = {self.output_key: final_result} + if self.return_intermediate_steps: + chain_result[INTERMEDIATE_STEPS_KEY] = intermediate_steps + return chain_result + except Exception as exc: + # Append intermediate steps to exception, to aid in logging and later + # improvement of few shot prompt seeds + exc.intermediate_steps = intermediate_steps # type: ignore + raise exc + + @property + def _chain_type(self) -> str: + return "vector_sql_database_chain" + + @classmethod + def from_llm( + cls, + llm: BaseLanguageModel, + db: SQLDatabase, + prompt: Optional[BasePromptTemplate] = None, + sql_cmd_parser: Optional[VectorSQLOutputParser] = None, + **kwargs: Any, + ) -> VectorSQLDatabaseChain: + assert sql_cmd_parser, "`sql_cmd_parser` must be set in VectorSQLDatabaseChain." + prompt = prompt or SQL_PROMPTS.get(db.dialect, PROMPT) + llm_chain = LLMChain(llm=llm, prompt=prompt) + return cls( + llm_chain=llm_chain, database=db, sql_cmd_parser=sql_cmd_parser, **kwargs + ) diff --git a/libs/experimental/poetry.lock b/libs/experimental/poetry.lock index 620da0f99ae11..9e8cf9f1aff2d 100644 --- a/libs/experimental/poetry.lock +++ b/libs/experimental/poetry.lock @@ -1245,6 +1245,7 @@ optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*" files = [ {file = "jsonpointer-2.4-py2.py3-none-any.whl", hash = "sha256:15d51bba20eea3165644553647711d150376234112651b4f1811022aecad7d7a"}, + {file = "jsonpointer-2.4.tar.gz", hash = "sha256:585cee82b70211fa9e6043b7bb89db6e1aa49524340dde8ad6b63206ea689d88"}, ] [[package]] diff --git a/libs/experimental/tests/unit_tests/test_reversible_data_anonymizer.py b/libs/experimental/tests/unit_tests/test_reversible_data_anonymizer.py new file mode 100644 index 0000000000000..9484a0e9dca06 --- /dev/null +++ b/libs/experimental/tests/unit_tests/test_reversible_data_anonymizer.py @@ -0,0 +1,154 @@ +import os +from typing import Iterator, List + +import pytest + + +@pytest.fixture(scope="module", autouse=True) +def check_spacy_model() -> Iterator[None]: + import spacy + + if not spacy.util.is_package("en_core_web_lg"): + pytest.skip(reason="Spacy model 'en_core_web_lg' not installed") + yield + + +@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker") +@pytest.mark.parametrize( + "analyzed_fields,should_contain", + [(["PERSON"], False), (["PHONE_NUMBER"], True), (None, False)], +) +def test_anonymize(analyzed_fields: List[str], should_contain: bool) -> None: + """Test anonymizing a name in a simple sentence""" + from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer + + text = "Hello, my name is John Doe." + anonymizer = PresidioReversibleAnonymizer(analyzed_fields=analyzed_fields) + anonymized_text = anonymizer.anonymize(text) + assert ("John Doe" in anonymized_text) == should_contain + + +@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker") +def test_anonymize_multiple() -> None: + """Test anonymizing multiple items in a sentence""" + from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer + + text = "John Smith's phone number is 313-666-7440 and email is johnsmith@gmail.com" + anonymizer = PresidioReversibleAnonymizer() + anonymized_text = anonymizer.anonymize(text) + for phrase in ["John Smith", "313-666-7440", "johnsmith@gmail.com"]: + assert phrase not in anonymized_text + + +@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker") +def test_anonymize_with_custom_operator() -> None: + """Test anonymize a name with a custom operator""" + from presidio_anonymizer.entities import OperatorConfig + + from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer + + custom_operator = {"PERSON": OperatorConfig("replace", {"new_value": ""})} + anonymizer = PresidioReversibleAnonymizer(operators=custom_operator) + + text = "Jane Doe was here." + + anonymized_text = anonymizer.anonymize(text) + assert anonymized_text == " was here." + + +@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker") +def test_add_recognizer_operator() -> None: + """ + Test add recognizer and anonymize a new type of entity and with a custom operator + """ + from presidio_analyzer import PatternRecognizer + from presidio_anonymizer.entities import OperatorConfig + + from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer + + anonymizer = PresidioReversibleAnonymizer(analyzed_fields=[]) + titles_list = ["Sir", "Madam", "Professor"] + custom_recognizer = PatternRecognizer( + supported_entity="TITLE", deny_list=titles_list + ) + anonymizer.add_recognizer(custom_recognizer) + + # anonymizing with custom recognizer + text = "Madam Jane Doe was here." + anonymized_text = anonymizer.anonymize(text) + assert anonymized_text == " Jane Doe was here." + + # anonymizing with custom recognizer and operator + custom_operator = {"TITLE": OperatorConfig("replace", {"new_value": "Dear"})} + anonymizer.add_operators(custom_operator) + anonymized_text = anonymizer.anonymize(text) + assert anonymized_text == "Dear Jane Doe was here." + + +@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker") +def test_deanonymizer_mapping() -> None: + """Test if deanonymizer mapping is correctly populated""" + from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer + + anonymizer = PresidioReversibleAnonymizer( + analyzed_fields=["PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "CREDIT_CARD"] + ) + + anonymizer.anonymize("Hello, my name is John Doe and my number is 444 555 6666.") + + # ["PERSON", "PHONE_NUMBER"] + assert len(anonymizer.deanonymizer_mapping.keys()) == 2 + assert "John Doe" in anonymizer.deanonymizer_mapping.get("PERSON", {}).values() + assert ( + "444 555 6666" + in anonymizer.deanonymizer_mapping.get("PHONE_NUMBER", {}).values() + ) + + text_to_anonymize = ( + "And my name is Jane Doe, my email is jane@gmail.com and " + "my credit card is 4929 5319 6292 5362." + ) + anonymizer.anonymize(text_to_anonymize) + + # ["PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "CREDIT_CARD"] + assert len(anonymizer.deanonymizer_mapping.keys()) == 4 + assert "Jane Doe" in anonymizer.deanonymizer_mapping.get("PERSON", {}).values() + assert ( + "jane@gmail.com" + in anonymizer.deanonymizer_mapping.get("EMAIL_ADDRESS", {}).values() + ) + assert ( + "4929 5319 6292 5362" + in anonymizer.deanonymizer_mapping.get("CREDIT_CARD", {}).values() + ) + + +@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker") +def test_deanonymize() -> None: + """Test deanonymizing a name in a simple sentence""" + from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer + + text = "Hello, my name is John Doe." + anonymizer = PresidioReversibleAnonymizer(analyzed_fields=["PERSON"]) + anonymized_text = anonymizer.anonymize(text) + deanonymized_text = anonymizer.deanonymize(anonymized_text) + assert deanonymized_text == text + + +@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker") +def test_save_load_deanonymizer_mapping() -> None: + from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer + + anonymizer = PresidioReversibleAnonymizer(analyzed_fields=["PERSON"]) + anonymizer.anonymize("Hello, my name is John Doe.") + try: + anonymizer.save_deanonymizer_mapping("test_file.json") + assert os.path.isfile("test_file.json") + + anonymizer = PresidioReversibleAnonymizer() + anonymizer.load_deanonymizer_mapping("test_file.json") + + assert "John Doe" in anonymizer.deanonymizer_mapping.get("PERSON", {}).values() + + finally: + os.remove("test_file.json") diff --git a/libs/langchain/langchain/utilities/sql_database.py b/libs/langchain/langchain/utilities/sql_database.py index e621ffd17bd26..13718c8c0c7f6 100644 --- a/libs/langchain/langchain/utilities/sql_database.py +++ b/libs/langchain/langchain/utilities/sql_database.py @@ -9,6 +9,7 @@ from sqlalchemy.engine import Engine from sqlalchemy.exc import ProgrammingError, SQLAlchemyError from sqlalchemy.schema import CreateTable +from sqlalchemy.types import NullType from langchain.utils import get_from_env @@ -314,6 +315,11 @@ def get_table_info(self, table_names: Optional[List[str]] = None) -> str: tables.append(self._custom_table_info[table.name]) continue + # Ignore JSON datatyped columns + for k, v in table.columns.items(): + if type(v.type) is NullType: + table._columns.remove(v) + # add create table command create_table = str(CreateTable(table).compile(self._engine)) table_info = f"{create_table.rstrip()}"