From 4765da0ee7c919d598d3c7d45a65679c78e78159 Mon Sep 17 00:00:00 2001 From: maks-operlejn-ds Date: Tue, 10 Oct 2023 20:11:21 +0000 Subject: [PATCH] WIP Add basic version of notebook --- .../how_to/qa_privacy_protection.ipynb | 842 ++++++++++++++++++ .../how_to/text_with_private_data.txt | 28 + 2 files changed, 870 insertions(+) create mode 100644 docs/docs_skeleton/docs/use_cases/question_answering/how_to/qa_privacy_protection.ipynb create mode 100644 docs/docs_skeleton/docs/use_cases/question_answering/how_to/text_with_private_data.txt diff --git a/docs/docs_skeleton/docs/use_cases/question_answering/how_to/qa_privacy_protection.ipynb b/docs/docs_skeleton/docs/use_cases/question_answering/how_to/qa_privacy_protection.ipynb new file mode 100644 index 0000000000000..de9b5b5eeffd2 --- /dev/null +++ b/docs/docs_skeleton/docs/use_cases/question_answering/how_to/qa_privacy_protection.ipynb @@ -0,0 +1,842 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# QA with private data protection\n", + "\n", + "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/privacy/presidio_data_anonymization/reversible.ipynb)\n", + "\n", + "\n", + "[TODO: opis]\n", + "\n", + "\n", + "## Quickstart\n", + "\n", + "### Iterative process of upgrading the anonymizer" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# Install necessary packages\n", + "# !pip install langchain langchain-experimental openai presidio-analyzer presidio-anonymizer spacy Faker faiss-cpu tiktoken\n", + "# ! python -m spacy download en_core_web_lg" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from langchain.document_loaders import TextLoader\n", + "\n", + "loader = TextLoader(\"text_with_private_data.txt\")\n", + "\n", + "documents = loader.load_and_split()\n", + "len(documents)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "document_content = documents[0].page_content" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Date: October 19, 2021\n", + "Witness: Maks Operlejn\n", + "Subject: Testimony Regarding the Loss of Wallet\n", + "\n", + "Testimony Content:\n", + "\n", + "Hello Officer,\n", + "\n", + "My name is Maks Operlejn and on October 19, 2021, my wallet was stolen in the vicinity of Kilmarnock during a bike trip. This wallet contains some very important things to me.\n", + "\n", + "Firstly, the wallet contains my credit card with number 5412 5412 5412 5412, which is registered under my name and linked to my bank account, PL61109010140000071219812874.\n", + "\n", + "Additionally, the wallet had a driver's license - DL No: 999000680 issued to my name. It also houses my Social Security Number, 602-76-4532. \n", + "\n", + "What's more, I had my polish identity card there, with the number ABC123456.\n", + "\n", + "I would like this data to be secured and protected in all possible ways. I believe It was stolen at 9:30 AM.\n", + "\n", + "In case any information arises regarding my wallet, please reach out to me on my phone number, 999-888-7777, or through my personal email, maksoperlejn@example.com.\n", + "\n", + "Please consider this information to be highly confidential and respect my privacy. \n", + "\n", + "The bank has been informed about the stolen credit card and necessary actions have been taken from their end. They will be reachable at their official email, support@bankname.com.\n", + "My representative there is Victoria Cherry (her business phone: 987-654-3210).\n", + "\n", + "Thank you for your assistance,\n", + "\n", + "Maks Operlejn\n" + ] + } + ], + "source": [ + "print(document_content)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Util function for coloring the PII markers\n", + "import re\n", + "\n", + "\n", + "def print_colored_pii(string):\n", + " colored_string = re.sub(\n", + " r\"(<[^>]*>)\", lambda m: \"\\033[31m\" + m.group(1) + \"\\033[0m\", string\n", + " )\n", + " print(colored_string)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Date: \u001b[31m\u001b[0m\n", + "Witness: \u001b[31m\u001b[0m\n", + "Subject: Testimony Regarding the Loss of Wallet\n", + "\n", + "Testimony Content:\n", + "\n", + "Hello Officer,\n", + "\n", + "My name is \u001b[31m\u001b[0m and on \u001b[31m\u001b[0m, my wallet was stolen in the vicinity of \u001b[31m\u001b[0m during a bike trip. This wallet contains some very important things to me.\n", + "\n", + "Firstly, the wallet contains my credit card with number 5412 5412 5412 5412, which is registered under my name and linked to my bank account, \u001b[31m\u001b[0m.\n", + "\n", + "Additionally, the wallet had a driver's license - DL No: \u001b[31m\u001b[0m issued to my name. It also houses my Social Security Number, \u001b[31m\u001b[0m. \n", + "\n", + "What's more, I had my polish identity card there, with the number ABC123456.\n", + "\n", + "I would like this data to be secured and protected in all possible ways. I believe It was stolen at \u001b[31m\u001b[0m.\n", + "\n", + "In case any information arises regarding my wallet, please reach out to me on my phone number, \u001b[31m\u001b[0m, or through my personal email, \u001b[31m\u001b[0m.\n", + "\n", + "Please consider this information to be highly confidential and respect my privacy. \n", + "\n", + "The bank has been informed about the stolen credit card and necessary actions have been taken from their end. They will be reachable at their official email, \u001b[31m\u001b[0m.\n", + "My representative there is \u001b[31m\u001b[0m (her business phone: \u001b[31m\u001b[0m).\n", + "\n", + "Thank you for your assistance,\n", + "\n", + "\u001b[31m\u001b[0m\n" + ] + } + ], + "source": [ + "from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer\n", + "\n", + "anonymizer = PresidioReversibleAnonymizer(\n", + " add_default_faker_operators=False,\n", + ")\n", + "\n", + "print_colored_pii(anonymizer.anonymize(document_content))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'DATE_TIME': {'': 'October 19, 2021', '': '9:30 AM'},\n", + " 'EMAIL_ADDRESS': {'': 'maksoperlejn@example.com',\n", + " '': 'support@bankname.com'},\n", + " 'IBAN_CODE': {'': 'PL61109010140000071219812874'},\n", + " 'LOCATION': {'': 'Kilmarnock'},\n", + " 'PERSON': {'': 'Maks Operlejn', '': 'Victoria Cherry'},\n", + " 'PHONE_NUMBER': {'': '999-888-7777',\n", + " '': '987-654-3210'},\n", + " 'US_DRIVER_LICENSE': {'': '999000680'},\n", + " 'US_SSN': {'': '602-76-4532'}}\n" + ] + } + ], + "source": [ + "import pprint\n", + "\n", + "pprint.pprint(anonymizer.deanonymizer_mapping)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# Define the regex pattern in a Presidio `Pattern` object:\n", + "from presidio_analyzer import Pattern, PatternRecognizer\n", + "\n", + "\n", + "polish_id_pattern = Pattern(\n", + " name=\"polish_id_pattern\",\n", + " regex=\"[A-Z]{3}\\d{6}\",\n", + " score=1,\n", + ")\n", + "time_pattern = Pattern(\n", + " name=\"time_pattern\",\n", + " regex=\"(1[0-2]|0?[1-9]):[0-5][0-9] (AM|PM)\",\n", + " score=1,\n", + ")\n", + "\n", + "# Define the recognizer with one or more patterns\n", + "polish_id_recognizer = PatternRecognizer(\n", + " supported_entity=\"POLISH_ID\", patterns=[polish_id_pattern]\n", + ")\n", + "time_recognizer = PatternRecognizer(supported_entity=\"TIME\", patterns=[time_pattern])" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "anonymizer.add_recognizer(polish_id_recognizer)\n", + "anonymizer.add_recognizer(time_recognizer)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "anonymizer.reset_deanonymizer_mapping()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Date: \u001b[31m\u001b[0m\n", + "Witness: \u001b[31m\u001b[0m\n", + "Subject: Testimony Regarding the Loss of Wallet\n", + "\n", + "Testimony Content:\n", + "\n", + "Hello Officer,\n", + "\n", + "My name is \u001b[31m\u001b[0m and on \u001b[31m\u001b[0m, my wallet was stolen in the vicinity of \u001b[31m\u001b[0m during a bike trip. This wallet contains some very important things to me.\n", + "\n", + "Firstly, the wallet contains my credit card with number 5412 5412 5412 5412, which is registered under my name and linked to my bank account, \u001b[31m\u001b[0m.\n", + "\n", + "Additionally, the wallet had a driver's license - DL No: \u001b[31m\u001b[0m issued to my name. It also houses my Social Security Number, \u001b[31m\u001b[0m. \n", + "\n", + "What's more, I had my polish identity card there, with the number \u001b[31m\u001b[0m.\n", + "\n", + "I would like this data to be secured and protected in all possible ways. I believe It was stolen at \u001b[31m