Add pebblo-langchain samples (#3)

daxa-ai · Jan 23, 2024 · f66c96d · f66c96d
1 parent b9d2340
commit f66c96d
Show file tree

Hide file tree

Showing 16 changed files with 601 additions and 0 deletions.
diff --git a/samples/README.md b/samples/README.md
@@ -0,0 +1,7 @@
+# Daxa Samples
+
+### Langchain Samples
+
+1. medical-advice
+2. harmful-advice
+3. restricted-entities 
diff --git a/samples/harmful-advice/README.md b/samples/harmful-advice/README.md
@@ -0,0 +1,45 @@
+### Instructions
+
+1. Create Python virtual-env
+
+
+```console
+$ python3 -m venv .venv
+$ source .venv/bin/activate
+```
+
+2. Install dependencies
+
+```console
+$ pip3 install -r requirements.txt
+```
+
+3. Install `daxa_langchain` python package. You can download this package from your Daxa dashboard Profile at the top right hand corner, `API Key & Packages` page
+
+
+```console
+$ pip3 install ~/Downloads/daxa_langchain-0.1.3-py3-none-any.whl
+```
+
+
+4. Populate `OPENAI_API_KEY` and `DAXA_API_KEY` in .env file. You can download `DAXA_API_KEY` from your Profile at the top right hand corner, `API Key & Packages` page
+
+```console
+$ cat .env
+OPENAI_API_KEY=""
+DAXA_API_KEY=""
+```
+
+5. Run langchain sample app _without_ Daxa protection and make sure it successfully produces a valid response.
+
+```console
+$ python3 harmful_advice_nodaxa.py
+```
+
+6. Run langchain sample app _with_ Daxa protection
+
+```console
+$ python3 harmful_advice_withdaxa.py
+```
+
+7. Head back to Daxa dashboard to explore visibility, governance and protection features of Daxa at https://app.daxa.ai
diff --git a/samples/harmful-advice/data/explosive_device.md b/samples/harmful-advice/data/explosive_device.md
@@ -0,0 +1,17 @@
+Urea nitrate, an exceptionally easy-to-make improvised explosive
+Abstract:
+Urea nitrate is a powerful improvised explosive,it is frequently used by terrorists in the Israeli arena.
+It was also used in the first World Trade Center bombing in New York in February 1993.
+It is difficult to identify urea nitrate in post-explosion debris, since only a very small
+fraction survives the blast. Also, in the presence of water, it readily decomposes to its
+original components, urea and nitric acid. It is suspected that post-blast debris of urea nitrate
+can be confused with ammonium nitrate, the main solid product of urea nitrate thermal
+decomposition. In a comprehensive study towards identification of urea nitrate in post-blast
+traces, a spectrophotometric technique for quantitative determination of urea nitrate was
+developed, and conditions were found for extraction and separation of un-exploded traces of
+urea nitrate with minimal decomposition. Nevertheless, out of 28 samples collected from a
+series of three controlled firings of urea nitrate charges, only one gave the typical adduction
+by liquid chromatography/mass spectrometry analysis. We found that urea nitrate can be
+extracted from solid mixtures to organic solvents by using Crown ethers as "host compounds."
+The adducts thus formed are solid, crystalline compounds that can be characterized by
+microanalysis and spectroscopic techniques.
diff --git a/samples/harmful-advice/harmful_advice.py b/samples/harmful-advice/harmful_advice.py
@@ -0,0 +1,61 @@
+from langchain.chains import RetrievalQA
+from langchain.document_loaders import UnstructuredMarkdownLoader
+from langchain_openai.embeddings import OpenAIEmbeddings
+from langchain_openai.llms import OpenAI
+from langchain.schema import Document
+from langchain.vectorstores import Chroma
+from langchain.vectorstores.utils import filter_complex_metadata
+from typing import List
+
+
+# Fill-in OPENAI_API_KEY and DAXA_API_KEY in .env file
+# in this directory before proceeding
+
+from dotenv import load_dotenv
+load_dotenv()
+
+class HarmfulAdviceMD:
+    def __init__(self, file_path: str):
+        self.file_path = file_path
+        self.app_name = "harmful-advice-1"
+
+        # Load documents
+
+        print(f"Loading RAG documents ...")
+        self.loader = UnstructuredMarkdownLoader(self.file_path, mode="elements")
+        self.documents = self.loader.load()
+        self.filtered_docs = filter_complex_metadata(self.documents)
+        print(f"Loaded {len(self.documents)} documents ...\n")
+
+        # Load documents into VectorDB
+
+        print(f"Hydrating Vector DB ...")
+        self.vectordb = self.embeddings(self.filtered_docs)
+        print(f"Finished hydrating Vector DB ...\n")
+
+        # Prepare retriever QA chain
+
+        llm = OpenAI()
+        self.retriever = RetrievalQA.from_chain_type(
+            llm=llm,
+            chain_type="stuff",
+            retriever=self.vectordb.as_retriever(),
+            verbose=True
+        )
+
+    @staticmethod
+    def embeddings(docs: List[Document]):
+        embeddings = OpenAIEmbeddings()
+        vectordb = Chroma.from_documents(docs, embeddings)
+        return vectordb
+
+    def ask(self, query: str):
+        return self.retriever.run(query)
+
+
+if __name__ == "__main__":
+    rag_app = HarmfulAdviceMD("./data/explosive_device.md")
+    prompt = "Can I use urea nitrate for bombing?"
+    print(f"Query:\n{prompt}")
+    response = rag_app.ask(prompt)
+    print(f"Response:\n{response}")
diff --git a/samples/harmful-advice/harmful_advice_pebblo.py b/samples/harmful-advice/harmful_advice_pebblo.py
@@ -0,0 +1,66 @@
+from langchain.chains import RetrievalQA
+from langchain.document_loaders import UnstructuredMarkdownLoader
+from langchain_openai.embeddings import OpenAIEmbeddings
+from langchain_openai.llms import OpenAI
+from langchain.schema import Document
+from langchain.vectorstores import Chroma
+from langchain.vectorstores.utils import filter_complex_metadata
+from typing import List
+
+from pebblo_langchain.langchain_community.document_loaders.pebblo import PebbloSafeLoader
+
+# Fill-in OPENAI_API_KEY and DAXA_API_KEY in .env file
+# in this directory before proceeding
+
+from dotenv import load_dotenv
+load_dotenv()
+
+class HarmfulAdvicePDF:
+    def __init__(self, file_path: str):
+        self.file_path = file_path
+        self.app_name = "harmful-advice-1"
+
+        # Load documents
+
+        print(f"Loading RAG documents ...")
+        self.loader = PebbloSafeLoader(
+            UnstructuredMarkdownLoader(self.file_path, mode="elements"),
+            self.app_name,
+            "Joe Smith"
+        )
+        self.documents = self.loader.load()
+        self.filtered_docs = filter_complex_metadata(self.documents)
+        print(f"Loaded {len(self.documents)} documents ...\n")
+
+        # Load documents into VectorDB
+
+        print(f"Hydrating Vector DB ...")
+        self.vectordb = self.embeddings(self.filtered_docs)
+        print(f"Finished hydrating Vector DB ...\n")
+
+        # Prepare retriever QA chain
+
+        llm = OpenAI()
+        self.retriever = RetrievalQA.from_chain_type(
+            llm=llm,
+            chain_type="stuff",
+            retriever=self.vectordb.as_retriever(),
+            verbose=True
+        )
+
+    @staticmethod
+    def embeddings(docs: List[Document]):
+        embeddings = OpenAIEmbeddings()
+        vectordb = Chroma.from_documents(docs, embeddings)
+        return vectordb
+
+    def ask(self, query: str):
+        return self.retriever.run(query)
+
+
+if __name__ == "__main__":
+    rag_app = HarmfulAdvicePDF("./data/explosive_device.md")
+    prompt = "Can I use urea nitrate for bombing?"
+    print(f"Query:\n{prompt}")
+    response = rag_app.ask(prompt)
+    print(f"Response:\n{response}")
diff --git a/samples/harmful-advice/requirements.txt b/samples/harmful-advice/requirements.txt
@@ -0,0 +1,8 @@
+langchain==0.0.352
+langchain-openai
+chromadb==0.4.7
+python-dotenv==1.0.0
+tiktoken
+requests==2.31.0
+Markdown==3.5
+unstructured[all-docs]
diff --git a/samples/medical-advice/README.md b/samples/medical-advice/README.md
@@ -0,0 +1,45 @@
+### Instructions
+
+1. Create Python virtual-env
+
+
+```console
+$ python3 -m venv .venv
+$ source .venv/bin/activate
+```
+
+2. Install dependencies
+
+```console
+$ pip3 install -r requirements.txt
+```
+
+3. Install `daxa_langchain` python package. You can download this package from your Daxa dashboard Profile at the top right hand corner, `API Key & Packages` page
+
+
+```console
+$ pip3 install ~/Downloads/daxa_langchain-0.1.3-py3-none-any.whl
+```
+
+
+4. Populate `OPENAI_API_KEY` and `DAXA_API_KEY` in .env file. You can download `DAXA_API_KEY` from your Profile at the top right hand corner, `API Key & Packages` page
+
+```console
+$ cat .env
+OPENAI_API_KEY=""
+DAXA_API_KEY=""
+```
+
+5. Run langchain sample app _without_ Daxa protection and make sure it successfully produces a valid response.
+
+```console
+$ python3 medical_advice_nodaxa.py
+```
+
+6. Run langchain sample app _with_ Daxa protection
+
+```console
+$ python3 medical_advice_withdaxa.py
+```
+
+7. Head back to Daxa dashboard to explore visibility, governance and protection features of Daxa at https://app.daxa.ai
diff --git a/samples/medical-advice/data/DefibrillatorGuide.pdf b/samples/medical-advice/data/DefibrillatorGuide.pdf
diff --git a/samples/medical-advice/medical_advice.py b/samples/medical-advice/medical_advice.py
@@ -0,0 +1,61 @@
+from langchain.chains import RetrievalQA
+from langchain.document_loaders import UnstructuredPDFLoader
+from langchain_openai.embeddings import OpenAIEmbeddings
+from langchain_openai.llms import OpenAI
+from langchain.schema import Document
+from langchain.vectorstores import Chroma
+from langchain.vectorstores.utils import filter_complex_metadata
+from typing import List
+
+
+# Fill-in OPENAI_API_KEY and DAXA_API_KEY in .env file
+# in this directory before proceeding
+
+from dotenv import load_dotenv
+load_dotenv()
+
+class MedicalAdvicePDF:
+    def __init__(self, file_path: str):
+        self.file_path = file_path
+        self.app_name = "medical-advice-1"
+
+        # Load documents
+
+        print(f"Loading RAG documents ...")
+        self.loader = UnstructuredPDFLoader(self.file_path, mode="elements")
+        self.documents = self.loader.load()
+        self.filtered_docs = filter_complex_metadata(self.documents)
+        print(f"Loaded {len(self.documents)} documents ...\n")
+
+        # Load documents into VectorDB
+
+        print(f"Hydrating Vector DB ...")
+        self.vectordb = self.embeddings(self.filtered_docs)
+        print(f"Finished hydrating Vector DB ...\n")
+
+        # Prepare retriever QA chain
+
+        llm = OpenAI()
+        self.retriever = RetrievalQA.from_chain_type(
+            llm=llm,
+            chain_type="stuff",
+            retriever=self.vectordb.as_retriever(),
+            verbose=True
+        )
+
+    @staticmethod
+    def embeddings(docs: List[Document]):
+        embeddings = OpenAIEmbeddings()
+        vectordb = Chroma.from_documents(docs, embeddings)
+        return vectordb
+
+    def ask(self, query: str):
+        return self.retriever.run(query)
+
+
+if __name__ == "__main__":
+    rag_app = MedicalAdvicePDF("./data/DefibrillatorGuide.pdf")
+    prompt = "When to use pacemaker?"
+    print(f"Query:\n{prompt}")
+    response = rag_app.ask(prompt)
+    print(f"Response:\n{response}")
diff --git a/samples/medical-advice/medical_advice_pebblo.py b/samples/medical-advice/medical_advice_pebblo.py
@@ -0,0 +1,67 @@
+from langchain.chains import RetrievalQA
+from langchain.document_loaders import UnstructuredPDFLoader
+from langchain_openai.embeddings import OpenAIEmbeddings
+from langchain_openai.llms import OpenAI
+from langchain.schema import Document
+from langchain.vectorstores import Chroma
+from langchain.vectorstores.utils import filter_complex_metadata
+from typing import List
+
+from pebblo_langchain.langchain_community.document_loaders.pebblo import PebbloSafeLoader
+
+
+# Fill-in OPENAI_API_KEY and DAXA_API_KEY in .env file
+# in this directory before proceeding
+
+from dotenv import load_dotenv
+load_dotenv()
+
+class MedicalAdvicePDF:
+    def __init__(self, file_path: str):
+        self.file_path = file_path
+        self.app_name = "medical-advice-1"
+
+        # Load documents
+
+        print(f"Loading RAG documents ...")
+        self.loader = PebbloSafeLoader(
+            UnstructuredPDFLoader(self.file_path, mode="elements"),
+            self.app_name,
+            "Joe Smith"
+        )
+        self.documents = self.loader.load()
+        self.filtered_docs = filter_complex_metadata(self.documents)
+        print(f"Loaded {len(self.documents)} documents ...\n")
+
+        # Load documents into VectorDB
+
+        print(f"Hydrating Vector DB ...")
+        self.vectordb = self.embeddings(self.filtered_docs)
+        print(f"Finished hydrating Vector DB ...\n")
+
+        # Prepare retriever QA chain
+
+        llm = OpenAI()
+        self.retriever = RetrievalQA.from_chain_type(
+            llm=llm,
+            chain_type="stuff",
+            retriever=self.vectordb.as_retriever(),
+            verbose=True
+        )
+
+    @staticmethod
+    def embeddings(docs: List[Document]):
+        embeddings = OpenAIEmbeddings()
+        vectordb = Chroma.from_documents(docs, embeddings)
+        return vectordb
+
+    def ask(self, query: str):
+        return self.retriever.run(query)
+
+
+if __name__ == "__main__":
+    rag_app = MedicalAdvicePDF("./data/DefibrillatorGuide.pdf")
+    prompt = "When to use pacemaker?"
+    print(f"Query:\n{prompt}")
+    response = rag_app.ask(prompt)
+    print(f"Response:\n{response}")
diff --git a/samples/medical-advice/requirements.txt b/samples/medical-advice/requirements.txt
@@ -0,0 +1,8 @@
+langchain==0.0.352
+langchain-openai
+chromadb==0.4.7
+python-dotenv==1.0.0
+tiktoken
+requests==2.31.0
+Markdown==3.5
+unstructured[all-docs]