Add PDF deletion functionality and enhance embedding storage with met…

…adata Signed-off-by: samadpls <[email protected]>
samadpls · Nov 30, 2024 · 58cd88b · 58cd88b
1 parent 31884c0
commit 58cd88b
Show file tree

Hide file tree

Showing 4 changed files with 44 additions and 7 deletions.
diff --git a/README.md b/README.md
@@ -46,6 +46,9 @@ rag.store_pdf_embeddings("your_pdf_file.pdf")
 # Search using a query
 results = rag.search(query="your search query", limit=10)
 print(results)
+
+# Delete particular pdf embeddings
+rag.delete_pdf_embeddings("home/notes.pdf")
 ```
 
 > **Note**: Qdrant offers a free tier with 1GB of storage. To generate your API key and endpoint, visit [Qdrant](https://qdrant.tech/).

diff --git a/bestrag/best_rag.py b/bestrag/best_rag.py
@@ -96,7 +96,7 @@ def _clean_text(self, text: str) -> str:
         text = re.sub(r'[^\x00-\x7F]+', '', text)
         text = re.sub(r'[\u200B-\u200D\uFEFF]', '', text)
         text = re.sub(r'\s+', ' ', text)
-    
+
         return text.strip()
 
     def _get_dense_embedding(self, text: str):
@@ -149,12 +149,16 @@ def _extract_pdf_text_per_page(self, pdf_path: str) -> List[str]:
             reader = PyPDF2.PdfReader(pdf_file)
             return [page.extract_text() for page in reader.pages]
 
-    def store_pdf_embeddings(self, pdf_path: str):
+    def store_pdf_embeddings(self, pdf_path: str,
+                             pdf_name: str,
+                             metadata: Optional[dict] = None):
         """
         Store the embeddings for each page of a PDF file in the Qdrant collection.
 
         Args:
             pdf_path (str): The path to the PDF file.
+            pdf_name (str): The name of the PDF file.
+            metadata (Optional[dict]): Additional metadata to store with each embedding.
         """
         texts = self._extract_pdf_text_per_page(pdf_path)
 
@@ -176,9 +180,13 @@ def store_pdf_embeddings(self, pdf_path: str):
 
             payload = {
                 "text": clean_text,
-                "page_number": page_num + 1
+                "page_number": page_num + 1,
+                "pdf_name": pdf_name
             }
 
+            if metadata:
+                payload.update(metadata)
+
             point = models.PointStruct(
                 id=str(uuid.uuid4()),
                 vector=hybrid_vector,
@@ -190,8 +198,34 @@ def store_pdf_embeddings(self, pdf_path: str):
                 points=[point]
             )
 
-            print(
-                f"Stored embedding for page {page_num + 1} in collection '{self.collection_name}'.")
+            print(f"Stored embedding for page {page_num + 1} \
+                of '{pdf_name}' in collection '{self.collection_name}'.")
+
+    def delete_pdf_embeddings(self, pdf_name: str):
+        """
+        Delete all embeddings associated with a given PDF name from the Qdrant collection.
+
+        Args:
+            pdf_name (str): The name of the PDF file whose embeddings should be deleted.
+        """
+        filter_ = models.Filter(
+            must=[
+                models.FieldCondition(
+                    key="pdf_name",
+                    match=models.MatchValue(value=pdf_name)
+                )
+            ]
+        )
+
+        self.client.delete(
+            collection_name=self.collection_name,
+            points_selector=models.FilterSelector(
+                filter=filter_
+            )
+        )
+
+        print(f"Deleted all embeddings for PDF '{pdf_name}' \
+            from collection '{self.collection_name}'.")
 
     def search(self, query: str, limit: int = 10):
         """

diff --git a/bestrag/tests/test_best_rag.py b/bestrag/tests/test_best_rag.py
@@ -103,7 +103,7 @@ def test_store_pdf_embeddings(best_rag_instance, tmp_path):
             patch.object(best_rag_instance.client, 'upsert',
                          return_value=None) as mock_upsert:
 
-        best_rag_instance.store_pdf_embeddings(str(pdf_path))
+        best_rag_instance.store_pdf_embeddings(str(pdf_path), "sample.pdf")
         assert mock_upsert.call_count == 2
 
 

diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name="bestrag",
-    version="0.3.0",
+    version="0.3.1",
     description="bestrag: Library for storing and searching document embeddings in a Qdrant vector database using hybrid embedding techniques.",
     author="samadpls",
     author_email="[email protected]",