Skip to content

Commit

Permalink
Add PDF deletion functionality and enhance embedding storage with met…
Browse files Browse the repository at this point in the history
…adata

Signed-off-by: samadpls <[email protected]>
  • Loading branch information
samadpls committed Nov 30, 2024
1 parent 31884c0 commit 58cd88b
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 7 deletions.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ rag.store_pdf_embeddings("your_pdf_file.pdf")
# Search using a query
results = rag.search(query="your search query", limit=10)
print(results)

# Delete particular pdf embeddings
rag.delete_pdf_embeddings("home/notes.pdf")
```

> **Note**: Qdrant offers a free tier with 1GB of storage. To generate your API key and endpoint, visit [Qdrant](https://qdrant.tech/).
Expand Down
44 changes: 39 additions & 5 deletions bestrag/best_rag.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def _clean_text(self, text: str) -> str:
text = re.sub(r'[^\x00-\x7F]+', '', text)
text = re.sub(r'[\u200B-\u200D\uFEFF]', '', text)
text = re.sub(r'\s+', ' ', text)

return text.strip()

def _get_dense_embedding(self, text: str):
Expand Down Expand Up @@ -149,12 +149,16 @@ def _extract_pdf_text_per_page(self, pdf_path: str) -> List[str]:
reader = PyPDF2.PdfReader(pdf_file)
return [page.extract_text() for page in reader.pages]

def store_pdf_embeddings(self, pdf_path: str):
def store_pdf_embeddings(self, pdf_path: str,
pdf_name: str,
metadata: Optional[dict] = None):
"""
Store the embeddings for each page of a PDF file in the Qdrant collection.
Args:
pdf_path (str): The path to the PDF file.
pdf_name (str): The name of the PDF file.
metadata (Optional[dict]): Additional metadata to store with each embedding.
"""
texts = self._extract_pdf_text_per_page(pdf_path)

Expand All @@ -176,9 +180,13 @@ def store_pdf_embeddings(self, pdf_path: str):

payload = {
"text": clean_text,
"page_number": page_num + 1
"page_number": page_num + 1,
"pdf_name": pdf_name
}

if metadata:
payload.update(metadata)

point = models.PointStruct(
id=str(uuid.uuid4()),
vector=hybrid_vector,
Expand All @@ -190,8 +198,34 @@ def store_pdf_embeddings(self, pdf_path: str):
points=[point]
)

print(
f"Stored embedding for page {page_num + 1} in collection '{self.collection_name}'.")
print(f"Stored embedding for page {page_num + 1} \
of '{pdf_name}' in collection '{self.collection_name}'.")

def delete_pdf_embeddings(self, pdf_name: str):
"""
Delete all embeddings associated with a given PDF name from the Qdrant collection.
Args:
pdf_name (str): The name of the PDF file whose embeddings should be deleted.
"""
filter_ = models.Filter(
must=[
models.FieldCondition(
key="pdf_name",
match=models.MatchValue(value=pdf_name)
)
]
)

self.client.delete(
collection_name=self.collection_name,
points_selector=models.FilterSelector(
filter=filter_
)
)

print(f"Deleted all embeddings for PDF '{pdf_name}' \
from collection '{self.collection_name}'.")

def search(self, query: str, limit: int = 10):
"""
Expand Down
2 changes: 1 addition & 1 deletion bestrag/tests/test_best_rag.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def test_store_pdf_embeddings(best_rag_instance, tmp_path):
patch.object(best_rag_instance.client, 'upsert',
return_value=None) as mock_upsert:

best_rag_instance.store_pdf_embeddings(str(pdf_path))
best_rag_instance.store_pdf_embeddings(str(pdf_path), "sample.pdf")
assert mock_upsert.call_count == 2


Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setup(
name="bestrag",
version="0.3.0",
version="0.3.1",
description="bestrag: Library for storing and searching document embeddings in a Qdrant vector database using hybrid embedding techniques.",
author="samadpls",
author_email="[email protected]",
Expand Down

0 comments on commit 58cd88b

Please sign in to comment.