Skip to content

Commit

Permalink
Merge pull request #3 from AlessandroSpallina/develop
Browse files Browse the repository at this point in the history
Welcome to v0.2.0
  • Loading branch information
AlessandroSpallina authored Dec 26, 2023
2 parents 8451ca4 + bb4360a commit 959e08a
Show file tree
Hide file tree
Showing 5 changed files with 124 additions and 56 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -150,4 +150,4 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/
diet.db
dietician.db
20 changes: 12 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,21 @@
[![Awesome plugin](https://custom-icon-badges.demolab.com/static/v1?label=&message=Awesome+plugin&color=000000&style=for-the-badge&logo=cheshire_cat_ai)](https://github.com/cheshire-cat-ai/awesome-plugins)


This plugin hooks into the `RabbitHole` to prevend multiple ingestions of the same file by using [LangChain Indexing](https://python.langchain.com/docs/modules/data_connection/indexing).
This plugin hooks into the `RabbitHole` to prevend multiple ingestions of the same document.

Using this plugin you can relax yourself and put into the RabbitHole all the files you want, the Dietician will only allow new files (o new versions of the same file, by updating only the modified chunks) for you.

If you like this plugin, please show appreciacion by giving a star to the repository!
Using this plugin you can relax yourself and put into the RabbitHole all the files you want, the Dietician will only allow new documents (o newer versions of the same file, by updating only the modified chunks) for you.

If you like this plugin, please show appreciacion by giving a star to the repository, otherwise a kitten will die!


## Usage

1. Install this plugin
2. Rebuild the cheshire-cat-ai container
3. Start the cheshire-cat-ai and enable the plugin
4. Relax and ingest all the files you want
1. Install the plugin BEFORE ingesting any document: Dietician tracks document ingestion only if activated
2. Ingest documents

## Notice
If you wipe the declarative memory remember to delete the dietician.db inside the directory plugin (es. core/cat/plugins/dietician.db)

## Under the hood

![Diagram flow](./img/ccat-dietician.png)
156 changes: 110 additions & 46 deletions dietician.py
Original file line number Diff line number Diff line change
@@ -1,70 +1,134 @@
import sqlite3
import os
import hashlib
from typing import List
from cat.log import log
from cat.mad_hatter.decorators import tool, hook
from pydantic import BaseModel
from langchain.indexes import SQLRecordManager, index
from langchain.docstore.document import Document
from langchain.vectorstores import Qdrant
from cat.mad_hatter.decorators import hook

# TODO: use settings instead of hard coded db path
# class DietSettings(BaseModel):
# sqlite_file_path: str = "/app/cat/plugins/ccat-dietician/diet.db"
from typing import List
from typing import Optional
from sqlalchemy import ForeignKey, MetaData
from sqlalchemy import String
from sqlalchemy.orm import DeclarativeBase
from sqlalchemy.orm import Mapped
from sqlalchemy.orm import mapped_column
from sqlalchemy.orm import relationship
from sqlalchemy import create_engine
from sqlalchemy.orm import Session


class Base(DeclarativeBase):
pass


class DietDocument(Base):
__tablename__= 'document'
id: Mapped[int] = mapped_column(primary_key=True)
name: Mapped[str] = mapped_column(String(256), unique=True)
hash: Mapped[str] = mapped_column(String(64), unique=True)

chunks: Mapped[List["Chunk"]] = relationship(back_populates="document")


# @hook
# def plugin_settings_schema():
# return DietSettings.schema()
def __repr__(self) -> str:
return f'DietDocument(name={self.name!r}, hash={self.hash!r})'


# Hook called when a list of Document is going to be inserted in memory from the rabbit hole.
# Here you can edit/summarize the documents before inserting them in memory
# Should return a list of documents (each is a langchain Document)
@hook
def before_rabbithole_stores_documents(docs: List[Document], cat) -> List[Document]:
"""Hook into the memory insertion pipeline.
class Chunk(Base):
__tablename__ = 'chunk'
id: Mapped[int] = mapped_column(primary_key=True)
chunk_count: Mapped[int]
document_id: Mapped[int] = mapped_column(ForeignKey("document.id"))

Allows modifying how the list of `Document` is inserted in the vector memory.
document: Mapped["DietDocument"] = relationship(back_populates="chunks")

For example, this hook is a good point to summarize the incoming documents and save both original and
summarized contents.
An official plugin is available to test this procedure.
def __repr__(self) -> str:
return f'Chunk(chunk_count={self.chunk_count!r})'

Parameters
----------
docs : List[Document]
List of Langchain `Document` to be edited.
cat: CheshireCat
Cheshire Cat instance.

Returns
-------
docs : List[Document]
List of edited Langchain documents.
engine = create_engine(f"sqlite:///cat/plugins/ccat-dietician/dietician.db")

"""
Base.metadata.create_all(engine, checkfirst=True)

vector_db = cat.memory.vectors.vector_db
embedder = cat.embedder

q = Qdrant(vector_db, "declarative", embedder)

record_manager = SQLRecordManager(
namespace="qdrant/declarative",
db_url="sqlite:////app/cat/plugins/ccat-dietician/diet.db"
)
@hook(priority=10)
def before_rabbithole_splits_text(doc, cat):
# doc is a list with only one element, always
cat.working_memory['ccat-dietician'] = {
'name': doc[0].metadata['source'],
'hash': hashlib.sha256(doc[0].page_content.encode()).hexdigest()
}

record_manager.create_schema()
return doc

ret = index(
docs,
record_manager,
q,
delete_mode="incremental",
source_id_key="source"
)

log(f"Dietist: index return is {ret}", "DEBUG")
# Hook called when a list of Document is going to be inserted in memory from the rabbit hole.
# Here you can edit/summarize the documents before inserting them in memory
# Should return a list of documents (each is a langchain Document)
@hook(priority=10)
def before_rabbithole_stores_documents(docs: List[Document], cat) -> List[Document]:
cat.working_memory['ccat-dietician']['chunk_count'] = len(docs)

#document = session.query(DietDocument).filter_by(hash=hash).first()

with Session(engine) as session:
try:
doc_by_name = session.query(DietDocument).filter_by(name=cat.working_memory['ccat-dietician']['name']).first()
if doc_by_name is None:
doc_by_hash = session.query(DietDocument).filter_by(hash=cat.working_memory['ccat-dietician']['hash']).first()
if doc_by_hash is None:
db_doc = DietDocument(name=cat.working_memory['ccat-dietician']['name'], hash=cat.working_memory['ccat-dietician']['hash'], chunks=[Chunk(chunk_count=cat.working_memory['ccat-dietician']['chunk_count'])])
session.add(db_doc)
session.commit()
log.info(f"Dietician is allowing the ingestion of a new document: {db_doc}")
return docs
else:
if cat.working_memory['ccat-dietician']['chunk_count'] in [c.chunk_count for c in doc_by_hash.chunks]:
log.info(f"Dietician detected {cat.working_memory['ccat-dietician']['name']} as a duplicate of {doc_by_hash.name}, since the number of chunks ({cat.working_memory['ccat-dietician']['chunk_count']}) coincides to what is already in declarative memory, this ingestion is going to be avoided.")
return []
else:
doc_by_hash.chunks.append(Chunk(chunk_count=cat.working_memory['ccat-dietician']['chunk_count']))
session.add(doc_by_hash)
session.commit()
log.info(f"Dietician detected {cat.working_memory['ccat-dietician']['name']} as a duplicate of {doc_by_hash.name}, since the number of chunks ({cat.working_memory['ccat-dietician']['chunk_count']}) produced now is different from what is already in declarative memory, this ingestion is going to be allowed.")
return docs
else:
if cat.working_memory['ccat-dietician']['hash'] == doc_by_name.hash:
if cat.working_memory['ccat-dietician']['chunk_count'] in [c.chunk_count for c in doc_by_name.chunks]:
log.info(f"Dietician detected that {doc_by_name.name} was already ingested, since the number of chunks ({cat.working_memory['ccat-dietician']['chunk_count']}) coincides to what is already in declarative memory, this ingestion is going to be avoided.")
return []
else:
doc_by_name.chunks.append(Chunk(chunk_count=cat.working_memory['ccat-dietician']['chunk_count']))
session.add(doc_by_name)
session.commit()
log.info(f"Dietician detected that {doc_by_name.name} was already ingested, since the number of chunks ({cat.working_memory['ccat-dietician']['chunk_count']}) produced now is different from what is already in declarative memory, this ingestion is going to be allowed.")
return docs
else:
old_chunks, _ = cat.memory.vectors.declarative.client.scroll(
collection_name=cat.memory.vectors.declarative.collection_name,
scroll_filter=cat.memory.vectors.declarative._qdrant_filter_from_dict({'source': doc_by_name.name}),
with_payload=True
)
old_chunks_text = [c.payload['page_content'] for c in old_chunks]
new_chunks_text = [d.page_content for d in docs]

# we have to delete all chunks in declarative memory that are not in the new document because those chunks are related an old version of the document
old_chunks_to_delete_ids = [c.id for c in old_chunks if c.payload['page_content'] not in new_chunks_text]

if len(old_chunks_to_delete_ids) > 0:
cat.memory.vectors.declarative.delete_points(old_chunks_to_delete_ids)

log.info(f"Dietician detected an hash change for the document {doc_by_name}, this means that the document has beed updated. Allowing the ingestion of new chunks and deleting all the old chunks not any more present in the current document version.")

# docs contain only chunks never inserted in declarative memory, we keep into the vectordb any chunk previously inserted (to avoid unnecessary calls to the embedding model)
return [d for d in docs if d.page_content not in old_chunks_text]

except Exception as e:
session.rollback()
log.error(f"Something weird happened: {str(e)}. Dietician is preventing the ingestion of {cat.working_memory['ccat-dietician']['name']}")
return []

return []
Binary file modified img/ccat-dietician.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion plugin.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "Dietician",
"version": "0.1.1",
"version": "0.2.0",
"description": "Preventing CheshireCatAI to ingest the same file multiple times.",
"author_name": "Alessandro Spallina",
"author_url": "https://www.linkedin.com/in/alessandro-spallina/",
Expand Down

0 comments on commit 959e08a

Please sign in to comment.