From 6d9901a54b0785197bc13d932d9cf269a7f0afdc Mon Sep 17 00:00:00 2001
From: Himanshu Gupta <161701569+himanshugt16@users.noreply.github.com>
Date: Mon, 18 Nov 2024 10:09:52 +0530
Subject: [PATCH] Added api to update knowldege vault real time (#1599)

* Added api to update mongo and qdrant in real time for changes in knowledge_vault and corresponding test cases

* Added Unit Test Cases

* Updated Test Cases

* Updated LLM Test
---
 kairon/api/app/routers/bot/data.py            |  31 +
 kairon/shared/cognition/processor.py          | 158 ++++-
 kairon/shared/llm/processor.py                |  15 +
 tests/integration_test/services_test.py       | 307 ++++++++++
 .../data_processor/data_processor_test.py     | 551 +++++++++++++++++-
 tests/unit_test/llm_test.py                   |  71 ++-
 6 files changed, 1129 insertions(+), 4 deletions(-)

diff --git a/kairon/api/app/routers/bot/data.py b/kairon/api/app/routers/bot/data.py
index f3075f0a8..42b79b217 100644
--- a/kairon/api/app/routers/bot/data.py
+++ b/kairon/api/app/routers/bot/data.py
@@ -326,3 +326,34 @@ async def download_error_csv(
             data=None,
             error_code=e.status_code
         )
+
+@router.post("/cognition/sync", response_model=Response)
+async def knowledge_vault_sync(
+    primary_key_col: str,
+    collection_name: str,
+    data: List[dict],
+    current_user: User = Security(Authentication.get_current_user_and_bot, scopes=DESIGNER_ACCESS),
+):
+    """
+    Validates and syncs data to the specified MongoDB collection and vector database.
+    """
+    data = [{key.lower(): value for key, value in row.items()} for row in data]
+
+    error_summary = cognition_processor.validate_data(primary_key_col.lower(), collection_name.lower(), data, current_user.get_bot())
+
+    if error_summary:
+        return Response(
+            success=False,
+            message="Validation failed",
+            data=error_summary,
+            error_code=400
+        )
+
+    await cognition_processor.upsert_data(primary_key_col.lower(), collection_name.lower(), data,
+                                    current_user.get_bot(), current_user.get_user())
+
+    return Response(
+        success=True,
+        message="Processing completed successfully",
+        data=None
+    )
\ No newline at end of file
diff --git a/kairon/shared/cognition/processor.py b/kairon/shared/cognition/processor.py
index 499a798da..4ac49c531 100644
--- a/kairon/shared/cognition/processor.py
+++ b/kairon/shared/cognition/processor.py
@@ -1,13 +1,15 @@
 from datetime import datetime
-from typing import Text, Dict, Any
+from typing import Text, Dict, Any, List
 
 from loguru import logger
 from mongoengine import DoesNotExist, Q
+from pydantic import constr, create_model, ValidationError
 
 from kairon import Utility
 from kairon.exceptions import AppException
 from kairon.shared.actions.data_objects import PromptAction, DatabaseAction
 from kairon.shared.cognition.data_objects import CognitionData, CognitionSchema, ColumnMetadata, CollectionData
+from kairon.shared.data.constant import DEFAULT_LLM
 from kairon.shared.data.processor import MongoProcessor
 from kairon.shared.models import CognitionDataType, CognitionMetadataType
 
@@ -414,3 +416,157 @@ def validate_collection_name(bot: Text, collection: Text):
             raise AppException(f'Cannot remove collection {collection} linked to action "{prompt_action[0].name}"!')
         if database_action:
             raise AppException(f'Cannot remove collection {collection} linked to action "{database_action[0].name}"!')
+
+    @staticmethod
+    def get_pydantic_type(data_type: str):
+        if data_type == 'str':
+            return (constr(strict=True, min_length=1), ...)
+        elif data_type == 'int':
+            return (int, ...)
+        elif data_type == 'float':
+            return (float, ...)
+        else:
+            raise ValueError(f"Unsupported data type: {data_type}")
+
+    def validate_data(self, primary_key_col: str, collection_name: str, data: List[Dict], bot: str) -> Dict:
+        """
+        Validates each dictionary in the data list according to the expected schema from column_dict.
+
+        Args:
+            data: List of dictionaries where each dictionary represents a row to be validated.
+            collection_name: The name of the collection (table name).
+            bot: The bot identifier.
+            primary_key_col: The primary key column for identifying rows.
+
+        Returns:
+            Dict: Summary of validation errors, if any.
+        """
+        if not CognitionSchema.objects(collection_name=collection_name).first():
+            raise AppException(f"Collection '{collection_name}' does not exist.")
+
+        column_dict = MongoProcessor().get_column_datatype_dict(bot, collection_name)
+
+        error_summary = {}
+
+        model_fields = {
+            column_name: self.get_pydantic_type(data_type)
+            for column_name, data_type in column_dict.items()
+        }
+        DynamicModel = create_model('DynamicModel', **model_fields)
+
+        for row in data:
+            row_key = row.get(primary_key_col)
+            if not row_key:
+                raise AppException(f"Primary key '{primary_key_col}' must exist in each row.")
+
+            row_errors = []
+            if set(row.keys()) != set(column_dict.keys()):
+                row_errors.append({
+                    "status": "Column headers mismatch",
+                    "expected_columns": list(column_dict.keys()),
+                    "actual_columns": list(row.keys())
+                })
+            if row_errors:
+                error_summary[row_key] = row_errors
+                continue
+
+            try:
+                DynamicModel(**row)
+            except ValidationError as e:
+                error_details = []
+                for error in e.errors():
+                    column_name = error['loc'][0]
+                    input_value = row.get(column_name)
+                    status = "Required Field is Empty" if input_value == "" else "Invalid DataType"
+                    error_details.append({
+                        "column_name": column_name,
+                        "input": input_value,
+                        "status": status
+                    })
+                error_summary[row_key] = error_details
+
+        return error_summary
+
+    async def upsert_data(self, primary_key_col: str, collection_name: str, data: List[Dict], bot: str, user: Text):
+        """
+        Upserts data into the CognitionData collection.
+        If document with the primary key exists, it will be updated.
+        If not, it will be inserted.
+
+        Args:
+            primary_key_col: The primary key column name to check for uniqueness.
+            collection_name: The collection name (table).
+            data: List of rows of data to upsert.
+            bot: The bot identifier associated with the data.
+            user: The user
+        """
+
+        from kairon.shared.llm.processor import LLMProcessor
+        llm_processor = LLMProcessor(bot, DEFAULT_LLM)
+        suffix = "_faq_embd"
+        qdrant_collection = f"{bot}_{collection_name}{suffix}" if collection_name else f"{bot}{suffix}"
+
+        if await llm_processor.__collection_exists__(qdrant_collection) is False:
+            await llm_processor.__create_collection__(qdrant_collection)
+
+        for row in data:
+            row = {str(key): str(value) for key, value in row.items()}
+            primary_key_value = row.get(primary_key_col)
+
+            payload = {
+                "data": row,
+                "content_type": CognitionDataType.json.value,
+                "collection": collection_name
+            }
+            existing_document = CognitionData.objects(
+                Q(bot=bot) &
+                Q(collection=collection_name) &
+                Q(**{f"data__{primary_key_col}": str(primary_key_value)})
+            ).first()
+
+            if existing_document:
+                if not isinstance(existing_document, dict):
+                    existing_document = existing_document.to_mongo().to_dict()
+                row_id = str(existing_document["_id"])
+                self.update_cognition_data(row_id, payload, user, bot)
+                updated_document = CognitionData.objects(id=row_id).first()
+                if not isinstance(updated_document, dict):
+                    updated_document = updated_document.to_mongo().to_dict()
+                logger.info(f"Row with {primary_key_col}: {primary_key_value} updated in MongoDB")
+                await self.sync_with_qdrant(llm_processor, qdrant_collection, bot, updated_document, user,
+                                            primary_key_col)
+            else:
+                row_id = self.save_cognition_data(payload, user, bot)
+                new_document = CognitionData.objects(id=row_id).first()
+                if not isinstance(new_document, dict):
+                    new_document = new_document.to_mongo().to_dict()
+                logger.info(f"Row with {primary_key_col}: {primary_key_value} inserted in MongoDB")
+                await self.sync_with_qdrant(llm_processor, qdrant_collection, bot, new_document, user, primary_key_col)
+
+        return {"message": "Upsert complete!"}
+
+    async def sync_with_qdrant(self, llm_processor, collection_name, bot, document, user, primary_key_col):
+        """
+        Syncs a document with Qdrant vector database by generating embeddings and upserting them.
+
+        Args:
+            llm_processor (LLMProcessor): Instance of LLMProcessor for embedding and Qdrant operations.
+            collection_name (str): Name of the Qdrant collection.
+            bot (str): Bot identifier.
+            document (CognitionData): Document to sync with Qdrant.
+            user (Text): User performing the operation.
+
+        Raises:
+            AppException: If Qdrant upsert operation fails.
+        """
+        try:
+            metadata = self.find_matching_metadata(bot, document['data'], document.get('collection'))
+            search_payload, embedding_payload = Utility.retrieve_search_payload_and_embedding_payload(
+                document['data'], metadata)
+            embeddings = await llm_processor.get_embedding(embedding_payload, user, invocation='knowledge_vault_sync')
+            points = [{'id': document['vector_id'], 'vector': embeddings, 'payload': search_payload}]
+            await llm_processor.__collection_upsert__(collection_name, {'points': points},
+                                                      err_msg="Unable to train FAQ! Contact support")
+            logger.info(f"Row with {primary_key_col}: {document['data'].get(primary_key_col)} upserted in Qdrant.")
+        except Exception as e:
+            raise AppException(f"Failed to sync document with Qdrant: {str(e)}")
diff --git a/kairon/shared/llm/processor.py b/kairon/shared/llm/processor.py
index dbaca51cc..168e7273d 100644
--- a/kairon/shared/llm/processor.py
+++ b/kairon/shared/llm/processor.py
@@ -290,6 +290,21 @@ async def __collection_upsert__(self, collection_name: Text, data: Dict, err_msg
                 if raise_err:
                     raise AppException(err_msg)
 
+    async def __collection_exists__(self, collection_name: Text) -> bool:
+        """Check if a collection exists."""
+        try:
+            response = await AioRestClient().request(
+                http_url=urljoin(self.db_url, f"/collections/{collection_name}"),
+                request_method="GET",
+                headers=self.headers,
+                return_json=True,
+                timeout=5
+            )
+            return response.get('status') == "ok"
+        except Exception as e:
+            logging.info(e)
+            return False
+
     async def __collection_search__(self, collection_name: Text, vector: List, limit: int, score_threshold: float):
         client = AioRestClient()
         response = await client.request(
diff --git a/tests/integration_test/services_test.py b/tests/integration_test/services_test.py
index 8a55c9879..98a37fc12 100644
--- a/tests/integration_test/services_test.py
+++ b/tests/integration_test/services_test.py
@@ -10,6 +10,7 @@
 from unittest.mock import patch
 from urllib.parse import urljoin
 from zipfile import ZipFile
+import litellm
 
 import pytest
 import responses
@@ -31,6 +32,8 @@
 from kairon.shared.callback.data_objects import CallbackLog, CallbackRecordStatusType
 from kairon.shared.content_importer.content_processor import ContentImporterLogProcessor
 from kairon.shared.utils import Utility, MailUtility
+from kairon.shared.llm.processor import LLMProcessor
+import numpy as np
 
 Utility.load_system_metadata()
 
@@ -1398,6 +1401,310 @@ def test_default_values():
 
     assert sorted(actual["data"]["default_names"]) == sorted(expected_default_names)
 
+@pytest.mark.asyncio
+@responses.activate
+@mock.patch.object(LLMProcessor, "__collection_exists__", autospec=True)
+@mock.patch.object(LLMProcessor, "__create_collection__", autospec=True)
+@mock.patch.object(LLMProcessor, "__collection_upsert__", autospec=True)
+@mock.patch.object(litellm, "aembedding", autospec=True)
+def test_knowledge_vault_sync(mock_embedding, mock_collection_exists, mock_create_collection, mock_collection_upsert):
+    bot_settings = BotSettings.objects(bot=pytest.bot).get()
+    bot_settings.content_importer_limit_per_day = 10
+    bot_settings.cognition_collections_limit = 10
+    bot_settings.llm_settings['enable_faq'] = True
+    bot_settings.save()
+
+    mock_collection_exists.return_value = False
+    mock_create_collection.return_value = None
+    mock_collection_upsert.return_value = None
+
+    embedding = list(np.random.random(LLMProcessor.__embedding__))
+    mock_embedding.return_value = litellm.EmbeddingResponse(**{'data': [{'embedding': embedding}]})
+
+    secrets = [
+        {
+            "llm_type": "openai",
+            "api_key": "common_openai_key",
+            "models": ["common_openai_model1", "common_openai_model2"],
+            "user": "123",
+            "timestamp": datetime.utcnow()
+        },
+    ]
+
+    for secret in secrets:
+        LLMSecret(**secret).save()
+
+    response = client.post(
+        url=f"/api/bot/{pytest.bot}/data/cognition/schema",
+        json={
+            "metadata": [
+                {"column_name": "id", "data_type": "int", "enable_search": True, "create_embeddings": True},
+                {"column_name": "item", "data_type": "str", "enable_search": True, "create_embeddings": True},
+                {"column_name": "price", "data_type": "float", "enable_search": True, "create_embeddings": True},
+                {"column_name": "quantity", "data_type": "int", "enable_search": True, "create_embeddings": True},
+            ],
+            "collection_name": "groceries"
+        },
+        headers={"Authorization": pytest.token_type + " " + pytest.access_token}
+    )
+    schema_response = response.json()
+    assert schema_response["message"] == "Schema saved!"
+    assert schema_response["error_code"] == 0
+
+    dummy_data = {
+        "id": "1",
+        "item": "Juice",
+        "price": "2.00",
+        "quantity": "9"
+    }
+    dummy_doc = CognitionData(
+        data=dummy_data,
+        content_type="json",
+        collection="groceries",
+        user="himanshu.gupta@digite.com",
+        bot=pytest.bot,
+        timestamp=datetime.utcnow()
+    )
+    dummy_doc.save()
+
+    cognition_data = CognitionData.objects(bot=pytest.bot, collection="groceries")
+    assert cognition_data.count() == 1
+
+    sync_data = [
+        {"id": 1, "item": "Juice", "price": "2.50", "quantity": "10"},
+        {"id": 2, "item": "Apples", "price": "1.20", "quantity": "20"}
+    ]
+
+    response = client.post(
+        url=f"/api/bot/{pytest.bot}/data/cognition/sync?primary_key_col=id&collection_name=groceries",
+        json=sync_data,
+        headers={"Authorization": pytest.token_type + " " + pytest.access_token}
+    )
+
+    actual= response.json()
+    print(actual)
+    assert actual["success"]
+    assert actual["message"] == "Processing completed successfully"
+    assert actual["error_code"] == 0
+
+    cognition_data = CognitionData.objects(bot=pytest.bot, collection="groceries")
+    assert cognition_data.count() == 2
+
+    expected_data = [
+        {"id": "1", "item": "Juice", "price": "2.50", "quantity": "10"},
+        {"id": "2", "item": "Apples", "price": "1.20", "quantity": "20"}
+    ]
+
+    for index, doc in enumerate(cognition_data):
+        doc_data = doc.to_mongo().to_dict()["data"]
+        assert doc_data == expected_data[index]
+
+    expected_calls = [
+        {
+            "model": "text-embedding-3-small",
+            "input": ['{"id":1,"item":"Juice","price":2.5,"quantity":10}'],  # First input
+            "metadata": {'user': 'integration@demo.ai', 'bot': pytest.bot, 'invocation': 'knowledge_vault_sync'},
+            "api_key": "common_openai_key",
+            "num_retries": 3
+        },
+        {
+            "model": "text-embedding-3-small",
+            "input": ['{"id":2,"item":"Apples","price":1.2,"quantity":20}'],  # Second input
+            "metadata": {'user': 'integration@demo.ai', 'bot': pytest.bot, 'invocation': 'knowledge_vault_sync'},
+            "api_key": "common_openai_key",
+            "num_retries": 3
+        }
+    ]
+
+    for i, expected in enumerate(expected_calls):
+        actual_call = mock_embedding.call_args_list[i].kwargs
+        assert actual_call == expected
+
+    CognitionData.objects(bot=pytest.bot, collection="groceries").delete()
+    CognitionSchema.objects(bot=pytest.bot, collection_name="groceries").delete()
+    LLMSecret.objects.delete()
+
+
+@pytest.mark.asyncio
+@responses.activate
+@mock.patch.object(litellm, "aembedding", autospec=True)
+def test_knowledge_vault_sync_missing_collection(mock_embedding):
+    bot_settings = BotSettings.objects(bot=pytest.bot).get()
+    bot_settings.content_importer_limit_per_day = 10
+    bot_settings.cognition_collections_limit = 10
+    bot_settings.llm_settings['enable_faq'] = True
+    bot_settings.save()
+
+    embedding = list(np.random.random(LLMProcessor.__embedding__))
+    mock_embedding.return_value = litellm.EmbeddingResponse(**{'data': [{'embedding': embedding}]})
+
+    secrets = [
+        {
+            "llm_type": "openai",
+            "api_key": "common_openai_key",
+            "models": ["common_openai_model1", "common_openai_model2"],
+            "user": "123",
+            "timestamp": datetime.utcnow()
+        }
+    ]
+    for secret in secrets:
+        LLMSecret(**secret).save()
+
+    sync_data = [
+        {"id": 1, "item": "Juice", "price": 2.50, "quantity": 10},
+        {"id": 2, "item": "Apples", "price": 1.20, "quantity": 20}
+    ]
+
+    response = client.post(
+        url=f"/api/bot/{pytest.bot}/data/cognition/sync?primary_key_col=id&collection_name=nonexistent_collection",
+        json=sync_data,
+        headers={"Authorization": pytest.token_type + " " + pytest.access_token}
+    )
+
+    actual = response.json()
+    assert not actual["success"]
+    assert actual["message"] == "Collection 'nonexistent_collection' does not exist."
+    assert actual["error_code"] == 422
+
+    cognition_data = CognitionData.objects(bot=pytest.bot, collection="nonexistent_collection")
+    assert cognition_data.count() == 0
+
+    LLMSecret.objects.delete()
+
+
+@pytest.mark.asyncio
+@responses.activate
+@mock.patch.object(litellm, "aembedding", autospec=True)
+def test_knowledge_vault_sync_column_header_mismatch(mock_embedding):
+    bot_settings = BotSettings.objects(bot=pytest.bot).get()
+    bot_settings.content_importer_limit_per_day = 10
+    bot_settings.cognition_collections_limit = 10
+    bot_settings.llm_settings['enable_faq'] = True
+    bot_settings.save()
+
+    embedding = list(np.random.random(LLMProcessor.__embedding__))
+    mock_embedding.return_value = litellm.EmbeddingResponse(**{'data': [{'embedding': embedding}]})
+
+    secrets = [
+        {
+            "llm_type": "openai",
+            "api_key": "common_openai_key",
+            "models": ["common_openai_model1", "common_openai_model2"],
+            "user": "123",
+            "timestamp": datetime.utcnow()
+        }
+    ]
+    for secret in secrets:
+        LLMSecret(**secret).save()
+
+    schema_response = client.post(
+        url=f"/api/bot/{pytest.bot}/data/cognition/schema",
+        json={
+            "metadata": [
+                {"column_name": "id", "data_type": "int", "enable_search": True, "create_embeddings": True},
+                {"column_name": "item", "data_type": "str", "enable_search": True, "create_embeddings": True},
+                {"column_name": "price", "data_type": "float", "enable_search": True, "create_embeddings": True},
+                {"column_name": "quantity", "data_type": "int", "enable_search": True, "create_embeddings": True},
+            ],
+            "collection_name": "groceries"
+        },
+        headers={"Authorization": pytest.token_type + " " + pytest.access_token}
+    )
+
+    assert schema_response.status_code == 200
+    assert schema_response.json()["message"] == "Schema saved!"
+    assert schema_response.json()["error_code"] == 0
+
+    sync_data = [
+        {"id": 1, "item": "Juice", "quantity": 10, "description": "Orange juice"},
+        {"id": 2, "item": "Apples", "quantity": 20, "description": "Fresh apples"}
+    ]
+
+    response = client.post(
+        url=f"/api/bot/{pytest.bot}/data/cognition/sync?primary_key_col=id&collection_name=groceries",
+        json=sync_data,
+        headers={"Authorization": pytest.token_type + " " + pytest.access_token}
+    )
+
+    actual = response.json()
+    print(actual)
+    assert not actual["success"]
+    assert actual["message"] == "Validation failed"
+    assert actual["error_code"] == 400
+    assert actual["data"] == {'1': [{'status': 'Column headers mismatch', 'expected_columns': ['id', 'item', 'price', 'quantity'], 'actual_columns': ['id', 'item', 'quantity', 'description']}], '2': [{'status': 'Column headers mismatch', 'expected_columns': ['id', 'item', 'price', 'quantity'], 'actual_columns': ['id', 'item', 'quantity', 'description']}]}
+
+    cognition_data = CognitionData.objects(bot=pytest.bot, collection="groceries")
+    assert cognition_data.count() == 0
+
+    CognitionSchema.objects(bot=pytest.bot, collection_name="groceries").delete()
+    LLMSecret.objects.delete()
+
+@pytest.mark.asyncio
+@responses.activate
+@mock.patch.object(litellm, "aembedding", autospec=True)
+def test_knowledge_vault_sync_missing_primary_key(mock_embedding):
+    bot_settings = BotSettings.objects(bot=pytest.bot).get()
+    bot_settings.content_importer_limit_per_day = 10
+    bot_settings.cognition_collections_limit = 10
+    bot_settings.llm_settings['enable_faq'] = True
+    bot_settings.save()
+
+    embedding = list(np.random.random(LLMProcessor.__embedding__))
+    mock_embedding.return_value = litellm.EmbeddingResponse(**{'data': [{'embedding': embedding}]})
+
+    secrets = [
+        {
+            "llm_type": "openai",
+            "api_key": "common_openai_key",
+            "models": ["common_openai_model1", "common_openai_model2"],
+            "user": "123",
+            "timestamp": datetime.utcnow()
+        }
+    ]
+    for secret in secrets:
+        LLMSecret(**secret).save()
+
+    schema_response = client.post(
+        url=f"/api/bot/{pytest.bot}/data/cognition/schema",
+        json={
+            "metadata": [
+                {"column_name": "id", "data_type": "int", "enable_search": True, "create_embeddings": True},
+                {"column_name": "item", "data_type": "str", "enable_search": True, "create_embeddings": True},
+                {"column_name": "price", "data_type": "float", "enable_search": True, "create_embeddings": True},
+                {"column_name": "quantity", "data_type": "int", "enable_search": True, "create_embeddings": True},
+            ],
+            "collection_name": "groceries"
+        },
+        headers={"Authorization": pytest.token_type + " " + pytest.access_token}
+    )
+
+    assert schema_response.status_code == 200
+    assert schema_response.json()["message"] == "Schema saved!"
+    assert schema_response.json()["error_code"] == 0
+
+    sync_data = [
+        {"item": "Juice", "price": 2.50, "quantity": 10},
+        {"item": "Apples", "price": 1.20, "quantity": 20}
+    ]
+
+    response = client.post(
+        url=f"/api/bot/{pytest.bot}/data/cognition/sync?primary_key_col=id&collection_name=groceries",
+        json=sync_data,
+        headers={"Authorization": pytest.token_type + " " + pytest.access_token}
+    )
+
+    actual = response.json()
+    print(actual)
+    assert not actual["success"]
+    assert actual["message"] == "Primary key 'id' must exist in each row."
+    assert actual["error_code"] == 422
+
+    cognition_data = CognitionData.objects(bot=pytest.bot, collection="groceries")
+    assert cognition_data.count() == 0
+
+    CognitionSchema.objects(bot=pytest.bot, collection_name="groceries").delete()
+    LLMSecret.objects.delete()
+
 @responses.activate
 def test_upload_doc_content():
     bot_settings = BotSettings.objects(bot=pytest.bot).get()
diff --git a/tests/unit_test/data_processor/data_processor_test.py b/tests/unit_test/data_processor/data_processor_test.py
index 6ca7075f1..d4b87eac9 100644
--- a/tests/unit_test/data_processor/data_processor_test.py
+++ b/tests/unit_test/data_processor/data_processor_test.py
@@ -12,14 +12,16 @@
 import yaml
 
 from kairon.shared.content_importer.data_objects import ContentValidationLogs
+from kairon.shared.rest_client import AioRestClient
 from kairon.shared.utils import Utility
+from kairon.shared.llm.processor import LLMProcessor
 
 os.environ["system_file"] = "./tests/testing_data/system.yaml"
 Utility.load_environment()
 Utility.load_system_metadata()
 
 
-from unittest.mock import patch
+from unittest.mock import patch, ANY
 import numpy as np
 import pandas as pd
 import pytest
@@ -30,7 +32,7 @@
 from mongoengine.errors import ValidationError
 from mongoengine.queryset.base import BaseQuerySet
 from pipedrive.exceptions import UnauthorizedError
-from pydantic import SecretStr
+from pydantic import SecretStr, constr
 from rasa.core.agent import Agent
 from rasa.shared.constants import DEFAULT_DOMAIN_PATH, DEFAULT_DATA_PATH, DEFAULT_CONFIG_PATH, \
     DEFAULT_NLU_FALLBACK_INTENT_NAME
@@ -1363,6 +1365,551 @@ def test_bot_id_change(self):
         bot_id = Slots.objects(bot="test_load_yml", user="testUser", influence_conversation=False, name='bot').get()
         assert bot_id['initial_value'] == "test_load_yml"
 
+    def test_validate_data_success(self):
+        bot = 'test_bot'
+        user = 'test_user'
+        collection_name = 'groceries'
+        primary_key_col = "id"
+
+        metadata = [
+            {
+                "column_name": "id",
+                "data_type": "int",
+                "enable_search": True,
+                "create_embeddings": True
+            },
+            {
+                "column_name": "item",
+                "data_type": "str",
+                "enable_search": True,
+                "create_embeddings": True
+            },
+            {
+                "column_name": "price",
+                "data_type": "float",
+                "enable_search": True,
+                "create_embeddings": True
+            },
+            {
+                "column_name": "quantity",
+                "data_type": "int",
+                "enable_search": True,
+                "create_embeddings": True
+            }
+        ]
+
+        cognition_schema = CognitionSchema(
+            metadata=[ColumnMetadata(**item) for item in metadata],
+            collection_name=collection_name,
+            user=user,
+            bot=bot,
+            timestamp=datetime.utcnow()
+        )
+        cognition_schema.validate(clean=True)
+        cognition_schema.save()
+
+        data = [
+            {"id": 1, "item": "Juice", "price": 2.50, "quantity": 10},
+            {"id": 2, "item": "Apples", "price": 1.20, "quantity": 20},
+            {"id": 3, "item": "Bananas", "price": 0.50, "quantity": 15},
+        ]
+
+        processor = CognitionDataProcessor()
+        validation_summary = processor.validate_data(
+            primary_key_col=primary_key_col,
+            collection_name=collection_name,
+            data=data,
+            bot=bot
+        )
+
+        assert validation_summary == {}
+        CognitionSchema.objects(bot=bot, collection_name="groceries").delete()
+
+    def test_validate_data_missing_collection(self):
+        bot = 'test_bot'
+        collection_name = 'nonexistent_collection'
+        primary_key_col = "id"
+        data = [{"id": 1, "item": "Juice", "price": 2.50, "quantity": 10}]
+
+        processor = CognitionDataProcessor()
+
+        with pytest.raises(AppException, match=f"Collection '{collection_name}' does not exist."):
+            processor.validate_data(
+                primary_key_col=primary_key_col,
+                collection_name=collection_name,
+                data=data,
+                bot=bot
+            )
+
+    def test_validate_data_missing_primary_key(self):
+        bot = 'test_bot'
+        user = 'test_user'
+        collection_name = 'groceries'
+        primary_key_col = "id"
+
+        metadata = [
+            {"column_name": "id", "data_type": "int", "enable_search": True, "create_embeddings": True},
+            {"column_name": "item", "data_type": "str", "enable_search": True, "create_embeddings": True},
+            {"column_name": "price", "data_type": "float", "enable_search": True, "create_embeddings": True},
+            {"column_name": "quantity", "data_type": "int", "enable_search": True, "create_embeddings": True}
+        ]
+
+        cognition_schema = CognitionSchema(
+            metadata=[ColumnMetadata(**item) for item in metadata],
+            collection_name=collection_name,
+            user=user,
+            bot=bot,
+            timestamp=datetime.utcnow()
+        )
+        cognition_schema.validate(clean=True)
+        cognition_schema.save()
+
+        data = [
+            {"item": "Juice", "price": 2.50, "quantity": 10}
+        ]
+
+        processor = CognitionDataProcessor()
+
+        with pytest.raises(AppException, match=f"Primary key '{primary_key_col}' must exist in each row."):
+            processor.validate_data(
+                primary_key_col=primary_key_col,
+                collection_name=collection_name,
+                data=data,
+                bot=bot
+            )
+        CognitionSchema.objects(bot=bot, collection_name="groceries").delete()
+
+    def test_validate_data_column_header_mismatch(self):
+        bot = 'test_bot'
+        user = 'test_user'
+        collection_name = 'groceries'
+        primary_key_col = "id"
+
+        metadata = [
+            {"column_name": "id", "data_type": "int", "enable_search": True, "create_embeddings": True},
+            {"column_name": "item", "data_type": "str", "enable_search": True, "create_embeddings": True},
+            {"column_name": "price", "data_type": "float", "enable_search": True, "create_embeddings": True},
+            {"column_name": "quantity", "data_type": "int", "enable_search": True, "create_embeddings": True}
+        ]
+
+        cognition_schema = CognitionSchema(
+            metadata=[ColumnMetadata(**item) for item in metadata],
+            collection_name=collection_name,
+            user=user,
+            bot=bot,
+            timestamp=datetime.utcnow()
+        )
+        cognition_schema.validate(clean=True)
+        cognition_schema.save()
+
+        data = [
+            {"id": "1", "item": "Juice", "quantity": 10, "discount": 0.10}
+        ]
+
+        processor = CognitionDataProcessor()
+        validation_summary = processor.validate_data(
+            primary_key_col=primary_key_col,
+            collection_name=collection_name,
+            data=data,
+            bot=bot
+        )
+
+        assert "1" in validation_summary
+        assert validation_summary["1"][0]["status"] == "Column headers mismatch"
+        assert validation_summary["1"][0]["expected_columns"] == ["id", "item", "price", "quantity"]
+        assert validation_summary["1"][0]["actual_columns"] == ["id", "item", "quantity", "discount"]
+        CognitionSchema.objects(bot=bot, collection_name="groceries").delete()
+
+    @pytest.mark.asyncio
+    @patch.object(LLMProcessor, "__collection_exists__", autospec=True)
+    @patch.object(LLMProcessor, "__create_collection__", autospec=True)
+    @patch.object(LLMProcessor, "__collection_upsert__", autospec=True)
+    @patch.object(litellm, "aembedding", autospec=True)
+    async def test_upsert_data_success(self, mock_embedding, mock_collection_upsert, mock_create_collection,
+                                       mock_collection_exists):
+        bot = 'test_bot'
+        user = 'test_user'
+        collection_name = 'groceries'
+        primary_key_col = 'id'
+
+        metadata = [
+            {"column_name": "id", "data_type": "int", "enable_search": True, "create_embeddings": True},
+            {"column_name": "item", "data_type": "str", "enable_search": True, "create_embeddings": True},
+            {"column_name": "price", "data_type": "float", "enable_search": True, "create_embeddings": True},
+            {"column_name": "quantity", "data_type": "int", "enable_search": True, "create_embeddings": True},
+        ]
+
+        cognition_schema = CognitionSchema(
+            metadata=[ColumnMetadata(**item) for item in metadata],
+            collection_name=collection_name,
+            user=user,
+            bot=bot,
+            timestamp=datetime.utcnow()
+        )
+        cognition_schema.validate(clean=True)
+        cognition_schema.save()
+
+        dummy_data = {
+            "id": "2",
+            "item": "Milk",
+            "price": "2.80",
+            "quantity": "5"
+        }
+        existing_document = CognitionData(
+            data=dummy_data,
+            content_type="json",
+            collection=collection_name,
+            user=user,
+            bot=bot,
+            timestamp=datetime.utcnow()
+        )
+        existing_document.save()
+
+        upsert_data = [
+            {"id": 1, "item": "Juice", "price": "2.50", "quantity": "10"},  # New entry
+            {"id": 2, "item": "Milk", "price": "3.00", "quantity": "5"}  # Existing entry to be updated
+        ]
+
+        llm_secret = LLMSecret(
+            llm_type="openai",
+            api_key="openai_key",
+            models=["model1", "model2"],
+            api_base_url="https://api.example.com",
+            bot=bot,
+            user=user
+        )
+        llm_secret.save()
+
+        mock_collection_exists.return_value = False
+        mock_create_collection.return_value = None
+        mock_collection_upsert.return_value = None
+
+        embedding = list(np.random.random(1532))
+        mock_embedding.return_value = {'data': [{'embedding': embedding}, {'embedding': embedding}]}
+
+        processor = CognitionDataProcessor()
+
+        result = await processor.upsert_data(
+            primary_key_col=primary_key_col,
+            collection_name=collection_name,
+            data=upsert_data,
+            bot=bot,
+            user=user
+        )
+
+        upserted_data = list(CognitionData.objects(bot=bot, collection=collection_name))
+
+        assert result["message"] == "Upsert complete!"
+        assert len(upserted_data) == 2
+
+        inserted_record = next((item for item in upserted_data if item.data["id"] == "1"), None)
+        assert inserted_record is not None
+        assert inserted_record.data["item"] == "Juice"
+        assert inserted_record.data["price"] == "2.50"
+        assert inserted_record.data["quantity"] == "10"
+
+        updated_record = next((item for item in upserted_data if item.data["id"] == "2"), None)
+        assert updated_record is not None
+        assert updated_record.data["item"] == "Milk"
+        assert updated_record.data["price"] == "3.00"  # Updated price
+        assert updated_record.data["quantity"] == "5"
+
+        CognitionSchema.objects(bot=bot, collection_name="groceries").delete()
+        CognitionData.objects(bot=bot, collection="groceries").delete()
+        LLMSecret.objects.delete()
+
+    @pytest.mark.asyncio
+    @patch.object(LLMProcessor, "__collection_exists__", autospec=True)
+    @patch.object(LLMProcessor, "__create_collection__", autospec=True)
+    @patch.object(LLMProcessor, "__collection_upsert__", autospec=True)
+    @patch.object(litellm, "aembedding", autospec=True)
+    async def test_upsert_data_empty_data_list(self, mock_embedding, mock_collection_upsert, mock_create_collection,
+                                               mock_collection_exists):
+        bot = 'test_bot'
+        user = 'test_user'
+        collection_name = 'groceries'
+        primary_key_col = 'id'
+
+        metadata = [
+            {"column_name": "id", "data_type": "int", "enable_search": True, "create_embeddings": True},
+            {"column_name": "item", "data_type": "str", "enable_search": True, "create_embeddings": True},
+            {"column_name": "price", "data_type": "float", "enable_search": True, "create_embeddings": True},
+            {"column_name": "quantity", "data_type": "int", "enable_search": True, "create_embeddings": True},
+        ]
+
+        cognition_schema = CognitionSchema(
+            metadata=[ColumnMetadata(**item) for item in metadata],
+            collection_name=collection_name,
+            user=user,
+            bot=bot,
+            timestamp=datetime.utcnow()
+        )
+        cognition_schema.validate(clean=True)
+        cognition_schema.save()
+
+        dummy_data = {
+            "id": "2",
+            "item": "Milk",
+            "price": "2.80",
+            "quantity": "5"
+        }
+        existing_document = CognitionData(
+            data=dummy_data,
+            content_type="json",
+            collection=collection_name,
+            user=user,
+            bot=bot,
+            timestamp=datetime.utcnow()
+        )
+        existing_document.save()
+
+        upsert_data = []
+
+        llm_secret = LLMSecret(
+            llm_type="openai",
+            api_key="openai_key",
+            models=["model1", "model2"],
+            api_base_url="https://api.example.com",
+            bot=bot,
+            user=user
+        )
+        llm_secret.save()
+
+        mock_collection_exists.return_value = False
+        mock_create_collection.return_value = None
+        mock_collection_upsert.return_value = None
+
+        embedding = list(np.random.random(1532))
+        mock_embedding.return_value = {'data': [{'embedding': embedding}, {'embedding': embedding}]}
+
+        processor = CognitionDataProcessor()
+        result = await processor.upsert_data(
+            primary_key_col=primary_key_col,
+            collection_name=collection_name,
+            data=upsert_data,
+            bot=bot,
+            user=user
+        )
+
+        data = list(CognitionData.objects(bot=bot, collection=collection_name))
+
+        assert result["message"] == "Upsert complete!"
+        assert len(data) == 1
+
+        existing_record = data[0]
+        assert existing_record.data["id"] == "2"
+        assert existing_record.data["item"] == "Milk"
+        assert existing_record.data["price"] == "2.80"
+        assert existing_record.data["quantity"] == "5"
+
+        CognitionSchema.objects(bot=bot, collection_name=collection_name).delete()
+        CognitionData.objects(bot=bot, collection=collection_name).delete()
+        LLMSecret.objects.delete()
+
+    @pytest.mark.asyncio
+    @patch.object(litellm, "aembedding", autospec=True)
+    @patch.object(LLMProcessor, "__collection_upsert__", autospec=True)
+    async def test_sync_with_qdrant_success(self, mock_collection_upsert, mock_embedding):
+        bot = "test_bot"
+        user = "test_user"
+        collection_name = "groceries"
+        primary_key_col = "id"
+
+        metadata = [
+            {"column_name": "id", "data_type": "int", "enable_search": True, "create_embeddings": True},
+            {"column_name": "item", "data_type": "str", "enable_search": True, "create_embeddings": True},
+            {"column_name": "price", "data_type": "float", "enable_search": True, "create_embeddings": True},
+            {"column_name": "quantity", "data_type": "int", "enable_search": True, "create_embeddings": True},
+        ]
+
+        cognition_schema = CognitionSchema(
+            metadata=[ColumnMetadata(**item) for item in metadata],
+            collection_name=collection_name,
+            user=user,
+            bot=bot,
+            timestamp=datetime.utcnow()
+        )
+        cognition_schema.validate(clean=True)
+        cognition_schema.save()
+
+        document_data = {
+            "id": "2",
+            "item": "Milk",
+            "price": "2.80",
+            "quantity": "5"
+        }
+        document = CognitionData(
+            data=document_data,
+            content_type="json",
+            collection=collection_name,
+            user=user,
+            bot=bot,
+            timestamp=datetime.utcnow()
+        )
+        document.save()
+
+        saved_document = None
+        for doc in CognitionData.objects(bot=bot, collection=collection_name):
+            doc_dict = doc.to_mongo().to_dict()
+            if doc_dict.get("data", {}).get("id") == "2":  # Match based on `data.id`
+                saved_document = doc_dict
+                break
+        assert saved_document, "Saved CognitionData document not found"
+        vector_id = saved_document["vector_id"]
+
+        if not isinstance(document, dict):
+            document = document.to_mongo().to_dict()
+
+        embedding = list(np.random.random(1532))
+        mock_embedding.return_value = {'data': [{'embedding': embedding}, {'embedding': embedding}]}
+
+        mock_collection_upsert.return_value = None
+
+        llm_secret = LLMSecret(
+            llm_type="openai",
+            api_key="openai_key",
+            models=["model1", "model2"],
+            api_base_url="https://api.example.com",
+            bot=bot,
+            user=user
+        )
+        llm_secret.save()
+
+        processor = CognitionDataProcessor()
+        llm_processor = LLMProcessor(bot, DEFAULT_LLM)
+        await processor.sync_with_qdrant(
+            llm_processor=llm_processor,
+            collection_name=collection_name,
+            bot=bot,
+            document=document,
+            user=user,
+            primary_key_col=primary_key_col
+        )
+
+        mock_embedding.assert_called_once_with(
+            model="text-embedding-3-small",
+            input=['{"id":2,"item":"Milk","price":2.8,"quantity":5}'],
+            metadata={'user': user, 'bot': bot, 'invocation': 'knowledge_vault_sync'},
+            api_key="openai_key",
+            num_retries=3
+        )
+        mock_collection_upsert.assert_called_once_with(
+            llm_processor,
+            collection_name,
+            {
+                "points": [
+                    {
+                        "id": vector_id,
+                        "vector": embedding,
+                        "payload": {'id': 2, 'item': 'Milk', 'price': 2.8, 'quantity': 5}
+                    }
+                ]
+            },
+            err_msg="Unable to train FAQ! Contact support"
+        )
+
+        CognitionSchema.objects(bot=bot, collection_name="groceries").delete()
+        CognitionData.objects(bot=bot, collection="groceries").delete()
+        LLMSecret.objects.delete()
+
+    @pytest.mark.asyncio
+    @patch.object(litellm, "aembedding", autospec=True)
+    @patch.object(AioRestClient, "request", autospec=True)
+    async def test_sync_with_qdrant_upsert_failure(self, mock_request, mock_embedding):
+        bot = "test_bot"
+        user = "test_user"
+        collection_name = "groceries"
+        primary_key_col = "id"
+
+        metadata = [
+            {"column_name": "id", "data_type": "int", "enable_search": True, "create_embeddings": True},
+            {"column_name": "item", "data_type": "str", "enable_search": True, "create_embeddings": True},
+            {"column_name": "price", "data_type": "float", "enable_search": True, "create_embeddings": True},
+            {"column_name": "quantity", "data_type": "int", "enable_search": True, "create_embeddings": True},
+        ]
+
+        cognition_schema = CognitionSchema(
+            metadata=[ColumnMetadata(**item) for item in metadata],
+            collection_name=collection_name,
+            user=user,
+            bot=bot,
+            timestamp=datetime.utcnow()
+        )
+        cognition_schema.validate(clean=True)
+        cognition_schema.save()
+
+        document_data = {
+            "id": "2",
+            "item": "Milk",
+            "price": "2.80",
+            "quantity": "5"
+        }
+        document = CognitionData(
+            data=document_data,
+            content_type="json",
+            collection=collection_name,
+            user=user,
+            bot=bot,
+            timestamp=datetime.utcnow()
+        )
+        document.save()
+        if not isinstance(document, dict):
+            document = document.to_mongo().to_dict()
+
+        embedding = list(np.random.random(1532))
+        mock_embedding.return_value = {'data': [{'embedding': embedding}, {'embedding': embedding}]}
+
+        mock_request.side_effect = ConnectionError("Failed to connect to Qdrant")
+
+        llm_secret = LLMSecret(
+            llm_type="openai",
+            api_key="openai_key",
+            models=["model1", "model2"],
+            api_base_url="https://api.example.com",
+            bot=bot,
+            user=user
+        )
+        llm_secret.save()
+
+        processor = CognitionDataProcessor()
+        llm_processor = LLMProcessor(bot, DEFAULT_LLM)
+
+        with pytest.raises(AppException, match="Failed to sync document with Qdrant: Failed to connect to Qdrant"):
+            await processor.sync_with_qdrant(
+                llm_processor=llm_processor,
+                collection_name=collection_name,
+                bot=bot,
+                document=document,
+                user=user,
+                primary_key_col=primary_key_col
+            )
+
+        mock_embedding.assert_called_once_with(
+            model="text-embedding-3-small",
+            input=['{"id":2,"item":"Milk","price":2.8,"quantity":5}'],
+            metadata={'user': user, 'bot': bot, 'invocation': 'knowledge_vault_sync'},
+            api_key="openai_key",
+            num_retries=3
+        )
+
+        CognitionSchema.objects(bot=bot, collection_name="groceries").delete()
+        CognitionData.objects(bot=bot, collection="groceries").delete()
+        LLMSecret.objects.delete()
+
+    def test_get_pydantic_type_int(self):
+        result = CognitionDataProcessor().get_pydantic_type('int')
+        expected = (int, ...)
+        assert result == expected
+
+    def test_get_pydantic_type_float(self):
+        result = CognitionDataProcessor.get_pydantic_type('float')
+        expected = (float, ...)
+        assert result == expected
+
+    def test_get_pydantic_type_invalid(self):
+        with pytest.raises(ValueError, match="Unsupported data type: unknown"):
+            CognitionDataProcessor.get_pydantic_type('unknown')
+
     def test_save_and_validate_success(self):
         bot = 'test_bot'
         user = 'test_user'
diff --git a/tests/unit_test/llm_test.py b/tests/unit_test/llm_test.py
index c754845d3..34e39e99c 100644
--- a/tests/unit_test/llm_test.py
+++ b/tests/unit_test/llm_test.py
@@ -9,6 +9,7 @@
 from aiohttp import ClientConnectionError
 from mongoengine import connect
 
+from kairon.shared.rest_client import AioRestClient
 from kairon.shared.utils import Utility
 
 Utility.load_system_metadata()
@@ -1287,4 +1288,72 @@ async def test_gpt3_faq_embedding_predict_with_query_prompt(self, mock_embedding
                     "input": [query], 'metadata': {'user': user, 'bot': bot, 'invocation': None},
                     "api_key": key,
                     "num_retries": 3}
-        assert not DeepDiff(mock_embedding.call_args[1], expected, ignore_order=True)
\ No newline at end of file
+        assert not DeepDiff(mock_embedding.call_args[1], expected, ignore_order=True)
+
+    @pytest.mark.asyncio
+    @mock.patch.object(AioRestClient, "request", autospec=True)
+    async def test_collection_exists_success(self, mock_request):
+        collection_name = "test_collection"
+        bot = "test_collection_exists_success"
+        user = "test_new"
+
+        llm_secret = LLMSecret(
+            llm_type="openai",
+            api_key="openai_key",
+            models=["model1", "model2"],
+            api_base_url="https://api.example.com",
+            bot=bot,
+            user=user
+        )
+        llm_secret.save()
+
+        mock_request.return_value = {"status": "ok"}
+
+        llm_processor = LLMProcessor(bot, DEFAULT_LLM)
+
+        result = await llm_processor.__collection_exists__(collection_name)
+
+        mock_request.assert_called_once_with(
+            mock.ANY,
+            http_url=f"{llm_processor.db_url}/collections/{collection_name}",
+            request_method="GET",
+            headers=llm_processor.headers,
+            return_json=True,
+            timeout=5
+        )
+        assert result is True
+        LLMSecret.objects.delete()
+
+    @pytest.mark.asyncio
+    @mock.patch.object(AioRestClient, "request", autospec=True)
+    async def test_collection_exists_failure(self, mock_request):
+        collection_name = "test_collection"
+        bot = "test_collection_exists_failure"
+        user = "test_new"
+
+        llm_secret = LLMSecret(
+            llm_type="openai",
+            api_key="openai_key",
+            models=["model1", "model2"],
+            api_base_url="https://api.example.com",
+            bot=bot,
+            user=user
+        )
+        llm_secret.save()
+
+        mock_request.side_effect = Exception("Connection error")
+
+        llm_processor = LLMProcessor(bot, DEFAULT_LLM)
+
+        result = await llm_processor.__collection_exists__(collection_name)
+
+        mock_request.assert_called_once_with(
+            mock.ANY,
+            http_url=f"{llm_processor.db_url}/collections/{collection_name}",
+            request_method="GET",
+            headers=llm_processor.headers,
+            return_json=True,
+            timeout=5
+        )
+        assert result is False
+        LLMSecret.objects.delete()
\ No newline at end of file