Feature/include vectors option document chunks (#1419)

* Fix async JSON parsing (#1408) * Fix async JSON parsing * Remove score completion from js * clean up js * lockfile * Feature/build custom logger (#1409) * building a custom logger for r2r * fix log * maintain bkwd compat * Feature/add kg description prompt (#1411) * add kg desc prompt * add kg desc prompt * add kg desc prompt * fix prompt name * separate test run freq * task_id check fix * add ingestion docs * updatet * add * rm old prompts * rm old prompots * rm old prompts * rm old prompts * add option to include vectors in document chunks * checkin * update vector --------- Co-authored-by: Nolan Tremelling <[email protected]>
SciPhi-AI · Oct 16, 2024 · 6a3d460 · 6a3d460
1 parent 21686b3
commit 6a3d460
Show file tree

Hide file tree

Showing 10 changed files with 81 additions and 13 deletions.
diff --git a/docs/api-reference/openapi.json b/docs/api-reference/openapi.json
diff --git a/docs/documentation/cli/management.mdx b/docs/documentation/cli/management.mdx
@@ -119,5 +119,10 @@ r2r document-chunks --document-id doc1 --offset 0 --limit 10
     <ParamField path="--limit" type="int">
       The maximum number of nodes to return. Defaults to 100.
     </ParamField>
+
+    <ParamField path="include_vectors" type="Optional[bool]">
+      An optional value to return the vectors associated with each chunk, defaults to `False`.
+    </ParamField>
+
   </Accordion>
 </AccordionGroup>
diff --git a/docs/documentation/js-sdk/ingestion.mdx b/docs/documentation/js-sdk/ingestion.mdx
@@ -57,7 +57,7 @@ const ingestResponse = await client.ingestFiles(files, {
 </ParamField>
 
 <ParamField path="ingestion_config" type="Optional[Union[dict, ChunkingConfig]]">
-  The ingestion config override parameter enables developers to customize their R2R chunking strategy at runtime.
+  The ingestion config override parameter enables developers to customize their R2R chunking strategy at runtime. Learn more about [configuration here](/documentation/configuration/ingestion/parsing_and_chunking).
   <Expandable title="properties">
     <ParamField path="provider" type="str" default="unstructured_local">
       Which chunking provider to use. Options are "r2r", "unstructured_local", or "unstructured_api".
@@ -273,7 +273,7 @@ const updateResponse = await client.updateFiles(files, {
   </ParamField>
 
   <ParamField path="ingestion_config" type="Record<string, any>">
-    The ingestion config override parameter enables developers to customize their R2R chunking strategy at runtime.
+    The ingestion config override parameter enables developers to customize their R2R chunking strategy at runtime. Learn more about [configuration here](/documentation/configuration/ingestion/parsing_and_chunking).
     <Expandable title="properties">
     <ParamField path="provider" type="str" default="r2r">
     Which chunking provider to use, `r2r` or `unstructured`. Selecting `unstructured` is generally recommended when parsing with `unstructured` or `unstructured_api`.
@@ -335,6 +335,12 @@ const documentsOverview = await client.documentsOverview();
 <ParamField path="document_ids" type="Array<string>">
   An optional array of document IDs to filter the overview.
 </ParamField>
+<ParamField path="offset" type="Optional[int]">
+  An optional value to offset the starting point of fetched results, defaults to `0`.
+</ParamField>
+<ParamField path="limit" type="Optional[int]">
+  An optional value to limit the fetched results, defaults to `100`.
+</ParamField>
 
 
 ### Document Chunks
@@ -368,7 +374,15 @@ const chunks = await client.documentChunks(documentId);
 <ParamField path="document_id" type="string" required>
   The ID of the document to retrieve chunks for.
 </ParamField>
-
+<ParamField path="offset" type="Optional[int]">
+  An optional value to offset the starting point of fetched results, defaults to `0`.
+</ParamField>
+<ParamField path="limit" type="Optional[int]">
+  An optional value to limit the fetched results, defaults to `100`.
+</ParamField>
+<ParamField path="include_vectors" type="Optional[bool]">
+  An optional value to return the vectors associated with each chunk, defaults to `False`.
+</ParamField>
 
 ### Delete Documents
 

diff --git a/docs/documentation/python-sdk/ingestion.mdx b/docs/documentation/python-sdk/ingestion.mdx
@@ -67,7 +67,7 @@ Refer to the [ingestion configuration](/documentation/configuration/ingestion/pa
 
 
 <ParamField path="ingestion_config" type="Optional[Union[dict, IngestionConfig]]">
-  The ingestion config override parameter enables developers to customize their R2R chunking strategy at runtime.
+  The ingestion config override parameter enables developers to customize their R2R chunking strategy at runtime. Learn more about [configuration here](/documentation/configuration/ingestion/parsing_and_chunking).
   <Expandable title="Other Provider Options">
     <ParamField path="provider" type="str" default="r2r">
       Which R2R ingestion provider to use. Options are "r2r".
@@ -287,7 +287,7 @@ The ingestion configuration can be customized analogously to the ingest files en
 
 
 <ParamField path="ingestion_config" type="Optional[Union[dict, IngestionConfig]]">
-  The ingestion config override parameter enables developers to customize their R2R chunking strategy at runtime.
+  The ingestion config override parameter enables developers to customize their R2R chunking strategy at runtime. Learn more about [configuration here](/documentation/configuration/ingestion/parsing_and_chunking).
   <Expandable title="Other Provider Options">
     <ParamField path="provider" type="str" default="r2r">
       Which R2R ingestion provider to use. Options are "r2r".
@@ -458,6 +458,13 @@ documents_overview = client.documents_overview()
 <ParamField path="document_ids" type="Optional[list[Union[UUID, str]]]">
   An optional list of document IDs to filter the overview.
 </ParamField>
+<ParamField path="offset" type="Optional[int]">
+  An optional value to offset the starting point of fetched results, defaults to `0`.
+</ParamField>
+<ParamField path="limit" type="Optional[int]">
+  An optional value to limit the fetched results, defaults to `100`.
+</ParamField>
+
 
 ### Document Chunks
 
@@ -493,6 +500,16 @@ chunks = client.document_chunks(document_id)
 <ParamField path="document_id" type="str" required>
   The ID of the document to retrieve chunks for.
 </ParamField>
+<ParamField path="offset" type="Optional[int]">
+  An optional value to offset the starting point of fetched results, defaults to `0`.
+</ParamField>
+<ParamField path="limit" type="Optional[int]">
+  An optional value to limit the fetched results, defaults to `100`.
+</ParamField>
+<ParamField path="include_vectors" type="Optional[bool]">
+  An optional value to return the vectors associated with each chunk, defaults to `False`.
+</ParamField>
+
 
 ### Delete Documents
 

diff --git a/py/cli/commands/management.py b/py/cli/commands/management.py
@@ -126,16 +126,24 @@ def documents_overview(ctx, document_ids, offset, limit):
     default=None,
     help="The maximum number of nodes to return. Defaults to 100.",
 )
+@click.option(
+    "--include-vectors",
+    is_flag=True,
+    default=False,
+    help="Should the vector be included in the response chunks",
+)
 @pass_context
-def document_chunks(ctx, document_id, offset, limit):
+def document_chunks(ctx, document_id, offset, limit, include_vectors):
     """Get chunks of a specific document."""
     client = ctx.obj
     if not document_id:
         click.echo("Error: Document ID is required.")
         return
 
     with timer():
-        chunks_data = client.document_chunks(document_id, offset, limit)
+        chunks_data = client.document_chunks(
+            document_id, offset, limit, include_vectors
+        )
 
     chunks = chunks_data["results"]
     if not chunks:
@@ -150,5 +158,7 @@ def document_chunks(ctx, document_id, offset, limit):
             click.echo(f"Extraction ID: {chunk.get('id', 'N/A')}")
             click.echo(f"Text: {chunk.get('text', '')[:100]}...")
             click.echo(f"Metadata: {chunk.get('metadata', {})}")
+            if include_vectors:
+                click.echo(f"Vector: {chunk.get('vector', 'N/A')}")
         else:
             click.echo(f"Unexpected chunk format: {chunk}")
diff --git a/py/core/main/api/management_router.py b/py/core/main/api/management_router.py
@@ -367,12 +367,13 @@ async def document_chunks_app(
             document_id: str = Path(...),
             offset: Optional[int] = Query(0, ge=0),
             limit: Optional[int] = Query(100, ge=0),
+            include_vectors: Optional[bool] = Query(False),
             auth_user=Depends(self.service.providers.auth.auth_wrapper),
         ) -> WrappedDocumentChunkResponse:
             document_uuid = UUID(document_id)
 
             document_chunks = await self.service.document_chunks(
-                document_uuid, offset, limit
+                document_uuid, offset, limit, include_vectors
             )
 
             document_chunks_result = document_chunks["results"]

diff --git a/py/core/main/services/management_service.py b/py/core/main/services/management_service.py
@@ -365,11 +365,15 @@ async def document_chunks(
         document_id: UUID,
         offset: int = 0,
         limit: int = 100,
+        include_vectors: bool = False,
         *args,
         **kwargs,
     ):
         return self.providers.database.vector.get_document_chunks(
-            document_id, offset=offset, limit=limit
+            document_id,
+            offset=offset,
+            limit=limit,
+            include_vectors=include_vectors,
         )
 
     @telemetry_event("AssignDocumentToCollection")

diff --git a/py/core/providers/database/vector.py b/py/core/providers/database/vector.py
@@ -1,5 +1,6 @@
 import concurrent.futures
 import copy
+import json
 import logging
 import time
 from concurrent.futures import ThreadPoolExecutor
@@ -490,16 +491,25 @@ def delete_collection(self, collection_id: str) -> None:
                 raise
 
     def get_document_chunks(
-        self, document_id: str, offset: int = 0, limit: int = -1
+        self,
+        document_id: str,
+        offset: int = 0,
+        limit: int = -1,
+        include_vectors: bool = False,
     ) -> dict[str, Any]:
         if not self.collection:
             raise ValueError("Collection is not initialized.")
 
         limit_clause = f"LIMIT {limit}" if limit != -1 else ""
         table_name = self.collection.table.name
+
+        select_clause = "SELECT extraction_id, document_id, user_id, collection_ids, text, metadata"
+        if include_vectors:
+            select_clause += ", vec"
+
         query = text(
             f"""
-            SELECT extraction_id, document_id, user_id, collection_ids, text, metadata, COUNT(*) OVER() AS total
+            {select_clause}, COUNT(*) OVER() AS total
             FROM {self.project_name}."{table_name}"
             WHERE document_id = :document_id
             ORDER BY CAST(metadata->>'chunk_order' AS INTEGER)
@@ -518,7 +528,7 @@ def get_document_chunks(
         total = 0
 
         if results:
-            total = results[0][6]
+            total = results[0][-1]  # Get the total count from the last column
             chunks = [
                 {
                     "extraction_id": result[0],
@@ -527,6 +537,9 @@ def get_document_chunks(
                     "collection_ids": result[3],
                     "text": result[4],
                     "metadata": result[5],
+                    "vector": (
+                        json.loads(result[6]) if include_vectors else None
+                    ),
                 }
                 for result in results
             ]

diff --git a/py/sdk/management.py b/py/sdk/management.py
@@ -248,6 +248,7 @@ async def document_chunks(
         document_id: str,
         offset: Optional[int] = None,
         limit: Optional[int] = None,
+        include_vectors: Optional[bool] = False,
     ) -> dict:
         """
         Get the chunks for a document.
@@ -263,6 +264,8 @@ async def document_chunks(
             params["offset"] = offset
         if limit is not None:
             params["limit"] = limit
+        if include_vectors:
+            params["include_vectors"] = include_vectors
         if not params:
             return await client._make_request(
                 "GET", f"document_chunks/{document_id}"

diff --git a/py/shared/api/models/management/responses.py b/py/shared/api/models/management/responses.py
@@ -106,6 +106,7 @@ class DocumentChunkResponse(BaseModel):
     collection_ids: list[UUID]
     text: str
     metadata: dict[str, Any]
+    vector: Optional[list[float]] = None
 
 
 KnowledgeGraphResponse = str