refactor: switch shared as to use a Llama Index pipeline implementati…

…on and adopt HyDE. (#259) * refactor(query): switch to implementation that uses Llama Index pipelines. Switch to HyDE rather than n query rewrite technique. Still running BM25 retrieval also and using reciprocal rank fusion as reranker. * refactor: fix some undefined type errors * build: update to Ruff v0.2.2 -> v0.4.4 * build: update llama-index-core from v0.10.39 to v0.10.39 * refactor: split things into files a bit more to avoid circular deps. * docs(readme): update 2nd level tag line to make it clearer. * tests: fix failing test from moving some indexing and storage functions to a different file. * feat: add GPT-4o support * refactor: move azure env var strings to config file and reference by py variable in code * update azure_openai_latest setting to use gpt-4o
docqai · Jun 1, 2024 · b76fda2 · b76fda2
1 parent 9e493cd
commit b76fda2
Show file tree

Hide file tree

Showing 24 changed files with 1,060 additions and 390 deletions.
diff --git a/README.md b/README.md
@@ -17,7 +17,7 @@
 
 # Your private ChatGPT alternative
 
-**Securely unlock knowledge from your business documents. Give your employees' a second-brain.**
+**Q&A with business documents securely. Depend less on other teams for answers.**
 
 ![Docq overview in a single diagram](https://docqai.github.io/docq/assets/docq-diag-apr2024.png)
 
@@ -43,7 +43,7 @@ Abstractions to easily add custom data sources. Plugin system for extending appl
 We offer Docq as a **turnkey** solution to lower the barrier for your organisation to adopt the latest AI innovation safely.
 
 - Open-source (AGPLv3) to address your organisation's security and compliance needs.
-- Commercial support available via a UK-based entity.
+- Commercial license and support available (via Docq.AI based in London, UK).
 - Re-licensing option also available.
 
 ## Vision
@@ -92,7 +92,7 @@ For more details see section [Deploy to Azure: 15 Minutes and Secure](./docs/use
 
 ## Roadmap
 
-We aim to support the following features in the near future:
+We aim to support the following features in the future:
 
 - More data ingestion options such as SaaS data connectors and network storage options
 - A plugin platform and ecosystem
@@ -114,12 +114,15 @@ Project Link: [https://github.com/docqai/docq/](https://github.com/docqai/docq/)
 
 ## Licenses
 
-Distributed under multiple licenses:
+The code in the repo is distributed under the licenses below. If a file has a specific license and copyright notice displayed then that wins.
+
+- AGPLv3. See [`LICENSE`](./LICENSE.AGPL3) and [`NOTICE`](./NOTICE.AGPL3)  for more information.
+- Commercial licenses available via [Docq.AI](https://docq.ai)
+- Copyright (C) 2023-2024 FABR Ltd t/a [Docq.AI](https://docq.ai). 
+
+Made :heart: London.
 
-- AGPLv3 See [`LICENSE`](https://github.com/docqai/docq/blob/main/LICENSE) for more information.
-- Commercial licenses available
 
-Copyright (C) 2023-2024 FABR Ltd t/a [Docq.AI](https://docq.ai)
 
 <!-- MARKDOWN LINKS & IMAGES -->
 <!-- https://www.markdownguide.org/basic-syntax/#reference-style-links -->

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "docq"
-version = "0.10.6"
+version = "0.11.0"
 description = "Docq.AI - Your private ChatGPT alternative. Securely unlock knowledge from confidential documents."
 authors = ["Docq.AI Team <[email protected]>"]
 maintainers = ["Docq.AI Team <[email protected]>"]
@@ -59,11 +59,12 @@ llama-index-llms-litellm = "^0.1.3"
 llama-index-embeddings-azure-openai = "^0.1.6"
 jwt = "^1.3.1"
 llama-index-embeddings-huggingface-optimum = "^0.1.5"
-llama-index-core = "^0.10.21.post1"
+llama-index-core = "^0.10.39"
 llama-index-readers-file = "^0.1.12"
 slack-bolt = "^1.18.1"
 llama-index-retrievers-bm25 = "^0.1.3"
 sentence-transformers = "^2.6.1"
+llama-index-postprocessor-colbert-rerank = "^0.1.2"
 
 [tool.poetry.group.dev.dependencies]
 pre-commit = "^2.18.1"
@@ -83,7 +84,7 @@ mkapi = "^1.0.14"
 pillow = "^10.2.0"
 cairosvg = "^2.7.0"
 python-dotenv = "^1.0.0"
-ruff = "^0.2.2"
+ruff = "^0.4.4"
 mkdocs = "^1.5.3"
 
 [tool.poetry.scripts]

diff --git a/source/docq/config.py b/source/docq/config.py
@@ -9,8 +9,17 @@
 ENV_VAR_DOCQ_COOKIE_HMAC_SECRET_KEY = "DOCQ_COOKIE_HMAC_SECRET_KEY"
 ENV_VAR_DOCQ_API_SECRET = "DOCQ_API_SECRET"
 SESSION_COOKIE_NAME = "docqai/_docq"
+
 ENV_VAR_DOCQ_GROQ_API_KEY = "DOCQ_GROQ_API_KEY"
 
+ENV_VAR_DOCQ_AZURE_OPENAI_API_VERSION = "DOCQ_AZURE_OPENAI_API_VERSION"
+
+ENV_VAR_DOCQ_AZURE_OPENAI_API_BASE1 = "DOCQ_AZURE_OPENAI_API_BASE"
+ENV_VAR_DOCQ_AZURE_OPENAI_API_KEY1 = "DOCQ_AZURE_OPENAI_API_KEY1"  # key for base1
+
+ENV_VAR_DOCQ_AZURE_OPENAI_API_BASE2 = "DOCQ_AZURE_OPENAI_API_BASE2"
+ENV_VAR_DOCQ_AZURE_OPENAI_API_KEY2 = "DOCQ_AZURE_OPENAI_API_KEY2"  # key for base2
+
 ENV_VAR_DOCQ_SLACK_CLIENT_ID = "DOCQ_SLACK_CLIENT_ID"
 ENV_VAR_DOCQ_SLACK_CLIENT_SECRET = "DOCQ_SLACK_CLIENT_SECRET"  # noqa: S105
 ENV_VAR_DOCQ_SLACK_SIGNING_SECRET = "DOCQ_SLACK_SIGNING_SECRET"  # noqa: S105

diff --git a/source/docq/manage_indices.py b/source/docq/manage_indices.py
@@ -0,0 +1,89 @@
+"""Functions to manage indices."""
+
+import logging as log
+from typing import List
+
+from llama_index.core.indices import DocumentSummaryIndex, VectorStoreIndex
+from llama_index.core.indices.base import BaseIndex
+from llama_index.core.indices.loading import load_index_from_storage
+from llama_index.core.schema import Document
+from opentelemetry import trace
+from opentelemetry.trace import Status, StatusCode
+
+import docq
+
+from .domain import SpaceKey
+from .model_selection.main import LlmUsageSettingsCollection, ModelCapability, _get_service_context
+from .support.store import _get_default_storage_context, _get_storage_context, get_index_dir
+
+tracer = trace.get_tracer(__name__, docq.__version_str__)
+
+
+@tracer.start_as_current_span("manage_spaces._create_vector_index")
+def _create_vector_index(
+    documents: List[Document], model_settings_collection: LlmUsageSettingsCollection
+) -> VectorStoreIndex:
+    # Use default storage and service context to initialise index purely for persisting
+    return VectorStoreIndex.from_documents(
+        documents,
+        storage_context=_get_default_storage_context(),
+        service_context=_get_service_context(model_settings_collection),
+        kwargs=model_settings_collection.model_usage_settings[ModelCapability.CHAT].additional_args,
+    )
+
+
+@tracer.start_as_current_span("manage_spaces._create_document_summary_index")
+def _create_document_summary_index(
+    documents: List[Document], model_settings_collection: LlmUsageSettingsCollection
+) -> DocumentSummaryIndex:
+    """Create a an index of summaries for each document. This doen't create embedding for each node."""
+    return DocumentSummaryIndex(embed_summaries=True).from_documents(
+        documents,
+        storage_context=_get_default_storage_context(),
+        service_context=_get_service_context(model_settings_collection),
+        kwargs=model_settings_collection.model_usage_settings[ModelCapability.CHAT].additional_args,
+    )
+
+
+@tracer.start_as_current_span("manage_spaces._persist_index")
+def _persist_index(index: BaseIndex, space: SpaceKey) -> None:
+    """Persist an Space datasource index to disk."""
+    index.storage_context.persist(persist_dir=get_index_dir(space))
+
+
+@tracer.start_as_current_span(name="_load_index_from_storage")
+def _load_index_from_storage(space: SpaceKey, model_settings_collection: LlmUsageSettingsCollection) -> BaseIndex:
+    # set service context explicitly for multi model compatibility
+    sc = _get_service_context(model_settings_collection)
+    return load_index_from_storage(
+        storage_context=_get_storage_context(space), service_context=sc, callback_manager=sc.callback_manager
+    )
+
+
+def load_indices_from_storage(
+    spaces: List[SpaceKey], model_settings_collection: LlmUsageSettingsCollection
+) -> List[BaseIndex]:
+    """Return a list of indices for the given list of spaces."""
+    with tracer.start_as_current_span("indices_from_spaces") as span:
+        indices = []
+        for space in spaces:
+            try:
+                index_ = _load_index_from_storage(space, model_settings_collection)
+
+                log.debug("run_chat(): %s, %s", index_.index_id, space.summary)
+                indices.append(index_)
+                span.add_event(
+                    name="index_appended",
+                    attributes={"index_id": index_.index_id, "index_struct_cls": index_.index_struct_cls.__name__},
+                )
+            except Exception as e:
+                span.set_status(status=Status(StatusCode.ERROR))
+                span.record_exception(e)
+                log.warning(
+                    "Index for space '%s' failed to load, skipping. Maybe the index isn't created yet. Error message: %s",
+                    space,
+                    e,
+                )
+                continue
+        span.add_event("indices_loaded", {"num_indices_loaded": len(indices), "num_spaces_given": len(spaces)})
+        return indices
diff --git a/source/docq/manage_spaces.py b/source/docq/manage_spaces.py
@@ -8,9 +8,6 @@
 from datetime import datetime
 from typing import Any, List, Optional
 
-from llama_index.core.indices import DocumentSummaryIndex, VectorStoreIndex
-from llama_index.core.indices.base import BaseIndex
-from llama_index.core.schema import Document
 from opentelemetry import trace
 
 import docq
@@ -19,9 +16,9 @@
 from .config import SpaceType
 from .data_source.list import SpaceDataSources
 from .domain import DocumentListItem, SpaceKey
-from .model_selection.main import LlmUsageSettingsCollection, ModelCapability, get_saved_model_settings_collection
-from .support.llm import _get_default_storage_context, _get_service_context
-from .support.store import get_index_dir, get_sqlite_shared_system_file
+from .manage_indices import _create_vector_index, _persist_index
+from .model_selection.main import get_saved_model_settings_collection
+from .support.store import get_sqlite_shared_system_file
 
 tracer = trace.get_tracer(__name__, docq.__version_str__)
 
@@ -67,35 +64,6 @@ def _init() -> None:
         connection.commit()
 
 
-@tracer.start_as_current_span("manage_spaces._create_vector_index")
-def _create_vector_index(
-    documents: List[Document], model_settings_collection: LlmUsageSettingsCollection
-) -> VectorStoreIndex:
-    # Use default storage and service context to initialise index purely for persisting
-    return VectorStoreIndex.from_documents(
-        documents,
-        storage_context=_get_default_storage_context(),
-        service_context=_get_service_context(model_settings_collection),
-        kwargs=model_settings_collection.model_usage_settings[ModelCapability.CHAT].additional_args,
-    )
-
-@tracer.start_as_current_span("manage_spaces._create_document_summary_index")
-def _create_document_summary_index(
-    documents: List[Document], model_settings_collection: LlmUsageSettingsCollection
-) -> DocumentSummaryIndex:
-    """Create a an index of summaries for each document. This doen't create embedding for each node."""
-    return DocumentSummaryIndex(embed_summaries=True).from_documents(
-        documents,
-        storage_context=_get_default_storage_context(),
-        service_context=_get_service_context(model_settings_collection),
-        kwargs=model_settings_collection.model_usage_settings[ModelCapability.CHAT].additional_args,
-    )
-
-
-@tracer.start_as_current_span("manage_spaces._persist_index")
-def _persist_index(index: BaseIndex, space: SpaceKey) -> None:
-    """Persist an Space datasource index to disk."""
-    index.storage_context.persist(persist_dir=get_index_dir(space))
 
 
 def _format_space(row: Any) -> SPACE: