Skip to content

Commit

Permalink
Merge pull request #268 from docqai/build/update-streamlit-package
Browse files Browse the repository at this point in the history
refactor: update streamlit version plus improvements to API need by Chrome Ext
  • Loading branch information
janaka authored Sep 7, 2024
2 parents 104782d + 79acfce commit 2e81525
Show file tree
Hide file tree
Showing 57 changed files with 2,653 additions and 2,206 deletions.
3,502 changes: 1,704 additions & 1,798 deletions poetry.lock

Large diffs are not rendered by default.

14 changes: 7 additions & 7 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "docq"
version = "0.12.1"
version = "0.13.1"
description = "Docq.AI - Your private ChatGPT alternative. Securely unlock knowledge from confidential documents."
authors = ["Docq.AI Team <[email protected]>"]
maintainers = ["Docq.AI Team <[email protected]>"]
Expand All @@ -13,16 +13,16 @@ packages = [{ include = "docq", from = "source" }]

[tool.poetry.dependencies]
python = ">=3.10,<3.12"
streamlit = "1.30.0"
st-pages = "^0.4.1"
streamlit = "^1.38.0"
st-pages = "^1.0.0"
pypdf = "^4.1.0"
docx2txt = "^0.8"
argon2-cffi = "^21.3.0"
azure-core = "^1.27.1"
opendal = "^0.41.0"
transformers = "^4.38.2"
optimum = {extras = ["exporters"], version = "^1.17.1"}
torch = "^2.0.0, !=2.0.1, !=2.1.0"
torch = "2.2.0"
cryptography = "^42.0.4"
span-marker = "^1.3.0"
honeycomb-opentelemetry = "^0.2.3b0"
Expand All @@ -47,7 +47,7 @@ google-auth-oauthlib = "^1.1.0"
google-api-python-client = "^2.104.0"
google-auth-httplib2 = "^0.1.1"
microsoftgraph-python = "^1.1.6"
pydantic = "^2.5.2"
pydantic = "2.8.2"
mkdocs-material = "^9.5.13"
pyautogen = "^0.2.2"
termcolor = "^2.4.0"
Expand All @@ -57,14 +57,14 @@ semantic-kernel = "0.4.3.dev0"
imap-tools = "^1.5.0"
llama-index-llms-litellm = "^0.1.3"
llama-index-embeddings-azure-openai = "^0.1.6"
jwt = "^1.3.1"
llama-index-embeddings-huggingface-optimum = "^0.1.5"
llama-index-core = "^0.10.39"
llama-index-readers-file = "^0.1.12"
slack-bolt = "^1.18.1"
llama-index-retrievers-bm25 = "^0.1.3"
sentence-transformers = "^2.6.1"
llama-index-postprocessor-colbert-rerank = "^0.1.2"
jwt = "^1.3.1"
llama-index-core = "0.10.39"

[tool.poetry.group.dev.dependencies]
pre-commit = "^2.18.1"
Expand Down
6 changes: 3 additions & 3 deletions source/docq/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,10 @@
class SpaceType(Enum):
"""Space types. These reflect scope of data access."""

PERSONAL = "personal"
PERSONAL = "personal" # DEPRECATED. Personal spaces are now shared spaces in the users personal org.
SHARED = "shared"
PUBLIC = "public"
THREAD = "thread"
PUBLIC = "public" # public spaces are accessible to all users and anonymous users such as via widgets for chat bots
THREAD = "thread" # a space that belongs to a thread used for adhoc uploads.


class SystemFeatureType(Enum):
Expand Down
30 changes: 21 additions & 9 deletions source/docq/manage_assistants.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
from llama_index.core.base.llms.types import ChatMessage, MessageRole
from llama_index.core.prompts import ChatPromptTemplate

from .domain import Assistant, AssistantType
from .support.store import (
from docq.domain import Assistant, AssistantType
from docq.support.store import (
get_sqlite_global_system_file,
get_sqlite_org_system_file,
)
Expand Down Expand Up @@ -126,7 +126,7 @@
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
"""

# id, name, type, archived, system_prompt_template, user_prompt_template, llm_settings_collection_key, created_at, updated_at, scoped_id
ASSISTANT = tuple[int, str, str, bool, str, str, str, datetime, datetime, str]


Expand Down Expand Up @@ -196,10 +196,17 @@ def get_assistant_fixed(
return result


def get_assistant_or_default(assistant_scoped_id: Optional[int] = None, org_id: Optional[int] = None) -> Assistant:
"""Get the persona."""
def get_assistant_or_default(assistant_scoped_id: Optional[str] = None, org_id: Optional[int] = None) -> Assistant:
"""Get the persona.
Args:
assistant_scoped_id (Optional[int]): The assistant scoped ID. A composite ID <scope>_<id>.
scope is either 'org' or 'global'. id from the respective table.
org_id (Optional[int]): The org ID.
"""
if assistant_scoped_id:
assistant_data = get_assistant(assistant_scoped_id=str(assistant_scoped_id), org_id=org_id)
assistant_data = get_assistant(assistant_scoped_id=assistant_scoped_id, org_id=org_id)
return Assistant(
key=str(assistant_data[0]),
name=assistant_data[1],
Expand All @@ -209,7 +216,11 @@ def get_assistant_or_default(assistant_scoped_id: Optional[int] = None, org_id:
)
else:
key = "default"
return Assistant(key=key, **SIMPLE_CHAT_PERSONAS[key])
return Assistant(
key=key,
llm_settings_collection_key="azure_openai_with_local_embedding",
**SIMPLE_CHAT_PERSONAS[key],
)


def list_assistants(org_id: Optional[int] = None, assistant_type: Optional[AssistantType] = None) -> list[ASSISTANT]:
Expand Down Expand Up @@ -258,6 +269,7 @@ def get_assistant(assistant_scoped_id: str, org_id: Optional[int]) -> ASSISTANT:
if scope == "org" and org_id:
path = __get_assistants_sqlite_file(org_id=org_id)
else:
# global scope
path = __get_assistants_sqlite_file(org_id=None)

with closing(sqlite3.connect(path, detect_types=sqlite3.PARSE_DECLTYPES)) as connection, closing(
Expand All @@ -271,10 +283,10 @@ def get_assistant(assistant_scoped_id: str, org_id: Optional[int]) -> ASSISTANT:
if row is None:
if org_id and scope == "org":
raise ValueError(
f"No Persona with: id = '{assistant_scoped_id}' that belongs to org org_id= '{org_id}', scope= '{scope}'"
f"No Assistant with: id = '{id_}' that belongs to org org_id= '{org_id}', scope= '{scope}'"
)
else:
raise ValueError(f"No Persona with: id = '{assistant_scoped_id}' in global scope. scope= '{scope}'")
raise ValueError(f"No Assistant with: id = '{id_}' in global scope. scope= '{scope}'")
return (row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7], row[8], assistant_scoped_id)


Expand Down
19 changes: 9 additions & 10 deletions source/docq/manage_spaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,13 @@
from opentelemetry import trace

import docq

from .access_control.main import SpaceAccessor, SpaceAccessType
from .config import SpaceType
from .data_source.list import SpaceDataSources
from .domain import DocumentListItem, SpaceKey
from .manage_indices import _create_vector_index, _persist_index
from .model_selection.main import get_saved_model_settings_collection
from .support.store import get_sqlite_shared_system_file
from docq.access_control.main import SpaceAccessor, SpaceAccessType
from docq.config import SpaceType
from docq.data_source.list import SpaceDataSources
from docq.domain import DocumentListItem, SpaceKey
from docq.manage_indices import _create_vector_index, _persist_index
from docq.model_selection.main import get_saved_model_settings_collection
from docq.support.store import get_sqlite_shared_system_file

tracer = trace.get_tracer(__name__, docq.__version_str__)

Expand Down Expand Up @@ -151,6 +150,7 @@ def list_space(org_id: int, space_type: Optional[str] = None) -> list[SPACE]:
)

rows = cursor.fetchall()
print("spaces:", rows)
return [_format_space(row) for row in rows]


Expand Down Expand Up @@ -354,8 +354,7 @@ def get_thread_space(org_id: int, thread_id: int) -> SpaceKey | None:
with closing(
sqlite3.connect(get_sqlite_shared_system_file(), detect_types=sqlite3.PARSE_DECLTYPES)
) as connection, closing(connection.cursor()) as cursor:

name = f"Thread-{thread_id} %"
name = f"Thread-{thread_id} %" # FIXME: urg this is nasty.
cursor.execute(
"SELECT id FROM spaces WHERE org_id = ? AND name LIKE ? AND space_type = ?",
(org_id, name, SpaceType.THREAD.name),
Expand Down
39 changes: 21 additions & 18 deletions source/docq/model_selection/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,19 @@
from typing import Any, Dict, Mapping, Optional

import docq
from docq.config import (
ENV_VAR_DOCQ_AZURE_OPENAI_API_BASE1,
ENV_VAR_DOCQ_AZURE_OPENAI_API_BASE2,
ENV_VAR_DOCQ_AZURE_OPENAI_API_KEY1,
ENV_VAR_DOCQ_AZURE_OPENAI_API_KEY2,
ENV_VAR_DOCQ_AZURE_OPENAI_API_VERSION,
ENV_VAR_DOCQ_GROQ_API_KEY,
EXPERIMENTS,
OrganisationSettingsKey,
)
from docq.manage_settings import get_organisation_settings
from docq.support.llama_index.callbackhandlers import OtelCallbackHandler
from docq.support.store import get_models_dir
from llama_index.core.callbacks.base import CallbackManager
from llama_index.core.embeddings import BaseEmbedding
from llama_index.core.llms import LLM
Expand All @@ -25,20 +38,6 @@
from opentelemetry import trace
from vertexai.preview.generative_models import HarmBlockThreshold, HarmCategory

from ..config import (
ENV_VAR_DOCQ_AZURE_OPENAI_API_BASE1,
ENV_VAR_DOCQ_AZURE_OPENAI_API_BASE2,
ENV_VAR_DOCQ_AZURE_OPENAI_API_KEY1,
ENV_VAR_DOCQ_AZURE_OPENAI_API_KEY2,
ENV_VAR_DOCQ_AZURE_OPENAI_API_VERSION,
ENV_VAR_DOCQ_GROQ_API_KEY,
EXPERIMENTS,
OrganisationSettingsKey,
)
from ..manage_settings import get_organisation_settings
from ..support.llama_index.callbackhandlers import OtelCallbackHandler
from ..support.store import get_models_dir

tracer = trace.get_tracer(__name__, docq.__version_str__)


Expand Down Expand Up @@ -165,9 +164,10 @@ class LlmUsageSettingsCollection:
provider=ModelProvider.AZURE_OPENAI,
model_name="gpt-4o",
model_deployment_name="gpt-4o-2024-05-13",
api_base=os.getenv(ENV_VAR_DOCQ_AZURE_OPENAI_API_BASE2) or "",
api_key=os.getenv(ENV_VAR_DOCQ_AZURE_OPENAI_API_KEY2) or "",
api_version=os.environ.get(ENV_VAR_DOCQ_AZURE_OPENAI_API_VERSION, "2023-05-15"),
api_base=os.getenv(ENV_VAR_DOCQ_AZURE_OPENAI_API_BASE2) or "base url missing",
api_key=os.getenv(ENV_VAR_DOCQ_AZURE_OPENAI_API_KEY2) or "api key missing",
# api_version=os.environ.get(ENV_VAR_DOCQ_AZURE_OPENAI_API_VERSION, "2023-05-15"),
api_version=os.environ.get(ENV_VAR_DOCQ_AZURE_OPENAI_API_VERSION, "2024-07-01-preview"),
license_="Commercial",
),
"azure-openai-ada-002": LlmServiceInstanceConfig(
Expand Down Expand Up @@ -282,7 +282,7 @@ class LlmUsageSettingsCollection:
ModelCapability.CHAT: LlmUsageSettings(
model_capability=ModelCapability.CHAT,
temperature=0.7,
service_instance_config=LLM_SERVICE_INSTANCES["azure-openai-gpt35turbo"],
service_instance_config=LLM_SERVICE_INSTANCES["azure-openai-gpt4o-2024-05-13"],
),
ModelCapability.EMBEDDING: LlmUsageSettings(
model_capability=ModelCapability.EMBEDDING,
Expand Down Expand Up @@ -412,6 +412,9 @@ class LlmUsageSettingsCollection:
def get_model_settings_collection(model_settings_collection_key: str) -> LlmUsageSettingsCollection:
"""Get the settings for the model."""
try:
x = os.getenv(ENV_VAR_DOCQ_AZURE_OPENAI_API_BASE2)
if not x:
raise ValueError("Azure OpenAI API base 2 is missing")
return LLM_MODEL_COLLECTIONS[model_settings_collection_key]
except KeyError as e:
log.error(
Expand Down
52 changes: 41 additions & 11 deletions source/docq/run_queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,28 +2,28 @@

import logging as log
import sqlite3
from concurrent.futures import thread
from contextlib import closing
from datetime import datetime
from typing import Literal, Optional

from llama_index.core.llms import ChatMessage, MessageRole
from numpy import int32

from docq.config import OrganisationFeatureType
from docq.domain import FeatureKey, SpaceKey
from docq.manage_assistants import Assistant
from docq.manage_documents import format_document_sources
from docq.model_selection.main import LlmUsageSettingsCollection

from .config import OrganisationFeatureType
from .domain import FeatureKey, SpaceKey
from .manage_assistants import Assistant
from .manage_documents import format_document_sources
from .support.llm import query_error, run_ask, run_chat
from .support.store import (
from docq.support.llm import query_error, run_ask, run_chat
from docq.support.store import (
get_history_table_name,
get_history_thread_table_name,
get_public_sqlite_usage_file,
get_sqlite_usage_file,
)

# TODO: add thread_space_id to hold the space that's hard attached to a thread for adhoc uploads
# add space_ids dict / array to loosely persist space ids that are selected by a user.
# add assistant_scoped_id to hold the assistant that's attached to the thread.
SQL_CREATE_THREAD_TABLE = """
CREATE TABLE IF NOT EXISTS {table} (
id INTEGER PRIMARY KEY,
Expand All @@ -32,6 +32,7 @@
)
"""


SQL_CREATE_MESSAGE_TABLE = """
CREATE TABLE IF NOT EXISTS {table} (
id INTEGER PRIMARY KEY,
Expand All @@ -52,6 +53,7 @@


def _save_messages(data: list[tuple[str, bool, datetime, int]], feature: FeatureKey) -> list:
"""feature.id_ needs to be the user_id."""
rows = []
tablename = get_history_table_name(feature.type_)
thread_tablename = get_history_thread_table_name(feature.type_)
Expand Down Expand Up @@ -124,7 +126,7 @@ def _retrieve_messages(


def list_thread_history(feature: FeatureKey, id_: Optional[int] = None) -> list[tuple[int, str, int]]:
"""List the history of threads."""
"""List threads or a thread if id_ is provided."""
tablename = get_history_thread_table_name(feature.type_)
rows = None
with closing(
Expand Down Expand Up @@ -206,7 +208,7 @@ def get_history_as_chat_messages(
return history_chat_message


def create_history_thread(topic: str, feature: FeatureKey) -> int:
def create_history_thread(topic: str, feature: FeatureKey) -> int | None:
"""Create a new thread for the history i.e a new chat session."""
tablename = get_history_thread_table_name(feature.type_)
with closing(
Expand All @@ -225,6 +227,34 @@ def create_history_thread(topic: str, feature: FeatureKey) -> int:

return id_

def delete_thread(thread_id: int, feature: FeatureKey) -> bool:
"""Delete a thread and its associated messages.
feature.id_ needs to be the user_id.
"""
thread_tablename = get_history_thread_table_name(feature.type_)
message_tablename = get_history_table_name(feature.type_)
usage_file = (
get_sqlite_usage_file(feature.id_)
if feature.type_ != OrganisationFeatureType.ASK_PUBLIC
else get_public_sqlite_usage_file(str(feature.id_))
)
is_deleted = False
with closing(sqlite3.connect(usage_file, detect_types=sqlite3.PARSE_DECLTYPES)) as connection, closing(
connection.cursor()
) as cursor:
cursor.execute("PRAGMA foreign_keys = ON;")
try:
cursor.execute(f"DELETE FROM {message_tablename} WHERE thread_id = ?", (thread_id,)) # noqa: S608
cursor.execute(f"DELETE FROM {thread_tablename} WHERE id = ?", (thread_id,)) # noqa: S608
connection.commit()
is_deleted = True
except sqlite3.Error as e:
connection.rollback()
# raise e
is_deleted = False
return is_deleted


def get_latest_thread(feature: FeatureKey) -> tuple[int, str, int] | None:
"""Retrieve the most recently created thread.
Expand Down
4 changes: 2 additions & 2 deletions source/docq/support/auth_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@
from typing import Dict, Optional

import docq
import streamlit as st
from cachetools import TTLCache
from cryptography.fernet import Fernet
from opentelemetry import trace
from streamlit.components.v1 import html
from streamlit.web.server.websocket_headers import _get_websocket_headers

from ..config import ENV_VAR_DOCQ_COOKIE_HMAC_SECRET_KEY, SESSION_COOKIE_NAME

Expand Down Expand Up @@ -80,7 +80,7 @@ def _clear_cookie(cookie_name: str) -> None:
def _get_cookies() -> Optional[Dict[str, str]]:
"""Return client cookies."""
try:
headers = _get_websocket_headers()
headers = st.context.headers # _get_websocket_headers()
if headers is None:
return None
cookie_str = str(headers.get("Cookie"))
Expand Down
Loading

0 comments on commit 2e81525

Please sign in to comment.