Skip to content

Commit

Permalink
chore: Add File storage service handlers.
Browse files Browse the repository at this point in the history
  • Loading branch information
osala-eng committed Nov 9, 2023
1 parent d633f40 commit 3dbb31d
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 23 deletions.
24 changes: 11 additions & 13 deletions source/docq/data_source/googledrive.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,9 @@

from llama_index import Document

from ..config import ConfigKeyHandlers, ConfigKeyOptions
from ..domain import ConfigKey, SpaceKey
from ..support.store import get_index_dir
from .main import DocumentMetadata, SpaceDataSourceFileBased
from .main import DocumentMetadata, FileStorageServiceKeys, SpaceDataSourceFileBased
from .support.opendal_reader.base import GoogleDriveReader, OpendalReader


Expand All @@ -24,15 +23,14 @@ def get_config_keys(self: Self) -> List[ConfigKey]:
"""Get the config keys for google drive."""
return [
ConfigKey(
ConfigKeyHandlers.GET_GDRIVE_CREDENTIAL.name,
f"{FileStorageServiceKeys.GOOGLE_DRIVE}-credential",
"Credential",
is_hidden=True,
input_element="credential_request",
is_secret=True,
ref_link="https://docqai.github.io/docq/user-guide/config-spaces/#data-source-google-drive",
),
ConfigKey(
ConfigKeyOptions.GET_GDRIVE_OPTIONS.name,
f"{FileStorageServiceKeys.GOOGLE_DRIVE}-root_path",
"Select a folder",
input_element="selectbox",
ref_link="https://docqai.github.io/docq/user-guide/config-spaces/#data-source-google-drive",
),
]
Expand All @@ -51,11 +49,11 @@ def lambda_metadata(x: str) -> dict:
str(DocumentMetadata.INDEXED_ON.name).lower(): datetime.timestamp(datetime.now().utcnow()),
}

drive_options = configs[ConfigKeyOptions.GET_GDRIVE_OPTIONS.name]
root_path = configs[f"{FileStorageServiceKeys.GOOGLE_DRIVE}-root_path"]

options = {
"root": drive_options["name"],
"access_token": json.dumps(configs[ConfigKeyHandlers.GET_GDRIVE_CREDENTIAL.name]),
"root": root_path["name"],
"access_token": json.dumps(configs[f"{FileStorageServiceKeys.GOOGLE_DRIVE}-credential"]),
}

try:
Expand All @@ -68,9 +66,9 @@ def lambda_metadata(x: str) -> dict:
log.error("Failed to load google drive with opendal reader: %s", e)
loader = GoogleDriveReader(
file_metadata=lambda_metadata,
root=drive_options["name"],
access_token=configs[ConfigKeyHandlers.GET_GDRIVE_CREDENTIAL.name],
selected_folder_id=drive_options["id"]
root=root_path["name"],
access_token=configs[f"{FileStorageServiceKeys.GOOGLE_DRIVE}-credential"],
selected_folder_id=root_path["id"]
)

documents = loader.load_data()
Expand Down
62 changes: 52 additions & 10 deletions source/docq/data_source/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from abc import ABC, abstractmethod
from dataclasses import asdict
from enum import Enum
from typing import List
from typing import Any, Callable, List, Literal, Self

from llama_index import Document

Expand All @@ -26,29 +26,38 @@ class DocumentMetadata(Enum):
SOURCE_URI = "Source URI"


class FileStorageServiceKeys(Enum):
"""File storage service keys."""

GOOGLE_DRIVE = "Google Drive"
ONEDRIVE = "OneDrive"
DROPBOX = "Dropbox"
BOX = "Box"


class SpaceDataSource(ABC):
"""Abstract definition of the data source for a space. To be extended by concrete data sources."""

def __init__(self, name: str) -> None:
def __init__(self: Self, name: str) -> None:
"""Initialize the data source."""
self.name = name

def get_name(self) -> str:
def get_name(self: Self) -> str:
"""Get the name of the data source."""
return self.name

@abstractmethod
def get_config_keys(self) -> List[ConfigKey]:
def get_config_keys(self: Self) -> List[ConfigKey]:
"""Get the list of config keys."""
pass

@abstractmethod
def load(self, space: SpaceKey, configs: dict) -> List[Document]:
def load(self: Self, space: SpaceKey, configs: dict) -> List[Document]:
"""Load the documents from the data source."""
pass

@abstractmethod
def get_document_list(self, space: SpaceKey, configs: dict) -> List[DocumentListItem]:
def get_document_list(self: Self, space: SpaceKey, configs: dict) -> List[DocumentListItem]:
"""Returns a list of tuples containing the name, creation time, and size (Mb) of each document in the specified space's cnfigured data source.
Args:
Expand All @@ -66,12 +75,12 @@ class SpaceDataSourceFileBased(SpaceDataSource):

_DOCUMENT_LIST_FILENAME = "document_list.json"

def get_document_list(self, space: SpaceKey, configs: dict) -> List[DocumentListItem]:
def get_document_list(self: Self, space: SpaceKey, configs: dict) -> List[DocumentListItem]:
"""Get the list of documents."""
persist_path = get_index_dir(space)
return self._load_document_list(persist_path, self._DOCUMENT_LIST_FILENAME)

def _save_document_list(self, document_list: List[DocumentListItem], persist_path: str, filename: str) -> None:
def _save_document_list(self: Self, document_list: List[DocumentListItem], persist_path: str, filename: str) -> None:
path = os.path.join(persist_path, filename)
try:
data = [asdict(item) for item in document_list]
Expand All @@ -86,7 +95,7 @@ def _save_document_list(self, document_list: List[DocumentListItem], persist_pat
except Exception as e:
log.error("Failed to save space index document list to '%s': %s", path, e, stack_info=True)

def _load_document_list(self, persist_path: str, filename: str) -> List[DocumentListItem]:
def _load_document_list(self: Self, persist_path: str, filename: str) -> List[DocumentListItem]:
path = os.path.join(persist_path, filename)
with open(path, "r") as f:
data = json.load(f)
Expand All @@ -95,7 +104,7 @@ def _load_document_list(self, persist_path: str, filename: str) -> List[Document
return document_list

def _add_exclude_metadata_keys(
self, documents: List[Document], embed_keys: List[str], llm_keys: List[str]
self: Self, documents: List[Document], embed_keys: List[str], llm_keys: List[str]
) -> List[Document]:
"""Exclude metadata keys from embedding and LLM."""
if documents is None:
Expand All @@ -109,3 +118,36 @@ def _add_exclude_metadata_keys(

class SpaceDataSourceWebBased(SpaceDataSourceFileBased):
"""Abstract definition of a web-based data source for a space. To be extended by concrete data sources."""


class SetFSHandler:
"""Set file storage handler."""

__handlers: dict[Literal["credential", "root_path"], Callable] = {}
__supported_keys: list[Literal["credential", "root_path"]] = ["credential", "root_path"]

def __init__(self: Self, credential: Callable, root_path: Callable) -> None:
"""Initialize the handler."""
self.__handlers = {"credential": credential, "root_path": root_path}

def __call__(self: Self, key: str, *args: Any, **kwargs: Any) -> Callable | None:
"""Call the handler."""
if key in self.__supported_keys:
return self.__handlers[key](*args, **kwargs)


class FileStorageServiceHandlers:
"""File storage service handlers."""

__handlers: dict[FileStorageServiceKeys, SetFSHandler] = {}

def __init__(self: Self, handlers: dict[FileStorageServiceKeys, SetFSHandler]) -> None:
"""Initialize the handlers."""
for key, handler in handlers.items():
if key in FileStorageServiceKeys:
self.__handlers[key] = handler

def __getitem__(self: Self, key: str) -> Callable | None:
"""Get the handler."""
if key in self.__handlers:
return self.__handlers[key]

0 comments on commit 3dbb31d

Please sign in to comment.