Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/cleanup refactor vector collection #1432

Merged
merged 31 commits into from
Oct 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
6a3d460
Feature/include vectors option document chunks (#1419)
emrgnt-cmplxty Oct 16, 2024
0840cf6
Allow env var to set the default R2R deployment for the dashboard (#1…
NolanTrem Oct 16, 2024
c5c0835
Feature/various documentation tweaks (#1422)
emrgnt-cmplxty Oct 17, 2024
a38f916
Graphrag tests (#1418)
shreyaspimpalgaonkar Oct 17, 2024
03cd278
Modify graphrag tests timeouts (#1416)
shreyaspimpalgaonkar Oct 17, 2024
f33fccd
feat: Make prompt provider methods asynchronous (comments below) (#1415)
shreyaspimpalgaonkar Oct 17, 2024
11ae42e
bump pyproject version
emrgnt-cmplxty Oct 17, 2024
785c72e
first commit
emrgnt-cmplxty Oct 17, 2024
ab14a05
towards slimmer vector implementation logic
emrgnt-cmplxty Oct 18, 2024
c8849bf
up
emrgnt-cmplxty Oct 18, 2024
58b1222
up
emrgnt-cmplxty Oct 18, 2024
c731df9
iterate
emrgnt-cmplxty Oct 18, 2024
a32505e
up
emrgnt-cmplxty Oct 18, 2024
f057907
checkin
emrgnt-cmplxty Oct 18, 2024
681d276
up
emrgnt-cmplxty Oct 18, 2024
8bca097
work doc chunks
emrgnt-cmplxty Oct 18, 2024
7d86e4e
working vector search
emrgnt-cmplxty Oct 18, 2024
0eb5b96
working full text search
emrgnt-cmplxty Oct 18, 2024
ab0558b
remove asyncpg
emrgnt-cmplxty Oct 18, 2024
ac9c016
up
emrgnt-cmplxty Oct 18, 2024
5ab2712
passing vector tests
emrgnt-cmplxty Oct 18, 2024
177db8d
merge
emrgnt-cmplxty Oct 20, 2024
5516373
up
emrgnt-cmplxty Oct 20, 2024
1e20324
merge
emrgnt-cmplxty Oct 20, 2024
43a32ea
merge
emrgnt-cmplxty Oct 21, 2024
2b24e83
rm pytest
emrgnt-cmplxty Oct 21, 2024
e368bcc
up
emrgnt-cmplxty Oct 21, 2024
8884405
up
emrgnt-cmplxty Oct 21, 2024
e279de4
fix delete
emrgnt-cmplxty Oct 21, 2024
94290d5
up
emrgnt-cmplxty Oct 21, 2024
1ec7e74
up
emrgnt-cmplxty Oct 21, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/r2r-light-py-integration-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ jobs:

strategy:
matrix:
os: [windows-latest]
os: [ubuntu-latest]
test_category:
- cli-ingestion
- cli-retrieval
Expand Down
9 changes: 1 addition & 8 deletions py/cli/main.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,5 @@
from cli.command_group import cli
from cli.commands import (
auth,
ingestion,
kg,
management,
retrieval,
server,
)
from cli.commands import auth, ingestion, kg, management, retrieval, server
from cli.utils.telemetry import posthog, telemetry


Expand Down
3 changes: 1 addition & 2 deletions py/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@
"KGSearchSettings",
"VectorSearchResult",
"VectorSearchSettings",
"HybridSearchSettings",
# User abstractions
"Token",
"TokenData",
Expand Down Expand Up @@ -139,8 +140,6 @@
# Database providers
"DatabaseConfig",
"DatabaseProvider",
"RelationalDBProvider",
"VectorDBProvider",
# Embedding provider
"EmbeddingConfig",
"EmbeddingProvider",
Expand Down
3 changes: 1 addition & 2 deletions py/core/base/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
"KGSearchSettings",
"VectorSearchResult",
"VectorSearchSettings",
"HybridSearchSettings",
# KG abstractions
"KGCreationSettings",
"KGEnrichmentSettings",
Expand Down Expand Up @@ -112,8 +113,6 @@
# Database providers
"DatabaseConfig",
"DatabaseProvider",
"RelationalDBProvider",
"VectorDBProvider",
"PostgresConfigurationSettings",
# Embedding provider
"EmbeddingConfig",
Expand Down
2 changes: 1 addition & 1 deletion py/core/base/abstractions/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from shared.abstractions.base import AsyncSyncMeta, R2RSerializable, syncable
from shared.abstractions.llm import MessageType
from shared.abstractions.document import (
DataType,
Document,
Expand Down Expand Up @@ -40,6 +39,7 @@
LLMChatCompletion,
LLMChatCompletionChunk,
Message,
MessageType,
RAGCompletion,
)
from shared.abstractions.prompt import Prompt
Expand Down
6 changes: 3 additions & 3 deletions py/core/base/api/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,29 +21,30 @@
WrappedKGCreationResponse,
WrappedKGEnrichmentResponse,
WrappedKGEntitiesResponse,
WrappedKGTriplesResponse,
WrappedKGEntityDeduplicationResponse,
WrappedKGTriplesResponse,
)
from shared.api.models.management.responses import (
AnalyticsResponse,
AppSettingsResponse,
CollectionOverviewResponse,
CollectionResponse,
ConversationOverviewResponse,
DocumentChunkResponse,
DocumentOverviewResponse,
LogResponse,
PromptResponse,
ScoreCompletionResponse,
ServerStats,
UserOverviewResponse,
ConversationOverviewResponse,
WrappedAddUserResponse,
WrappedAnalyticsResponse,
WrappedAppSettingsResponse,
WrappedCollectionListResponse,
WrappedCollectionOverviewResponse,
WrappedCollectionResponse,
WrappedConversationResponse,
WrappedConversationsOverviewResponse,
WrappedDeleteResponse,
WrappedDocumentChunkResponse,
WrappedDocumentOverviewResponse,
Expand All @@ -54,7 +55,6 @@
WrappedUserCollectionResponse,
WrappedUserOverviewResponse,
WrappedUsersInCollectionResponse,
WrappedConversationsOverviewResponse,
)
from shared.api.models.retrieval.responses import (
RAGAgentResponse,
Expand Down
6 changes: 2 additions & 4 deletions py/core/base/providers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@
DatabaseConfig,
DatabaseProvider,
PostgresConfigurationSettings,
RelationalDBProvider,
VectorDBProvider,
VectorQuantizationType,
)
from .embedding import EmbeddingConfig, EmbeddingProvider
from .file import FileConfig, FileProvider
Expand Down Expand Up @@ -35,8 +34,7 @@
"DatabaseConfig",
"PostgresConfigurationSettings",
"DatabaseProvider",
"RelationalDBProvider",
"VectorDBProvider",
"VectorQuantizationType",
# Embedding provider
"EmbeddingConfig",
"EmbeddingProvider",
Expand Down
53 changes: 27 additions & 26 deletions py/core/base/providers/database.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import logging
from abc import ABC, abstractmethod
from typing import Any, Optional
from typing import Any, Optional, Union, Sequence

from pydantic import BaseModel

from uuid import UUID
from shared.abstractions.vector import VectorQuantizationType

from .base import Provider, ProviderConfig
Expand Down Expand Up @@ -53,6 +53,7 @@ class DatabaseConfig(ProviderConfig):
] = None
default_collection_name: str = "Default"
default_collection_description: str = "Your default collection."
enable_fts: bool = False

def __post_init__(self):
self.validate_config()
Expand All @@ -69,41 +70,41 @@ def supported_providers(self) -> list[str]:
return ["postgres"]


class VectorDBProvider(Provider, ABC):
@abstractmethod
def _initialize_vector_db(
self, dimension: int, quantization_type: VectorQuantizationType
) -> None:
pass


class RelationalDBProvider(Provider, ABC):
@abstractmethod
async def _initialize_relational_db(self) -> None:
pass


class DatabaseProvider(Provider):
def __init__(self, config: DatabaseConfig):
if not isinstance(config, DatabaseConfig):
raise ValueError(
"DatabaseProvider must be initialized with a `DatabaseConfig`."
)
logger.info(f"Initializing DatabaseProvider with config {config}.")

super().__init__(config)

# remove later to re-introduce typing...
self.vector: Any = None
self.relational: Any = None
@abstractmethod
def _get_table_name(self, base_name: str) -> str:
pass

@abstractmethod
def _initialize_vector_db(self) -> VectorDBProvider:
def execute_query(
self,
query: str,
params: Optional[Union[dict[str, Any], Sequence[Any]]] = None,
isolation_level: Optional[str] = None,
):
pass

@abstractmethod
async def _initialize_relational_db(self) -> RelationalDBProvider:
async def execute_many(self, query, params=None, batch_size=1000):
pass

@abstractmethod
def _get_table_name(self, base_name: str) -> str:
def fetch_query(
self,
query: str,
params: Optional[Union[dict[str, Any], Sequence[Any]]] = None,
):
pass

@abstractmethod
def fetchrow_query(
self,
query: str,
params: Optional[Union[dict[str, Any], Sequence[Any]]] = None,
):
pass
4 changes: 3 additions & 1 deletion py/core/base/providers/ingestion.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import logging
from abc import ABC
from enum import Enum
from .base import Provider, ProviderConfig

from shared.abstractions.ingestion import ChunkEnrichmentSettings

from .base import Provider, ProviderConfig

logger = logging.getLogger()


Expand Down
2 changes: 1 addition & 1 deletion py/core/base/providers/kg.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,11 @@
Entity,
KGCreationSettings,
KGEnrichmentSettings,
KGEntityDeduplicationSettings,
KGExtraction,
KGSearchSettings,
RelationshipType,
Triple,
KGEntityDeduplicationSettings,
)
from .base import ProviderConfig

Expand Down
1 change: 0 additions & 1 deletion py/core/examples/scripts/upload_hf_textbooks_ex.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ def remove_file(file_path):
async def process_batch(client, batch):
results = await client.ingest_files(batch)
print(f"Submitted {len(results['results'])} files for processing")
print("results = ", results["results"])
# Remove the processed files
for file_path in batch:
remove_file(file_path)
Expand Down
11 changes: 5 additions & 6 deletions py/core/main/api/ingestion_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,7 +336,7 @@ async def ingest_chunks_app(
@self.base_endpoint
async def create_vector_index_app(
table_name: Optional[VectorTableName] = Body(
default=VectorTableName.CHUNKS,
default=VectorTableName.RAW_CHUNKS,
description="The name of the vector table to create.",
),
index_method: IndexMethod = Body(
Expand All @@ -353,9 +353,9 @@ async def create_vector_index_app(
None,
description="The arguments for the index method.",
),
replace: bool = Body(
default=True,
description="Whether to replace an existing index.",
index_name: Optional[str] = Body(
None,
description="The name of the index to create.",
),
concurrently: bool = Body(
default=True,
Expand All @@ -365,7 +365,7 @@ async def create_vector_index_app(
) -> WrappedCreateVectorIndexResponse:

logger.info(
f"Creating vector index for {table_name} with method {index_method}, measure {measure}, replace {replace}, concurrently {concurrently}"
f"Creating vector index for {table_name} with method {index_method}, measure {measure}, concurrently {concurrently}"
)

raw_message = await self.orchestration_provider.run_workflow(
Expand All @@ -376,7 +376,6 @@ async def create_vector_index_app(
"index_method": index_method,
"measure": measure,
"index_arguments": index_arguments,
"replace": replace,
"concurrently": concurrently,
},
},
Expand Down
2 changes: 1 addition & 1 deletion py/core/main/api/kg_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
WrappedKGCreationResponse,
WrappedKGEnrichmentResponse,
WrappedKGEntitiesResponse,
WrappedKGTriplesResponse,
WrappedKGEntityDeduplicationResponse,
WrappedKGTriplesResponse,
)
from core.base.providers import OrchestrationProvider, Workflow
from core.utils import generate_default_user_collection_id
Expand Down
4 changes: 2 additions & 2 deletions py/core/main/api/management_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from fastapi.responses import StreamingResponse
from pydantic import Json

from core.base import R2RException, Message
from core.base import Message, R2RException
from core.base.api.models import (
WrappedAddUserResponse,
WrappedAnalyticsResponse,
Expand All @@ -19,6 +19,7 @@
WrappedCollectionOverviewResponse,
WrappedCollectionResponse,
WrappedConversationResponse,
WrappedConversationsOverviewResponse,
WrappedDeleteResponse,
WrappedDocumentChunkResponse,
WrappedDocumentOverviewResponse,
Expand All @@ -29,7 +30,6 @@
WrappedUserCollectionResponse,
WrappedUserOverviewResponse,
WrappedUsersInCollectionResponse,
WrappedConversationsOverviewResponse,
)
from core.base.logging import AnalysisTypes, LogFilterCriteria
from core.base.providers import OrchestrationProvider
Expand Down
4 changes: 2 additions & 2 deletions py/core/main/assembly/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ async def create_database_provider(
"Embedding config must have a base dimension to initialize database."
)

vector_db_dimension = self.config.embedding.base_dimension
dimension = self.config.embedding.base_dimension
quantization_type = (
self.config.embedding.quantization_settings.quantization_type
)
Expand All @@ -156,7 +156,7 @@ async def create_database_provider(

database_provider = PostgresDBProvider(
db_config,
vector_db_dimension,
dimension,
crypto_provider=crypto_provider,
quantization_type=quantization_type,
)
Expand Down
Loading
Loading