From e47fcef857e8d64359e9f39298deeeecdf769f4b Mon Sep 17 00:00:00 2001 From: emrgnt-cmplxty Date: Mon, 21 Oct 2024 16:44:36 -0700 Subject: [PATCH] split graphrag actions --- ...r2r-full-py-integration-tests-graphrag.yml | 80 +++++++ .../r2r-full-py-integration-tests.yml | 2 - ...2r-light-py-integration-tests-graphrag.yml | 83 ++++++++ .../r2r-light-py-integration-tests.yml | 2 - py/core/providers/database/vector.py | 195 ++++++++++++++++++ 5 files changed, 358 insertions(+), 4 deletions(-) create mode 100644 .github/workflows/r2r-full-py-integration-tests-graphrag.yml create mode 100644 .github/workflows/r2r-light-py-integration-tests-graphrag.yml diff --git a/.github/workflows/r2r-full-py-integration-tests-graphrag.yml b/.github/workflows/r2r-full-py-integration-tests-graphrag.yml new file mode 100644 index 000000000..b0b879528 --- /dev/null +++ b/.github/workflows/r2r-full-py-integration-tests-graphrag.yml @@ -0,0 +1,80 @@ +name: R2R Full Python Integration Test (ubuntu) + +on: + push: + branches: + - dev + - dev-minor + pull_request: + branches: + - dev + - dev-minor + workflow_dispatch: + +jobs: + test: + runs-on: ${{ matrix.os }} + + strategy: + matrix: + os: [ubuntu-latest] + test_category: + - cli-graphrag + - sdk-graphrag + env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + TELEMETRY_ENABLED: 'false' + R2R_PROJECT_NAME: r2r_default + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python and install dependencies + uses: ./.github/actions/setup-python-full + with: + os: ${{ matrix.os }} + + - name: Setup and start Docker + uses: ./.github/actions/setup-docker + + - name: Login Docker + uses: ./.github/actions/login-docker + with: + docker_username: ${{ secrets.RAGTORICHES_DOCKER_UNAME }} + docker_password: ${{ secrets.RAGTORICHES_DOCKER_TOKEN }} + + - name: Start R2R Full server + uses: ./.github/actions/start-r2r-full + + - name: Run CLI Ingestion Tests + if: matrix.test_category == 'cli-ingestion' + uses: ./.github/actions/run-cli-ingestion-tests + + - name: Run CLI Retrieval Tests + if: matrix.test_category == 'cli-retrieval' + uses: ./.github/actions/run-cli-retrieval-tests + + - name: Run CLI GraphRAG Tests + if: matrix.test_category == 'cli-graphrag' + uses: ./.github/actions/run-cli-graphrag-tests + + - name: Run SDK Ingestion Tests + if: matrix.test_category == 'sdk-ingestion' + uses: ./.github/actions/run-sdk-ingestion-tests + + - name: Run SDK Retrieval Tests + if: matrix.test_category == 'sdk-retrieval' + uses: ./.github/actions/run-sdk-retrieval-tests + + - name: Run SDK Auth Tests + if: matrix.test_category == 'sdk-auth' + uses: ./.github/actions/run-sdk-auth-tests + + - name: Run SDK Collections Tests + if: matrix.test_category == 'sdk-collections' + uses: ./.github/actions/run-sdk-collections-tests + + - name: Run SDK GraphRAG Tests + if: matrix.test_category == 'sdk-graphrag' + uses: ./.github/actions/run-sdk-graphrag-tests diff --git a/.github/workflows/r2r-full-py-integration-tests.yml b/.github/workflows/r2r-full-py-integration-tests.yml index 4c37e2c6c..1e2aa6679 100644 --- a/.github/workflows/r2r-full-py-integration-tests.yml +++ b/.github/workflows/r2r-full-py-integration-tests.yml @@ -21,12 +21,10 @@ jobs: test_category: - cli-ingestion - cli-retrieval - - cli-graphrag - sdk-ingestion - sdk-retrieval - sdk-auth - sdk-collections - - sdk-graphrag env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} TELEMETRY_ENABLED: 'false' diff --git a/.github/workflows/r2r-light-py-integration-tests-graphrag.yml b/.github/workflows/r2r-light-py-integration-tests-graphrag.yml new file mode 100644 index 000000000..ed37f8e26 --- /dev/null +++ b/.github/workflows/r2r-light-py-integration-tests-graphrag.yml @@ -0,0 +1,83 @@ +# yaml-language-server: $schema=https://json.schemastore.org/github-workflow.json + +name: R2R Light Python Integration Test (ubuntu) + +on: + push: + branches: + - dev + - dev-minor + pull_request: + branches: + - dev + - dev-minor + workflow_dispatch: + +jobs: + test: + runs-on: ${{ matrix.os }} + + strategy: + matrix: + os: [ubuntu-latest] + test_category: + - cli-graphrag + - sdk-graphrag + env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + TELEMETRY_ENABLED: 'false' + R2R_POSTGRES_HOST: localhost + R2R_POSTGRES_DBNAME: postgres + R2R_POSTGRES_PORT: '5432' + R2R_POSTGRES_PASSWORD: postgres + R2R_POSTGRES_USER: postgres + R2R_PROJECT_NAME: r2r_default + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python and install dependencies + uses: ./.github/actions/setup-python-light + with: + os: ${{ matrix.os }} + + - name: Setup and start PostgreSQL + uses: ./.github/actions/setup-postgres-ext + with: + os: ${{ matrix.os }} + + - name: Start R2R Light server + uses: ./.github/actions/start-r2r-light + + - name: Run CLI Ingestion Tests + if: matrix.test_category == 'cli-ingestion' + uses: ./.github/actions/run-cli-ingestion-tests + + - name: Run CLI Retrieval Tests + if: matrix.test_category == 'cli-retrieval' + uses: ./.github/actions/run-cli-retrieval-tests + + - name: Run CLI GraphRAG Tests + if: matrix.test_category == 'cli-graphrag' + uses: ./.github/actions/run-cli-graphrag-tests + + - name: Run SDK Ingestion Tests + if: matrix.test_category == 'sdk-ingestion' + uses: ./.github/actions/run-sdk-ingestion-tests + + - name: Run SDK Retrieval Tests + if: matrix.test_category == 'sdk-retrieval' + uses: ./.github/actions/run-sdk-retrieval-tests + + - name: Run SDK Auth Tests + if: matrix.test_category == 'sdk-auth' + uses: ./.github/actions/run-sdk-auth-tests + + - name: Run SDK Collections Tests + if: matrix.test_category == 'sdk-collections' + uses: ./.github/actions/run-sdk-collections-tests + + - name: Run SDK GraphRAG Tests + if: matrix.test_category == 'sdk-graphrag' + uses: ./.github/actions/run-sdk-graphrag-tests diff --git a/.github/workflows/r2r-light-py-integration-tests.yml b/.github/workflows/r2r-light-py-integration-tests.yml index 8f44684ca..dd094b933 100644 --- a/.github/workflows/r2r-light-py-integration-tests.yml +++ b/.github/workflows/r2r-light-py-integration-tests.yml @@ -23,12 +23,10 @@ jobs: test_category: - cli-ingestion - cli-retrieval - - cli-graphrag - sdk-ingestion - sdk-retrieval - sdk-auth - sdk-collections - - sdk-graphrag env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} TELEMETRY_ENABLED: 'false' diff --git a/py/core/providers/database/vector.py b/py/core/providers/database/vector.py index 2fbe80438..844659d83 100644 --- a/py/core/providers/database/vector.py +++ b/py/core/providers/database/vector.py @@ -739,6 +739,201 @@ def parse_filter(filter_dict: dict) -> str: return where_clause + async def list_indices( + self, table_name: Optional[VectorTableName] = None + ) -> list[dict]: + """ + Lists all vector indices for the specified table. + + Args: + table_name (VectorTableName, optional): The table to list indices for. + If None, defaults to RAW_CHUNKS table. + + Returns: + List[dict]: List of indices with their properties + + Raises: + ArgError: If an invalid table name is provided + """ + if table_name == VectorTableName.RAW_CHUNKS: + table_name_str = ( + f"{self.project_name}.{VectorTableName.RAW_CHUNKS}" + ) + col_name = "vec" + elif table_name == VectorTableName.ENTITIES: + table_name_str = f"{self.project_name}.{VectorTableName.ENTITIES}" + col_name = "description_embedding" + elif table_name == VectorTableName.COMMUNITIES: + table_name_str = ( + f"{self.project_name}.{VectorTableName.COMMUNITIES}" + ) + col_name = "embedding" + else: + raise ArgError("invalid table name") + + query = """ + SELECT + i.indexname as name, + i.indexdef as definition, + am.amname as method, + pg_relation_size(i.indexrelid) as size_in_bytes, + idx_scan as number_of_scans, + idx_tup_read as tuples_read, + idx_tup_fetch as tuples_fetched + FROM pg_indexes i + JOIN pg_class c ON c.relname = i.indexname + JOIN pg_am am ON c.relam = am.oid + LEFT JOIN pg_stat_all_indexes s ON s.indexrelid = c.oid + WHERE i.tablename = $1 + AND i.indexdef LIKE $2 + """ + + # Look for indices on the vector column + results = await self.connection_manager.fetch_query( + query, (table_name_str, f"%({col_name}%") + ) + + return [ + { + "name": result["name"], + "definition": result["definition"], + "method": result["method"], + "size_in_bytes": result["size_in_bytes"], + "number_of_scans": result["number_of_scans"], + "tuples_read": result["tuples_read"], + "tuples_fetched": result["tuples_fetched"], + } + for result in results + ] + + async def delete_index( + self, + index_name: str, + table_name: Optional[VectorTableName] = None, + concurrently: bool = True, + ) -> None: + """ + Deletes a vector index. + + Args: + index_name (str): Name of the index to delete + table_name (VectorTableName, optional): Table the index belongs to + concurrently (bool): Whether to drop the index concurrently + + Raises: + ArgError: If table name is invalid or index doesn't exist + Exception: If index deletion fails + """ + # Validate table name and get column name + if table_name == VectorTableName.RAW_CHUNKS: + table_name_str = ( + f"{self.project_name}.{VectorTableName.RAW_CHUNKS}" + ) + col_name = "vec" + elif table_name == VectorTableName.ENTITIES: + table_name_str = f"{self.project_name}.{VectorTableName.ENTITIES}" + col_name = "description_embedding" + elif table_name == VectorTableName.COMMUNITIES: + table_name_str = ( + f"{self.project_name}.{VectorTableName.COMMUNITIES}" + ) + col_name = "embedding" + else: + raise ArgError("invalid table name") + + # Verify index exists and is a vector index + query = """ + SELECT indexdef + FROM pg_indexes + WHERE indexname = $1 + AND tablename = $2 + AND indexdef LIKE $3 + """ + + result = await self.connection_manager.fetchrow_query( + query, (index_name, table_name_str, f"%({col_name}%") + ) + + if not result: + raise ArgError( + f"Vector index '{index_name}' does not exist on table {table_name_str}" + ) + + # Drop the index + concurrently_sql = "CONCURRENTLY" if concurrently else "" + drop_query = f"DROP INDEX {concurrently_sql} {index_name}" + + try: + if concurrently: + await self.connection_manager.execute_query( + drop_query, isolation_level="AUTOCOMMIT" + ) + else: + await self.connection_manager.execute_query(drop_query) + except Exception as e: + raise Exception(f"Failed to delete index: {e}") + + async def select_index( + self, index_name: str, table_name: Optional[VectorTableName] = None + ) -> None: + """ + Updates planner statistics to prefer using the specified index. + Note: This is a best-effort operation as PostgreSQL's query planner + ultimately decides which index to use. + + Args: + index_name (str): Name of the index to prefer + table_name (VectorTableName, optional): Table the index belongs to + + Raises: + ArgError: If table name is invalid or index doesn't exist + """ + # Validate table name and get column name + if table_name == VectorTableName.RAW_CHUNKS: + table_name_str = ( + f"{self.project_name}.{VectorTableName.RAW_CHUNKS}" + ) + col_name = "vec" + elif table_name == VectorTableName.ENTITIES: + table_name_str = f"{self.project_name}.{VectorTableName.ENTITIES}" + col_name = "description_embedding" + elif table_name == VectorTableName.COMMUNITIES: + table_name_str = ( + f"{self.project_name}.{VectorTableName.COMMUNITIES}" + ) + col_name = "embedding" + else: + raise ArgError("invalid table name") + + # Verify index exists and is a vector index + query = """ + SELECT indexdef + FROM pg_indexes + WHERE indexname = $1 + AND tablename = $2 + AND indexdef LIKE $3 + """ + + result = await self.connection_manager.fetchrow_query( + query, (index_name, table_name_str, f"%({col_name}%") + ) + + if not result: + raise ArgError( + f"Vector index '{index_name}' does not exist on table {table_name_str}" + ) + + # Update statistics to encourage use of this index + # Note: This doesn't guarantee the index will be used + await self.connection_manager.execute_query( + f"ALTER INDEX {index_name} SET STATISTICS 1000;" + ) + + # Analyze the table to update planner statistics + await self.connection_manager.execute_query( + f"ANALYZE {table_name_str};" + ) + async def get_semantic_neighbors( self, document_id: UUID,