From e47fcef857e8d64359e9f39298deeeecdf769f4b Mon Sep 17 00:00:00 2001
From: emrgnt-cmplxty <owen@algofi.org>
Date: Mon, 21 Oct 2024 16:44:36 -0700
Subject: [PATCH] split graphrag actions

---
 ...r2r-full-py-integration-tests-graphrag.yml |  80 +++++++
 .../r2r-full-py-integration-tests.yml         |   2 -
 ...2r-light-py-integration-tests-graphrag.yml |  83 ++++++++
 .../r2r-light-py-integration-tests.yml        |   2 -
 py/core/providers/database/vector.py          | 195 ++++++++++++++++++
 5 files changed, 358 insertions(+), 4 deletions(-)
 create mode 100644 .github/workflows/r2r-full-py-integration-tests-graphrag.yml
 create mode 100644 .github/workflows/r2r-light-py-integration-tests-graphrag.yml

diff --git a/.github/workflows/r2r-full-py-integration-tests-graphrag.yml b/.github/workflows/r2r-full-py-integration-tests-graphrag.yml
new file mode 100644
index 000000000..b0b879528
--- /dev/null
+++ b/.github/workflows/r2r-full-py-integration-tests-graphrag.yml
@@ -0,0 +1,80 @@
+name: R2R Full Python Integration Test (ubuntu)
+
+on:
+  push:
+    branches:
+      - dev
+      - dev-minor
+  pull_request:
+    branches:
+      - dev
+      - dev-minor
+  workflow_dispatch:
+
+jobs:
+  test:
+    runs-on: ${{ matrix.os }}
+
+    strategy:
+      matrix:
+        os: [ubuntu-latest]
+        test_category:
+          - cli-graphrag
+          - sdk-graphrag
+    env:
+      OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+      TELEMETRY_ENABLED: 'false'
+      R2R_PROJECT_NAME: r2r_default
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set up Python and install dependencies
+        uses: ./.github/actions/setup-python-full
+        with:
+          os: ${{ matrix.os }}
+
+      - name: Setup and start Docker
+        uses: ./.github/actions/setup-docker
+
+      - name: Login Docker
+        uses: ./.github/actions/login-docker
+        with:
+          docker_username: ${{ secrets.RAGTORICHES_DOCKER_UNAME }}
+          docker_password: ${{ secrets.RAGTORICHES_DOCKER_TOKEN }}
+
+      - name: Start R2R Full server
+        uses: ./.github/actions/start-r2r-full
+
+      - name: Run CLI Ingestion Tests
+        if: matrix.test_category == 'cli-ingestion'
+        uses: ./.github/actions/run-cli-ingestion-tests
+
+      - name: Run CLI Retrieval Tests
+        if: matrix.test_category == 'cli-retrieval'
+        uses: ./.github/actions/run-cli-retrieval-tests
+
+      - name: Run CLI GraphRAG Tests
+        if: matrix.test_category == 'cli-graphrag'
+        uses: ./.github/actions/run-cli-graphrag-tests
+
+      - name: Run SDK Ingestion Tests
+        if: matrix.test_category == 'sdk-ingestion'
+        uses: ./.github/actions/run-sdk-ingestion-tests
+
+      - name: Run SDK Retrieval Tests
+        if: matrix.test_category == 'sdk-retrieval'
+        uses: ./.github/actions/run-sdk-retrieval-tests
+
+      - name: Run SDK Auth Tests
+        if: matrix.test_category == 'sdk-auth'
+        uses: ./.github/actions/run-sdk-auth-tests
+
+      - name: Run SDK Collections Tests
+        if: matrix.test_category == 'sdk-collections'
+        uses: ./.github/actions/run-sdk-collections-tests
+
+      - name: Run SDK GraphRAG Tests
+        if: matrix.test_category == 'sdk-graphrag'
+        uses: ./.github/actions/run-sdk-graphrag-tests
diff --git a/.github/workflows/r2r-full-py-integration-tests.yml b/.github/workflows/r2r-full-py-integration-tests.yml
index 4c37e2c6c..1e2aa6679 100644
--- a/.github/workflows/r2r-full-py-integration-tests.yml
+++ b/.github/workflows/r2r-full-py-integration-tests.yml
@@ -21,12 +21,10 @@ jobs:
         test_category:
           - cli-ingestion
           - cli-retrieval
-          - cli-graphrag
           - sdk-ingestion
           - sdk-retrieval
           - sdk-auth
           - sdk-collections
-          - sdk-graphrag
     env:
       OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
       TELEMETRY_ENABLED: 'false'
diff --git a/.github/workflows/r2r-light-py-integration-tests-graphrag.yml b/.github/workflows/r2r-light-py-integration-tests-graphrag.yml
new file mode 100644
index 000000000..ed37f8e26
--- /dev/null
+++ b/.github/workflows/r2r-light-py-integration-tests-graphrag.yml
@@ -0,0 +1,83 @@
+# yaml-language-server: $schema=https://json.schemastore.org/github-workflow.json
+
+name: R2R Light Python Integration Test (ubuntu)
+
+on:
+  push:
+    branches:
+      - dev
+      - dev-minor
+  pull_request:
+    branches:
+      - dev
+      - dev-minor
+  workflow_dispatch:
+
+jobs:
+  test:
+    runs-on: ${{ matrix.os }}
+
+    strategy:
+      matrix:
+        os: [ubuntu-latest]
+        test_category:
+          - cli-graphrag
+          - sdk-graphrag
+    env:
+      OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+      TELEMETRY_ENABLED: 'false'
+      R2R_POSTGRES_HOST: localhost
+      R2R_POSTGRES_DBNAME: postgres
+      R2R_POSTGRES_PORT: '5432'
+      R2R_POSTGRES_PASSWORD: postgres
+      R2R_POSTGRES_USER: postgres
+      R2R_PROJECT_NAME: r2r_default
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set up Python and install dependencies
+        uses: ./.github/actions/setup-python-light
+        with:
+          os: ${{ matrix.os }}
+
+      - name: Setup and start PostgreSQL
+        uses: ./.github/actions/setup-postgres-ext
+        with:
+          os: ${{ matrix.os }}
+
+      - name: Start R2R Light server
+        uses: ./.github/actions/start-r2r-light
+
+      - name: Run CLI Ingestion Tests
+        if: matrix.test_category == 'cli-ingestion'
+        uses: ./.github/actions/run-cli-ingestion-tests
+
+      - name: Run CLI Retrieval Tests
+        if: matrix.test_category == 'cli-retrieval'
+        uses: ./.github/actions/run-cli-retrieval-tests
+
+      - name: Run CLI GraphRAG Tests
+        if: matrix.test_category == 'cli-graphrag'
+        uses: ./.github/actions/run-cli-graphrag-tests
+
+      - name: Run SDK Ingestion Tests
+        if: matrix.test_category == 'sdk-ingestion'
+        uses: ./.github/actions/run-sdk-ingestion-tests
+
+      - name: Run SDK Retrieval Tests
+        if: matrix.test_category == 'sdk-retrieval'
+        uses: ./.github/actions/run-sdk-retrieval-tests
+
+      - name: Run SDK Auth Tests
+        if: matrix.test_category == 'sdk-auth'
+        uses: ./.github/actions/run-sdk-auth-tests
+
+      - name: Run SDK Collections Tests
+        if: matrix.test_category == 'sdk-collections'
+        uses: ./.github/actions/run-sdk-collections-tests
+
+      - name: Run SDK GraphRAG Tests
+        if: matrix.test_category == 'sdk-graphrag'
+        uses: ./.github/actions/run-sdk-graphrag-tests
diff --git a/.github/workflows/r2r-light-py-integration-tests.yml b/.github/workflows/r2r-light-py-integration-tests.yml
index 8f44684ca..dd094b933 100644
--- a/.github/workflows/r2r-light-py-integration-tests.yml
+++ b/.github/workflows/r2r-light-py-integration-tests.yml
@@ -23,12 +23,10 @@ jobs:
         test_category:
           - cli-ingestion
           - cli-retrieval
-          - cli-graphrag
           - sdk-ingestion
           - sdk-retrieval
           - sdk-auth
           - sdk-collections
-          - sdk-graphrag
     env:
       OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
       TELEMETRY_ENABLED: 'false'
diff --git a/py/core/providers/database/vector.py b/py/core/providers/database/vector.py
index 2fbe80438..844659d83 100644
--- a/py/core/providers/database/vector.py
+++ b/py/core/providers/database/vector.py
@@ -739,6 +739,201 @@ def parse_filter(filter_dict: dict) -> str:
 
         return where_clause
 
+    async def list_indices(
+        self, table_name: Optional[VectorTableName] = None
+    ) -> list[dict]:
+        """
+        Lists all vector indices for the specified table.
+
+        Args:
+            table_name (VectorTableName, optional): The table to list indices for.
+                If None, defaults to RAW_CHUNKS table.
+
+        Returns:
+            List[dict]: List of indices with their properties
+
+        Raises:
+            ArgError: If an invalid table name is provided
+        """
+        if table_name == VectorTableName.RAW_CHUNKS:
+            table_name_str = (
+                f"{self.project_name}.{VectorTableName.RAW_CHUNKS}"
+            )
+            col_name = "vec"
+        elif table_name == VectorTableName.ENTITIES:
+            table_name_str = f"{self.project_name}.{VectorTableName.ENTITIES}"
+            col_name = "description_embedding"
+        elif table_name == VectorTableName.COMMUNITIES:
+            table_name_str = (
+                f"{self.project_name}.{VectorTableName.COMMUNITIES}"
+            )
+            col_name = "embedding"
+        else:
+            raise ArgError("invalid table name")
+
+        query = """
+        SELECT
+            i.indexname as name,
+            i.indexdef as definition,
+            am.amname as method,
+            pg_relation_size(i.indexrelid) as size_in_bytes,
+            idx_scan as number_of_scans,
+            idx_tup_read as tuples_read,
+            idx_tup_fetch as tuples_fetched
+        FROM pg_indexes i
+        JOIN pg_class c ON c.relname = i.indexname
+        JOIN pg_am am ON c.relam = am.oid
+        LEFT JOIN pg_stat_all_indexes s ON s.indexrelid = c.oid
+        WHERE i.tablename = $1
+        AND i.indexdef LIKE $2
+        """
+
+        # Look for indices on the vector column
+        results = await self.connection_manager.fetch_query(
+            query, (table_name_str, f"%({col_name}%")
+        )
+
+        return [
+            {
+                "name": result["name"],
+                "definition": result["definition"],
+                "method": result["method"],
+                "size_in_bytes": result["size_in_bytes"],
+                "number_of_scans": result["number_of_scans"],
+                "tuples_read": result["tuples_read"],
+                "tuples_fetched": result["tuples_fetched"],
+            }
+            for result in results
+        ]
+
+    async def delete_index(
+        self,
+        index_name: str,
+        table_name: Optional[VectorTableName] = None,
+        concurrently: bool = True,
+    ) -> None:
+        """
+        Deletes a vector index.
+
+        Args:
+            index_name (str): Name of the index to delete
+            table_name (VectorTableName, optional): Table the index belongs to
+            concurrently (bool): Whether to drop the index concurrently
+
+        Raises:
+            ArgError: If table name is invalid or index doesn't exist
+            Exception: If index deletion fails
+        """
+        # Validate table name and get column name
+        if table_name == VectorTableName.RAW_CHUNKS:
+            table_name_str = (
+                f"{self.project_name}.{VectorTableName.RAW_CHUNKS}"
+            )
+            col_name = "vec"
+        elif table_name == VectorTableName.ENTITIES:
+            table_name_str = f"{self.project_name}.{VectorTableName.ENTITIES}"
+            col_name = "description_embedding"
+        elif table_name == VectorTableName.COMMUNITIES:
+            table_name_str = (
+                f"{self.project_name}.{VectorTableName.COMMUNITIES}"
+            )
+            col_name = "embedding"
+        else:
+            raise ArgError("invalid table name")
+
+        # Verify index exists and is a vector index
+        query = """
+        SELECT indexdef
+        FROM pg_indexes
+        WHERE indexname = $1
+        AND tablename = $2
+        AND indexdef LIKE $3
+        """
+
+        result = await self.connection_manager.fetchrow_query(
+            query, (index_name, table_name_str, f"%({col_name}%")
+        )
+
+        if not result:
+            raise ArgError(
+                f"Vector index '{index_name}' does not exist on table {table_name_str}"
+            )
+
+        # Drop the index
+        concurrently_sql = "CONCURRENTLY" if concurrently else ""
+        drop_query = f"DROP INDEX {concurrently_sql} {index_name}"
+
+        try:
+            if concurrently:
+                await self.connection_manager.execute_query(
+                    drop_query, isolation_level="AUTOCOMMIT"
+                )
+            else:
+                await self.connection_manager.execute_query(drop_query)
+        except Exception as e:
+            raise Exception(f"Failed to delete index: {e}")
+
+    async def select_index(
+        self, index_name: str, table_name: Optional[VectorTableName] = None
+    ) -> None:
+        """
+        Updates planner statistics to prefer using the specified index.
+        Note: This is a best-effort operation as PostgreSQL's query planner
+        ultimately decides which index to use.
+
+        Args:
+            index_name (str): Name of the index to prefer
+            table_name (VectorTableName, optional): Table the index belongs to
+
+        Raises:
+            ArgError: If table name is invalid or index doesn't exist
+        """
+        # Validate table name and get column name
+        if table_name == VectorTableName.RAW_CHUNKS:
+            table_name_str = (
+                f"{self.project_name}.{VectorTableName.RAW_CHUNKS}"
+            )
+            col_name = "vec"
+        elif table_name == VectorTableName.ENTITIES:
+            table_name_str = f"{self.project_name}.{VectorTableName.ENTITIES}"
+            col_name = "description_embedding"
+        elif table_name == VectorTableName.COMMUNITIES:
+            table_name_str = (
+                f"{self.project_name}.{VectorTableName.COMMUNITIES}"
+            )
+            col_name = "embedding"
+        else:
+            raise ArgError("invalid table name")
+
+        # Verify index exists and is a vector index
+        query = """
+        SELECT indexdef
+        FROM pg_indexes
+        WHERE indexname = $1
+        AND tablename = $2
+        AND indexdef LIKE $3
+        """
+
+        result = await self.connection_manager.fetchrow_query(
+            query, (index_name, table_name_str, f"%({col_name}%")
+        )
+
+        if not result:
+            raise ArgError(
+                f"Vector index '{index_name}' does not exist on table {table_name_str}"
+            )
+
+        # Update statistics to encourage use of this index
+        # Note: This doesn't guarantee the index will be used
+        await self.connection_manager.execute_query(
+            f"ALTER INDEX {index_name} SET STATISTICS 1000;"
+        )
+
+        # Analyze the table to update planner statistics
+        await self.connection_manager.execute_query(
+            f"ANALYZE {table_name_str};"
+        )
+
     async def get_semantic_neighbors(
         self,
         document_id: UUID,