diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml index 533598f950..b7760aebcd 100644 --- a/.github/workflows/python-ci.yml +++ b/.github/workflows/python-ci.yml @@ -2,8 +2,8 @@ name: Python CI on: push: branches: - - "**/main" # Matches branches like feature/main - - "main" # Matches the main branch + - "**/main" # match branches like feature/main + - "main" # match the main branch pull_request: types: - opened @@ -13,6 +13,9 @@ on: branches: - "**/main" - "main" + paths-ignore: + - "**/*.md" + - ".semversioner/**" permissions: contents: read diff --git a/.github/workflows/python-integration-tests.yml b/.github/workflows/python-integration-tests.yml index 157127a086..ffdbbef760 100644 --- a/.github/workflows/python-integration-tests.yml +++ b/.github/workflows/python-integration-tests.yml @@ -2,8 +2,8 @@ name: Python Integration Tests on: push: branches: - - "**/main" # Matches branches like feature/main - - "main" # Matches the main branch + - "**/main" # match branches like feature/main + - "main" # match the main branch pull_request: types: - opened @@ -13,6 +13,9 @@ on: branches: - "**/main" - "main" + paths-ignore: + - "**/*.md" + - ".semversioner/**" permissions: contents: read diff --git a/.github/workflows/python-notebook-tests.yml b/.github/workflows/python-notebook-tests.yml index 95d2ae2e73..8134b748d9 100644 --- a/.github/workflows/python-notebook-tests.yml +++ b/.github/workflows/python-notebook-tests.yml @@ -2,8 +2,8 @@ name: Python Notebook Tests on: push: branches: - - "**/main" # Matches branches like feature/main - - "main" # Matches the main branch + - "**/main" # match branches like feature/main + - "main" # match the main branch pull_request: types: - opened @@ -13,6 +13,9 @@ on: branches: - "**/main" - "main" + paths-ignore: + - "**/*.md" + - ".semversioner/**" permissions: contents: read diff --git a/.github/workflows/python-smoke-tests.yml b/.github/workflows/python-smoke-tests.yml index 6a57ac2e85..fafab1891f 100644 --- a/.github/workflows/python-smoke-tests.yml +++ b/.github/workflows/python-smoke-tests.yml @@ -2,8 +2,8 @@ name: Python Smoke Tests on: push: branches: - - "**/main" # Matches branches like feature/main - - "main" # Matches the main branch + - "**/main" # match branches like feature/main + - "main" # match the main branch pull_request: types: - opened @@ -13,6 +13,9 @@ on: branches: - "**/main" - "main" + paths-ignore: + - "**/*.md" + - ".semversioner/**" permissions: contents: read diff --git a/.semversioner/next-release/patch-20241031001404444046.json b/.semversioner/next-release/patch-20241031001404444046.json new file mode 100644 index 0000000000..10725e9652 --- /dev/null +++ b/.semversioner/next-release/patch-20241031001404444046.json @@ -0,0 +1,4 @@ +{ + "type": "patch", + "description": "add visualization guide to doc site" +} diff --git a/.semversioner/next-release/patch-20241106094228896260.json b/.semversioner/next-release/patch-20241106094228896260.json new file mode 100644 index 0000000000..e9c07c612b --- /dev/null +++ b/.semversioner/next-release/patch-20241106094228896260.json @@ -0,0 +1,4 @@ +{ + "type": "patch", + "description": "fix streaming output error" +} diff --git a/.semversioner/next-release/patch-20241106184714830526.json b/.semversioner/next-release/patch-20241106184714830526.json new file mode 100644 index 0000000000..e1d29c8c3a --- /dev/null +++ b/.semversioner/next-release/patch-20241106184714830526.json @@ -0,0 +1,4 @@ +{ + "type": "patch", + "description": "Allow some cicd jobs to skip PRs dedicated to doc updates only." +} diff --git a/.semversioner/next-release/patch-20241106193551070554.json b/.semversioner/next-release/patch-20241106193551070554.json new file mode 100644 index 0000000000..48e7ce9a52 --- /dev/null +++ b/.semversioner/next-release/patch-20241106193551070554.json @@ -0,0 +1,4 @@ +{ + "type": "patch", + "description": "Fix a file paths issue in the viz guide." +} diff --git a/.semversioner/next-release/patch-20241106225803494336.json b/.semversioner/next-release/patch-20241106225803494336.json new file mode 100644 index 0000000000..e61c26301e --- /dev/null +++ b/.semversioner/next-release/patch-20241106225803494336.json @@ -0,0 +1,4 @@ +{ + "type": "patch", + "description": "Fix optional covariates update in incremental indexing" +} diff --git a/.semversioner/next-release/patch-20241106232311738461.json b/.semversioner/next-release/patch-20241106232311738461.json new file mode 100644 index 0000000000..7fb27d13e0 --- /dev/null +++ b/.semversioner/next-release/patch-20241106232311738461.json @@ -0,0 +1,4 @@ +{ + "type": "patch", + "description": "Raise error on empty deltas for inc indexing" +} diff --git a/dictionary.txt b/dictionary.txt index 7ea41bd295..0288faac9a 100644 --- a/dictionary.txt +++ b/dictionary.txt @@ -96,6 +96,9 @@ onclick pymdownx linenums twemoji +Gephi +gephi +Gephi's # Verbs binarize @@ -183,4 +186,4 @@ kwds astrotechnician epitheg unspooled -unnavigated \ No newline at end of file +unnavigated diff --git a/docs/blog_posts.md b/docs/blog_posts.md index 741066e3bb..750c0afebb 100644 --- a/docs/blog_posts.md +++ b/docs/blog_posts.md @@ -1,35 +1,34 @@ -
-
-Figure 1. An entire DRIFT search hierarchy highlighting the three core phases of the DRIFT search process. A (Primer): DRIFT compares the user’s query with the top K most semantically relevant community reports, generating a broad initial answer and follow-up questions to steer further exploration. B (Follow-Up): DRIFT uses local search to refine queries, producing additional intermediate answers and follow-up questions that enhance specificity, guiding the engine towards context-rich information. A glyph on each node in the diagram shows the confidence the algorithm has to continue the query expansion step. C (Output Hierarchy): The final output is a hierarchical structure of questions and answers ranked by relevance, reflecting a balanced mix of global insights and local refinements, making the results adaptable and comprehensive.
-+
+Figure 1. An entire DRIFT search hierarchy highlighting the three core phases of the DRIFT search process. A (Primer): DRIFT compares the user’s query with the top K most semantically relevant community reports, generating a broad initial answer and follow-up questions to steer further exploration. B (Follow-Up): DRIFT uses local search to refine queries, producing additional intermediate answers and follow-up questions that enhance specificity, guiding the engine towards context-rich information. A glyph on each node in the diagram shows the confidence the algorithm has to continue the query expansion step. C (Output Hierarchy): The final output is a hierarchical structure of questions and answers ranked by relevance, reflecting a balanced mix of global insights and local refinements, making the results adaptable and comprehensive.
+ DRIFT Search introduces a new approach to local search queries by including community information in the search process. This greatly expands the breadth of the query’s starting point and leads to retrieval and usage of a far higher variety of facts in the final answer. This addition expands the GraphRAG query engine by providing a more comprehensive option for local search, which uses community insights to refine a query into detailed follow-up questions. diff --git a/docs/visualization_guide.md b/docs/visualization_guide.md new file mode 100644 index 0000000000..539dd509c2 --- /dev/null +++ b/docs/visualization_guide.md @@ -0,0 +1,100 @@ +# Visualizing and Debugging Your Knowledge Graph + +The following step-by-step guide walks through the process to visualize a knowledge graph after it's been constructed by graphrag. Note that some of the settings recommended below are based on our own experience of what works well. Feel free to change and explore other settings for a better visualization experience! + +## 1. Run the Pipeline +Before building an index, please review your `settings.yaml` configuration file and ensure that graphml snapshots is enabled. +```yaml +snapshots: + graphml: true +``` +(Optional) To support other visualization tools and exploration, additional parameters can be enabled that provide access to vector embeddings. +```yaml +embed_graph: + enabled: true # will generate node2vec embeddings for nodes +umap: + enabled: true # will generate UMAP embeddings for nodes +``` +After running the indexing pipeline over your data, there will be an output folder (defined by the `storage.base_dir` setting). + +- **Output Folder**: Contains artifacts from the LLM’s indexing pass. + +## 2. Locate the Knowledge Graph +In the output folder, look for a file named `merged_graph.graphml`. graphml is a standard [file format](http://graphml.graphdrawing.org) supported by many visualization tools. We recommend trying [Gephi](https://gephi.org). + +## 3. Open the Graph in Gephi +1. Install and open Gephi +2. Navigate to the `output` folder containing the various parquet files. +3. Import the `merged_graph.graphml` file into Gephi. This will result in a fairly plain view of the undirected graph nodes and edges. + ++ +
+ +## 4. Install the Leiden Algorithm Plugin +1. Go to `Tools` -> `Plugins`. +2. Search for "Leiden Algorithm". +3. Click `Install` and restart Gephi. + +## 5. Run Statistics +1. In the `Statistics` tab on the right, click `Run` for `Average Degree` and `Leiden Algorithm`. + ++ +
+ +2. For the Leiden Algorithm, adjust the settings: + - **Quality function**: Modularity + - **Resolution**: 1 + +## 6. Color the Graph by Clusters +1. Go to the `Appearance` pane in the upper left side of Gephi. + ++ +
+ +2. Select `Nodes`, then `Partition`, and click the color palette icon in the upper right. +3. Choose `Cluster` from the dropdown. +4. Click the `Palette...` hyperlink, then `Generate...`. +5. Uncheck `Limit number of colors`, click `Generate`, and then `Ok`. +6. Click `Apply` to color the graph. This will color the graph based on the partitions discovered by Leiden. + +## 7. Resize Nodes by Degree Centrality +1. In the `Appearance` pane in the upper left, select `Nodes` -> `Ranking` +2. Select the `Sizing` icon in the upper right. +2. Choose `Degree` and set: + - **Min**: 10 + - **Max**: 150 +3. Click `Apply`. + +## 8. Layout the Graph +1. In the `Layout` tab in the lower left, select `OpenORD`. + ++ +
+ +2. Set `Liquid` and `Expansion` stages to 50, and everything else to 0. +3. Click `Run` and monitor the progress. + +## 9. Run ForceAtlas2 +1. Select `Force Atlas 2` in the layout options. + ++ +
+ +2. Adjust the settings: + - **Scaling**: 15 + - **Dissuade Hubs**: checked + - **LinLog mode**: uncheck + - **Prevent Overlap**: checked +3. Click `Run` and wait. +4. Press `Stop` when it looks like the graph nodes have settled and no longer change position significantly. + +## 10. Add Text Labels (Optional) +1. Turn on text labels in the appropriate section. +2. Configure and resize them as needed. + +Your final graph should now be visually organized and ready for analysis! diff --git a/graphrag/api/query.py b/graphrag/api/query.py index b1e20409e5..60691c7108 100644 --- a/graphrag/api/query.py +++ b/graphrag/api/query.py @@ -276,7 +276,7 @@ async def local_search_streaming( reporter.info(f"Vector Store Args: {redact(vector_store_args)}") # type: ignore description_embedding_store = _get_embedding_store( - conf_args=vector_store_args, # type: ignore + config_args=vector_store_args, # type: ignore container_suffix="entity-description", ) diff --git a/graphrag/index/run/run.py b/graphrag/index/run/run.py index f6c8ebc141..8db214bf93 100644 --- a/graphrag/index/run/run.py +++ b/graphrag/index/run/run.py @@ -133,6 +133,11 @@ async def run_pipeline_with_config( if is_update_run and update_index_storage: delta_dataset = await get_delta_docs(dataset, storage) + # Fail on empty delta dataset + if delta_dataset.new_inputs.empty: + error_msg = "Incremental Indexing Error: No new documents to process." + raise ValueError(error_msg) + delta_storage = update_index_storage.child("delta") # Run the pipeline on the new documents diff --git a/graphrag/index/update/incremental_index.py b/graphrag/index/update/incremental_index.py index e947fb92f4..57d3cd2b43 100644 --- a/graphrag/index/update/incremental_index.py +++ b/graphrag/index/update/incremental_index.py @@ -120,8 +120,12 @@ async def update_dataframe_outputs( ) # Merge final covariates - progress_reporter.info("Updating Final Covariates") - await _update_covariates(dataframe_dict, storage, update_storage) + if ( + await storage.has("create_final_covariates.parquet") + and "create_final_covariates" in dataframe_dict + ): + progress_reporter.info("Updating Final Covariates") + await _update_covariates(dataframe_dict, storage, update_storage) # Merge final nodes and update community ids progress_reporter.info("Updating Final Nodes") diff --git a/mkdocs.yaml b/mkdocs.yaml index 5f30e7fa13..c2a04c081a 100644 --- a/mkdocs.yaml +++ b/mkdocs.yaml @@ -54,6 +54,7 @@ nav: - Microsoft Research Blog: "blog_posts.md" - Extras: - CLI: "cli.md" + - Visualization Guide: "visualization_guide.md" - Operation Dulce: - About: "data/operation_dulce/ABOUT.md" - Document: "data/operation_dulce/Operation Dulce v2 1 1.md"