Merge branch 'microsoft:main' into master

KylinMountain · Nov 8, 2024 · feb9196 · feb9196
2 parents 1f1f7c8 + 20c1202
commit feb9196
Show file tree

Hide file tree

Showing 5 changed files with 138 additions and 2 deletions.
diff --git a/.semversioner/next-release/patch-20241107010037320137.json b/.semversioner/next-release/patch-20241107010037320137.json
@@ -0,0 +1,4 @@
+{
+  "type": "patch",
+  "description": "Add update cli entrypoint for incremental indexing"
+}
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -3,6 +3,7 @@
     "**/.yarn": true,
     "**/.pnp.*": true
   },
+  "editor.formatOnSave": false,
   "eslint.nodePath": ".yarn/sdks",
   "typescript.tsdk": ".yarn/sdks/typescript/lib",
   "typescript.enablePromptUseWorkspaceTsdk": true,

diff --git a/graphrag/cli/index.py b/graphrag/cli/index.py
@@ -79,11 +79,76 @@ def index_cli(
     output_dir: Path | None,
 ):
     """Run the pipeline with the given config."""
+    config = load_config(root_dir, config_filepath)
+
+    _run_index(
+        config=config,
+        verbose=verbose,
+        resume=resume,
+        memprofile=memprofile,
+        cache=cache,
+        reporter=reporter,
+        emit=emit,
+        dry_run=dry_run,
+        skip_validation=skip_validation,
+        output_dir=output_dir,
+    )
+
+
+def update_cli(
+    root_dir: Path,
+    verbose: bool,
+    memprofile: bool,
+    cache: bool,
+    reporter: ReporterType,
+    config_filepath: Path | None,
+    emit: list[TableEmitterType],
+    skip_validation: bool,
+    output_dir: Path | None,
+):
+    """Run the pipeline with the given config."""
+    config = load_config(root_dir, config_filepath)
+
+    # Check if update storage exist, if not configure it with default values
+    if not config.update_index_storage:
+        from graphrag.config.defaults import STORAGE_TYPE, UPDATE_STORAGE_BASE_DIR
+        from graphrag.config.models.storage_config import StorageConfig
+
+        config.update_index_storage = StorageConfig(
+            type=STORAGE_TYPE,
+            base_dir=UPDATE_STORAGE_BASE_DIR,
+        )
+
+    _run_index(
+        config=config,
+        verbose=verbose,
+        resume=False,
+        memprofile=memprofile,
+        cache=cache,
+        reporter=reporter,
+        emit=emit,
+        dry_run=False,
+        skip_validation=skip_validation,
+        output_dir=output_dir,
+    )
+
+
+def _run_index(
+    config,
+    verbose,
+    resume,
+    memprofile,
+    cache,
+    reporter,
+    emit,
+    dry_run,
+    skip_validation,
+    output_dir,
+):
     progress_reporter = create_progress_reporter(reporter)
     info, error, success = _logger(progress_reporter)
     run_id = resume or time.strftime("%Y%m%d-%H%M%S")
 
-    config = load_config(root_dir, config_filepath)
     config.storage.base_dir = str(output_dir) if output_dir else config.storage.base_dir
     config.reporting.base_dir = (
         str(output_dir) if output_dir else config.reporting.base_dir

diff --git a/graphrag/cli/main.py b/graphrag/cli/main.py
@@ -16,7 +16,7 @@
 from graphrag.prompt_tune.generator import MAX_TOKEN_COUNT
 from graphrag.prompt_tune.loader import MIN_CHUNK_SIZE
 
-from .index import index_cli
+from .index import index_cli, update_cli
 from .initialize import initialize_project_at
 from .prompt_tune import prompt_tune
 from .query import run_drift_search, run_global_search, run_local_search
@@ -129,6 +129,71 @@ def _index_cli(
     )
 
 
+@app.command("update")
+def _update_cli(
+    config: Annotated[
+        Path | None,
+        typer.Option(
+            help="The configuration to use.", exists=True, file_okay=True, readable=True
+        ),
+    ] = None,
+    root: Annotated[
+        Path,
+        typer.Option(
+            help="The project root directory.",
+            exists=True,
+            dir_okay=True,
+            writable=True,
+            resolve_path=True,
+        ),
+    ] = Path(),  # set default to current directory
+    verbose: Annotated[
+        bool, typer.Option(help="Run the indexing pipeline with verbose logging")
+    ] = False,
+    memprofile: Annotated[
+        bool, typer.Option(help="Run the indexing pipeline with memory profiling")
+    ] = False,
+    reporter: Annotated[
+        ReporterType, typer.Option(help="The progress reporter to use.")
+    ] = ReporterType.RICH,
+    emit: Annotated[
+        str, typer.Option(help="The data formats to emit, comma-separated.")
+    ] = TableEmitterType.Parquet.value,
+    cache: Annotated[bool, typer.Option(help="Use LLM cache.")] = True,
+    skip_validation: Annotated[
+        bool,
+        typer.Option(
+            help="Skip any preflight validation. Useful when running no LLM steps."
+        ),
+    ] = False,
+    output: Annotated[
+        Path | None,
+        typer.Option(
+            help="Indexing pipeline output directory. Overrides storage.base_dir in the configuration file.",
+            dir_okay=True,
+            writable=True,
+            resolve_path=True,
+        ),
+    ] = None,
+):
+    """
+    Update an existing knowledge graph index.
+
+    Applies a default storage configuration (if not provided by config), saving the new index to the local file system in the `update_output` folder.
+    """
+    update_cli(
+        root_dir=root,
+        verbose=verbose,
+        memprofile=memprofile,
+        cache=cache,
+        reporter=ReporterType(reporter),
+        config_filepath=config,
+        emit=[TableEmitterType(value.strip()) for value in emit.split(",")],
+        skip_validation=skip_validation,
+        output_dir=output,
+    )
+
+
 @app.command("prompt-tune")
 def _prompt_tune_cli(
     root: Annotated[

diff --git a/pyproject.toml b/pyproject.toml
@@ -139,6 +139,7 @@ test_smoke = "pytest ./tests/smoke"
 test_notebook = "pytest ./tests/notebook"
 test_verbs = "pytest ./tests/verbs"
 index = "python -m graphrag index"
+update = "python -m graphrag update"
 init = "python -m graphrag init"
 query = "python -m graphrag query"
 prompt_tune = "python -m graphrag prompt-tune"