add encoding model to text-chunking config (microsoft#743)

* add encoding model to text-chunking config * revert groupby fix, handled in other pr * revert environment reader update for other pr
KylinMountain · Jul 26, 2024 · 4c229af · 4c229af
1 parent 971e7d9
commit 4c229af
Show file tree

Hide file tree

Showing 8 changed files with 30 additions and 3 deletions.
diff --git a/.semversioner/next-release/patch-20240726205654788488.json b/.semversioner/next-release/patch-20240726205654788488.json
@@ -0,0 +1,4 @@
+{
+  "type": "patch",
+  "description": "add encoding-model to text chunking config"
+}
diff --git a/docsite/posts/config/env_vars.md b/docsite/posts/config/env_vars.md
@@ -137,6 +137,7 @@ These settings control the data input used by the pipeline. Any settings with a
 | `GRAPHRAG_CHUNK_SIZE`       | The chunk size in tokens for text-chunk analysis windows.                                   | `str` | optional             | 1200    |
 | `GRAPHRAG_CHUNK_OVERLAP`    | The chunk overlap in tokens for text-chunk analysis windows.                                | `str` | optional             | 100     |
 | `GRAPHRAG_CHUNK_BY_COLUMNS` | A comma-separated list of document attributes to groupby when performing TextUnit chunking. | `str` | optional             | `id`    |
+| `GRAPHRAG_CHUNK_ENCODING_MODEL` | The encoding model to use for chunking.                                                  | `str` | optional             | `None` |
 
 ## Prompting Overrides
 

diff --git a/docsite/posts/config/json_yaml.md b/docsite/posts/config/json_yaml.md
@@ -102,6 +102,7 @@ This is the base LLM configuration section. Other steps may override this config
 - `size` **int** - The max chunk size in tokens.
 - `overlap` **int** - The chunk overlap in tokens.
 - `group_by_columns` **list[str]** - group documents by fields before chunking.
+- `encoding_model` **str** - The text encoding model to use. Default is to use the top-level encoding model.
 - `strategy` **dict** - Fully override the chunking strategy.
 
 ## cache

diff --git a/graphrag/config/create_graphrag_config.py b/graphrag/config/create_graphrag_config.py
@@ -387,6 +387,7 @@ def hydrate_parallelization_params(
                 overlap=reader.int("overlap") or defs.CHUNK_OVERLAP,
                 group_by_columns=reader.list("group_by_columns", "BY_COLUMNS")
                 or defs.CHUNK_GROUP_BY_COLUMNS,
+                encoding_model=reader.str(Fragment.encoding_model),
             )
         with (
             reader.envvar_prefix(Section.snapshot),

diff --git a/graphrag/config/models/chunking_config.py b/graphrag/config/models/chunking_config.py
@@ -23,8 +23,11 @@ class ChunkingConfig(BaseModel):
         description="The chunk strategy to use, overriding the default tokenization strategy",
         default=None,
     )
+    encoding_model: str | None = Field(
+        default=None, description="The encoding model to use."
+    )
 
-    def resolved_strategy(self) -> dict:
+    def resolved_strategy(self, encoding_model: str) -> dict:
         """Get the resolved chunking strategy."""
         from graphrag.index.verbs.text.chunk import ChunkStrategyType
 
@@ -33,4 +36,5 @@ def resolved_strategy(self) -> dict:
             "chunk_size": self.size,
             "chunk_overlap": self.overlap,
             "group_by_columns": self.group_by_columns,
+            "encoding_name": self.encoding_model or encoding_model,
         }
diff --git a/graphrag/index/create_pipeline_config.py b/graphrag/index/create_pipeline_config.py
@@ -217,7 +217,11 @@ def _text_unit_workflows(
             name=create_base_text_units,
             config={
                 "chunk_by": settings.chunks.group_by_columns,
-                "text_chunk": {"strategy": settings.chunks.resolved_strategy()},
+                "text_chunk": {
+                    "strategy": settings.chunks.resolved_strategy(
+                        settings.encoding_model
+                    )
+                },
             },
         ),
         PipelineWorkflowReference(

diff --git a/graphrag/prompt_tune/loader/input.py b/graphrag/prompt_tune/loader/input.py
@@ -9,6 +9,7 @@
 import pandas as pd
 from datashaper import NoopVerbCallbacks, TableContainer, VerbInput
 
+import graphrag.config.defaults as defs
 from graphrag.config.models.graph_rag_config import GraphRagConfig
 from graphrag.index.input import load_input
 from graphrag.index.llm import load_llm_embeddings
@@ -61,7 +62,7 @@ async def load_docs_in_chunks(
 
     # covert to text units
     input = VerbInput(input=TableContainer(table=dataset))
-    chunk_strategy = config.chunks.resolved_strategy()
+    chunk_strategy = config.chunks.resolved_strategy(defs.ENCODING_MODEL)
 
     # Use smaller chunks, to avoid huge prompts
     chunk_strategy["chunk_size"] = chunk_size

diff --git a/tests/unit/config/test_default_config.py b/tests/unit/config/test_default_config.py
@@ -88,6 +88,7 @@
     "GRAPHRAG_CHUNK_BY_COLUMNS": "a,b",
     "GRAPHRAG_CHUNK_OVERLAP": "12",
     "GRAPHRAG_CHUNK_SIZE": "500",
+    "GRAPHRAG_CHUNK_ENCODING_MODEL": "encoding-c",
     "GRAPHRAG_CLAIM_EXTRACTION_ENABLED": "True",
     "GRAPHRAG_CLAIM_EXTRACTION_DESCRIPTION": "test 123",
     "GRAPHRAG_CLAIM_EXTRACTION_MAX_GLEANINGS": "5000",
@@ -468,6 +469,15 @@ def test_can_set_gleanings_to_zero(self):
         assert parameters.claim_extraction.max_gleanings == 0
         assert parameters.entity_extraction.max_gleanings == 0
 
+    @mock.patch.dict(
+        os.environ,
+        {"GRAPHRAG_LLM_API_KEY": "test", "GRAPHRAG_CHUNK_BY_COLUMNS": ""},
+        clear=True,
+    )
+    def test_can_set_no_chunk_by_columns(self):
+        parameters = create_graphrag_config()
+        assert parameters.chunks.group_by_columns == []
+
     def test_all_env_vars_is_accurate(self):
         env_var_docs_path = Path("docsite/posts/config/env_vars.md")
         query_docs_path = Path("docsite/posts/query/3-cli.md")
@@ -528,6 +538,7 @@ def test_create_parameters_from_env_vars(self) -> None:
         assert parameters.chunks.group_by_columns == ["a", "b"]
         assert parameters.chunks.overlap == 12
         assert parameters.chunks.size == 500
+        assert parameters.chunks.encoding_model == "encoding-c"
         assert parameters.claim_extraction.enabled
         assert parameters.claim_extraction.description == "test 123"
         assert parameters.claim_extraction.max_gleanings == 5000