Skip to content

Commit

Permalink
add encoding model to text-chunking config (microsoft#743)
Browse files Browse the repository at this point in the history
* add encoding model to text-chunking config

* revert groupby fix, handled in other pr

* revert environment reader update for other pr
  • Loading branch information
darthtrevino authored Jul 26, 2024
1 parent 971e7d9 commit 4c229af
Show file tree
Hide file tree
Showing 8 changed files with 30 additions and 3 deletions.
4 changes: 4 additions & 0 deletions .semversioner/next-release/patch-20240726205654788488.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"type": "patch",
"description": "add encoding-model to text chunking config"
}
1 change: 1 addition & 0 deletions docsite/posts/config/env_vars.md
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ These settings control the data input used by the pipeline. Any settings with a
| `GRAPHRAG_CHUNK_SIZE` | The chunk size in tokens for text-chunk analysis windows. | `str` | optional | 1200 |
| `GRAPHRAG_CHUNK_OVERLAP` | The chunk overlap in tokens for text-chunk analysis windows. | `str` | optional | 100 |
| `GRAPHRAG_CHUNK_BY_COLUMNS` | A comma-separated list of document attributes to groupby when performing TextUnit chunking. | `str` | optional | `id` |
| `GRAPHRAG_CHUNK_ENCODING_MODEL` | The encoding model to use for chunking. | `str` | optional | `None` |

## Prompting Overrides

Expand Down
1 change: 1 addition & 0 deletions docsite/posts/config/json_yaml.md
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ This is the base LLM configuration section. Other steps may override this config
- `size` **int** - The max chunk size in tokens.
- `overlap` **int** - The chunk overlap in tokens.
- `group_by_columns` **list[str]** - group documents by fields before chunking.
- `encoding_model` **str** - The text encoding model to use. Default is to use the top-level encoding model.
- `strategy` **dict** - Fully override the chunking strategy.

## cache
Expand Down
1 change: 1 addition & 0 deletions graphrag/config/create_graphrag_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,6 +387,7 @@ def hydrate_parallelization_params(
overlap=reader.int("overlap") or defs.CHUNK_OVERLAP,
group_by_columns=reader.list("group_by_columns", "BY_COLUMNS")
or defs.CHUNK_GROUP_BY_COLUMNS,
encoding_model=reader.str(Fragment.encoding_model),
)
with (
reader.envvar_prefix(Section.snapshot),
Expand Down
6 changes: 5 additions & 1 deletion graphrag/config/models/chunking_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,11 @@ class ChunkingConfig(BaseModel):
description="The chunk strategy to use, overriding the default tokenization strategy",
default=None,
)
encoding_model: str | None = Field(
default=None, description="The encoding model to use."
)

def resolved_strategy(self) -> dict:
def resolved_strategy(self, encoding_model: str) -> dict:
"""Get the resolved chunking strategy."""
from graphrag.index.verbs.text.chunk import ChunkStrategyType

Expand All @@ -33,4 +36,5 @@ def resolved_strategy(self) -> dict:
"chunk_size": self.size,
"chunk_overlap": self.overlap,
"group_by_columns": self.group_by_columns,
"encoding_name": self.encoding_model or encoding_model,
}
6 changes: 5 additions & 1 deletion graphrag/index/create_pipeline_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,11 @@ def _text_unit_workflows(
name=create_base_text_units,
config={
"chunk_by": settings.chunks.group_by_columns,
"text_chunk": {"strategy": settings.chunks.resolved_strategy()},
"text_chunk": {
"strategy": settings.chunks.resolved_strategy(
settings.encoding_model
)
},
},
),
PipelineWorkflowReference(
Expand Down
3 changes: 2 additions & 1 deletion graphrag/prompt_tune/loader/input.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import pandas as pd
from datashaper import NoopVerbCallbacks, TableContainer, VerbInput

import graphrag.config.defaults as defs
from graphrag.config.models.graph_rag_config import GraphRagConfig
from graphrag.index.input import load_input
from graphrag.index.llm import load_llm_embeddings
Expand Down Expand Up @@ -61,7 +62,7 @@ async def load_docs_in_chunks(

# covert to text units
input = VerbInput(input=TableContainer(table=dataset))
chunk_strategy = config.chunks.resolved_strategy()
chunk_strategy = config.chunks.resolved_strategy(defs.ENCODING_MODEL)

# Use smaller chunks, to avoid huge prompts
chunk_strategy["chunk_size"] = chunk_size
Expand Down
11 changes: 11 additions & 0 deletions tests/unit/config/test_default_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@
"GRAPHRAG_CHUNK_BY_COLUMNS": "a,b",
"GRAPHRAG_CHUNK_OVERLAP": "12",
"GRAPHRAG_CHUNK_SIZE": "500",
"GRAPHRAG_CHUNK_ENCODING_MODEL": "encoding-c",
"GRAPHRAG_CLAIM_EXTRACTION_ENABLED": "True",
"GRAPHRAG_CLAIM_EXTRACTION_DESCRIPTION": "test 123",
"GRAPHRAG_CLAIM_EXTRACTION_MAX_GLEANINGS": "5000",
Expand Down Expand Up @@ -468,6 +469,15 @@ def test_can_set_gleanings_to_zero(self):
assert parameters.claim_extraction.max_gleanings == 0
assert parameters.entity_extraction.max_gleanings == 0

@mock.patch.dict(
os.environ,
{"GRAPHRAG_LLM_API_KEY": "test", "GRAPHRAG_CHUNK_BY_COLUMNS": ""},
clear=True,
)
def test_can_set_no_chunk_by_columns(self):
parameters = create_graphrag_config()
assert parameters.chunks.group_by_columns == []

def test_all_env_vars_is_accurate(self):
env_var_docs_path = Path("docsite/posts/config/env_vars.md")
query_docs_path = Path("docsite/posts/query/3-cli.md")
Expand Down Expand Up @@ -528,6 +538,7 @@ def test_create_parameters_from_env_vars(self) -> None:
assert parameters.chunks.group_by_columns == ["a", "b"]
assert parameters.chunks.overlap == 12
assert parameters.chunks.size == 500
assert parameters.chunks.encoding_model == "encoding-c"
assert parameters.claim_extraction.enabled
assert parameters.claim_extraction.description == "test 123"
assert parameters.claim_extraction.max_gleanings == 5000
Expand Down

0 comments on commit 4c229af

Please sign in to comment.