From 918740b6345bfde01406fa40eb0d3d82af7666ec Mon Sep 17 00:00:00 2001 From: ChrisJar Date: Thu, 12 Dec 2024 21:33:10 -0800 Subject: [PATCH 01/18] Switch split task to token based splitting --- .../primitives/tasks/split.py | 51 ++------ .../modules/transforms/nemo_doc_splitter.py | 119 +++++------------- src/nv_ingest/schemas/ingest_job_schema.py | 13 +- .../schemas/nemo_doc_splitter_schema.py | 17 +-- 4 files changed, 50 insertions(+), 150 deletions(-) diff --git a/client/src/nv_ingest_client/primitives/tasks/split.py b/client/src/nv_ingest_client/primitives/tasks/split.py index ddbae624..c9ac86cc 100644 --- a/client/src/nv_ingest_client/primitives/tasks/split.py +++ b/client/src/nv_ingest_client/primitives/tasks/split.py @@ -20,18 +20,8 @@ class SplitTaskSchema(BaseModel): - split_by: Optional[str] = "sentence" - split_length: Optional[int] = 10 - split_overlap: Optional[int] = 0 - max_character_length: Optional[int] = 1024 - sentence_window_size: Optional[int] = 0 - - @validator("split_by") - def split_by_must_be_valid(cls, v): - valid_criteria = ["page", "size", "word", "sentence"] - if v not in valid_criteria: - raise ValueError(f"split_by must be one of {valid_criteria}") - return v + split_by: str = "intfloat/e5-large-unsupervised" + chunk_size: int = 300 class Config: extra = "forbid" @@ -42,25 +32,17 @@ class SplitTask(Task): Object for document splitting task """ - _TypeSplitBy = Literal["word", "sentence", "passage"] - def __init__( self, - split_by: _TypeSplitBy = None, - split_length: int = None, - split_overlap: int = None, - max_character_length: int = None, - sentence_window_size: int = None, + tokenizer: str = None, + chunk_size: int = None, ) -> None: """ Setup Split Task Config """ super().__init__() - self._split_by = split_by - self._split_length = split_length - self._split_overlap = split_overlap - self._max_character_length = max_character_length - self._sentence_window_size = sentence_window_size + self._tokenizer = tokenizer + self._chunk_size = chunk_size def __str__(self) -> str: """ @@ -68,11 +50,8 @@ def __str__(self) -> str: """ info = "" info += "Split Task:\n" - info += f" split_by: {self._split_by}\n" - info += f" split_length: {self._split_length}\n" - info += f" split_overlap: {self._split_overlap}\n" - info += f" split_max_character_length: {self._max_character_length}\n" - info += f" split_sentence_window_size: {self._sentence_window_size}\n" + info += f" tokenizer: {self._tokenizer}\n" + info += f" chunk_size: {self._chunk_size}\n" return info def to_dict(self) -> Dict: @@ -81,15 +60,9 @@ def to_dict(self) -> Dict: """ split_params = {} - if self._split_by is not None: - split_params["split_by"] = self._split_by - if self._split_length is not None: - split_params["split_length"] = self._split_length - if self._split_overlap is not None: - split_params["split_overlap"] = self._split_overlap - if self._max_character_length is not None: - split_params["max_character_length"] = self._max_character_length - if self._sentence_window_size is not None: - split_params["sentence_window_size"] = self._sentence_window_size + if self._tokenizer is not None: + split_params["tokenizer"] = self._tokenizer + if self._chunk_size is not None: + split_params["chunk_size"] = self._chunk_size return {"type": "split", "task_properties": split_params} diff --git a/src/nv_ingest/modules/transforms/nemo_doc_splitter.py b/src/nv_ingest/modules/transforms/nemo_doc_splitter.py index 647d9bf1..b5685d8f 100644 --- a/src/nv_ingest/modules/transforms/nemo_doc_splitter.py +++ b/src/nv_ingest/modules/transforms/nemo_doc_splitter.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 +import os import copy import logging import traceback @@ -13,6 +14,7 @@ import mrc import pandas as pd +from transformers import AutoTokenizer from more_itertools import windowed from morpheus.messages import ControlMessage from morpheus.messages import MessageMeta @@ -33,24 +35,16 @@ logger = logging.getLogger(__name__) -def _build_split_documents(row, text_splits: List[str], sentence_window_size: int) -> List[dict[str, Any]]: - """Build documents from text splits with window text.""" +def _build_split_documents(row, chunks: List[str]) -> List[dict[str, Any]]: + """Build documents from text chunks""" documents: List[dict] = [] - window_size = sentence_window_size - for i, text in enumerate(text_splits): + for i, text in enumerate(chunks): if text is None or not text.strip(): continue metadata = row.metadata if hasattr(row, "metadata") and isinstance(row.metadata, dict) else {} metadata = copy.deepcopy(metadata) - if window_size > 0: - window_text = "".join( - text_splits[max(0, i - window_size) : min(i + 1 + window_size, len(text_splits))] # noqa: E203 - ) - - metadata["window"] = window_text - metadata["original_text"] = text metadata["content"] = text @@ -59,70 +53,33 @@ def _build_split_documents(row, text_splits: List[str], sentence_window_size: in return documents -def _split_into_units(text: str, split_by: Literal["word", "sentence", "passage"]) -> List[str]: - if split_by == "passage": - split_at = "\n\n" - elif split_by == "sentence": - split_at = "." # why not ?,!, etc..? - elif split_by == "word": - split_at = " " - else: - raise NotImplementedError("DocumentSplitter only supports 'passage', 'sentence'" " or 'word' split_by options.") - units = text.split(split_at) - # Add the delimiter back to all units except the last one - for i in range(len(units) - 1): - units[i] += split_at +def _split_into_chunks(text, tokenizer, chunk_size=300): + # Tokenize the text into token IDs + encoding = tokenizer.encode_plus(text, add_special_tokens=False, return_offsets_mapping=True) - return units + # Get the token IDs and offsets for splitting + tokens = encoding['input_ids'] + offsets = encoding['offset_mapping'] + # Split the tokens into chunks of the desired size + chunks = [tokens[i:i + chunk_size] for i in range(0, len(tokens), chunk_size)] -def _concatenate_units(units: List[str], split_length: int, split_overlap: int, max_character_length: int) -> List[str]: - text_splits = [] - segments = windowed(units, n=split_length, step=split_length - split_overlap) - for seg in segments: - current_units = [unit for unit in seg if unit is not None] - txt = "".join(current_units) - if max_character_length and len(txt) > max_character_length: - text_splits.extend(_split_long_text(txt, max_character_length)) - elif len(txt) > 0: - text_splits.append(txt) + # Convert token chunks back to text while preserving original spacing and case + text_chunks = [] + for chunk in chunks: + # Find the start and end offsets for the current chunk + chunk_offsets = offsets[:len(chunk)] + start_offset = chunk_offsets[0][0] + end_offset = chunk_offsets[-1][1] - return text_splits + # Extract the original text for this chunk based on offsets + text_chunk = text[start_offset:end_offset] + text_chunks.append(text_chunk) + # Remove processed offsets for the next iteration + offsets = offsets[len(chunk):] -def _split_long_text(text: str, max_character_length: int) -> List[str]: - """ - Splits a long text into smaller segments that - do not exceed max_character_length. - """ - split_texts = [] - while text: - # Take the maximum possible substring without exceeding max_character_length - segment = text[:max_character_length] - split_texts.append(segment) - text = text[max_character_length:] # noqa: E203 - - return split_texts - - -def _process_content(row, validated_config): - content = row["metadata"]["content"] - - if content is None: - raise ValueError( - "DocumentSplitter only works with text documents but one or more 'content' " "values are None." - ) - - units = _split_into_units(content, validated_config.split_by) - text_splits = _concatenate_units( - units, - validated_config.split_length, - validated_config.split_overlap, - max_character_length=validated_config.max_character_length, - ) - split_docs = _build_split_documents(row, text_splits, sentence_window_size=validated_config.sentence_window_size) - - return split_docs + return text_chunks MODULE_NAME = "nemo_document_splitter" @@ -167,16 +124,11 @@ def split_and_forward(message: ControlMessage): return message # Override parameters if set - split_by = task_props.get("split_by", validated_config.split_by) - split_length = task_props.get("split_length", validated_config.split_length) - split_overlap = task_props.get("split_overlap", validated_config.split_overlap) - max_character_length = task_props.get("max_character_length", validated_config.max_character_length) - sentence_window_size = task_props.get("sentence_window_size", validated_config.sentence_window_size) + tokenizer = task_props.get("tokenizer", validated_config.tokenizer) + chunk_size = task_props.get("chunk_size", validated_config.chunk_size) logger.info( - f"Splitting documents with split_by: {split_by}, split_length: {split_length}, " - f"split_overlap: {split_overlap}, max_character_length: {max_character_length}, " - f"sentence_window_size: {sentence_window_size}" + f"Splitting documents with tokenizer: {tokenizer}, chunk_size: {chunk_size} tokens" ) split_docs = [] @@ -188,14 +140,11 @@ def split_and_forward(message: ControlMessage): "DocumentSplitter only works with text documents but one or more " "'content' values are None." ) - units = _split_into_units(content, split_by) - text_splits = _concatenate_units( - units, - split_length, - split_overlap, - max_character_length=max_character_length, - ) - split_docs.extend(_build_split_documents(row, text_splits, sentence_window_size=sentence_window_size)) + os.environ['TOKENIZERS_PARALLELISM'] = "False" + tokenizer_model = AutoTokenizer.from_pretrained(tokenizer) + + chunks = _split_into_chunks(content, tokenizer_model, chunk_size) + split_docs.extend(_build_split_documents(row, chunks)) split_docs_df = pd.DataFrame(split_docs) diff --git a/src/nv_ingest/schemas/ingest_job_schema.py b/src/nv_ingest/schemas/ingest_job_schema.py index 83c987ff..e3a9255c 100644 --- a/src/nv_ingest/schemas/ingest_job_schema.py +++ b/src/nv_ingest/schemas/ingest_job_schema.py @@ -61,17 +61,8 @@ class TracingOptionsSchema(BaseModelNoExt): class IngestTaskSplitSchema(BaseModelNoExt): - split_by: Literal["word", "sentence", "passage"] - split_length: conint(gt=0) - split_overlap: conint(ge=0) - max_character_length: Optional[conint(gt=0)] - sentence_window_size: Optional[conint(ge=0)] - - @validator("sentence_window_size") - def check_sentence_window_size(cls, v, values, **kwargs): - if v is not None and v > 0 and values["split_by"] != "sentence": - raise ValueError("When using sentence_window_size, split_by must be 'sentence'.") - return v + tokenizer: str + chunk_size: conint(gt=0) class IngestTaskExtractSchema(BaseModelNoExt): diff --git a/src/nv_ingest/schemas/nemo_doc_splitter_schema.py b/src/nv_ingest/schemas/nemo_doc_splitter_schema.py index 9c392459..9231eb4f 100644 --- a/src/nv_ingest/schemas/nemo_doc_splitter_schema.py +++ b/src/nv_ingest/schemas/nemo_doc_splitter_schema.py @@ -3,24 +3,11 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Literal -from typing import Optional - from pydantic import BaseModel from pydantic import conint -from pydantic import validator class DocumentSplitterSchema(BaseModel): - split_by: Literal["word", "sentence", "passage"] = "word" - split_length: conint(gt=0) = 60 - split_overlap: conint(ge=0) = 10 - max_character_length: Optional[conint(gt=0)] = 450 - sentence_window_size: Optional[conint(ge=0)] = 0 + tokenizer: str = "intfloat/e5-large-unsupervised" + chunk_size: conint(gt=0) = 300 raise_on_failure: bool = False - - @validator("sentence_window_size") - def check_sentence_window_size(cls, v, values, **kwargs): - if v is not None and v > 0 and values["split_by"] != "sentence": - raise ValueError("When using sentence_window_size, split_by must be 'sentence'.") - return v From c12df54eea864fd3ec6908cd87faa65cb7a1c839 Mon Sep 17 00:00:00 2001 From: Chris Jarrett Date: Wed, 18 Dec 2024 14:18:53 -0800 Subject: [PATCH 02/18] Move tokenizer out of loop --- src/nv_ingest/modules/transforms/nemo_doc_splitter.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/nv_ingest/modules/transforms/nemo_doc_splitter.py b/src/nv_ingest/modules/transforms/nemo_doc_splitter.py index b5685d8f..f168432d 100644 --- a/src/nv_ingest/modules/transforms/nemo_doc_splitter.py +++ b/src/nv_ingest/modules/transforms/nemo_doc_splitter.py @@ -131,6 +131,8 @@ def split_and_forward(message: ControlMessage): f"Splitting documents with tokenizer: {tokenizer}, chunk_size: {chunk_size} tokens" ) + tokenizer_model = AutoTokenizer.from_pretrained(tokenizer, token="") + split_docs = [] for _, row in df_filtered.iterrows(): content = row["metadata"]["content"] @@ -140,9 +142,6 @@ def split_and_forward(message: ControlMessage): "DocumentSplitter only works with text documents but one or more " "'content' values are None." ) - os.environ['TOKENIZERS_PARALLELISM'] = "False" - tokenizer_model = AutoTokenizer.from_pretrained(tokenizer) - chunks = _split_into_chunks(content, tokenizer_model, chunk_size) split_docs.extend(_build_split_documents(row, chunks)) From c88a4a283824ebe617b6f5f62b1c4c29ad8b7827 Mon Sep 17 00:00:00 2001 From: Chris Jarrett Date: Wed, 18 Dec 2024 15:17:03 -0800 Subject: [PATCH 03/18] Fix CLI --- client/src/nv_ingest_client/primitives/tasks/split.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/client/src/nv_ingest_client/primitives/tasks/split.py b/client/src/nv_ingest_client/primitives/tasks/split.py index c9ac86cc..24e7f956 100644 --- a/client/src/nv_ingest_client/primitives/tasks/split.py +++ b/client/src/nv_ingest_client/primitives/tasks/split.py @@ -20,7 +20,7 @@ class SplitTaskSchema(BaseModel): - split_by: str = "intfloat/e5-large-unsupervised" + tokenizer: str = "intfloat/e5-large-unsupervised" chunk_size: int = 300 class Config: From 0cacc9eba501a5878578098742e1fd2847ab702c Mon Sep 17 00:00:00 2001 From: Chris Jarrett Date: Wed, 8 Jan 2025 10:00:59 -0800 Subject: [PATCH 04/18] Add chunk_overlap parameter --- .../nv_ingest_client/primitives/tasks/split.py | 10 ++++++++-- .../modules/transforms/nemo_doc_splitter.py | 15 ++++++++------- src/nv_ingest/schemas/ingest_job_schema.py | 1 + src/nv_ingest/schemas/nemo_doc_splitter_schema.py | 1 + 4 files changed, 18 insertions(+), 9 deletions(-) diff --git a/client/src/nv_ingest_client/primitives/tasks/split.py b/client/src/nv_ingest_client/primitives/tasks/split.py index 24e7f956..d60f067d 100644 --- a/client/src/nv_ingest_client/primitives/tasks/split.py +++ b/client/src/nv_ingest_client/primitives/tasks/split.py @@ -22,6 +22,7 @@ class SplitTaskSchema(BaseModel): tokenizer: str = "intfloat/e5-large-unsupervised" chunk_size: int = 300 + chunk_overlap: int = 0 class Config: extra = "forbid" @@ -34,8 +35,9 @@ class SplitTask(Task): def __init__( self, - tokenizer: str = None, - chunk_size: int = None, + tokenizer: str = "intfloat/e5-large-unsupervised", + chunk_size: int = 300, + chunk_overlap: int = 0, ) -> None: """ Setup Split Task Config @@ -43,6 +45,7 @@ def __init__( super().__init__() self._tokenizer = tokenizer self._chunk_size = chunk_size + self._chunk_overlap = chunk_overlap def __str__(self) -> str: """ @@ -52,6 +55,7 @@ def __str__(self) -> str: info += "Split Task:\n" info += f" tokenizer: {self._tokenizer}\n" info += f" chunk_size: {self._chunk_size}\n" + info += f" chunk_overlap: {self._chunk_overlap}\n" return info def to_dict(self) -> Dict: @@ -64,5 +68,7 @@ def to_dict(self) -> Dict: split_params["tokenizer"] = self._tokenizer if self._chunk_size is not None: split_params["chunk_size"] = self._chunk_size + if self._chunk_overlap is not None: + split_params["chunk_overlap"] = self._chunk_overlap return {"type": "split", "task_properties": split_params} diff --git a/src/nv_ingest/modules/transforms/nemo_doc_splitter.py b/src/nv_ingest/modules/transforms/nemo_doc_splitter.py index f168432d..b3fc2fa6 100644 --- a/src/nv_ingest/modules/transforms/nemo_doc_splitter.py +++ b/src/nv_ingest/modules/transforms/nemo_doc_splitter.py @@ -58,17 +58,17 @@ def _split_into_chunks(text, tokenizer, chunk_size=300): encoding = tokenizer.encode_plus(text, add_special_tokens=False, return_offsets_mapping=True) # Get the token IDs and offsets for splitting - tokens = encoding['input_ids'] - offsets = encoding['offset_mapping'] + tokens = encoding["input_ids"] + offsets = encoding["offset_mapping"] # Split the tokens into chunks of the desired size - chunks = [tokens[i:i + chunk_size] for i in range(0, len(tokens), chunk_size)] + chunks = [tokens[i : i + chunk_size] for i in range(0, len(tokens), chunk_size)] # Convert token chunks back to text while preserving original spacing and case text_chunks = [] for chunk in chunks: # Find the start and end offsets for the current chunk - chunk_offsets = offsets[:len(chunk)] + chunk_offsets = offsets[: len(chunk)] start_offset = chunk_offsets[0][0] end_offset = chunk_offsets[-1][1] @@ -77,7 +77,7 @@ def _split_into_chunks(text, tokenizer, chunk_size=300): text_chunks.append(text_chunk) # Remove processed offsets for the next iteration - offsets = offsets[len(chunk):] + offsets = offsets[len(chunk) :] return text_chunks @@ -124,11 +124,12 @@ def split_and_forward(message: ControlMessage): return message # Override parameters if set - tokenizer = task_props.get("tokenizer", validated_config.tokenizer) + tokenizer = task_props.get("tokenizer", validated_config.tokenizer) chunk_size = task_props.get("chunk_size", validated_config.chunk_size) + chunk_overlap = task_props.get("chunk_overlap", validated_config.chunk_overlap) logger.info( - f"Splitting documents with tokenizer: {tokenizer}, chunk_size: {chunk_size} tokens" + f"Splitting text with tokenizer: {tokenizer}, chunk_size: {chunk_size} tokens, chunk_overlap: {chunk_overlap}" ) tokenizer_model = AutoTokenizer.from_pretrained(tokenizer, token="") diff --git a/src/nv_ingest/schemas/ingest_job_schema.py b/src/nv_ingest/schemas/ingest_job_schema.py index e3a9255c..b7a97f19 100644 --- a/src/nv_ingest/schemas/ingest_job_schema.py +++ b/src/nv_ingest/schemas/ingest_job_schema.py @@ -63,6 +63,7 @@ class TracingOptionsSchema(BaseModelNoExt): class IngestTaskSplitSchema(BaseModelNoExt): tokenizer: str chunk_size: conint(gt=0) + chunk_overlap: conint(ge=0) class IngestTaskExtractSchema(BaseModelNoExt): diff --git a/src/nv_ingest/schemas/nemo_doc_splitter_schema.py b/src/nv_ingest/schemas/nemo_doc_splitter_schema.py index 9231eb4f..86a13d6e 100644 --- a/src/nv_ingest/schemas/nemo_doc_splitter_schema.py +++ b/src/nv_ingest/schemas/nemo_doc_splitter_schema.py @@ -10,4 +10,5 @@ class DocumentSplitterSchema(BaseModel): tokenizer: str = "intfloat/e5-large-unsupervised" chunk_size: conint(gt=0) = 300 + chunk_overlap: conint(ge=0) = 0 raise_on_failure: bool = False From d8a2de9a80cfdaf5044d01ebdb41873f29054443 Mon Sep 17 00:00:00 2001 From: Chris Jarrett Date: Wed, 8 Jan 2025 15:02:09 -0800 Subject: [PATCH 05/18] Fix broken tests --- .../primitives/tasks/split.py | 4 +- .../modules/transforms/nemo_doc_splitter.py | 7 +- src/nv_ingest/schemas/ingest_job_schema.py | 5 +- .../schemas/nemo_doc_splitter_schema.py | 9 ++- .../test_message_broker_task_source.py | 6 +- .../schemas/test_ingest_job_schema.py | 45 +++-------- .../schemas/test_nemo_doc_splitter_schema.py | 51 ++---------- tests/nv_ingest_client/cli/util/test_click.py | 2 +- tests/nv_ingest_client/client/test_client.py | 14 ++-- .../nv_ingest_client/client/test_interface.py | 6 +- .../primitives/tasks/test_split.py | 79 ++++++++----------- 11 files changed, 73 insertions(+), 155 deletions(-) diff --git a/client/src/nv_ingest_client/primitives/tasks/split.py b/client/src/nv_ingest_client/primitives/tasks/split.py index b0178885..5f45e806 100644 --- a/client/src/nv_ingest_client/primitives/tasks/split.py +++ b/client/src/nv_ingest_client/primitives/tasks/split.py @@ -8,10 +8,8 @@ import logging from typing import Dict -from typing import Literal -from typing import Optional -from pydantic import BaseModel, field_validator +from pydantic import BaseModel from .task_base import Task diff --git a/src/nv_ingest/modules/transforms/nemo_doc_splitter.py b/src/nv_ingest/modules/transforms/nemo_doc_splitter.py index 3ed12355..79c315b2 100644 --- a/src/nv_ingest/modules/transforms/nemo_doc_splitter.py +++ b/src/nv_ingest/modules/transforms/nemo_doc_splitter.py @@ -3,19 +3,16 @@ # SPDX-License-Identifier: Apache-2.0 -import os import copy import logging import traceback import uuid from typing import Any from typing import List -from typing import Literal import mrc import pandas as pd from transformers import AutoTokenizer -from more_itertools import windowed from morpheus.messages import ControlMessage from morpheus.messages import MessageMeta from morpheus.utils.control_message_utils import cm_skip_processing_if_failed @@ -133,7 +130,9 @@ def split_and_forward(message: ControlMessage): chunk_overlap = task_props.get("chunk_overlap", validated_config.chunk_overlap) logger.info( - f"Splitting text with tokenizer: {tokenizer}, chunk_size: {chunk_size} tokens, chunk_overlap: {chunk_overlap}" + f"Splitting text with tokenizer: {tokenizer}, " + f"chunk_size: {chunk_size} tokens, " + f"chunk_overlap: {chunk_overlap}" ) tokenizer_model = AutoTokenizer.from_pretrained(tokenizer, token="") diff --git a/src/nv_ingest/schemas/ingest_job_schema.py b/src/nv_ingest/schemas/ingest_job_schema.py index a11f03cb..e179894b 100644 --- a/src/nv_ingest/schemas/ingest_job_schema.py +++ b/src/nv_ingest/schemas/ingest_job_schema.py @@ -8,7 +8,6 @@ from typing import Any from typing import Dict from typing import List -from typing import Literal from typing import Optional from typing import Union @@ -61,8 +60,8 @@ class TracingOptionsSchema(BaseModelNoExt): class IngestTaskSplitSchema(BaseModelNoExt): tokenizer: str - chunk_size: conint(gt=0) - chunk_overlap: conint(ge=0) + chunk_size: Annotated[int, Field(gt=0)] + chunk_overlap: Annotated[int, Field(ge=0)] class IngestTaskExtractSchema(BaseModelNoExt): diff --git a/src/nv_ingest/schemas/nemo_doc_splitter_schema.py b/src/nv_ingest/schemas/nemo_doc_splitter_schema.py index 86a13d6e..ad34e583 100644 --- a/src/nv_ingest/schemas/nemo_doc_splitter_schema.py +++ b/src/nv_ingest/schemas/nemo_doc_splitter_schema.py @@ -3,12 +3,13 @@ # SPDX-License-Identifier: Apache-2.0 -from pydantic import BaseModel -from pydantic import conint +from pydantic import Field, BaseModel + +from typing_extensions import Annotated class DocumentSplitterSchema(BaseModel): tokenizer: str = "intfloat/e5-large-unsupervised" - chunk_size: conint(gt=0) = 300 - chunk_overlap: conint(ge=0) = 0 + chunk_size: Annotated[int, Field(gt=0)] = 300 + chunk_overlap: Annotated[int, Field(ge=0)] = 0 raise_on_failure: bool = False diff --git a/tests/nv_ingest/modules/sources/test_message_broker_task_source.py b/tests/nv_ingest/modules/sources/test_message_broker_task_source.py index a824f60d..08cd3a01 100644 --- a/tests/nv_ingest/modules/sources/test_message_broker_task_source.py +++ b/tests/nv_ingest/modules/sources/test_message_broker_task_source.py @@ -39,9 +39,9 @@ def job_payload(): { "type": "split", "task_properties": { - "split_by": "word", - "split_length": 100, - "split_overlap": 0, + "tokenizer": "intfloat/e5-large-unsupervised", + "chunk_size": 100, + "chunk_overlap": 0, }, }, { diff --git a/tests/nv_ingest/schemas/test_ingest_job_schema.py b/tests/nv_ingest/schemas/test_ingest_job_schema.py index 97045fbb..e9fb7f22 100644 --- a/tests/nv_ingest/schemas/test_ingest_job_schema.py +++ b/tests/nv_ingest/schemas/test_ingest_job_schema.py @@ -26,11 +26,9 @@ def valid_task_properties(task_type): """Returns valid task properties based on the task type.""" if task_type == TaskTypeEnum.split: return { - "split_by": "sentence", - "split_length": 10, - "split_overlap": 0, - "max_character_length": 100, - "sentence_window_size": None, # This is valid when not required + "tokenizer": "intfloat/e5-large-unsupervised", + "chunk_size": 300, + "chunk_overlap": 0, } elif task_type == TaskTypeEnum.extract: return {"document_type": "pdf", "method": "OCR", "params": {"language": "en"}} @@ -119,27 +117,6 @@ def test_field_type_correctness(): validate_ingest_job(job_data) -def test_custom_validator_logic_for_sentence_window_size(): - """Tests custom validator logic related to sentence_window_size in split tasks.""" - task = { - "type": "split", - "task_properties": { - "split_by": "word", # Incorrect usage of sentence_window_size - "split_length": 10, - "split_overlap": 5, - "sentence_window_size": 5, # Should not be set when split_by is not 'sentence' - }, - } - job_data = { - "job_payload": valid_job_payload(), - "job_id": "123", - "tasks": [task], - } - with pytest.raises(ValidationError) as exc_info: - validate_ingest_job(job_data) - assert "sentence_window_size" in str(exc_info.value) and "must be 'sentence'" in str(exc_info.value) - - def test_multiple_task_types(): job_data = { "job_payload": { @@ -153,9 +130,9 @@ def test_multiple_task_types(): { "type": "split", "task_properties": { - "split_by": "word", - "split_length": 100, - "split_overlap": 0, + "tokenizer": "intfloat/e5-large-unsupervised", + "chunk_size": 100, + "chunk_overlap": 0, }, }, { @@ -250,9 +227,9 @@ def test_incorrect_property_types(): { "type": "split", "task_properties": { - "split_by": "word", - "split_length": {"not an int": 123}, # Incorrect type (should be int) - "split_overlap": 0, + "tokenizer": "intfloat/e5-large-unsupervised", + "chunk_size": {"not an int": 123}, # Incorrect type (should be int) + "chunk_overlap": 0, }, } ], @@ -269,8 +246,8 @@ def test_missing_required_fields(): { "type": "split", "task_properties": { - "split_by": "sentence", # Missing split_length - "split_overlap": 0, + "tokenizer": "intfloat/e5-large-unsupervised", # Missing chunk_size + "chunk_overlap": 0, }, } ], diff --git a/tests/nv_ingest/schemas/test_nemo_doc_splitter_schema.py b/tests/nv_ingest/schemas/test_nemo_doc_splitter_schema.py index 00f4e606..73636ccf 100644 --- a/tests/nv_ingest/schemas/test_nemo_doc_splitter_schema.py +++ b/tests/nv_ingest/schemas/test_nemo_doc_splitter_schema.py @@ -13,57 +13,16 @@ def test_document_splitter_schema_defaults(): Test the DocumentSplitterSchema with default values. """ schema = DocumentSplitterSchema() - assert schema.split_by == "word" - assert schema.split_length == 60 - assert schema.split_overlap == 10 - assert schema.max_character_length == 450 - assert schema.sentence_window_size == 0 + assert schema.tokenizer == "intfloat/e5-large-unsupervised" + assert schema.chunk_size == 300 + assert schema.chunk_overlap == 0 assert schema.raise_on_failure is False @pytest.mark.parametrize("invalid_value", [-1, 0]) def test_document_splitter_schema_invalid_split_length(invalid_value): """ - Test DocumentSplitterSchema with invalid split_length values. + Test DocumentSplitterSchema with invalid chunk_size values. """ with pytest.raises(ValidationError): - DocumentSplitterSchema(split_length=invalid_value) - - -@pytest.mark.parametrize( - "split_by, sentence_window_size, is_valid", - [ - ("sentence", 5, True), # Valid use of sentence_window_size - ( - "word", - 0, - True, - ), # Valid when split_by is not 'sentence' but sentence_window_size is 0 - ( - "word", - 5, - False, - ), # Invalid because sentence_window_size > 0 requires split_by to be 'sentence' - ], -) -def test_document_splitter_schema_sentence_window_size_validation(split_by, sentence_window_size, is_valid): - """ - Parametrized test for validating the sentence_window_size logic in DocumentSplitterSchema. - """ - if is_valid: - schema = DocumentSplitterSchema(split_by=split_by, sentence_window_size=sentence_window_size) - assert schema.sentence_window_size == sentence_window_size - assert schema.split_by == split_by - else: - with pytest.raises(ValidationError) as excinfo: - DocumentSplitterSchema(split_by=split_by, sentence_window_size=sentence_window_size) - assert "split_by must be 'sentence'" in str(excinfo.value) - - -def test_document_splitter_schema_optional_fields_none(): - """ - Test DocumentSplitterSchema with optional fields set to None. - """ - schema = DocumentSplitterSchema(max_character_length=None, sentence_window_size=None) - assert schema.max_character_length is None - assert schema.sentence_window_size is None + DocumentSplitterSchema(chunk_size=invalid_value) diff --git a/tests/nv_ingest_client/cli/util/test_click.py b/tests/nv_ingest_client/cli/util/test_click.py index f4f15ebd..a8ed1b0a 100644 --- a/tests/nv_ingest_client/cli/util/test_click.py +++ b/tests/nv_ingest_client/cli/util/test_click.py @@ -93,7 +93,7 @@ def test_debug_print_click_options(mock_pprint): def test_validate_task_with_valid_split(): """Test with valid split task options.""" - value = ['split:{"split_by": "page", "split_length": 10}'] + value = ['split:{"tokenizer": "intfloat/e5-large-unsupervised", "chunk_size": 300}'] result = click_validate_task(None, None, value) assert "split" in result diff --git a/tests/nv_ingest_client/client/test_client.py b/tests/nv_ingest_client/client/test_client.py index 05c90425..5a45936c 100644 --- a/tests/nv_ingest_client/client/test_client.py +++ b/tests/nv_ingest_client/client/test_client.py @@ -276,7 +276,7 @@ def test_correct_storage_of_job_details(nv_ingest_client): def test_successful_task_creation(nv_ingest_client_with_jobs): job_id = "12345678-1234-5678-1234-567812345678" task_type = TaskType.SPLIT - task_params = {"split_by": "sentence"} + task_params = {"tokenizer": "intfloat/e5-large-unsupervised"} # Assuming task_factory and task creation are implemented nv_ingest_client_with_jobs.create_task(job_id, task_type, task_params) @@ -288,7 +288,9 @@ def test_successful_task_creation(nv_ingest_client_with_jobs): def test_non_existent_job(nv_ingest_client): with pytest.raises(ValueError): - nv_ingest_client.create_task("nonexistent_job_id", TaskType.SPLIT, {"split_by": "sentence"}) + nv_ingest_client.create_task( + "nonexistent_job_id", TaskType.SPLIT, {"tokenizer": "intfloat/e5-large-unsupervised"} + ) def test_add_task_post_submission(nv_ingest_client_with_jobs): @@ -297,13 +299,13 @@ def test_add_task_post_submission(nv_ingest_client_with_jobs): nv_ingest_client_with_jobs._job_states[job_id].state = JobStateEnum.PROCESSING with pytest.raises(ValueError): - nv_ingest_client_with_jobs.create_task(job_id, TaskType.SPLIT, {"split_by": "sentence"}) + nv_ingest_client_with_jobs.create_task(job_id, TaskType.SPLIT, {"tokenizer": "intfloat/e5-large-unsupervised"}) def test_parameter_validation(nv_ingest_client_with_jobs): job_id = "12345678-1234-5678-1234-567812345678" task_type = TaskType.SPLIT - task_params = {"split_by": "sentence", "split_length": 128} + task_params = {"tokenizer": "intfloat/e5-large-unsupervised", "chunk_size": 128} nv_ingest_client_with_jobs.create_task(job_id, task_type, task_params) job_state = nv_ingest_client_with_jobs._job_states[job_id] @@ -580,8 +582,8 @@ def test_create_jobs_for_batch_duplicate_task(nv_ingest_client, mock_create_job_ files = ["file1.pdf"] duplicate_tasks = { - "split": SplitTask(split_by="sentence"), - "store": SplitTask(split_by="sentence"), # Duplicate task + "split": SplitTask(tokenizer="intfloat/e5-large-unsupervised"), + "store": SplitTask(tokenizer="intfloat/e5-large-unsupervised"), # Duplicate task } with pytest.raises(ValueError, match="Duplicate task detected"): diff --git a/tests/nv_ingest_client/client/test_interface.py b/tests/nv_ingest_client/client/test_interface.py index 64b9dc63..786b1f31 100644 --- a/tests/nv_ingest_client/client/test_interface.py +++ b/tests/nv_ingest_client/client/test_interface.py @@ -153,12 +153,12 @@ def test_split_task_no_args(ingestor): def test_split_task_some_args(ingestor): - ingestor.split(split_by="word", split_length=42) + ingestor.split(tokenizer="intfloat/e5-large-unsupervised", chunk_size=42) task = ingestor._job_specs.job_specs["pdf"][0]._tasks[0] assert isinstance(task, SplitTask) - assert task._split_by == "word" - assert task._split_length == 42 + assert task._tokenizer == "intfloat/e5-large-unsupervised" + assert task._chunk_size == 42 def test_store_task_no_args(ingestor): diff --git a/tests/nv_ingest_client/primitives/tasks/test_split.py b/tests/nv_ingest_client/primitives/tasks/test_split.py index 3fb0dbeb..a4531002 100644 --- a/tests/nv_ingest_client/primitives/tasks/test_split.py +++ b/tests/nv_ingest_client/primitives/tasks/test_split.py @@ -10,31 +10,22 @@ def test_split_task_initialization(): task = SplitTask( - split_by="word", - split_length=100, - split_overlap=10, - max_character_length=1000, - sentence_window_size=5, + tokenizer="intfloat/e5-large-unsupervised", + chunk_size=300, + chunk_overlap=0, ) - assert task._split_by == "word" - assert task._split_length == 100 - assert task._split_overlap == 10 - assert task._max_character_length == 1000 - assert task._sentence_window_size == 5 + assert task._tokenizer == "intfloat/e5-large-unsupervised" + assert task._chunk_size == 300 + assert task._chunk_overlap == 0 # String Representation Tests def test_split_task_str_representation(): - task = SplitTask(split_by="sentence", split_length=50, split_overlap=5) + task = SplitTask(tokenizer="intfloat/e5-large-unsupervised", chunk_size=50, chunk_overlap=5) expected_str = ( - "Split Task:\n" - " split_by: sentence\n" - " split_length: 50\n" - " split_overlap: 5\n" - " split_max_character_length: None\n" - " split_sentence_window_size: None\n" + "Split Task:\n" " tokenizer: intfloat/e5-large-unsupervised\n" " chunk_size: 50\n" " chunk_overlap: 5\n" ) assert str(task) == expected_str @@ -43,42 +34,33 @@ def test_split_task_str_representation(): @pytest.mark.parametrize( - "split_by, split_length, split_overlap, max_character_length, sentence_window_size", + "tokenizer, chunk_size, chunk_overlap", [ - ("word", 100, 10, 1000, 5), - ("sentence", 50, 5, None, None), - ("passage", None, None, 1500, 3), - (None, None, None, None, None), # Test default parameters + ("intfloat/e5-large-unsupervised", 100, 10), + ("microsoft/deberta-large", 50, 5), + ("meta-llama/Llama-3.2-1B", 300, 0), ], ) def test_split_task_to_dict( - split_by, - split_length, - split_overlap, - max_character_length, - sentence_window_size, + tokenizer, + chunk_size, + chunk_overlap, ): task = SplitTask( - split_by=split_by, - split_length=split_length, - split_overlap=split_overlap, - max_character_length=max_character_length, - sentence_window_size=sentence_window_size, + tokenizer=tokenizer, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, ) expected_dict = {"type": "split", "task_properties": {}} # Only add properties to expected_dict if they are not None - if split_by is not None: - expected_dict["task_properties"]["split_by"] = split_by - if split_length is not None: - expected_dict["task_properties"]["split_length"] = split_length - if split_overlap is not None: - expected_dict["task_properties"]["split_overlap"] = split_overlap - if max_character_length is not None: - expected_dict["task_properties"]["max_character_length"] = max_character_length - if sentence_window_size is not None: - expected_dict["task_properties"]["sentence_window_size"] = sentence_window_size + if tokenizer is not None: + expected_dict["task_properties"]["tokenizer"] = tokenizer + if chunk_size is not None: + expected_dict["task_properties"]["chunk_size"] = chunk_size + if chunk_overlap is not None: + expected_dict["task_properties"]["chunk_overlap"] = chunk_overlap assert task.to_dict() == expected_dict, "The to_dict method did not return the expected dictionary representation" @@ -89,14 +71,15 @@ def test_split_task_to_dict( def test_split_task_default_params(): task = SplitTask() expected_str_contains = [ - "split_by: None", - "split_length: None", - "split_overlap: None", - "split_max_character_length: None", - "split_sentence_window_size: None", + "tokenizer: intfloat/e5-large-unsupervised", + "chunk_size: 300", + "chunk_overlap: 0", ] for expected_part in expected_str_contains: assert expected_part in str(task) - expected_dict = {"type": "split", "task_properties": {}} + expected_dict = { + "type": "split", + "task_properties": {"tokenizer": "intfloat/e5-large-unsupervised", "chunk_size": 300, "chunk_overlap": 0}, + } assert task.to_dict() == expected_dict From a21d0658017610eaa810f800eabc0fed802c603e Mon Sep 17 00:00:00 2001 From: Chris Jarrett Date: Wed, 8 Jan 2025 15:16:07 -0800 Subject: [PATCH 06/18] Add chunk overlap --- src/nv_ingest/modules/transforms/nemo_doc_splitter.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/nv_ingest/modules/transforms/nemo_doc_splitter.py b/src/nv_ingest/modules/transforms/nemo_doc_splitter.py index 79c315b2..e730f819 100644 --- a/src/nv_ingest/modules/transforms/nemo_doc_splitter.py +++ b/src/nv_ingest/modules/transforms/nemo_doc_splitter.py @@ -51,7 +51,7 @@ def _build_split_documents(row, chunks: List[str]) -> List[dict[str, Any]]: return documents -def _split_into_chunks(text, tokenizer, chunk_size=300): +def _split_into_chunks(text, tokenizer, chunk_size=300, chunk_overlap=0): # Tokenize the text into token IDs encoding = tokenizer.encode_plus(text, add_special_tokens=False, return_offsets_mapping=True) @@ -60,7 +60,7 @@ def _split_into_chunks(text, tokenizer, chunk_size=300): offsets = encoding["offset_mapping"] # Split the tokens into chunks of the desired size - chunks = [tokens[i : i + chunk_size] for i in range(0, len(tokens), chunk_size)] + chunks = [tokens[i : i + chunk_size] for i in range(0, len(tokens), chunk_size - chunk_overlap)] # Convert token chunks back to text while preserving original spacing and case text_chunks = [] @@ -146,7 +146,7 @@ def split_and_forward(message: ControlMessage): "DocumentSplitter only works with text documents but one or more " "'content' values are None." ) - chunks = _split_into_chunks(content, tokenizer_model, chunk_size) + chunks = _split_into_chunks(content, tokenizer_model, chunk_size, chunk_overlap) split_docs.extend(_build_split_documents(row, chunks)) split_docs_df = pd.DataFrame(split_docs) From bc2bf48dfc47ca016150b595b8d6e7ae3a45704b Mon Sep 17 00:00:00 2001 From: Chris Jarrett Date: Wed, 8 Jan 2025 17:19:47 -0800 Subject: [PATCH 07/18] Rename nemo document splitter to text splitter --- src/nv_ingest/modules/transforms/__init__.py | 4 ++-- .../{nemo_doc_splitter.py => text_splitter.py} | 12 ++++++------ src/nv_ingest/schemas/__init__.py | 4 ++-- .../schemas/ingest_pipeline_config_schema.py | 4 ++-- ..._splitter_schema.py => text_splitter_schema.py} | 2 +- src/nv_ingest/util/pipeline/__init__.py | 4 ++-- src/nv_ingest/util/pipeline/pipeline_builders.py | 6 +++--- src/nv_ingest/util/pipeline/stage_builders.py | 14 +++++++------- ...tter_schema.py => test_text_splitter_schema.py} | 14 +++++++------- 9 files changed, 32 insertions(+), 32 deletions(-) rename src/nv_ingest/modules/transforms/{nemo_doc_splitter.py => text_splitter.py} (93%) rename src/nv_ingest/schemas/{nemo_doc_splitter_schema.py => text_splitter_schema.py} (90%) rename tests/nv_ingest/schemas/{test_nemo_doc_splitter_schema.py => test_text_splitter_schema.py} (56%) diff --git a/src/nv_ingest/modules/transforms/__init__.py b/src/nv_ingest/modules/transforms/__init__.py index 4a39c32a..941cd120 100644 --- a/src/nv_ingest/modules/transforms/__init__.py +++ b/src/nv_ingest/modules/transforms/__init__.py @@ -3,6 +3,6 @@ # SPDX-License-Identifier: Apache-2.0 from .associate_nearby_text import AssociateNearbyTextLoaderFactory -from .nemo_doc_splitter import NemoDocSplitterLoaderFactory +from .text_splitter import TextSplitterLoaderFactory -__all__ = ["NemoDocSplitterLoaderFactory", "AssociateNearbyTextLoaderFactory"] +__all__ = ["TextSplitterLoaderFactory", "AssociateNearbyTextLoaderFactory"] diff --git a/src/nv_ingest/modules/transforms/nemo_doc_splitter.py b/src/nv_ingest/modules/transforms/text_splitter.py similarity index 93% rename from src/nv_ingest/modules/transforms/nemo_doc_splitter.py rename to src/nv_ingest/modules/transforms/text_splitter.py index e730f819..eb3ec875 100644 --- a/src/nv_ingest/modules/transforms/nemo_doc_splitter.py +++ b/src/nv_ingest/modules/transforms/text_splitter.py @@ -24,7 +24,7 @@ import cudf from nv_ingest.schemas.metadata_schema import ContentTypeEnum -from nv_ingest.schemas.nemo_doc_splitter_schema import DocumentSplitterSchema +from nv_ingest.schemas.text_splitter_schema import TextSplitterSchema from nv_ingest.util.exception_handlers.decorators import nv_ingest_node_failure_context_manager from nv_ingest.util.flow_control import filter_by_task from nv_ingest.util.modules.config_validator import fetch_and_validate_module_config @@ -80,19 +80,19 @@ def _split_into_chunks(text, tokenizer, chunk_size=300, chunk_overlap=0): return text_chunks -MODULE_NAME = "nemo_document_splitter" +MODULE_NAME = "text_splitter" MODULE_NAMESPACE = "nv_ingest" -NemoDocSplitterLoaderFactory = ModuleLoaderFactory(MODULE_NAME, MODULE_NAMESPACE, DocumentSplitterSchema) +TextSplitterLoaderFactory = ModuleLoaderFactory(MODULE_NAME, MODULE_NAMESPACE, TextSplitterSchema) @register_module(MODULE_NAME, MODULE_NAMESPACE) -def _nemo_document_splitter(builder: mrc.Builder): +def _text_splitter(builder: mrc.Builder): """ A pipeline module that splits documents into smaller parts based on the specified criteria. """ - validated_config = fetch_and_validate_module_config(builder, DocumentSplitterSchema) + validated_config = fetch_and_validate_module_config(builder, TextSplitterSchema) @filter_by_task(["split"]) @traceable(MODULE_NAME) @@ -143,7 +143,7 @@ def split_and_forward(message: ControlMessage): if content is None: raise ValueError( - "DocumentSplitter only works with text documents but one or more " "'content' values are None." + "TextSplitter only works with text documents but one or more " "'content' values are None." ) chunks = _split_into_chunks(content, tokenizer_model, chunk_size, chunk_overlap) diff --git a/src/nv_ingest/schemas/__init__.py b/src/nv_ingest/schemas/__init__.py index 43a94055..f3ff4659 100644 --- a/src/nv_ingest/schemas/__init__.py +++ b/src/nv_ingest/schemas/__init__.py @@ -13,13 +13,13 @@ from .message_broker_source_schema import MessageBrokerTaskSourceSchema from .metadata_injector_schema import MetadataInjectorSchema from .metadata_schema import validate_metadata -from .nemo_doc_splitter_schema import DocumentSplitterSchema +from .text_splitter_schema import TextSplitterSchema from .pdf_extractor_schema import PDFExtractorSchema from .task_injection_schema import TaskInjectionSchema from .vdb_task_sink_schema import VdbTaskSinkSchema __all__ = [ - "DocumentSplitterSchema", + "TextSplitterSchema", "ImageCaptionExtractionSchema", "ImageStorageModuleSchema", "IngestJobSchema", diff --git a/src/nv_ingest/schemas/ingest_pipeline_config_schema.py b/src/nv_ingest/schemas/ingest_pipeline_config_schema.py index 1471a338..b3c82f42 100644 --- a/src/nv_ingest/schemas/ingest_pipeline_config_schema.py +++ b/src/nv_ingest/schemas/ingest_pipeline_config_schema.py @@ -18,7 +18,7 @@ from nv_ingest.schemas.message_broker_sink_schema import MessageBrokerTaskSinkSchema from nv_ingest.schemas.message_broker_source_schema import MessageBrokerTaskSourceSchema from nv_ingest.schemas.metadata_injector_schema import MetadataInjectorSchema -from nv_ingest.schemas.nemo_doc_splitter_schema import DocumentSplitterSchema +from nv_ingest.schemas.text_splitter_schema import TextSplitterSchema from nv_ingest.schemas.otel_meter_schema import OpenTelemetryMeterSchema from nv_ingest.schemas.otel_tracer_schema import OpenTelemetryTracerSchema from nv_ingest.schemas.pdf_extractor_schema import PDFExtractorSchema @@ -30,7 +30,7 @@ class PipelineConfigSchema(BaseModel): chart_extractor_module: ChartExtractorSchema = ChartExtractorSchema() - document_splitter_module: DocumentSplitterSchema = DocumentSplitterSchema() + text_splitter_module: TextSplitterSchema = TextSplitterSchema() embedding_storage_module: EmbeddingStorageModuleSchema = EmbeddingStorageModuleSchema() embed_extractions_module: EmbedExtractionsSchema = EmbedExtractionsSchema() image_caption_extraction_module: ImageCaptionExtractionSchema = ImageCaptionExtractionSchema() diff --git a/src/nv_ingest/schemas/nemo_doc_splitter_schema.py b/src/nv_ingest/schemas/text_splitter_schema.py similarity index 90% rename from src/nv_ingest/schemas/nemo_doc_splitter_schema.py rename to src/nv_ingest/schemas/text_splitter_schema.py index ad34e583..db9b1508 100644 --- a/src/nv_ingest/schemas/nemo_doc_splitter_schema.py +++ b/src/nv_ingest/schemas/text_splitter_schema.py @@ -8,7 +8,7 @@ from typing_extensions import Annotated -class DocumentSplitterSchema(BaseModel): +class TextSplitterSchema(BaseModel): tokenizer: str = "intfloat/e5-large-unsupervised" chunk_size: Annotated[int, Field(gt=0)] = 300 chunk_overlap: Annotated[int, Field(ge=0)] = 0 diff --git a/src/nv_ingest/util/pipeline/__init__.py b/src/nv_ingest/util/pipeline/__init__.py index 5754c667..6957b2ea 100644 --- a/src/nv_ingest/util/pipeline/__init__.py +++ b/src/nv_ingest/util/pipeline/__init__.py @@ -17,7 +17,7 @@ add_table_extractor_stage, add_chart_extractor_stage, add_image_caption_stage, - add_nemo_splitter_stage, + add_text_splitter_stage, add_embed_extractions_stage, add_embedding_storage_stage, add_image_storage_stage, @@ -39,7 +39,7 @@ "add_table_extractor_stage", "add_chart_extractor_stage", "add_image_caption_stage", - "add_nemo_splitter_stage", + "add_text_splitter_stage", "add_embed_extractions_stage", "add_embedding_storage_stage", "add_image_storage_stage", diff --git a/src/nv_ingest/util/pipeline/pipeline_builders.py b/src/nv_ingest/util/pipeline/pipeline_builders.py index 842682f0..b431778c 100644 --- a/src/nv_ingest/util/pipeline/pipeline_builders.py +++ b/src/nv_ingest/util/pipeline/pipeline_builders.py @@ -47,7 +47,7 @@ def setup_ingestion_pipeline( ######################################################################################################## ## Transforms and data synthesis ######################################################################################################## - nemo_splitter_stage = add_nemo_splitter_stage(pipe, morpheus_pipeline_config, ingest_config) + text_splitter_stage = add_text_splitter_stage(pipe, morpheus_pipeline_config, ingest_config) embed_extractions_stage = add_embed_extractions_stage(pipe, morpheus_pipeline_config, ingest_config) ######################################################################################################## ## Storage and output @@ -80,8 +80,8 @@ def setup_ingestion_pipeline( pipe.add_edge(image_dedup_stage, image_filter_stage) pipe.add_edge(image_filter_stage, table_extraction_stage) pipe.add_edge(table_extraction_stage, chart_extraction_stage) - pipe.add_edge(chart_extraction_stage, nemo_splitter_stage) - pipe.add_edge(nemo_splitter_stage, image_caption_stage) + pipe.add_edge(chart_extraction_stage, text_splitter_stage) + pipe.add_edge(text_splitter_stage, image_caption_stage) pipe.add_edge(image_caption_stage, embed_extractions_stage) pipe.add_edge(embed_extractions_stage, image_storage_stage) pipe.add_edge(image_storage_stage, embedding_storage_stage) diff --git a/src/nv_ingest/util/pipeline/stage_builders.py b/src/nv_ingest/util/pipeline/stage_builders.py index b01338de..94e1aa94 100644 --- a/src/nv_ingest/util/pipeline/stage_builders.py +++ b/src/nv_ingest/util/pipeline/stage_builders.py @@ -21,7 +21,7 @@ from nv_ingest.modules.telemetry.otel_meter import OpenTelemetryMeterLoaderFactory from nv_ingest.modules.telemetry.otel_tracer import OpenTelemetryTracerLoaderFactory from nv_ingest.modules.transforms.embed_extractions import EmbedExtractionsLoaderFactory -from nv_ingest.modules.transforms.nemo_doc_splitter import NemoDocSplitterLoaderFactory +from nv_ingest.modules.transforms.text_splitter import TextSplitterLoaderFactory from nv_ingest.stages.docx_extractor_stage import generate_docx_extractor_stage from nv_ingest.stages.extractors.image_extractor_stage import generate_image_extractor_stage from nv_ingest.stages.filters import generate_dedup_stage @@ -331,15 +331,15 @@ def add_image_filter_stage(pipe, morpheus_pipeline_config, ingest_config, defaul return image_filter_stage -def add_nemo_splitter_stage(pipe, morpheus_pipeline_config, ingest_config): - nemo_splitter_loader = NemoDocSplitterLoaderFactory.get_instance( - module_name="nemo_doc_splitter", +def add_text_splitter_stage(pipe, morpheus_pipeline_config, ingest_config): + text_splitter_loader = TextSplitterLoaderFactory.get_instance( + module_name="text_splitter", module_config=ingest_config.get("text_splitting_module", {}), ) - nemo_splitter_stage = pipe.add_stage( + text_splitter_stage = pipe.add_stage( LinearModulesStage( morpheus_pipeline_config, - nemo_splitter_loader, + text_splitter_loader, input_type=ControlMessage, output_type=ControlMessage, input_port_name="input", @@ -347,7 +347,7 @@ def add_nemo_splitter_stage(pipe, morpheus_pipeline_config, ingest_config): ) ) - return nemo_splitter_stage + return text_splitter_stage def add_image_caption_stage(pipe, morpheus_pipeline_config, ingest_config, default_cpu_count): diff --git a/tests/nv_ingest/schemas/test_nemo_doc_splitter_schema.py b/tests/nv_ingest/schemas/test_text_splitter_schema.py similarity index 56% rename from tests/nv_ingest/schemas/test_nemo_doc_splitter_schema.py rename to tests/nv_ingest/schemas/test_text_splitter_schema.py index 73636ccf..5e2c2a80 100644 --- a/tests/nv_ingest/schemas/test_nemo_doc_splitter_schema.py +++ b/tests/nv_ingest/schemas/test_text_splitter_schema.py @@ -5,14 +5,14 @@ import pytest from pydantic import ValidationError -from nv_ingest.schemas import DocumentSplitterSchema +from nv_ingest.schemas import TextSplitterSchema -def test_document_splitter_schema_defaults(): +def test_text_splitter_schema_defaults(): """ - Test the DocumentSplitterSchema with default values. + Test the TextSplitterSchema with default values. """ - schema = DocumentSplitterSchema() + schema = TextSplitterSchema() assert schema.tokenizer == "intfloat/e5-large-unsupervised" assert schema.chunk_size == 300 assert schema.chunk_overlap == 0 @@ -20,9 +20,9 @@ def test_document_splitter_schema_defaults(): @pytest.mark.parametrize("invalid_value", [-1, 0]) -def test_document_splitter_schema_invalid_split_length(invalid_value): +def test_text_splitter_schema_invalid_split_length(invalid_value): """ - Test DocumentSplitterSchema with invalid chunk_size values. + Test TextSplitterSchema with invalid chunk_size values. """ with pytest.raises(ValidationError): - DocumentSplitterSchema(chunk_size=invalid_value) + TextSplitterSchema(chunk_size=invalid_value) From cd18083efdbe85d36cb8aa4fafd6e6788e9772b5 Mon Sep 17 00:00:00 2001 From: Chris Jarrett Date: Wed, 8 Jan 2025 17:27:08 -0800 Subject: [PATCH 08/18] Temp fix --- src/nv_ingest/modules/transforms/text_splitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nv_ingest/modules/transforms/text_splitter.py b/src/nv_ingest/modules/transforms/text_splitter.py index eb3ec875..1ba95d4b 100644 --- a/src/nv_ingest/modules/transforms/text_splitter.py +++ b/src/nv_ingest/modules/transforms/text_splitter.py @@ -135,7 +135,7 @@ def split_and_forward(message: ControlMessage): f"chunk_overlap: {chunk_overlap}" ) - tokenizer_model = AutoTokenizer.from_pretrained(tokenizer, token="") + tokenizer_model = AutoTokenizer.from_pretrained(tokenizer) split_docs = [] for _, row in df_filtered.iterrows(): From bab7f3d0fd71b6b54aad747ef34989c5724880a1 Mon Sep 17 00:00:00 2001 From: Chris Jarrett Date: Mon, 13 Jan 2025 15:02:07 -0800 Subject: [PATCH 09/18] Address reviews --- .../modules/transforms/text_splitter.py | 11 +--- src/nv_ingest/schemas/ingest_job_schema.py | 6 ++ src/nv_ingest/schemas/text_splitter_schema.py | 8 ++- .../schemas/test_ingest_job_schema.py | 20 +++++++ .../schemas/test_text_splitter_schema.py | 59 ++++++++++++++++++- 5 files changed, 92 insertions(+), 12 deletions(-) diff --git a/src/nv_ingest/modules/transforms/text_splitter.py b/src/nv_ingest/modules/transforms/text_splitter.py index 1ba95d4b..20210659 100644 --- a/src/nv_ingest/modules/transforms/text_splitter.py +++ b/src/nv_ingest/modules/transforms/text_splitter.py @@ -118,10 +118,6 @@ def split_and_forward(message: ControlMessage): df_filtered = df.loc[bool_index] if df_filtered.empty: - gdf = cudf.from_pandas(df) - message_meta = MessageMeta(df=gdf) - message.payload(message_meta) - return message # Override parameters if set @@ -139,12 +135,7 @@ def split_and_forward(message: ControlMessage): split_docs = [] for _, row in df_filtered.iterrows(): - content = row["metadata"]["content"] - - if content is None: - raise ValueError( - "TextSplitter only works with text documents but one or more " "'content' values are None." - ) + content = row["metadata"]["content"] if row["metadata"]["content"] is not None else "" chunks = _split_into_chunks(content, tokenizer_model, chunk_size, chunk_overlap) split_docs.extend(_build_split_documents(row, chunks)) diff --git a/src/nv_ingest/schemas/ingest_job_schema.py b/src/nv_ingest/schemas/ingest_job_schema.py index f78cadd9..3decf647 100644 --- a/src/nv_ingest/schemas/ingest_job_schema.py +++ b/src/nv_ingest/schemas/ingest_job_schema.py @@ -63,6 +63,12 @@ class IngestTaskSplitSchema(BaseModelNoExt): chunk_size: Annotated[int, Field(gt=0)] chunk_overlap: Annotated[int, Field(ge=0)] + @field_validator("chunk_overlap") + def check_chunk_overlap(cls, v, values, **kwargs): + if v is not None and "chunk_size" in values.data and v >= values.data["chunk_size"]: + raise ValueError("chunk_overlap must be less than chunk_size") + return v + class IngestTaskExtractSchema(BaseModelNoExt): document_type: DocumentTypeEnum diff --git a/src/nv_ingest/schemas/text_splitter_schema.py b/src/nv_ingest/schemas/text_splitter_schema.py index db9b1508..3fe639ca 100644 --- a/src/nv_ingest/schemas/text_splitter_schema.py +++ b/src/nv_ingest/schemas/text_splitter_schema.py @@ -3,7 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 -from pydantic import Field, BaseModel +from pydantic import Field, BaseModel, field_validator from typing_extensions import Annotated @@ -13,3 +13,9 @@ class TextSplitterSchema(BaseModel): chunk_size: Annotated[int, Field(gt=0)] = 300 chunk_overlap: Annotated[int, Field(ge=0)] = 0 raise_on_failure: bool = False + + @field_validator("chunk_overlap") + def check_chunk_overlap(cls, v, values, **kwargs): + if v is not None and "chunk_size" in values.data and v >= values.data["chunk_size"]: + raise ValueError("chunk_overlap must be less than chunk_size") + return v diff --git a/tests/nv_ingest/schemas/test_ingest_job_schema.py b/tests/nv_ingest/schemas/test_ingest_job_schema.py index e9fb7f22..386e756d 100644 --- a/tests/nv_ingest/schemas/test_ingest_job_schema.py +++ b/tests/nv_ingest/schemas/test_ingest_job_schema.py @@ -117,6 +117,26 @@ def test_field_type_correctness(): validate_ingest_job(job_data) +def test_custom_validator_logic_for_sentence_window_size(): + """Tests custom validator logic related to chunk_size and chunk_overlap in split tasks.""" + task = { + "type": "split", + "task_properties": { + "tokanizer": "intfloat/e5-large-unsupervised", + "chunk_size": 200, + "chunk_overlap": 250, # chunk_overlap should always be less than chunk_size + }, + } + job_data = { + "job_payload": valid_job_payload(), + "job_id": "123", + "tasks": [task], + } + with pytest.raises(ValidationError) as exc_info: + validate_ingest_job(job_data) + assert "chunk_overlap must be less than chunk_size" in str(exc_info.value) + + def test_multiple_task_types(): job_data = { "job_payload": { diff --git a/tests/nv_ingest/schemas/test_text_splitter_schema.py b/tests/nv_ingest/schemas/test_text_splitter_schema.py index 5e2c2a80..86488105 100644 --- a/tests/nv_ingest/schemas/test_text_splitter_schema.py +++ b/tests/nv_ingest/schemas/test_text_splitter_schema.py @@ -19,10 +19,67 @@ def test_text_splitter_schema_defaults(): assert schema.raise_on_failure is False +def test_text_splitter_schema_custom_values(): + """ + Test the TextSplitterSchema with custom values. + """ + tokenizer = "meta-llama/Llama-3.2-1B" + chunk_size = 500 + chunk_overlap = 10 + schema = TextSplitterSchema( + tokenizer=tokenizer, chunk_size=chunk_size, chunk_overlap=chunk_overlap, raise_on_failure=True + ) + assert schema.tokenizer == tokenizer + assert schema.chunk_size == chunk_size + assert schema.chunk_overlap == chunk_overlap + assert schema.raise_on_failure is True + + +@pytest.mark.parametrize("invalid_value", [50, 5.5]) +def test_text_splitter_schema_invalid_tokenizer(invalid_value): + """ + Test TextSplitterSchema with invalid tokenizer values. + """ + with pytest.raises(ValidationError): + TextSplitterSchema(tokenizer=invalid_value) + + @pytest.mark.parametrize("invalid_value", [-1, 0]) -def test_text_splitter_schema_invalid_split_length(invalid_value): +def test_text_splitter_schema_invalid_chunk_size(invalid_value): """ Test TextSplitterSchema with invalid chunk_size values. """ with pytest.raises(ValidationError): TextSplitterSchema(chunk_size=invalid_value) + + +@pytest.mark.parametrize("invalid_value", [-1, "a"]) +def test_text_splitter_schema_invalid_chunk_overlap(invalid_value): + """ + Test TextSplitterSchema with invalid chunk_overlap values. + """ + with pytest.raises(ValidationError): + TextSplitterSchema(chunk_overlap=invalid_value) + + +@pytest.mark.parametrize( + "chunk_size, chunk_overlap, is_valid", + [ + (300, 50, True), + (150, 0, True), + (100, 100, False), + (50, 200, False), + ], +) +def test_text_splitter_schema_chunk_overlap_validation(chunk_size, chunk_overlap, is_valid): + """ + Parametrized test for validating the chunk_overlap logic in TextSplitterSchema. + """ + if is_valid: + schema = TextSplitterSchema(chunk_size=chunk_size, chunk_overlap=chunk_overlap) + assert schema.chunk_size == chunk_size + assert schema.chunk_overlap == chunk_overlap + else: + with pytest.raises(ValidationError) as excinfo: + TextSplitterSchema(chunk_size=chunk_size, chunk_overlap=chunk_overlap) + assert "chunk_overlap must be less than chunk_size" in str(excinfo.value) From 2f4b979c1a114b6c6559673136ce90a412d57ccc Mon Sep 17 00:00:00 2001 From: Chris Jarrett Date: Mon, 10 Feb 2025 13:42:09 -0800 Subject: [PATCH 10/18] Change default chunk_size to 1024 --- client/src/nv_ingest_client/primitives/tasks/split.py | 4 ++-- src/nv_ingest/modules/transforms/text_splitter.py | 2 +- src/nv_ingest/schemas/text_splitter_schema.py | 2 +- tests/nv_ingest/schemas/test_text_splitter_schema.py | 2 +- tests/nv_ingest_client/primitives/tasks/test_split.py | 10 +++++----- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/client/src/nv_ingest_client/primitives/tasks/split.py b/client/src/nv_ingest_client/primitives/tasks/split.py index 5f45e806..86094701 100644 --- a/client/src/nv_ingest_client/primitives/tasks/split.py +++ b/client/src/nv_ingest_client/primitives/tasks/split.py @@ -18,7 +18,7 @@ class SplitTaskSchema(BaseModel): tokenizer: str = "intfloat/e5-large-unsupervised" - chunk_size: int = 300 + chunk_size: int = 1024 chunk_overlap: int = 0 class Config: @@ -33,7 +33,7 @@ class SplitTask(Task): def __init__( self, tokenizer: str = "intfloat/e5-large-unsupervised", - chunk_size: int = 300, + chunk_size: int = 1024, chunk_overlap: int = 0, ) -> None: """ diff --git a/src/nv_ingest/modules/transforms/text_splitter.py b/src/nv_ingest/modules/transforms/text_splitter.py index 20210659..45b46ccd 100644 --- a/src/nv_ingest/modules/transforms/text_splitter.py +++ b/src/nv_ingest/modules/transforms/text_splitter.py @@ -51,7 +51,7 @@ def _build_split_documents(row, chunks: List[str]) -> List[dict[str, Any]]: return documents -def _split_into_chunks(text, tokenizer, chunk_size=300, chunk_overlap=0): +def _split_into_chunks(text, tokenizer, chunk_size=1024, chunk_overlap=0): # Tokenize the text into token IDs encoding = tokenizer.encode_plus(text, add_special_tokens=False, return_offsets_mapping=True) diff --git a/src/nv_ingest/schemas/text_splitter_schema.py b/src/nv_ingest/schemas/text_splitter_schema.py index 3fe639ca..46c7ba64 100644 --- a/src/nv_ingest/schemas/text_splitter_schema.py +++ b/src/nv_ingest/schemas/text_splitter_schema.py @@ -10,7 +10,7 @@ class TextSplitterSchema(BaseModel): tokenizer: str = "intfloat/e5-large-unsupervised" - chunk_size: Annotated[int, Field(gt=0)] = 300 + chunk_size: Annotated[int, Field(gt=0)] = 1024 chunk_overlap: Annotated[int, Field(ge=0)] = 0 raise_on_failure: bool = False diff --git a/tests/nv_ingest/schemas/test_text_splitter_schema.py b/tests/nv_ingest/schemas/test_text_splitter_schema.py index 86488105..e9406d8d 100644 --- a/tests/nv_ingest/schemas/test_text_splitter_schema.py +++ b/tests/nv_ingest/schemas/test_text_splitter_schema.py @@ -14,7 +14,7 @@ def test_text_splitter_schema_defaults(): """ schema = TextSplitterSchema() assert schema.tokenizer == "intfloat/e5-large-unsupervised" - assert schema.chunk_size == 300 + assert schema.chunk_size == 1024 assert schema.chunk_overlap == 0 assert schema.raise_on_failure is False diff --git a/tests/nv_ingest_client/primitives/tasks/test_split.py b/tests/nv_ingest_client/primitives/tasks/test_split.py index a4531002..2976d2ee 100644 --- a/tests/nv_ingest_client/primitives/tasks/test_split.py +++ b/tests/nv_ingest_client/primitives/tasks/test_split.py @@ -11,11 +11,11 @@ def test_split_task_initialization(): task = SplitTask( tokenizer="intfloat/e5-large-unsupervised", - chunk_size=300, + chunk_size=1024, chunk_overlap=0, ) assert task._tokenizer == "intfloat/e5-large-unsupervised" - assert task._chunk_size == 300 + assert task._chunk_size == 1024 assert task._chunk_overlap == 0 @@ -38,7 +38,7 @@ def test_split_task_str_representation(): [ ("intfloat/e5-large-unsupervised", 100, 10), ("microsoft/deberta-large", 50, 5), - ("meta-llama/Llama-3.2-1B", 300, 0), + ("meta-llama/Llama-3.2-1B", 1024, 0), ], ) def test_split_task_to_dict( @@ -72,7 +72,7 @@ def test_split_task_default_params(): task = SplitTask() expected_str_contains = [ "tokenizer: intfloat/e5-large-unsupervised", - "chunk_size: 300", + "chunk_size: 1024", "chunk_overlap: 0", ] for expected_part in expected_str_contains: @@ -80,6 +80,6 @@ def test_split_task_default_params(): expected_dict = { "type": "split", - "task_properties": {"tokenizer": "intfloat/e5-large-unsupervised", "chunk_size": 300, "chunk_overlap": 0}, + "task_properties": {"tokenizer": "intfloat/e5-large-unsupervised", "chunk_size": 1024, "chunk_overlap": 0}, } assert task.to_dict() == expected_dict From b0529427072855ba720c74965fc147c1cd3aa1ca Mon Sep 17 00:00:00 2001 From: Chris Jarrett Date: Mon, 10 Feb 2025 21:29:59 -0800 Subject: [PATCH 11/18] Change default chunk_overlap to 20 --- client/src/nv_ingest_client/primitives/tasks/split.py | 4 ++-- src/nv_ingest/modules/transforms/text_splitter.py | 2 +- src/nv_ingest/schemas/text_splitter_schema.py | 2 +- tests/nv_ingest/schemas/test_text_splitter_schema.py | 2 +- tests/nv_ingest_client/primitives/tasks/test_split.py | 4 ++-- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/client/src/nv_ingest_client/primitives/tasks/split.py b/client/src/nv_ingest_client/primitives/tasks/split.py index 86094701..ece29605 100644 --- a/client/src/nv_ingest_client/primitives/tasks/split.py +++ b/client/src/nv_ingest_client/primitives/tasks/split.py @@ -19,7 +19,7 @@ class SplitTaskSchema(BaseModel): tokenizer: str = "intfloat/e5-large-unsupervised" chunk_size: int = 1024 - chunk_overlap: int = 0 + chunk_overlap: int = 20 class Config: extra = "forbid" @@ -34,7 +34,7 @@ def __init__( self, tokenizer: str = "intfloat/e5-large-unsupervised", chunk_size: int = 1024, - chunk_overlap: int = 0, + chunk_overlap: int = 20, ) -> None: """ Setup Split Task Config diff --git a/src/nv_ingest/modules/transforms/text_splitter.py b/src/nv_ingest/modules/transforms/text_splitter.py index 45b46ccd..694335ec 100644 --- a/src/nv_ingest/modules/transforms/text_splitter.py +++ b/src/nv_ingest/modules/transforms/text_splitter.py @@ -51,7 +51,7 @@ def _build_split_documents(row, chunks: List[str]) -> List[dict[str, Any]]: return documents -def _split_into_chunks(text, tokenizer, chunk_size=1024, chunk_overlap=0): +def _split_into_chunks(text, tokenizer, chunk_size=1024, chunk_overlap=20): # Tokenize the text into token IDs encoding = tokenizer.encode_plus(text, add_special_tokens=False, return_offsets_mapping=True) diff --git a/src/nv_ingest/schemas/text_splitter_schema.py b/src/nv_ingest/schemas/text_splitter_schema.py index 46c7ba64..ef191749 100644 --- a/src/nv_ingest/schemas/text_splitter_schema.py +++ b/src/nv_ingest/schemas/text_splitter_schema.py @@ -11,7 +11,7 @@ class TextSplitterSchema(BaseModel): tokenizer: str = "intfloat/e5-large-unsupervised" chunk_size: Annotated[int, Field(gt=0)] = 1024 - chunk_overlap: Annotated[int, Field(ge=0)] = 0 + chunk_overlap: Annotated[int, Field(ge=0)] = 20 raise_on_failure: bool = False @field_validator("chunk_overlap") diff --git a/tests/nv_ingest/schemas/test_text_splitter_schema.py b/tests/nv_ingest/schemas/test_text_splitter_schema.py index e9406d8d..66a725c2 100644 --- a/tests/nv_ingest/schemas/test_text_splitter_schema.py +++ b/tests/nv_ingest/schemas/test_text_splitter_schema.py @@ -15,7 +15,7 @@ def test_text_splitter_schema_defaults(): schema = TextSplitterSchema() assert schema.tokenizer == "intfloat/e5-large-unsupervised" assert schema.chunk_size == 1024 - assert schema.chunk_overlap == 0 + assert schema.chunk_overlap == 20 assert schema.raise_on_failure is False diff --git a/tests/nv_ingest_client/primitives/tasks/test_split.py b/tests/nv_ingest_client/primitives/tasks/test_split.py index 2976d2ee..eedadf61 100644 --- a/tests/nv_ingest_client/primitives/tasks/test_split.py +++ b/tests/nv_ingest_client/primitives/tasks/test_split.py @@ -73,13 +73,13 @@ def test_split_task_default_params(): expected_str_contains = [ "tokenizer: intfloat/e5-large-unsupervised", "chunk_size: 1024", - "chunk_overlap: 0", + "chunk_overlap: 20", ] for expected_part in expected_str_contains: assert expected_part in str(task) expected_dict = { "type": "split", - "task_properties": {"tokenizer": "intfloat/e5-large-unsupervised", "chunk_size": 1024, "chunk_overlap": 0}, + "task_properties": {"tokenizer": "intfloat/e5-large-unsupervised", "chunk_size": 1024, "chunk_overlap": 20}, } assert task.to_dict() == expected_dict From 317a42621f6a4dfc0041ad03037707b419325c93 Mon Sep 17 00:00:00 2001 From: Chris Jarrett Date: Tue, 11 Feb 2025 16:32:31 -0800 Subject: [PATCH 12/18] Pass huggingface access token as param --- .../primitives/tasks/split.py | 13 +++++++++-- .../modules/transforms/text_splitter.py | 16 ++++++++++---- src/nv_ingest/schemas/ingest_job_schema.py | 1 + src/nv_ingest/schemas/text_splitter_schema.py | 3 ++- .../test_message_broker_task_source.py | 1 + .../schemas/test_ingest_job_schema.py | 5 +++++ .../schemas/test_text_splitter_schema.py | 2 +- .../primitives/tasks/test_split.py | 22 ++++++++++++------- 8 files changed, 47 insertions(+), 16 deletions(-) diff --git a/client/src/nv_ingest_client/primitives/tasks/split.py b/client/src/nv_ingest_client/primitives/tasks/split.py index ece29605..cfada11f 100644 --- a/client/src/nv_ingest_client/primitives/tasks/split.py +++ b/client/src/nv_ingest_client/primitives/tasks/split.py @@ -6,6 +6,8 @@ # pylint: disable=too-few-public-methods # pylint: disable=too-many-arguments +import os + import logging from typing import Dict @@ -17,9 +19,10 @@ class SplitTaskSchema(BaseModel): - tokenizer: str = "intfloat/e5-large-unsupervised" + tokenizer: str = "meta-llama/Llama-3.2-1B" chunk_size: int = 1024 chunk_overlap: int = 20 + params: dict = {} class Config: extra = "forbid" @@ -32,9 +35,10 @@ class SplitTask(Task): def __init__( self, - tokenizer: str = "intfloat/e5-large-unsupervised", + tokenizer: str = "meta-llama/Llama-3.2-1B", chunk_size: int = 1024, chunk_overlap: int = 20, + params: dict = {}, ) -> None: """ Setup Split Task Config @@ -43,6 +47,7 @@ def __init__( self._tokenizer = tokenizer self._chunk_size = chunk_size self._chunk_overlap = chunk_overlap + self._params = params def __str__(self) -> str: """ @@ -53,6 +58,8 @@ def __str__(self) -> str: info += f" tokenizer: {self._tokenizer}\n" info += f" chunk_size: {self._chunk_size}\n" info += f" chunk_overlap: {self._chunk_overlap}\n" + for key, value in self._params.items(): + info += f" {key}: {value}\n" return info def to_dict(self) -> Dict: @@ -67,5 +74,7 @@ def to_dict(self) -> Dict: split_params["chunk_size"] = self._chunk_size if self._chunk_overlap is not None: split_params["chunk_overlap"] = self._chunk_overlap + if self._params is not None: + split_params["params"] = self._params return {"type": "split", "task_properties": split_params} diff --git a/src/nv_ingest/modules/transforms/text_splitter.py b/src/nv_ingest/modules/transforms/text_splitter.py index 694335ec..496ac094 100644 --- a/src/nv_ingest/modules/transforms/text_splitter.py +++ b/src/nv_ingest/modules/transforms/text_splitter.py @@ -59,7 +59,7 @@ def _split_into_chunks(text, tokenizer, chunk_size=1024, chunk_overlap=20): tokens = encoding["input_ids"] offsets = encoding["offset_mapping"] - # Split the tokens into chunks of the desired size + # Split the tokens into chunks of the desired size with the desired overlap chunks = [tokens[i : i + chunk_size] for i in range(0, len(tokens), chunk_size - chunk_overlap)] # Convert token chunks back to text while preserving original spacing and case @@ -113,7 +113,12 @@ def split_and_forward(message: ControlMessage): with message.payload().mutable_dataframe() as mdf: df = mdf.to_pandas() - # Filter to text only + # Filter to txt files only + # bool_index = df["document_type"] == ContentTypeEnum.TEXT + # df_filtered = df.loc[bool_index] + logger.info(df.columns) + + # Filter to document type bool_index = df["document_type"] == ContentTypeEnum.TEXT df_filtered = df.loc[bool_index] @@ -124,14 +129,17 @@ def split_and_forward(message: ControlMessage): tokenizer = task_props.get("tokenizer", validated_config.tokenizer) chunk_size = task_props.get("chunk_size", validated_config.chunk_size) chunk_overlap = task_props.get("chunk_overlap", validated_config.chunk_overlap) + params = task_props.get("params", {}) + + hf_access_token = params.get("hf_access_token", None) - logger.info( + logger.debug( f"Splitting text with tokenizer: {tokenizer}, " f"chunk_size: {chunk_size} tokens, " f"chunk_overlap: {chunk_overlap}" ) - tokenizer_model = AutoTokenizer.from_pretrained(tokenizer) + tokenizer_model = AutoTokenizer.from_pretrained(tokenizer, token=hf_access_token) split_docs = [] for _, row in df_filtered.iterrows(): diff --git a/src/nv_ingest/schemas/ingest_job_schema.py b/src/nv_ingest/schemas/ingest_job_schema.py index 17a92903..0341394f 100644 --- a/src/nv_ingest/schemas/ingest_job_schema.py +++ b/src/nv_ingest/schemas/ingest_job_schema.py @@ -62,6 +62,7 @@ class IngestTaskSplitSchema(BaseModelNoExt): tokenizer: str chunk_size: Annotated[int, Field(gt=0)] chunk_overlap: Annotated[int, Field(ge=0)] + params: dict @field_validator("chunk_overlap") def check_chunk_overlap(cls, v, values, **kwargs): diff --git a/src/nv_ingest/schemas/text_splitter_schema.py b/src/nv_ingest/schemas/text_splitter_schema.py index ef191749..37380af6 100644 --- a/src/nv_ingest/schemas/text_splitter_schema.py +++ b/src/nv_ingest/schemas/text_splitter_schema.py @@ -3,13 +3,14 @@ # SPDX-License-Identifier: Apache-2.0 +from typing import Optional from pydantic import Field, BaseModel, field_validator from typing_extensions import Annotated class TextSplitterSchema(BaseModel): - tokenizer: str = "intfloat/e5-large-unsupervised" + tokenizer: str = "meta-llama/Llama-3.2-1B" chunk_size: Annotated[int, Field(gt=0)] = 1024 chunk_overlap: Annotated[int, Field(ge=0)] = 20 raise_on_failure: bool = False diff --git a/tests/nv_ingest/modules/sources/test_message_broker_task_source.py b/tests/nv_ingest/modules/sources/test_message_broker_task_source.py index c897f246..3cb9297e 100644 --- a/tests/nv_ingest/modules/sources/test_message_broker_task_source.py +++ b/tests/nv_ingest/modules/sources/test_message_broker_task_source.py @@ -42,6 +42,7 @@ def job_payload(): "tokenizer": "intfloat/e5-large-unsupervised", "chunk_size": 100, "chunk_overlap": 0, + "params": {}, }, }, { diff --git a/tests/nv_ingest/schemas/test_ingest_job_schema.py b/tests/nv_ingest/schemas/test_ingest_job_schema.py index b98610f6..f9d559ca 100644 --- a/tests/nv_ingest/schemas/test_ingest_job_schema.py +++ b/tests/nv_ingest/schemas/test_ingest_job_schema.py @@ -29,6 +29,7 @@ def valid_task_properties(task_type): "tokenizer": "intfloat/e5-large-unsupervised", "chunk_size": 300, "chunk_overlap": 0, + "params": {}, } elif task_type == TaskTypeEnum.extract: return {"document_type": "pdf", "method": "OCR", "params": {"language": "en"}} @@ -122,6 +123,7 @@ def test_custom_validator_logic_for_sentence_window_size(): "tokanizer": "intfloat/e5-large-unsupervised", "chunk_size": 200, "chunk_overlap": 250, # chunk_overlap should always be less than chunk_size + "params": {}, }, } job_data = { @@ -150,6 +152,7 @@ def test_multiple_task_types(): "tokenizer": "intfloat/e5-large-unsupervised", "chunk_size": 100, "chunk_overlap": 0, + "params": {}, }, }, { @@ -244,6 +247,7 @@ def test_incorrect_property_types(): "tokenizer": "intfloat/e5-large-unsupervised", "chunk_size": {"not an int": 123}, # Incorrect type (should be int) "chunk_overlap": 0, + "params": {}, }, } ], @@ -262,6 +266,7 @@ def test_missing_required_fields(): "task_properties": { "tokenizer": "intfloat/e5-large-unsupervised", # Missing chunk_size "chunk_overlap": 0, + "params": {}, }, } ], diff --git a/tests/nv_ingest/schemas/test_text_splitter_schema.py b/tests/nv_ingest/schemas/test_text_splitter_schema.py index 66a725c2..bbdda847 100644 --- a/tests/nv_ingest/schemas/test_text_splitter_schema.py +++ b/tests/nv_ingest/schemas/test_text_splitter_schema.py @@ -13,7 +13,7 @@ def test_text_splitter_schema_defaults(): Test the TextSplitterSchema with default values. """ schema = TextSplitterSchema() - assert schema.tokenizer == "intfloat/e5-large-unsupervised" + assert schema.tokenizer == "meta-llama/Llama-3.2-1B" assert schema.chunk_size == 1024 assert schema.chunk_overlap == 20 assert schema.raise_on_failure is False diff --git a/tests/nv_ingest_client/primitives/tasks/test_split.py b/tests/nv_ingest_client/primitives/tasks/test_split.py index eedadf61..6af26afc 100644 --- a/tests/nv_ingest_client/primitives/tasks/test_split.py +++ b/tests/nv_ingest_client/primitives/tasks/test_split.py @@ -10,13 +10,15 @@ def test_split_task_initialization(): task = SplitTask( - tokenizer="intfloat/e5-large-unsupervised", + tokenizer="meta-llama/Llama-3.2-1B", chunk_size=1024, chunk_overlap=0, + params={}, ) - assert task._tokenizer == "intfloat/e5-large-unsupervised" + assert task._tokenizer == "meta-llama/Llama-3.2-1B" assert task._chunk_size == 1024 assert task._chunk_overlap == 0 + assert task._params == {} # String Representation Tests @@ -34,22 +36,24 @@ def test_split_task_str_representation(): @pytest.mark.parametrize( - "tokenizer, chunk_size, chunk_overlap", + "tokenizer, chunk_size, chunk_overlap, params", [ - ("intfloat/e5-large-unsupervised", 100, 10), - ("microsoft/deberta-large", 50, 5), - ("meta-llama/Llama-3.2-1B", 1024, 0), + ("intfloat/e5-large-unsupervised", 100, 10, {}), + ("microsoft/deberta-large", 50, 5, None), + ("meta-llama/Llama-3.2-1B", 1024, 0, {"hf_access_token": "TOKEN"}), ], ) def test_split_task_to_dict( tokenizer, chunk_size, chunk_overlap, + params, ): task = SplitTask( tokenizer=tokenizer, chunk_size=chunk_size, chunk_overlap=chunk_overlap, + params=params, ) expected_dict = {"type": "split", "task_properties": {}} @@ -61,6 +65,8 @@ def test_split_task_to_dict( expected_dict["task_properties"]["chunk_size"] = chunk_size if chunk_overlap is not None: expected_dict["task_properties"]["chunk_overlap"] = chunk_overlap + if params is not None: + expected_dict["task_properties"]["params"] = params assert task.to_dict() == expected_dict, "The to_dict method did not return the expected dictionary representation" @@ -71,7 +77,7 @@ def test_split_task_to_dict( def test_split_task_default_params(): task = SplitTask() expected_str_contains = [ - "tokenizer: intfloat/e5-large-unsupervised", + "tokenizer: meta-llama/Llama-3.2-1B", "chunk_size: 1024", "chunk_overlap: 20", ] @@ -80,6 +86,6 @@ def test_split_task_default_params(): expected_dict = { "type": "split", - "task_properties": {"tokenizer": "intfloat/e5-large-unsupervised", "chunk_size": 1024, "chunk_overlap": 20}, + "task_properties": {"tokenizer": "meta-llama/Llama-3.2-1B", "chunk_size": 1024, "chunk_overlap": 20, "params": {}}, } assert task.to_dict() == expected_dict From e250effc97b6c3cc1e23671bded870f8e53c96ff Mon Sep 17 00:00:00 2001 From: Chris Jarrett Date: Tue, 11 Feb 2025 16:57:47 -0800 Subject: [PATCH 13/18] Add llama license notice --- README.md | 4 ++++ client/src/nv_ingest_client/primitives/tasks/split.py | 2 -- src/nv_ingest/schemas/text_splitter_schema.py | 2 -- tests/nv_ingest_client/primitives/tasks/test_split.py | 7 ++++++- 4 files changed, 10 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 9f9fa906..5b40e591 100644 --- a/README.md +++ b/README.md @@ -397,6 +397,10 @@ https://pypi.org/project/pdfservices-sdk/ required if you want to use the Adobe extraction service for PDF decomposition. Please review the [license agreement](https://github.com/adobe/pdfservices-python-sdk?tab=License-1-ov-file) for the pdfservices-sdk before enabling this option. +- **`Split`**: + - **Description**: By default, the Split task tokenizer is set to `meta-llama/Llama-3.2-1B`, which will download + Llama-3.2-1B materials from Huggingface in order to perform tokenizer based splitting. Please review the + [license agreement](https://huggingface.co/meta-llama/Llama-3.2-1B) for Llama 3.2 materials before using this. ### Contributing diff --git a/client/src/nv_ingest_client/primitives/tasks/split.py b/client/src/nv_ingest_client/primitives/tasks/split.py index cfada11f..9c2d3e75 100644 --- a/client/src/nv_ingest_client/primitives/tasks/split.py +++ b/client/src/nv_ingest_client/primitives/tasks/split.py @@ -6,8 +6,6 @@ # pylint: disable=too-few-public-methods # pylint: disable=too-many-arguments -import os - import logging from typing import Dict diff --git a/src/nv_ingest/schemas/text_splitter_schema.py b/src/nv_ingest/schemas/text_splitter_schema.py index 37380af6..d78b26c7 100644 --- a/src/nv_ingest/schemas/text_splitter_schema.py +++ b/src/nv_ingest/schemas/text_splitter_schema.py @@ -2,8 +2,6 @@ # All rights reserved. # SPDX-License-Identifier: Apache-2.0 - -from typing import Optional from pydantic import Field, BaseModel, field_validator from typing_extensions import Annotated diff --git a/tests/nv_ingest_client/primitives/tasks/test_split.py b/tests/nv_ingest_client/primitives/tasks/test_split.py index 6af26afc..da2fa2fe 100644 --- a/tests/nv_ingest_client/primitives/tasks/test_split.py +++ b/tests/nv_ingest_client/primitives/tasks/test_split.py @@ -86,6 +86,11 @@ def test_split_task_default_params(): expected_dict = { "type": "split", - "task_properties": {"tokenizer": "meta-llama/Llama-3.2-1B", "chunk_size": 1024, "chunk_overlap": 20, "params": {}}, + "task_properties": { + "tokenizer": "meta-llama/Llama-3.2-1B", + "chunk_size": 1024, + "chunk_overlap": 20, + "params": {}, + }, } assert task.to_dict() == expected_dict From 3c2f246ab1c68ae9afeab23005fab13702799d75 Mon Sep 17 00:00:00 2001 From: Chris Jarrett Date: Tue, 11 Feb 2025 21:43:26 -0800 Subject: [PATCH 14/18] Add support for filtering by file type --- src/nv_ingest/modules/transforms/text_splitter.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/nv_ingest/modules/transforms/text_splitter.py b/src/nv_ingest/modules/transforms/text_splitter.py index 496ac094..322ee027 100644 --- a/src/nv_ingest/modules/transforms/text_splitter.py +++ b/src/nv_ingest/modules/transforms/text_splitter.py @@ -113,11 +113,6 @@ def split_and_forward(message: ControlMessage): with message.payload().mutable_dataframe() as mdf: df = mdf.to_pandas() - # Filter to txt files only - # bool_index = df["document_type"] == ContentTypeEnum.TEXT - # df_filtered = df.loc[bool_index] - logger.info(df.columns) - # Filter to document type bool_index = df["document_type"] == ContentTypeEnum.TEXT df_filtered = df.loc[bool_index] @@ -132,6 +127,7 @@ def split_and_forward(message: ControlMessage): params = task_props.get("params", {}) hf_access_token = params.get("hf_access_token", None) + split_source_types = params.get("split_source_types", ["TEXT"]) logger.debug( f"Splitting text with tokenizer: {tokenizer}, " @@ -139,6 +135,15 @@ def split_and_forward(message: ControlMessage): f"chunk_overlap: {chunk_overlap}" ) + # Filter to file type + bool_index = pd.json_normalize(df_filtered["metadata"])["source_metadata.source_type"].isin( + split_source_types + ) + df_filtered = df_filtered.loc[bool_index] + + if df_filtered.empty: + return message + tokenizer_model = AutoTokenizer.from_pretrained(tokenizer, token=hf_access_token) split_docs = [] From 6e32a9cd6c83badbff5ff88e82f386235c5abd59 Mon Sep 17 00:00:00 2001 From: Chris Jarrett Date: Wed, 12 Feb 2025 13:12:06 -0800 Subject: [PATCH 15/18] Fix offset mapping --- src/nv_ingest/modules/transforms/text_splitter.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/src/nv_ingest/modules/transforms/text_splitter.py b/src/nv_ingest/modules/transforms/text_splitter.py index 322ee027..a6e09939 100644 --- a/src/nv_ingest/modules/transforms/text_splitter.py +++ b/src/nv_ingest/modules/transforms/text_splitter.py @@ -56,27 +56,17 @@ def _split_into_chunks(text, tokenizer, chunk_size=1024, chunk_overlap=20): encoding = tokenizer.encode_plus(text, add_special_tokens=False, return_offsets_mapping=True) # Get the token IDs and offsets for splitting - tokens = encoding["input_ids"] offsets = encoding["offset_mapping"] # Split the tokens into chunks of the desired size with the desired overlap - chunks = [tokens[i : i + chunk_size] for i in range(0, len(tokens), chunk_size - chunk_overlap)] + chunks = [offsets[i : i + chunk_size] for i in range(0, len(offsets), chunk_size - chunk_overlap)] # Convert token chunks back to text while preserving original spacing and case text_chunks = [] for chunk in chunks: - # Find the start and end offsets for the current chunk - chunk_offsets = offsets[: len(chunk)] - start_offset = chunk_offsets[0][0] - end_offset = chunk_offsets[-1][1] - - # Extract the original text for this chunk based on offsets - text_chunk = text[start_offset:end_offset] + text_chunk = text[chunk[0][0] : chunk[-1][0]] text_chunks.append(text_chunk) - # Remove processed offsets for the next iteration - offsets = offsets[len(chunk) :] - return text_chunks From 67b7051b88b36828e1bac10f7d485be94e84de7b Mon Sep 17 00:00:00 2001 From: Chris Jarrett Date: Wed, 19 Feb 2025 10:55:06 -0800 Subject: [PATCH 16/18] Add built with llama --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index cb977419..5c835ea7 100644 --- a/README.md +++ b/README.md @@ -396,7 +396,7 @@ https://pypi.org/project/pdfservices-sdk/ required if you want to use the Adobe extraction service for PDF decomposition. Please review the [license agreement](https://github.com/adobe/pdfservices-python-sdk?tab=License-1-ov-file) for the pdfservices-sdk before enabling this option. -- **`Split`**: +- **`Built With Llama:`**: - **Description**: By default, the Split task tokenizer is set to `meta-llama/Llama-3.2-1B`, which will download Llama-3.2-1B materials from Huggingface in order to perform tokenizer based splitting. Please review the [license agreement](https://huggingface.co/meta-llama/Llama-3.2-1B) for Llama 3.2 materials before using this. From 763403019d638e122afe27063f040d1f468d5463 Mon Sep 17 00:00:00 2001 From: Chris Jarrett Date: Wed, 19 Feb 2025 14:16:54 -0800 Subject: [PATCH 17/18] Change default chunk_overlap to 150 --- client/src/nv_ingest_client/primitives/tasks/split.py | 4 ++-- src/nv_ingest/schemas/text_splitter_schema.py | 2 +- tests/nv_ingest/schemas/test_text_splitter_schema.py | 2 +- tests/nv_ingest_client/primitives/tasks/test_split.py | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/client/src/nv_ingest_client/primitives/tasks/split.py b/client/src/nv_ingest_client/primitives/tasks/split.py index 9c2d3e75..12f1b2d7 100644 --- a/client/src/nv_ingest_client/primitives/tasks/split.py +++ b/client/src/nv_ingest_client/primitives/tasks/split.py @@ -19,7 +19,7 @@ class SplitTaskSchema(BaseModel): tokenizer: str = "meta-llama/Llama-3.2-1B" chunk_size: int = 1024 - chunk_overlap: int = 20 + chunk_overlap: int = 150 params: dict = {} class Config: @@ -35,7 +35,7 @@ def __init__( self, tokenizer: str = "meta-llama/Llama-3.2-1B", chunk_size: int = 1024, - chunk_overlap: int = 20, + chunk_overlap: int = 150, params: dict = {}, ) -> None: """ diff --git a/src/nv_ingest/schemas/text_splitter_schema.py b/src/nv_ingest/schemas/text_splitter_schema.py index d78b26c7..7eb6d93c 100644 --- a/src/nv_ingest/schemas/text_splitter_schema.py +++ b/src/nv_ingest/schemas/text_splitter_schema.py @@ -10,7 +10,7 @@ class TextSplitterSchema(BaseModel): tokenizer: str = "meta-llama/Llama-3.2-1B" chunk_size: Annotated[int, Field(gt=0)] = 1024 - chunk_overlap: Annotated[int, Field(ge=0)] = 20 + chunk_overlap: Annotated[int, Field(ge=0)] = 150 raise_on_failure: bool = False @field_validator("chunk_overlap") diff --git a/tests/nv_ingest/schemas/test_text_splitter_schema.py b/tests/nv_ingest/schemas/test_text_splitter_schema.py index bbdda847..8cfc9d97 100644 --- a/tests/nv_ingest/schemas/test_text_splitter_schema.py +++ b/tests/nv_ingest/schemas/test_text_splitter_schema.py @@ -15,7 +15,7 @@ def test_text_splitter_schema_defaults(): schema = TextSplitterSchema() assert schema.tokenizer == "meta-llama/Llama-3.2-1B" assert schema.chunk_size == 1024 - assert schema.chunk_overlap == 20 + assert schema.chunk_overlap == 150 assert schema.raise_on_failure is False diff --git a/tests/nv_ingest_client/primitives/tasks/test_split.py b/tests/nv_ingest_client/primitives/tasks/test_split.py index da2fa2fe..533ef0c5 100644 --- a/tests/nv_ingest_client/primitives/tasks/test_split.py +++ b/tests/nv_ingest_client/primitives/tasks/test_split.py @@ -79,7 +79,7 @@ def test_split_task_default_params(): expected_str_contains = [ "tokenizer: meta-llama/Llama-3.2-1B", "chunk_size: 1024", - "chunk_overlap: 20", + "chunk_overlap: 150", ] for expected_part in expected_str_contains: assert expected_part in str(task) @@ -89,7 +89,7 @@ def test_split_task_default_params(): "task_properties": { "tokenizer": "meta-llama/Llama-3.2-1B", "chunk_size": 1024, - "chunk_overlap": 20, + "chunk_overlap": 150, "params": {}, }, } From d6401bc1c3821f176daf25ab04878e476c346aa5 Mon Sep 17 00:00:00 2001 From: Chris Jarrett Date: Thu, 20 Feb 2025 14:30:24 -0800 Subject: [PATCH 18/18] Add option to predownload llama tokenizer --- Dockerfile | 11 +++++++++++ README.md | 8 +++++--- docker-compose.yaml | 3 +++ docker/scripts/post_build_triggers.py | 9 +++++++++ 4 files changed, 28 insertions(+), 3 deletions(-) create mode 100644 docker/scripts/post_build_triggers.py diff --git a/Dockerfile b/Dockerfile index 4a5e73c0..fb974f0d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -12,6 +12,8 @@ FROM $BASE_IMG:$BASE_IMG_TAG AS base ARG RELEASE_TYPE="dev" ARG VERSION="" ARG VERSION_REV="0" +ARG DOWNLOAD_LLAMA_TOKENIZER="" +ARG HF_ACCESS_TOKEN="" # Embed the `git rev-parse HEAD` as a Docker metadata label # Allows for linking container builds to git commits @@ -71,6 +73,9 @@ WORKDIR /workspace # Copy custom entrypoint script COPY ./docker/scripts/entrypoint.sh /workspace/docker/entrypoint.sh +# Copy post build triggers script +COPY ./docker/scripts/post_build_triggers.py /workspace/docker/post_build_triggers.py + FROM base AS nv_ingest_install # Copy the module code COPY setup.py setup.py @@ -124,6 +129,12 @@ RUN --mount=type=cache,target=/opt/conda/pkgs\ && pip install ./api/dist/*.whl \ && pip install ./client/dist/*.whl + +RUN --mount=type=cache,target=/opt/conda/pkgs \ + --mount=type=cache,target=/root/.cache/pip \ + source activate nv_ingest_runtime \ + && python3 /workspace/docker/post_build_triggers.py + RUN rm -rf src FROM nv_ingest_install AS runtime diff --git a/README.md b/README.md index 8eb10894..9aad924f 100644 --- a/README.md +++ b/README.md @@ -411,10 +411,12 @@ https://pypi.org/project/pdfservices-sdk/ required if you want to use the Adobe extraction service for PDF decomposition. Please review the [license agreement](https://github.com/adobe/pdfservices-python-sdk?tab=License-1-ov-file) for the pdfservices-sdk before enabling this option. -- **`Built With Llama:`**: - - **Description**: By default, the Split task tokenizer is set to `meta-llama/Llama-3.2-1B`, which will download - Llama-3.2-1B materials from Huggingface in order to perform tokenizer based splitting. Please review the +- **`DOWNLOAD_LLAMA_TOKENIZER` (Built With Llama):**: + - **Description**: The Split task uses the `meta-llama/Llama-3.2-1B` tokenizer, which will be downloaded + from HuggingFace at build time if `DOWNLOAD_LLAMA_TOKENIZER` is set to `True`. Please review the [license agreement](https://huggingface.co/meta-llama/Llama-3.2-1B) for Llama 3.2 materials before using this. + This is a gated model so you'll need to [request access](https://huggingface.co/meta-llama/Llama-3.2-1B) and + set `HF_ACCESS_TOKEN` to your HuggingFace access token in order to use it. ### Contributing diff --git a/docker-compose.yaml b/docker-compose.yaml index 44ed1107..451fc280 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -155,6 +155,9 @@ services: context: ${NV_INGEST_ROOT:-.} dockerfile: "./Dockerfile" target: runtime + args: + DOWNLOAD_LLAMA_TOKENIZER: ${DOWNLOAD_LLAMA_TOKENIZER:-False} + HF_ACCESS_TOKEN: ${HF_ACCESS_TOKEN:-hfaccesstoken} volumes: - ${DATASET_ROOT:-./data}:/workspace/data ports: diff --git a/docker/scripts/post_build_triggers.py b/docker/scripts/post_build_triggers.py new file mode 100644 index 00000000..676139ab --- /dev/null +++ b/docker/scripts/post_build_triggers.py @@ -0,0 +1,9 @@ +import os +from transformers import AutoTokenizer + +if os.getenv("DOWNLOAD_LLAMA_TOKENIZER") == "True": + tokenizer_path = "/workspace/models/tokenizer/" + os.makedirs(tokenizer_path, exist_ok=True) + + tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B", token=os.getenv("HF_ACCESS_TOKEN")) + tokenizer.save_pretrained(tokenizer_path)