From 918740b6345bfde01406fa40eb0d3d82af7666ec Mon Sep 17 00:00:00 2001
From: ChrisJar <chris.jarrett.0@gmail.com>
Date: Thu, 12 Dec 2024 21:33:10 -0800
Subject: [PATCH 01/18] Switch split task to token based splitting

---
 .../primitives/tasks/split.py                 |  51 ++------
 .../modules/transforms/nemo_doc_splitter.py   | 119 +++++-------------
 src/nv_ingest/schemas/ingest_job_schema.py    |  13 +-
 .../schemas/nemo_doc_splitter_schema.py       |  17 +--
 4 files changed, 50 insertions(+), 150 deletions(-)

diff --git a/client/src/nv_ingest_client/primitives/tasks/split.py b/client/src/nv_ingest_client/primitives/tasks/split.py
index ddbae624..c9ac86cc 100644
--- a/client/src/nv_ingest_client/primitives/tasks/split.py
+++ b/client/src/nv_ingest_client/primitives/tasks/split.py
@@ -20,18 +20,8 @@
 
 
 class SplitTaskSchema(BaseModel):
-    split_by: Optional[str] = "sentence"
-    split_length: Optional[int] = 10
-    split_overlap: Optional[int] = 0
-    max_character_length: Optional[int] = 1024
-    sentence_window_size: Optional[int] = 0
-
-    @validator("split_by")
-    def split_by_must_be_valid(cls, v):
-        valid_criteria = ["page", "size", "word", "sentence"]
-        if v not in valid_criteria:
-            raise ValueError(f"split_by must be one of {valid_criteria}")
-        return v
+    split_by: str = "intfloat/e5-large-unsupervised"
+    chunk_size: int = 300
 
     class Config:
         extra = "forbid"
@@ -42,25 +32,17 @@ class SplitTask(Task):
     Object for document splitting task
     """
 
-    _TypeSplitBy = Literal["word", "sentence", "passage"]
-
     def __init__(
         self,
-        split_by: _TypeSplitBy = None,
-        split_length: int = None,
-        split_overlap: int = None,
-        max_character_length: int = None,
-        sentence_window_size: int = None,
+        tokenizer: str = None,
+        chunk_size: int = None,
     ) -> None:
         """
         Setup Split Task Config
         """
         super().__init__()
-        self._split_by = split_by
-        self._split_length = split_length
-        self._split_overlap = split_overlap
-        self._max_character_length = max_character_length
-        self._sentence_window_size = sentence_window_size
+        self._tokenizer = tokenizer
+        self._chunk_size = chunk_size
 
     def __str__(self) -> str:
         """
@@ -68,11 +50,8 @@ def __str__(self) -> str:
         """
         info = ""
         info += "Split Task:\n"
-        info += f"  split_by: {self._split_by}\n"
-        info += f"  split_length: {self._split_length}\n"
-        info += f"  split_overlap: {self._split_overlap}\n"
-        info += f"  split_max_character_length: {self._max_character_length}\n"
-        info += f"  split_sentence_window_size: {self._sentence_window_size}\n"
+        info += f"  tokenizer: {self._tokenizer}\n"
+        info += f"  chunk_size: {self._chunk_size}\n"
         return info
 
     def to_dict(self) -> Dict:
@@ -81,15 +60,9 @@ def to_dict(self) -> Dict:
         """
         split_params = {}
 
-        if self._split_by is not None:
-            split_params["split_by"] = self._split_by
-        if self._split_length is not None:
-            split_params["split_length"] = self._split_length
-        if self._split_overlap is not None:
-            split_params["split_overlap"] = self._split_overlap
-        if self._max_character_length is not None:
-            split_params["max_character_length"] = self._max_character_length
-        if self._sentence_window_size is not None:
-            split_params["sentence_window_size"] = self._sentence_window_size
+        if self._tokenizer is not None:
+            split_params["tokenizer"] = self._tokenizer
+        if self._chunk_size is not None:
+            split_params["chunk_size"] = self._chunk_size
 
         return {"type": "split", "task_properties": split_params}
diff --git a/src/nv_ingest/modules/transforms/nemo_doc_splitter.py b/src/nv_ingest/modules/transforms/nemo_doc_splitter.py
index 647d9bf1..b5685d8f 100644
--- a/src/nv_ingest/modules/transforms/nemo_doc_splitter.py
+++ b/src/nv_ingest/modules/transforms/nemo_doc_splitter.py
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 
+import os
 import copy
 import logging
 import traceback
@@ -13,6 +14,7 @@
 
 import mrc
 import pandas as pd
+from transformers import AutoTokenizer
 from more_itertools import windowed
 from morpheus.messages import ControlMessage
 from morpheus.messages import MessageMeta
@@ -33,24 +35,16 @@
 logger = logging.getLogger(__name__)
 
 
-def _build_split_documents(row, text_splits: List[str], sentence_window_size: int) -> List[dict[str, Any]]:
-    """Build documents from text splits with window text."""
+def _build_split_documents(row, chunks: List[str]) -> List[dict[str, Any]]:
+    """Build documents from text chunks"""
     documents: List[dict] = []
 
-    window_size = sentence_window_size
-    for i, text in enumerate(text_splits):
+    for i, text in enumerate(chunks):
         if text is None or not text.strip():
             continue
 
         metadata = row.metadata if hasattr(row, "metadata") and isinstance(row.metadata, dict) else {}
         metadata = copy.deepcopy(metadata)
-        if window_size > 0:
-            window_text = "".join(
-                text_splits[max(0, i - window_size) : min(i + 1 + window_size, len(text_splits))]  # noqa: E203
-            )
-
-            metadata["window"] = window_text
-            metadata["original_text"] = text
 
         metadata["content"] = text
 
@@ -59,70 +53,33 @@ def _build_split_documents(row, text_splits: List[str], sentence_window_size: in
     return documents
 
 
-def _split_into_units(text: str, split_by: Literal["word", "sentence", "passage"]) -> List[str]:
-    if split_by == "passage":
-        split_at = "\n\n"
-    elif split_by == "sentence":
-        split_at = "."  # why not ?,!, etc..?
-    elif split_by == "word":
-        split_at = " "
-    else:
-        raise NotImplementedError("DocumentSplitter only supports 'passage', 'sentence'" " or 'word' split_by options.")
-    units = text.split(split_at)
-    # Add the delimiter back to all units except the last one
-    for i in range(len(units) - 1):
-        units[i] += split_at
+def _split_into_chunks(text, tokenizer, chunk_size=300):
+    # Tokenize the text into token IDs
+    encoding = tokenizer.encode_plus(text, add_special_tokens=False, return_offsets_mapping=True)
 
-    return units
+    # Get the token IDs and offsets for splitting
+    tokens = encoding['input_ids']
+    offsets = encoding['offset_mapping']
 
+    # Split the tokens into chunks of the desired size
+    chunks = [tokens[i:i + chunk_size] for i in range(0, len(tokens), chunk_size)]
 
-def _concatenate_units(units: List[str], split_length: int, split_overlap: int, max_character_length: int) -> List[str]:
-    text_splits = []
-    segments = windowed(units, n=split_length, step=split_length - split_overlap)
-    for seg in segments:
-        current_units = [unit for unit in seg if unit is not None]
-        txt = "".join(current_units)
-        if max_character_length and len(txt) > max_character_length:
-            text_splits.extend(_split_long_text(txt, max_character_length))
-        elif len(txt) > 0:
-            text_splits.append(txt)
+    # Convert token chunks back to text while preserving original spacing and case
+    text_chunks = []
+    for chunk in chunks:
+        # Find the start and end offsets for the current chunk
+        chunk_offsets = offsets[:len(chunk)]
+        start_offset = chunk_offsets[0][0]
+        end_offset = chunk_offsets[-1][1]
 
-    return text_splits
+        # Extract the original text for this chunk based on offsets
+        text_chunk = text[start_offset:end_offset]
+        text_chunks.append(text_chunk)
 
+        # Remove processed offsets for the next iteration
+        offsets = offsets[len(chunk):]
 
-def _split_long_text(text: str, max_character_length: int) -> List[str]:
-    """
-    Splits a long text into smaller segments that
-    do not exceed max_character_length.
-    """
-    split_texts = []
-    while text:
-        # Take the maximum possible substring without exceeding max_character_length
-        segment = text[:max_character_length]
-        split_texts.append(segment)
-        text = text[max_character_length:]  # noqa: E203
-
-    return split_texts
-
-
-def _process_content(row, validated_config):
-    content = row["metadata"]["content"]
-
-    if content is None:
-        raise ValueError(
-            "DocumentSplitter only works with text documents but one or more 'content' " "values are None."
-        )
-
-    units = _split_into_units(content, validated_config.split_by)
-    text_splits = _concatenate_units(
-        units,
-        validated_config.split_length,
-        validated_config.split_overlap,
-        max_character_length=validated_config.max_character_length,
-    )
-    split_docs = _build_split_documents(row, text_splits, sentence_window_size=validated_config.sentence_window_size)
-
-    return split_docs
+    return text_chunks
 
 
 MODULE_NAME = "nemo_document_splitter"
@@ -167,16 +124,11 @@ def split_and_forward(message: ControlMessage):
                 return message
 
             # Override parameters if set
-            split_by = task_props.get("split_by", validated_config.split_by)
-            split_length = task_props.get("split_length", validated_config.split_length)
-            split_overlap = task_props.get("split_overlap", validated_config.split_overlap)
-            max_character_length = task_props.get("max_character_length", validated_config.max_character_length)
-            sentence_window_size = task_props.get("sentence_window_size", validated_config.sentence_window_size)
+            tokenizer  = task_props.get("tokenizer", validated_config.tokenizer)
+            chunk_size = task_props.get("chunk_size", validated_config.chunk_size)
 
             logger.info(
-                f"Splitting documents with split_by: {split_by}, split_length: {split_length}, "
-                f"split_overlap: {split_overlap}, max_character_length: {max_character_length}, "
-                f"sentence_window_size: {sentence_window_size}"
+                f"Splitting documents with tokenizer: {tokenizer}, chunk_size: {chunk_size} tokens"
             )
 
             split_docs = []
@@ -188,14 +140,11 @@ def split_and_forward(message: ControlMessage):
                         "DocumentSplitter only works with text documents but one or more " "'content' values are None."
                     )
 
-                units = _split_into_units(content, split_by)
-                text_splits = _concatenate_units(
-                    units,
-                    split_length,
-                    split_overlap,
-                    max_character_length=max_character_length,
-                )
-                split_docs.extend(_build_split_documents(row, text_splits, sentence_window_size=sentence_window_size))
+                os.environ['TOKENIZERS_PARALLELISM'] = "False"
+                tokenizer_model = AutoTokenizer.from_pretrained(tokenizer)
+
+                chunks = _split_into_chunks(content, tokenizer_model, chunk_size)
+                split_docs.extend(_build_split_documents(row, chunks))
 
             split_docs_df = pd.DataFrame(split_docs)
 
diff --git a/src/nv_ingest/schemas/ingest_job_schema.py b/src/nv_ingest/schemas/ingest_job_schema.py
index 83c987ff..e3a9255c 100644
--- a/src/nv_ingest/schemas/ingest_job_schema.py
+++ b/src/nv_ingest/schemas/ingest_job_schema.py
@@ -61,17 +61,8 @@ class TracingOptionsSchema(BaseModelNoExt):
 
 
 class IngestTaskSplitSchema(BaseModelNoExt):
-    split_by: Literal["word", "sentence", "passage"]
-    split_length: conint(gt=0)
-    split_overlap: conint(ge=0)
-    max_character_length: Optional[conint(gt=0)]
-    sentence_window_size: Optional[conint(ge=0)]
-
-    @validator("sentence_window_size")
-    def check_sentence_window_size(cls, v, values, **kwargs):
-        if v is not None and v > 0 and values["split_by"] != "sentence":
-            raise ValueError("When using sentence_window_size, split_by must be 'sentence'.")
-        return v
+    tokenizer: str
+    chunk_size: conint(gt=0)
 
 
 class IngestTaskExtractSchema(BaseModelNoExt):
diff --git a/src/nv_ingest/schemas/nemo_doc_splitter_schema.py b/src/nv_ingest/schemas/nemo_doc_splitter_schema.py
index 9c392459..9231eb4f 100644
--- a/src/nv_ingest/schemas/nemo_doc_splitter_schema.py
+++ b/src/nv_ingest/schemas/nemo_doc_splitter_schema.py
@@ -3,24 +3,11 @@
 # SPDX-License-Identifier: Apache-2.0
 
 
-from typing import Literal
-from typing import Optional
-
 from pydantic import BaseModel
 from pydantic import conint
-from pydantic import validator
 
 
 class DocumentSplitterSchema(BaseModel):
-    split_by: Literal["word", "sentence", "passage"] = "word"
-    split_length: conint(gt=0) = 60
-    split_overlap: conint(ge=0) = 10
-    max_character_length: Optional[conint(gt=0)] = 450
-    sentence_window_size: Optional[conint(ge=0)] = 0
+    tokenizer: str = "intfloat/e5-large-unsupervised"
+    chunk_size: conint(gt=0) = 300
     raise_on_failure: bool = False
-
-    @validator("sentence_window_size")
-    def check_sentence_window_size(cls, v, values, **kwargs):
-        if v is not None and v > 0 and values["split_by"] != "sentence":
-            raise ValueError("When using sentence_window_size, split_by must be 'sentence'.")
-        return v

From c12df54eea864fd3ec6908cd87faa65cb7a1c839 Mon Sep 17 00:00:00 2001
From: Chris Jarrett <cjarrett@dgx-a100-01.aselab.nvidia.com>
Date: Wed, 18 Dec 2024 14:18:53 -0800
Subject: [PATCH 02/18] Move tokenizer out of loop

---
 src/nv_ingest/modules/transforms/nemo_doc_splitter.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/nv_ingest/modules/transforms/nemo_doc_splitter.py b/src/nv_ingest/modules/transforms/nemo_doc_splitter.py
index b5685d8f..f168432d 100644
--- a/src/nv_ingest/modules/transforms/nemo_doc_splitter.py
+++ b/src/nv_ingest/modules/transforms/nemo_doc_splitter.py
@@ -131,6 +131,8 @@ def split_and_forward(message: ControlMessage):
                 f"Splitting documents with tokenizer: {tokenizer}, chunk_size: {chunk_size} tokens"
             )
 
+            tokenizer_model = AutoTokenizer.from_pretrained(tokenizer, token="")
+
             split_docs = []
             for _, row in df_filtered.iterrows():
                 content = row["metadata"]["content"]
@@ -140,9 +142,6 @@ def split_and_forward(message: ControlMessage):
                         "DocumentSplitter only works with text documents but one or more " "'content' values are None."
                     )
 
-                os.environ['TOKENIZERS_PARALLELISM'] = "False"
-                tokenizer_model = AutoTokenizer.from_pretrained(tokenizer)
-
                 chunks = _split_into_chunks(content, tokenizer_model, chunk_size)
                 split_docs.extend(_build_split_documents(row, chunks))
 

From c88a4a283824ebe617b6f5f62b1c4c29ad8b7827 Mon Sep 17 00:00:00 2001
From: Chris Jarrett <cjarrett@dgx-a100-01.aselab.nvidia.com>
Date: Wed, 18 Dec 2024 15:17:03 -0800
Subject: [PATCH 03/18] Fix CLI

---
 client/src/nv_ingest_client/primitives/tasks/split.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/client/src/nv_ingest_client/primitives/tasks/split.py b/client/src/nv_ingest_client/primitives/tasks/split.py
index c9ac86cc..24e7f956 100644
--- a/client/src/nv_ingest_client/primitives/tasks/split.py
+++ b/client/src/nv_ingest_client/primitives/tasks/split.py
@@ -20,7 +20,7 @@
 
 
 class SplitTaskSchema(BaseModel):
-    split_by: str = "intfloat/e5-large-unsupervised"
+    tokenizer: str = "intfloat/e5-large-unsupervised"
     chunk_size: int = 300
 
     class Config:

From 0cacc9eba501a5878578098742e1fd2847ab702c Mon Sep 17 00:00:00 2001
From: Chris Jarrett <cjarrett@dgx-a100-01.aselab.nvidia.com>
Date: Wed, 8 Jan 2025 10:00:59 -0800
Subject: [PATCH 04/18] Add chunk_overlap parameter

---
 .../nv_ingest_client/primitives/tasks/split.py    | 10 ++++++++--
 .../modules/transforms/nemo_doc_splitter.py       | 15 ++++++++-------
 src/nv_ingest/schemas/ingest_job_schema.py        |  1 +
 src/nv_ingest/schemas/nemo_doc_splitter_schema.py |  1 +
 4 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/client/src/nv_ingest_client/primitives/tasks/split.py b/client/src/nv_ingest_client/primitives/tasks/split.py
index 24e7f956..d60f067d 100644
--- a/client/src/nv_ingest_client/primitives/tasks/split.py
+++ b/client/src/nv_ingest_client/primitives/tasks/split.py
@@ -22,6 +22,7 @@
 class SplitTaskSchema(BaseModel):
     tokenizer: str = "intfloat/e5-large-unsupervised"
     chunk_size: int = 300
+    chunk_overlap: int = 0
 
     class Config:
         extra = "forbid"
@@ -34,8 +35,9 @@ class SplitTask(Task):
 
     def __init__(
         self,
-        tokenizer: str = None,
-        chunk_size: int = None,
+        tokenizer: str = "intfloat/e5-large-unsupervised",
+        chunk_size: int = 300,
+        chunk_overlap: int = 0,
     ) -> None:
         """
         Setup Split Task Config
@@ -43,6 +45,7 @@ def __init__(
         super().__init__()
         self._tokenizer = tokenizer
         self._chunk_size = chunk_size
+        self._chunk_overlap = chunk_overlap
 
     def __str__(self) -> str:
         """
@@ -52,6 +55,7 @@ def __str__(self) -> str:
         info += "Split Task:\n"
         info += f"  tokenizer: {self._tokenizer}\n"
         info += f"  chunk_size: {self._chunk_size}\n"
+        info += f"  chunk_overlap: {self._chunk_overlap}\n"
         return info
 
     def to_dict(self) -> Dict:
@@ -64,5 +68,7 @@ def to_dict(self) -> Dict:
             split_params["tokenizer"] = self._tokenizer
         if self._chunk_size is not None:
             split_params["chunk_size"] = self._chunk_size
+        if self._chunk_overlap is not None:
+            split_params["chunk_overlap"] = self._chunk_overlap
 
         return {"type": "split", "task_properties": split_params}
diff --git a/src/nv_ingest/modules/transforms/nemo_doc_splitter.py b/src/nv_ingest/modules/transforms/nemo_doc_splitter.py
index f168432d..b3fc2fa6 100644
--- a/src/nv_ingest/modules/transforms/nemo_doc_splitter.py
+++ b/src/nv_ingest/modules/transforms/nemo_doc_splitter.py
@@ -58,17 +58,17 @@ def _split_into_chunks(text, tokenizer, chunk_size=300):
     encoding = tokenizer.encode_plus(text, add_special_tokens=False, return_offsets_mapping=True)
 
     # Get the token IDs and offsets for splitting
-    tokens = encoding['input_ids']
-    offsets = encoding['offset_mapping']
+    tokens = encoding["input_ids"]
+    offsets = encoding["offset_mapping"]
 
     # Split the tokens into chunks of the desired size
-    chunks = [tokens[i:i + chunk_size] for i in range(0, len(tokens), chunk_size)]
+    chunks = [tokens[i : i + chunk_size] for i in range(0, len(tokens), chunk_size)]
 
     # Convert token chunks back to text while preserving original spacing and case
     text_chunks = []
     for chunk in chunks:
         # Find the start and end offsets for the current chunk
-        chunk_offsets = offsets[:len(chunk)]
+        chunk_offsets = offsets[: len(chunk)]
         start_offset = chunk_offsets[0][0]
         end_offset = chunk_offsets[-1][1]
 
@@ -77,7 +77,7 @@ def _split_into_chunks(text, tokenizer, chunk_size=300):
         text_chunks.append(text_chunk)
 
         # Remove processed offsets for the next iteration
-        offsets = offsets[len(chunk):]
+        offsets = offsets[len(chunk) :]
 
     return text_chunks
 
@@ -124,11 +124,12 @@ def split_and_forward(message: ControlMessage):
                 return message
 
             # Override parameters if set
-            tokenizer  = task_props.get("tokenizer", validated_config.tokenizer)
+            tokenizer = task_props.get("tokenizer", validated_config.tokenizer)
             chunk_size = task_props.get("chunk_size", validated_config.chunk_size)
+            chunk_overlap = task_props.get("chunk_overlap", validated_config.chunk_overlap)
 
             logger.info(
-                f"Splitting documents with tokenizer: {tokenizer}, chunk_size: {chunk_size} tokens"
+                f"Splitting text with tokenizer: {tokenizer}, chunk_size: {chunk_size} tokens, chunk_overlap: {chunk_overlap}"
             )
 
             tokenizer_model = AutoTokenizer.from_pretrained(tokenizer, token="")
diff --git a/src/nv_ingest/schemas/ingest_job_schema.py b/src/nv_ingest/schemas/ingest_job_schema.py
index e3a9255c..b7a97f19 100644
--- a/src/nv_ingest/schemas/ingest_job_schema.py
+++ b/src/nv_ingest/schemas/ingest_job_schema.py
@@ -63,6 +63,7 @@ class TracingOptionsSchema(BaseModelNoExt):
 class IngestTaskSplitSchema(BaseModelNoExt):
     tokenizer: str
     chunk_size: conint(gt=0)
+    chunk_overlap: conint(ge=0)
 
 
 class IngestTaskExtractSchema(BaseModelNoExt):
diff --git a/src/nv_ingest/schemas/nemo_doc_splitter_schema.py b/src/nv_ingest/schemas/nemo_doc_splitter_schema.py
index 9231eb4f..86a13d6e 100644
--- a/src/nv_ingest/schemas/nemo_doc_splitter_schema.py
+++ b/src/nv_ingest/schemas/nemo_doc_splitter_schema.py
@@ -10,4 +10,5 @@
 class DocumentSplitterSchema(BaseModel):
     tokenizer: str = "intfloat/e5-large-unsupervised"
     chunk_size: conint(gt=0) = 300
+    chunk_overlap: conint(ge=0) = 0
     raise_on_failure: bool = False

From d8a2de9a80cfdaf5044d01ebdb41873f29054443 Mon Sep 17 00:00:00 2001
From: Chris Jarrett <cjarrett@dgx-a100-01.aselab.nvidia.com>
Date: Wed, 8 Jan 2025 15:02:09 -0800
Subject: [PATCH 05/18] Fix broken tests

---
 .../primitives/tasks/split.py                 |  4 +-
 .../modules/transforms/nemo_doc_splitter.py   |  7 +-
 src/nv_ingest/schemas/ingest_job_schema.py    |  5 +-
 .../schemas/nemo_doc_splitter_schema.py       |  9 ++-
 .../test_message_broker_task_source.py        |  6 +-
 .../schemas/test_ingest_job_schema.py         | 45 +++--------
 .../schemas/test_nemo_doc_splitter_schema.py  | 51 ++----------
 tests/nv_ingest_client/cli/util/test_click.py |  2 +-
 tests/nv_ingest_client/client/test_client.py  | 14 ++--
 .../nv_ingest_client/client/test_interface.py |  6 +-
 .../primitives/tasks/test_split.py            | 79 ++++++++-----------
 11 files changed, 73 insertions(+), 155 deletions(-)

diff --git a/client/src/nv_ingest_client/primitives/tasks/split.py b/client/src/nv_ingest_client/primitives/tasks/split.py
index b0178885..5f45e806 100644
--- a/client/src/nv_ingest_client/primitives/tasks/split.py
+++ b/client/src/nv_ingest_client/primitives/tasks/split.py
@@ -8,10 +8,8 @@
 
 import logging
 from typing import Dict
-from typing import Literal
-from typing import Optional
 
-from pydantic import BaseModel, field_validator
+from pydantic import BaseModel
 
 from .task_base import Task
 
diff --git a/src/nv_ingest/modules/transforms/nemo_doc_splitter.py b/src/nv_ingest/modules/transforms/nemo_doc_splitter.py
index 3ed12355..79c315b2 100644
--- a/src/nv_ingest/modules/transforms/nemo_doc_splitter.py
+++ b/src/nv_ingest/modules/transforms/nemo_doc_splitter.py
@@ -3,19 +3,16 @@
 # SPDX-License-Identifier: Apache-2.0
 
 
-import os
 import copy
 import logging
 import traceback
 import uuid
 from typing import Any
 from typing import List
-from typing import Literal
 
 import mrc
 import pandas as pd
 from transformers import AutoTokenizer
-from more_itertools import windowed
 from morpheus.messages import ControlMessage
 from morpheus.messages import MessageMeta
 from morpheus.utils.control_message_utils import cm_skip_processing_if_failed
@@ -133,7 +130,9 @@ def split_and_forward(message: ControlMessage):
             chunk_overlap = task_props.get("chunk_overlap", validated_config.chunk_overlap)
 
             logger.info(
-                f"Splitting text with tokenizer: {tokenizer}, chunk_size: {chunk_size} tokens, chunk_overlap: {chunk_overlap}"
+                f"Splitting text with tokenizer: {tokenizer}, "
+                f"chunk_size: {chunk_size} tokens, "
+                f"chunk_overlap: {chunk_overlap}"
             )
 
             tokenizer_model = AutoTokenizer.from_pretrained(tokenizer, token="")
diff --git a/src/nv_ingest/schemas/ingest_job_schema.py b/src/nv_ingest/schemas/ingest_job_schema.py
index a11f03cb..e179894b 100644
--- a/src/nv_ingest/schemas/ingest_job_schema.py
+++ b/src/nv_ingest/schemas/ingest_job_schema.py
@@ -8,7 +8,6 @@
 from typing import Any
 from typing import Dict
 from typing import List
-from typing import Literal
 from typing import Optional
 from typing import Union
 
@@ -61,8 +60,8 @@ class TracingOptionsSchema(BaseModelNoExt):
 
 class IngestTaskSplitSchema(BaseModelNoExt):
     tokenizer: str
-    chunk_size: conint(gt=0)
-    chunk_overlap: conint(ge=0)
+    chunk_size: Annotated[int, Field(gt=0)]
+    chunk_overlap: Annotated[int, Field(ge=0)]
 
 
 class IngestTaskExtractSchema(BaseModelNoExt):
diff --git a/src/nv_ingest/schemas/nemo_doc_splitter_schema.py b/src/nv_ingest/schemas/nemo_doc_splitter_schema.py
index 86a13d6e..ad34e583 100644
--- a/src/nv_ingest/schemas/nemo_doc_splitter_schema.py
+++ b/src/nv_ingest/schemas/nemo_doc_splitter_schema.py
@@ -3,12 +3,13 @@
 # SPDX-License-Identifier: Apache-2.0
 
 
-from pydantic import BaseModel
-from pydantic import conint
+from pydantic import Field, BaseModel
+
+from typing_extensions import Annotated
 
 
 class DocumentSplitterSchema(BaseModel):
     tokenizer: str = "intfloat/e5-large-unsupervised"
-    chunk_size: conint(gt=0) = 300
-    chunk_overlap: conint(ge=0) = 0
+    chunk_size: Annotated[int, Field(gt=0)] = 300
+    chunk_overlap: Annotated[int, Field(ge=0)] = 0
     raise_on_failure: bool = False
diff --git a/tests/nv_ingest/modules/sources/test_message_broker_task_source.py b/tests/nv_ingest/modules/sources/test_message_broker_task_source.py
index a824f60d..08cd3a01 100644
--- a/tests/nv_ingest/modules/sources/test_message_broker_task_source.py
+++ b/tests/nv_ingest/modules/sources/test_message_broker_task_source.py
@@ -39,9 +39,9 @@ def job_payload():
                 {
                     "type": "split",
                     "task_properties": {
-                        "split_by": "word",
-                        "split_length": 100,
-                        "split_overlap": 0,
+                        "tokenizer": "intfloat/e5-large-unsupervised",
+                        "chunk_size": 100,
+                        "chunk_overlap": 0,
                     },
                 },
                 {
diff --git a/tests/nv_ingest/schemas/test_ingest_job_schema.py b/tests/nv_ingest/schemas/test_ingest_job_schema.py
index 97045fbb..e9fb7f22 100644
--- a/tests/nv_ingest/schemas/test_ingest_job_schema.py
+++ b/tests/nv_ingest/schemas/test_ingest_job_schema.py
@@ -26,11 +26,9 @@ def valid_task_properties(task_type):
     """Returns valid task properties based on the task type."""
     if task_type == TaskTypeEnum.split:
         return {
-            "split_by": "sentence",
-            "split_length": 10,
-            "split_overlap": 0,
-            "max_character_length": 100,
-            "sentence_window_size": None,  # This is valid when not required
+            "tokenizer": "intfloat/e5-large-unsupervised",
+            "chunk_size": 300,
+            "chunk_overlap": 0,
         }
     elif task_type == TaskTypeEnum.extract:
         return {"document_type": "pdf", "method": "OCR", "params": {"language": "en"}}
@@ -119,27 +117,6 @@ def test_field_type_correctness():
         validate_ingest_job(job_data)
 
 
-def test_custom_validator_logic_for_sentence_window_size():
-    """Tests custom validator logic related to sentence_window_size in split tasks."""
-    task = {
-        "type": "split",
-        "task_properties": {
-            "split_by": "word",  # Incorrect usage of sentence_window_size
-            "split_length": 10,
-            "split_overlap": 5,
-            "sentence_window_size": 5,  # Should not be set when split_by is not 'sentence'
-        },
-    }
-    job_data = {
-        "job_payload": valid_job_payload(),
-        "job_id": "123",
-        "tasks": [task],
-    }
-    with pytest.raises(ValidationError) as exc_info:
-        validate_ingest_job(job_data)
-    assert "sentence_window_size" in str(exc_info.value) and "must be 'sentence'" in str(exc_info.value)
-
-
 def test_multiple_task_types():
     job_data = {
         "job_payload": {
@@ -153,9 +130,9 @@ def test_multiple_task_types():
             {
                 "type": "split",
                 "task_properties": {
-                    "split_by": "word",
-                    "split_length": 100,
-                    "split_overlap": 0,
+                    "tokenizer": "intfloat/e5-large-unsupervised",
+                    "chunk_size": 100,
+                    "chunk_overlap": 0,
                 },
             },
             {
@@ -250,9 +227,9 @@ def test_incorrect_property_types():
             {
                 "type": "split",
                 "task_properties": {
-                    "split_by": "word",
-                    "split_length": {"not an int": 123},  # Incorrect type (should be int)
-                    "split_overlap": 0,
+                    "tokenizer": "intfloat/e5-large-unsupervised",
+                    "chunk_size": {"not an int": 123},  # Incorrect type (should be int)
+                    "chunk_overlap": 0,
                 },
             }
         ],
@@ -269,8 +246,8 @@ def test_missing_required_fields():
             {
                 "type": "split",
                 "task_properties": {
-                    "split_by": "sentence",  # Missing split_length
-                    "split_overlap": 0,
+                    "tokenizer": "intfloat/e5-large-unsupervised",  # Missing chunk_size
+                    "chunk_overlap": 0,
                 },
             }
         ],
diff --git a/tests/nv_ingest/schemas/test_nemo_doc_splitter_schema.py b/tests/nv_ingest/schemas/test_nemo_doc_splitter_schema.py
index 00f4e606..73636ccf 100644
--- a/tests/nv_ingest/schemas/test_nemo_doc_splitter_schema.py
+++ b/tests/nv_ingest/schemas/test_nemo_doc_splitter_schema.py
@@ -13,57 +13,16 @@ def test_document_splitter_schema_defaults():
     Test the DocumentSplitterSchema with default values.
     """
     schema = DocumentSplitterSchema()
-    assert schema.split_by == "word"
-    assert schema.split_length == 60
-    assert schema.split_overlap == 10
-    assert schema.max_character_length == 450
-    assert schema.sentence_window_size == 0
+    assert schema.tokenizer == "intfloat/e5-large-unsupervised"
+    assert schema.chunk_size == 300
+    assert schema.chunk_overlap == 0
     assert schema.raise_on_failure is False
 
 
 @pytest.mark.parametrize("invalid_value", [-1, 0])
 def test_document_splitter_schema_invalid_split_length(invalid_value):
     """
-    Test DocumentSplitterSchema with invalid split_length values.
+    Test DocumentSplitterSchema with invalid chunk_size values.
     """
     with pytest.raises(ValidationError):
-        DocumentSplitterSchema(split_length=invalid_value)
-
-
-@pytest.mark.parametrize(
-    "split_by, sentence_window_size, is_valid",
-    [
-        ("sentence", 5, True),  # Valid use of sentence_window_size
-        (
-            "word",
-            0,
-            True,
-        ),  # Valid when split_by is not 'sentence' but sentence_window_size is 0
-        (
-            "word",
-            5,
-            False,
-        ),  # Invalid because sentence_window_size > 0 requires split_by to be 'sentence'
-    ],
-)
-def test_document_splitter_schema_sentence_window_size_validation(split_by, sentence_window_size, is_valid):
-    """
-    Parametrized test for validating the sentence_window_size logic in DocumentSplitterSchema.
-    """
-    if is_valid:
-        schema = DocumentSplitterSchema(split_by=split_by, sentence_window_size=sentence_window_size)
-        assert schema.sentence_window_size == sentence_window_size
-        assert schema.split_by == split_by
-    else:
-        with pytest.raises(ValidationError) as excinfo:
-            DocumentSplitterSchema(split_by=split_by, sentence_window_size=sentence_window_size)
-        assert "split_by must be 'sentence'" in str(excinfo.value)
-
-
-def test_document_splitter_schema_optional_fields_none():
-    """
-    Test DocumentSplitterSchema with optional fields set to None.
-    """
-    schema = DocumentSplitterSchema(max_character_length=None, sentence_window_size=None)
-    assert schema.max_character_length is None
-    assert schema.sentence_window_size is None
+        DocumentSplitterSchema(chunk_size=invalid_value)
diff --git a/tests/nv_ingest_client/cli/util/test_click.py b/tests/nv_ingest_client/cli/util/test_click.py
index f4f15ebd..a8ed1b0a 100644
--- a/tests/nv_ingest_client/cli/util/test_click.py
+++ b/tests/nv_ingest_client/cli/util/test_click.py
@@ -93,7 +93,7 @@ def test_debug_print_click_options(mock_pprint):
 
 def test_validate_task_with_valid_split():
     """Test with valid split task options."""
-    value = ['split:{"split_by": "page", "split_length": 10}']
+    value = ['split:{"tokenizer": "intfloat/e5-large-unsupervised", "chunk_size": 300}']
     result = click_validate_task(None, None, value)
 
     assert "split" in result
diff --git a/tests/nv_ingest_client/client/test_client.py b/tests/nv_ingest_client/client/test_client.py
index 05c90425..5a45936c 100644
--- a/tests/nv_ingest_client/client/test_client.py
+++ b/tests/nv_ingest_client/client/test_client.py
@@ -276,7 +276,7 @@ def test_correct_storage_of_job_details(nv_ingest_client):
 def test_successful_task_creation(nv_ingest_client_with_jobs):
     job_id = "12345678-1234-5678-1234-567812345678"
     task_type = TaskType.SPLIT
-    task_params = {"split_by": "sentence"}
+    task_params = {"tokenizer": "intfloat/e5-large-unsupervised"}
 
     # Assuming task_factory and task creation are implemented
     nv_ingest_client_with_jobs.create_task(job_id, task_type, task_params)
@@ -288,7 +288,9 @@ def test_successful_task_creation(nv_ingest_client_with_jobs):
 
 def test_non_existent_job(nv_ingest_client):
     with pytest.raises(ValueError):
-        nv_ingest_client.create_task("nonexistent_job_id", TaskType.SPLIT, {"split_by": "sentence"})
+        nv_ingest_client.create_task(
+            "nonexistent_job_id", TaskType.SPLIT, {"tokenizer": "intfloat/e5-large-unsupervised"}
+        )
 
 
 def test_add_task_post_submission(nv_ingest_client_with_jobs):
@@ -297,13 +299,13 @@ def test_add_task_post_submission(nv_ingest_client_with_jobs):
     nv_ingest_client_with_jobs._job_states[job_id].state = JobStateEnum.PROCESSING
 
     with pytest.raises(ValueError):
-        nv_ingest_client_with_jobs.create_task(job_id, TaskType.SPLIT, {"split_by": "sentence"})
+        nv_ingest_client_with_jobs.create_task(job_id, TaskType.SPLIT, {"tokenizer": "intfloat/e5-large-unsupervised"})
 
 
 def test_parameter_validation(nv_ingest_client_with_jobs):
     job_id = "12345678-1234-5678-1234-567812345678"
     task_type = TaskType.SPLIT
-    task_params = {"split_by": "sentence", "split_length": 128}
+    task_params = {"tokenizer": "intfloat/e5-large-unsupervised", "chunk_size": 128}
 
     nv_ingest_client_with_jobs.create_task(job_id, task_type, task_params)
     job_state = nv_ingest_client_with_jobs._job_states[job_id]
@@ -580,8 +582,8 @@ def test_create_jobs_for_batch_duplicate_task(nv_ingest_client, mock_create_job_
 
     files = ["file1.pdf"]
     duplicate_tasks = {
-        "split": SplitTask(split_by="sentence"),
-        "store": SplitTask(split_by="sentence"),  # Duplicate task
+        "split": SplitTask(tokenizer="intfloat/e5-large-unsupervised"),
+        "store": SplitTask(tokenizer="intfloat/e5-large-unsupervised"),  # Duplicate task
     }
 
     with pytest.raises(ValueError, match="Duplicate task detected"):
diff --git a/tests/nv_ingest_client/client/test_interface.py b/tests/nv_ingest_client/client/test_interface.py
index 64b9dc63..786b1f31 100644
--- a/tests/nv_ingest_client/client/test_interface.py
+++ b/tests/nv_ingest_client/client/test_interface.py
@@ -153,12 +153,12 @@ def test_split_task_no_args(ingestor):
 
 
 def test_split_task_some_args(ingestor):
-    ingestor.split(split_by="word", split_length=42)
+    ingestor.split(tokenizer="intfloat/e5-large-unsupervised", chunk_size=42)
 
     task = ingestor._job_specs.job_specs["pdf"][0]._tasks[0]
     assert isinstance(task, SplitTask)
-    assert task._split_by == "word"
-    assert task._split_length == 42
+    assert task._tokenizer == "intfloat/e5-large-unsupervised"
+    assert task._chunk_size == 42
 
 
 def test_store_task_no_args(ingestor):
diff --git a/tests/nv_ingest_client/primitives/tasks/test_split.py b/tests/nv_ingest_client/primitives/tasks/test_split.py
index 3fb0dbeb..a4531002 100644
--- a/tests/nv_ingest_client/primitives/tasks/test_split.py
+++ b/tests/nv_ingest_client/primitives/tasks/test_split.py
@@ -10,31 +10,22 @@
 
 def test_split_task_initialization():
     task = SplitTask(
-        split_by="word",
-        split_length=100,
-        split_overlap=10,
-        max_character_length=1000,
-        sentence_window_size=5,
+        tokenizer="intfloat/e5-large-unsupervised",
+        chunk_size=300,
+        chunk_overlap=0,
     )
-    assert task._split_by == "word"
-    assert task._split_length == 100
-    assert task._split_overlap == 10
-    assert task._max_character_length == 1000
-    assert task._sentence_window_size == 5
+    assert task._tokenizer == "intfloat/e5-large-unsupervised"
+    assert task._chunk_size == 300
+    assert task._chunk_overlap == 0
 
 
 # String Representation Tests
 
 
 def test_split_task_str_representation():
-    task = SplitTask(split_by="sentence", split_length=50, split_overlap=5)
+    task = SplitTask(tokenizer="intfloat/e5-large-unsupervised", chunk_size=50, chunk_overlap=5)
     expected_str = (
-        "Split Task:\n"
-        "  split_by: sentence\n"
-        "  split_length: 50\n"
-        "  split_overlap: 5\n"
-        "  split_max_character_length: None\n"
-        "  split_sentence_window_size: None\n"
+        "Split Task:\n" "  tokenizer: intfloat/e5-large-unsupervised\n" "  chunk_size: 50\n" "  chunk_overlap: 5\n"
     )
     assert str(task) == expected_str
 
@@ -43,42 +34,33 @@ def test_split_task_str_representation():
 
 
 @pytest.mark.parametrize(
-    "split_by, split_length, split_overlap, max_character_length, sentence_window_size",
+    "tokenizer, chunk_size, chunk_overlap",
     [
-        ("word", 100, 10, 1000, 5),
-        ("sentence", 50, 5, None, None),
-        ("passage", None, None, 1500, 3),
-        (None, None, None, None, None),  # Test default parameters
+        ("intfloat/e5-large-unsupervised", 100, 10),
+        ("microsoft/deberta-large", 50, 5),
+        ("meta-llama/Llama-3.2-1B", 300, 0),
     ],
 )
 def test_split_task_to_dict(
-    split_by,
-    split_length,
-    split_overlap,
-    max_character_length,
-    sentence_window_size,
+    tokenizer,
+    chunk_size,
+    chunk_overlap,
 ):
     task = SplitTask(
-        split_by=split_by,
-        split_length=split_length,
-        split_overlap=split_overlap,
-        max_character_length=max_character_length,
-        sentence_window_size=sentence_window_size,
+        tokenizer=tokenizer,
+        chunk_size=chunk_size,
+        chunk_overlap=chunk_overlap,
     )
 
     expected_dict = {"type": "split", "task_properties": {}}
 
     # Only add properties to expected_dict if they are not None
-    if split_by is not None:
-        expected_dict["task_properties"]["split_by"] = split_by
-    if split_length is not None:
-        expected_dict["task_properties"]["split_length"] = split_length
-    if split_overlap is not None:
-        expected_dict["task_properties"]["split_overlap"] = split_overlap
-    if max_character_length is not None:
-        expected_dict["task_properties"]["max_character_length"] = max_character_length
-    if sentence_window_size is not None:
-        expected_dict["task_properties"]["sentence_window_size"] = sentence_window_size
+    if tokenizer is not None:
+        expected_dict["task_properties"]["tokenizer"] = tokenizer
+    if chunk_size is not None:
+        expected_dict["task_properties"]["chunk_size"] = chunk_size
+    if chunk_overlap is not None:
+        expected_dict["task_properties"]["chunk_overlap"] = chunk_overlap
 
     assert task.to_dict() == expected_dict, "The to_dict method did not return the expected dictionary representation"
 
@@ -89,14 +71,15 @@ def test_split_task_to_dict(
 def test_split_task_default_params():
     task = SplitTask()
     expected_str_contains = [
-        "split_by: None",
-        "split_length: None",
-        "split_overlap: None",
-        "split_max_character_length: None",
-        "split_sentence_window_size: None",
+        "tokenizer: intfloat/e5-large-unsupervised",
+        "chunk_size: 300",
+        "chunk_overlap: 0",
     ]
     for expected_part in expected_str_contains:
         assert expected_part in str(task)
 
-    expected_dict = {"type": "split", "task_properties": {}}
+    expected_dict = {
+        "type": "split",
+        "task_properties": {"tokenizer": "intfloat/e5-large-unsupervised", "chunk_size": 300, "chunk_overlap": 0},
+    }
     assert task.to_dict() == expected_dict

From a21d0658017610eaa810f800eabc0fed802c603e Mon Sep 17 00:00:00 2001
From: Chris Jarrett <cjarrett@dgx-a100-01.aselab.nvidia.com>
Date: Wed, 8 Jan 2025 15:16:07 -0800
Subject: [PATCH 06/18] Add chunk overlap

---
 src/nv_ingest/modules/transforms/nemo_doc_splitter.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/nv_ingest/modules/transforms/nemo_doc_splitter.py b/src/nv_ingest/modules/transforms/nemo_doc_splitter.py
index 79c315b2..e730f819 100644
--- a/src/nv_ingest/modules/transforms/nemo_doc_splitter.py
+++ b/src/nv_ingest/modules/transforms/nemo_doc_splitter.py
@@ -51,7 +51,7 @@ def _build_split_documents(row, chunks: List[str]) -> List[dict[str, Any]]:
     return documents
 
 
-def _split_into_chunks(text, tokenizer, chunk_size=300):
+def _split_into_chunks(text, tokenizer, chunk_size=300, chunk_overlap=0):
     # Tokenize the text into token IDs
     encoding = tokenizer.encode_plus(text, add_special_tokens=False, return_offsets_mapping=True)
 
@@ -60,7 +60,7 @@ def _split_into_chunks(text, tokenizer, chunk_size=300):
     offsets = encoding["offset_mapping"]
 
     # Split the tokens into chunks of the desired size
-    chunks = [tokens[i : i + chunk_size] for i in range(0, len(tokens), chunk_size)]
+    chunks = [tokens[i : i + chunk_size] for i in range(0, len(tokens), chunk_size - chunk_overlap)]
 
     # Convert token chunks back to text while preserving original spacing and case
     text_chunks = []
@@ -146,7 +146,7 @@ def split_and_forward(message: ControlMessage):
                         "DocumentSplitter only works with text documents but one or more " "'content' values are None."
                     )
 
-                chunks = _split_into_chunks(content, tokenizer_model, chunk_size)
+                chunks = _split_into_chunks(content, tokenizer_model, chunk_size, chunk_overlap)
                 split_docs.extend(_build_split_documents(row, chunks))
 
             split_docs_df = pd.DataFrame(split_docs)

From bc2bf48dfc47ca016150b595b8d6e7ae3a45704b Mon Sep 17 00:00:00 2001
From: Chris Jarrett <cjarrett@dgx-a100-01.aselab.nvidia.com>
Date: Wed, 8 Jan 2025 17:19:47 -0800
Subject: [PATCH 07/18] Rename nemo document splitter to text splitter

---
 src/nv_ingest/modules/transforms/__init__.py       |  4 ++--
 .../{nemo_doc_splitter.py => text_splitter.py}     | 12 ++++++------
 src/nv_ingest/schemas/__init__.py                  |  4 ++--
 .../schemas/ingest_pipeline_config_schema.py       |  4 ++--
 ..._splitter_schema.py => text_splitter_schema.py} |  2 +-
 src/nv_ingest/util/pipeline/__init__.py            |  4 ++--
 src/nv_ingest/util/pipeline/pipeline_builders.py   |  6 +++---
 src/nv_ingest/util/pipeline/stage_builders.py      | 14 +++++++-------
 ...tter_schema.py => test_text_splitter_schema.py} | 14 +++++++-------
 9 files changed, 32 insertions(+), 32 deletions(-)
 rename src/nv_ingest/modules/transforms/{nemo_doc_splitter.py => text_splitter.py} (93%)
 rename src/nv_ingest/schemas/{nemo_doc_splitter_schema.py => text_splitter_schema.py} (90%)
 rename tests/nv_ingest/schemas/{test_nemo_doc_splitter_schema.py => test_text_splitter_schema.py} (56%)

diff --git a/src/nv_ingest/modules/transforms/__init__.py b/src/nv_ingest/modules/transforms/__init__.py
index 4a39c32a..941cd120 100644
--- a/src/nv_ingest/modules/transforms/__init__.py
+++ b/src/nv_ingest/modules/transforms/__init__.py
@@ -3,6 +3,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from .associate_nearby_text import AssociateNearbyTextLoaderFactory
-from .nemo_doc_splitter import NemoDocSplitterLoaderFactory
+from .text_splitter import TextSplitterLoaderFactory
 
-__all__ = ["NemoDocSplitterLoaderFactory", "AssociateNearbyTextLoaderFactory"]
+__all__ = ["TextSplitterLoaderFactory", "AssociateNearbyTextLoaderFactory"]
diff --git a/src/nv_ingest/modules/transforms/nemo_doc_splitter.py b/src/nv_ingest/modules/transforms/text_splitter.py
similarity index 93%
rename from src/nv_ingest/modules/transforms/nemo_doc_splitter.py
rename to src/nv_ingest/modules/transforms/text_splitter.py
index e730f819..eb3ec875 100644
--- a/src/nv_ingest/modules/transforms/nemo_doc_splitter.py
+++ b/src/nv_ingest/modules/transforms/text_splitter.py
@@ -24,7 +24,7 @@
 import cudf
 
 from nv_ingest.schemas.metadata_schema import ContentTypeEnum
-from nv_ingest.schemas.nemo_doc_splitter_schema import DocumentSplitterSchema
+from nv_ingest.schemas.text_splitter_schema import TextSplitterSchema
 from nv_ingest.util.exception_handlers.decorators import nv_ingest_node_failure_context_manager
 from nv_ingest.util.flow_control import filter_by_task
 from nv_ingest.util.modules.config_validator import fetch_and_validate_module_config
@@ -80,19 +80,19 @@ def _split_into_chunks(text, tokenizer, chunk_size=300, chunk_overlap=0):
     return text_chunks
 
 
-MODULE_NAME = "nemo_document_splitter"
+MODULE_NAME = "text_splitter"
 MODULE_NAMESPACE = "nv_ingest"
 
-NemoDocSplitterLoaderFactory = ModuleLoaderFactory(MODULE_NAME, MODULE_NAMESPACE, DocumentSplitterSchema)
+TextSplitterLoaderFactory = ModuleLoaderFactory(MODULE_NAME, MODULE_NAMESPACE, TextSplitterSchema)
 
 
 @register_module(MODULE_NAME, MODULE_NAMESPACE)
-def _nemo_document_splitter(builder: mrc.Builder):
+def _text_splitter(builder: mrc.Builder):
     """
     A pipeline module that splits documents into smaller parts based on the specified criteria.
     """
 
-    validated_config = fetch_and_validate_module_config(builder, DocumentSplitterSchema)
+    validated_config = fetch_and_validate_module_config(builder, TextSplitterSchema)
 
     @filter_by_task(["split"])
     @traceable(MODULE_NAME)
@@ -143,7 +143,7 @@ def split_and_forward(message: ControlMessage):
 
                 if content is None:
                     raise ValueError(
-                        "DocumentSplitter only works with text documents but one or more " "'content' values are None."
+                        "TextSplitter only works with text documents but one or more " "'content' values are None."
                     )
 
                 chunks = _split_into_chunks(content, tokenizer_model, chunk_size, chunk_overlap)
diff --git a/src/nv_ingest/schemas/__init__.py b/src/nv_ingest/schemas/__init__.py
index 43a94055..f3ff4659 100644
--- a/src/nv_ingest/schemas/__init__.py
+++ b/src/nv_ingest/schemas/__init__.py
@@ -13,13 +13,13 @@
 from .message_broker_source_schema import MessageBrokerTaskSourceSchema
 from .metadata_injector_schema import MetadataInjectorSchema
 from .metadata_schema import validate_metadata
-from .nemo_doc_splitter_schema import DocumentSplitterSchema
+from .text_splitter_schema import TextSplitterSchema
 from .pdf_extractor_schema import PDFExtractorSchema
 from .task_injection_schema import TaskInjectionSchema
 from .vdb_task_sink_schema import VdbTaskSinkSchema
 
 __all__ = [
-    "DocumentSplitterSchema",
+    "TextSplitterSchema",
     "ImageCaptionExtractionSchema",
     "ImageStorageModuleSchema",
     "IngestJobSchema",
diff --git a/src/nv_ingest/schemas/ingest_pipeline_config_schema.py b/src/nv_ingest/schemas/ingest_pipeline_config_schema.py
index 1471a338..b3c82f42 100644
--- a/src/nv_ingest/schemas/ingest_pipeline_config_schema.py
+++ b/src/nv_ingest/schemas/ingest_pipeline_config_schema.py
@@ -18,7 +18,7 @@
 from nv_ingest.schemas.message_broker_sink_schema import MessageBrokerTaskSinkSchema
 from nv_ingest.schemas.message_broker_source_schema import MessageBrokerTaskSourceSchema
 from nv_ingest.schemas.metadata_injector_schema import MetadataInjectorSchema
-from nv_ingest.schemas.nemo_doc_splitter_schema import DocumentSplitterSchema
+from nv_ingest.schemas.text_splitter_schema import TextSplitterSchema
 from nv_ingest.schemas.otel_meter_schema import OpenTelemetryMeterSchema
 from nv_ingest.schemas.otel_tracer_schema import OpenTelemetryTracerSchema
 from nv_ingest.schemas.pdf_extractor_schema import PDFExtractorSchema
@@ -30,7 +30,7 @@
 
 class PipelineConfigSchema(BaseModel):
     chart_extractor_module: ChartExtractorSchema = ChartExtractorSchema()
-    document_splitter_module: DocumentSplitterSchema = DocumentSplitterSchema()
+    text_splitter_module: TextSplitterSchema = TextSplitterSchema()
     embedding_storage_module: EmbeddingStorageModuleSchema = EmbeddingStorageModuleSchema()
     embed_extractions_module: EmbedExtractionsSchema = EmbedExtractionsSchema()
     image_caption_extraction_module: ImageCaptionExtractionSchema = ImageCaptionExtractionSchema()
diff --git a/src/nv_ingest/schemas/nemo_doc_splitter_schema.py b/src/nv_ingest/schemas/text_splitter_schema.py
similarity index 90%
rename from src/nv_ingest/schemas/nemo_doc_splitter_schema.py
rename to src/nv_ingest/schemas/text_splitter_schema.py
index ad34e583..db9b1508 100644
--- a/src/nv_ingest/schemas/nemo_doc_splitter_schema.py
+++ b/src/nv_ingest/schemas/text_splitter_schema.py
@@ -8,7 +8,7 @@
 from typing_extensions import Annotated
 
 
-class DocumentSplitterSchema(BaseModel):
+class TextSplitterSchema(BaseModel):
     tokenizer: str = "intfloat/e5-large-unsupervised"
     chunk_size: Annotated[int, Field(gt=0)] = 300
     chunk_overlap: Annotated[int, Field(ge=0)] = 0
diff --git a/src/nv_ingest/util/pipeline/__init__.py b/src/nv_ingest/util/pipeline/__init__.py
index 5754c667..6957b2ea 100644
--- a/src/nv_ingest/util/pipeline/__init__.py
+++ b/src/nv_ingest/util/pipeline/__init__.py
@@ -17,7 +17,7 @@
     add_table_extractor_stage,
     add_chart_extractor_stage,
     add_image_caption_stage,
-    add_nemo_splitter_stage,
+    add_text_splitter_stage,
     add_embed_extractions_stage,
     add_embedding_storage_stage,
     add_image_storage_stage,
@@ -39,7 +39,7 @@
     "add_table_extractor_stage",
     "add_chart_extractor_stage",
     "add_image_caption_stage",
-    "add_nemo_splitter_stage",
+    "add_text_splitter_stage",
     "add_embed_extractions_stage",
     "add_embedding_storage_stage",
     "add_image_storage_stage",
diff --git a/src/nv_ingest/util/pipeline/pipeline_builders.py b/src/nv_ingest/util/pipeline/pipeline_builders.py
index 842682f0..b431778c 100644
--- a/src/nv_ingest/util/pipeline/pipeline_builders.py
+++ b/src/nv_ingest/util/pipeline/pipeline_builders.py
@@ -47,7 +47,7 @@ def setup_ingestion_pipeline(
     ########################################################################################################
     ## Transforms and data synthesis
     ########################################################################################################
-    nemo_splitter_stage = add_nemo_splitter_stage(pipe, morpheus_pipeline_config, ingest_config)
+    text_splitter_stage = add_text_splitter_stage(pipe, morpheus_pipeline_config, ingest_config)
     embed_extractions_stage = add_embed_extractions_stage(pipe, morpheus_pipeline_config, ingest_config)
     ########################################################################################################
     ## Storage and output
@@ -80,8 +80,8 @@ def setup_ingestion_pipeline(
     pipe.add_edge(image_dedup_stage, image_filter_stage)
     pipe.add_edge(image_filter_stage, table_extraction_stage)
     pipe.add_edge(table_extraction_stage, chart_extraction_stage)
-    pipe.add_edge(chart_extraction_stage, nemo_splitter_stage)
-    pipe.add_edge(nemo_splitter_stage, image_caption_stage)
+    pipe.add_edge(chart_extraction_stage, text_splitter_stage)
+    pipe.add_edge(text_splitter_stage, image_caption_stage)
     pipe.add_edge(image_caption_stage, embed_extractions_stage)
     pipe.add_edge(embed_extractions_stage, image_storage_stage)
     pipe.add_edge(image_storage_stage, embedding_storage_stage)
diff --git a/src/nv_ingest/util/pipeline/stage_builders.py b/src/nv_ingest/util/pipeline/stage_builders.py
index b01338de..94e1aa94 100644
--- a/src/nv_ingest/util/pipeline/stage_builders.py
+++ b/src/nv_ingest/util/pipeline/stage_builders.py
@@ -21,7 +21,7 @@
 from nv_ingest.modules.telemetry.otel_meter import OpenTelemetryMeterLoaderFactory
 from nv_ingest.modules.telemetry.otel_tracer import OpenTelemetryTracerLoaderFactory
 from nv_ingest.modules.transforms.embed_extractions import EmbedExtractionsLoaderFactory
-from nv_ingest.modules.transforms.nemo_doc_splitter import NemoDocSplitterLoaderFactory
+from nv_ingest.modules.transforms.text_splitter import TextSplitterLoaderFactory
 from nv_ingest.stages.docx_extractor_stage import generate_docx_extractor_stage
 from nv_ingest.stages.extractors.image_extractor_stage import generate_image_extractor_stage
 from nv_ingest.stages.filters import generate_dedup_stage
@@ -331,15 +331,15 @@ def add_image_filter_stage(pipe, morpheus_pipeline_config, ingest_config, defaul
     return image_filter_stage
 
 
-def add_nemo_splitter_stage(pipe, morpheus_pipeline_config, ingest_config):
-    nemo_splitter_loader = NemoDocSplitterLoaderFactory.get_instance(
-        module_name="nemo_doc_splitter",
+def add_text_splitter_stage(pipe, morpheus_pipeline_config, ingest_config):
+    text_splitter_loader = TextSplitterLoaderFactory.get_instance(
+        module_name="text_splitter",
         module_config=ingest_config.get("text_splitting_module", {}),
     )
-    nemo_splitter_stage = pipe.add_stage(
+    text_splitter_stage = pipe.add_stage(
         LinearModulesStage(
             morpheus_pipeline_config,
-            nemo_splitter_loader,
+            text_splitter_loader,
             input_type=ControlMessage,
             output_type=ControlMessage,
             input_port_name="input",
@@ -347,7 +347,7 @@ def add_nemo_splitter_stage(pipe, morpheus_pipeline_config, ingest_config):
         )
     )
 
-    return nemo_splitter_stage
+    return text_splitter_stage
 
 
 def add_image_caption_stage(pipe, morpheus_pipeline_config, ingest_config, default_cpu_count):
diff --git a/tests/nv_ingest/schemas/test_nemo_doc_splitter_schema.py b/tests/nv_ingest/schemas/test_text_splitter_schema.py
similarity index 56%
rename from tests/nv_ingest/schemas/test_nemo_doc_splitter_schema.py
rename to tests/nv_ingest/schemas/test_text_splitter_schema.py
index 73636ccf..5e2c2a80 100644
--- a/tests/nv_ingest/schemas/test_nemo_doc_splitter_schema.py
+++ b/tests/nv_ingest/schemas/test_text_splitter_schema.py
@@ -5,14 +5,14 @@
 import pytest
 from pydantic import ValidationError
 
-from nv_ingest.schemas import DocumentSplitterSchema
+from nv_ingest.schemas import TextSplitterSchema
 
 
-def test_document_splitter_schema_defaults():
+def test_text_splitter_schema_defaults():
     """
-    Test the DocumentSplitterSchema with default values.
+    Test the TextSplitterSchema with default values.
     """
-    schema = DocumentSplitterSchema()
+    schema = TextSplitterSchema()
     assert schema.tokenizer == "intfloat/e5-large-unsupervised"
     assert schema.chunk_size == 300
     assert schema.chunk_overlap == 0
@@ -20,9 +20,9 @@ def test_document_splitter_schema_defaults():
 
 
 @pytest.mark.parametrize("invalid_value", [-1, 0])
-def test_document_splitter_schema_invalid_split_length(invalid_value):
+def test_text_splitter_schema_invalid_split_length(invalid_value):
     """
-    Test DocumentSplitterSchema with invalid chunk_size values.
+    Test TextSplitterSchema with invalid chunk_size values.
     """
     with pytest.raises(ValidationError):
-        DocumentSplitterSchema(chunk_size=invalid_value)
+        TextSplitterSchema(chunk_size=invalid_value)

From cd18083efdbe85d36cb8aa4fafd6e6788e9772b5 Mon Sep 17 00:00:00 2001
From: Chris Jarrett <cjarrett@dgx-a100-01.aselab.nvidia.com>
Date: Wed, 8 Jan 2025 17:27:08 -0800
Subject: [PATCH 08/18] Temp fix

---
 src/nv_ingest/modules/transforms/text_splitter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nv_ingest/modules/transforms/text_splitter.py b/src/nv_ingest/modules/transforms/text_splitter.py
index eb3ec875..1ba95d4b 100644
--- a/src/nv_ingest/modules/transforms/text_splitter.py
+++ b/src/nv_ingest/modules/transforms/text_splitter.py
@@ -135,7 +135,7 @@ def split_and_forward(message: ControlMessage):
                 f"chunk_overlap: {chunk_overlap}"
             )
 
-            tokenizer_model = AutoTokenizer.from_pretrained(tokenizer, token="")
+            tokenizer_model = AutoTokenizer.from_pretrained(tokenizer)
 
             split_docs = []
             for _, row in df_filtered.iterrows():

From bab7f3d0fd71b6b54aad747ef34989c5724880a1 Mon Sep 17 00:00:00 2001
From: Chris Jarrett <cjarrett@dgx-a100-01.aselab.nvidia.com>
Date: Mon, 13 Jan 2025 15:02:07 -0800
Subject: [PATCH 09/18] Address reviews

---
 .../modules/transforms/text_splitter.py       | 11 +---
 src/nv_ingest/schemas/ingest_job_schema.py    |  6 ++
 src/nv_ingest/schemas/text_splitter_schema.py |  8 ++-
 .../schemas/test_ingest_job_schema.py         | 20 +++++++
 .../schemas/test_text_splitter_schema.py      | 59 ++++++++++++++++++-
 5 files changed, 92 insertions(+), 12 deletions(-)

diff --git a/src/nv_ingest/modules/transforms/text_splitter.py b/src/nv_ingest/modules/transforms/text_splitter.py
index 1ba95d4b..20210659 100644
--- a/src/nv_ingest/modules/transforms/text_splitter.py
+++ b/src/nv_ingest/modules/transforms/text_splitter.py
@@ -118,10 +118,6 @@ def split_and_forward(message: ControlMessage):
             df_filtered = df.loc[bool_index]
 
             if df_filtered.empty:
-                gdf = cudf.from_pandas(df)
-                message_meta = MessageMeta(df=gdf)
-                message.payload(message_meta)
-
                 return message
 
             # Override parameters if set
@@ -139,12 +135,7 @@ def split_and_forward(message: ControlMessage):
 
             split_docs = []
             for _, row in df_filtered.iterrows():
-                content = row["metadata"]["content"]
-
-                if content is None:
-                    raise ValueError(
-                        "TextSplitter only works with text documents but one or more " "'content' values are None."
-                    )
+                content = row["metadata"]["content"] if row["metadata"]["content"] is not None else ""
 
                 chunks = _split_into_chunks(content, tokenizer_model, chunk_size, chunk_overlap)
                 split_docs.extend(_build_split_documents(row, chunks))
diff --git a/src/nv_ingest/schemas/ingest_job_schema.py b/src/nv_ingest/schemas/ingest_job_schema.py
index f78cadd9..3decf647 100644
--- a/src/nv_ingest/schemas/ingest_job_schema.py
+++ b/src/nv_ingest/schemas/ingest_job_schema.py
@@ -63,6 +63,12 @@ class IngestTaskSplitSchema(BaseModelNoExt):
     chunk_size: Annotated[int, Field(gt=0)]
     chunk_overlap: Annotated[int, Field(ge=0)]
 
+    @field_validator("chunk_overlap")
+    def check_chunk_overlap(cls, v, values, **kwargs):
+        if v is not None and "chunk_size" in values.data and v >= values.data["chunk_size"]:
+            raise ValueError("chunk_overlap must be less than chunk_size")
+        return v
+
 
 class IngestTaskExtractSchema(BaseModelNoExt):
     document_type: DocumentTypeEnum
diff --git a/src/nv_ingest/schemas/text_splitter_schema.py b/src/nv_ingest/schemas/text_splitter_schema.py
index db9b1508..3fe639ca 100644
--- a/src/nv_ingest/schemas/text_splitter_schema.py
+++ b/src/nv_ingest/schemas/text_splitter_schema.py
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 
-from pydantic import Field, BaseModel
+from pydantic import Field, BaseModel, field_validator
 
 from typing_extensions import Annotated
 
@@ -13,3 +13,9 @@ class TextSplitterSchema(BaseModel):
     chunk_size: Annotated[int, Field(gt=0)] = 300
     chunk_overlap: Annotated[int, Field(ge=0)] = 0
     raise_on_failure: bool = False
+
+    @field_validator("chunk_overlap")
+    def check_chunk_overlap(cls, v, values, **kwargs):
+        if v is not None and "chunk_size" in values.data and v >= values.data["chunk_size"]:
+            raise ValueError("chunk_overlap must be less than chunk_size")
+        return v
diff --git a/tests/nv_ingest/schemas/test_ingest_job_schema.py b/tests/nv_ingest/schemas/test_ingest_job_schema.py
index e9fb7f22..386e756d 100644
--- a/tests/nv_ingest/schemas/test_ingest_job_schema.py
+++ b/tests/nv_ingest/schemas/test_ingest_job_schema.py
@@ -117,6 +117,26 @@ def test_field_type_correctness():
         validate_ingest_job(job_data)
 
 
+def test_custom_validator_logic_for_sentence_window_size():
+    """Tests custom validator logic related to chunk_size and chunk_overlap in split tasks."""
+    task = {
+        "type": "split",
+        "task_properties": {
+            "tokanizer": "intfloat/e5-large-unsupervised",
+            "chunk_size": 200,
+            "chunk_overlap": 250,  # chunk_overlap should always be less than chunk_size
+        },
+    }
+    job_data = {
+        "job_payload": valid_job_payload(),
+        "job_id": "123",
+        "tasks": [task],
+    }
+    with pytest.raises(ValidationError) as exc_info:
+        validate_ingest_job(job_data)
+    assert "chunk_overlap must be less than chunk_size" in str(exc_info.value)
+
+
 def test_multiple_task_types():
     job_data = {
         "job_payload": {
diff --git a/tests/nv_ingest/schemas/test_text_splitter_schema.py b/tests/nv_ingest/schemas/test_text_splitter_schema.py
index 5e2c2a80..86488105 100644
--- a/tests/nv_ingest/schemas/test_text_splitter_schema.py
+++ b/tests/nv_ingest/schemas/test_text_splitter_schema.py
@@ -19,10 +19,67 @@ def test_text_splitter_schema_defaults():
     assert schema.raise_on_failure is False
 
 
+def test_text_splitter_schema_custom_values():
+    """
+    Test the TextSplitterSchema with custom values.
+    """
+    tokenizer = "meta-llama/Llama-3.2-1B"
+    chunk_size = 500
+    chunk_overlap = 10
+    schema = TextSplitterSchema(
+        tokenizer=tokenizer, chunk_size=chunk_size, chunk_overlap=chunk_overlap, raise_on_failure=True
+    )
+    assert schema.tokenizer == tokenizer
+    assert schema.chunk_size == chunk_size
+    assert schema.chunk_overlap == chunk_overlap
+    assert schema.raise_on_failure is True
+
+
+@pytest.mark.parametrize("invalid_value", [50, 5.5])
+def test_text_splitter_schema_invalid_tokenizer(invalid_value):
+    """
+    Test TextSplitterSchema with invalid tokenizer values.
+    """
+    with pytest.raises(ValidationError):
+        TextSplitterSchema(tokenizer=invalid_value)
+
+
 @pytest.mark.parametrize("invalid_value", [-1, 0])
-def test_text_splitter_schema_invalid_split_length(invalid_value):
+def test_text_splitter_schema_invalid_chunk_size(invalid_value):
     """
     Test TextSplitterSchema with invalid chunk_size values.
     """
     with pytest.raises(ValidationError):
         TextSplitterSchema(chunk_size=invalid_value)
+
+
+@pytest.mark.parametrize("invalid_value", [-1, "a"])
+def test_text_splitter_schema_invalid_chunk_overlap(invalid_value):
+    """
+    Test TextSplitterSchema with invalid chunk_overlap values.
+    """
+    with pytest.raises(ValidationError):
+        TextSplitterSchema(chunk_overlap=invalid_value)
+
+
+@pytest.mark.parametrize(
+    "chunk_size, chunk_overlap, is_valid",
+    [
+        (300, 50, True),
+        (150, 0, True),
+        (100, 100, False),
+        (50, 200, False),
+    ],
+)
+def test_text_splitter_schema_chunk_overlap_validation(chunk_size, chunk_overlap, is_valid):
+    """
+    Parametrized test for validating the chunk_overlap logic in TextSplitterSchema.
+    """
+    if is_valid:
+        schema = TextSplitterSchema(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+        assert schema.chunk_size == chunk_size
+        assert schema.chunk_overlap == chunk_overlap
+    else:
+        with pytest.raises(ValidationError) as excinfo:
+            TextSplitterSchema(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+        assert "chunk_overlap must be less than chunk_size" in str(excinfo.value)

From 2f4b979c1a114b6c6559673136ce90a412d57ccc Mon Sep 17 00:00:00 2001
From: Chris Jarrett <cjarrett@dgx-a100-01.aselab.nvidia.com>
Date: Mon, 10 Feb 2025 13:42:09 -0800
Subject: [PATCH 10/18] Change default chunk_size to 1024

---
 client/src/nv_ingest_client/primitives/tasks/split.py |  4 ++--
 src/nv_ingest/modules/transforms/text_splitter.py     |  2 +-
 src/nv_ingest/schemas/text_splitter_schema.py         |  2 +-
 tests/nv_ingest/schemas/test_text_splitter_schema.py  |  2 +-
 tests/nv_ingest_client/primitives/tasks/test_split.py | 10 +++++-----
 5 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/client/src/nv_ingest_client/primitives/tasks/split.py b/client/src/nv_ingest_client/primitives/tasks/split.py
index 5f45e806..86094701 100644
--- a/client/src/nv_ingest_client/primitives/tasks/split.py
+++ b/client/src/nv_ingest_client/primitives/tasks/split.py
@@ -18,7 +18,7 @@
 
 class SplitTaskSchema(BaseModel):
     tokenizer: str = "intfloat/e5-large-unsupervised"
-    chunk_size: int = 300
+    chunk_size: int = 1024
     chunk_overlap: int = 0
 
     class Config:
@@ -33,7 +33,7 @@ class SplitTask(Task):
     def __init__(
         self,
         tokenizer: str = "intfloat/e5-large-unsupervised",
-        chunk_size: int = 300,
+        chunk_size: int = 1024,
         chunk_overlap: int = 0,
     ) -> None:
         """
diff --git a/src/nv_ingest/modules/transforms/text_splitter.py b/src/nv_ingest/modules/transforms/text_splitter.py
index 20210659..45b46ccd 100644
--- a/src/nv_ingest/modules/transforms/text_splitter.py
+++ b/src/nv_ingest/modules/transforms/text_splitter.py
@@ -51,7 +51,7 @@ def _build_split_documents(row, chunks: List[str]) -> List[dict[str, Any]]:
     return documents
 
 
-def _split_into_chunks(text, tokenizer, chunk_size=300, chunk_overlap=0):
+def _split_into_chunks(text, tokenizer, chunk_size=1024, chunk_overlap=0):
     # Tokenize the text into token IDs
     encoding = tokenizer.encode_plus(text, add_special_tokens=False, return_offsets_mapping=True)
 
diff --git a/src/nv_ingest/schemas/text_splitter_schema.py b/src/nv_ingest/schemas/text_splitter_schema.py
index 3fe639ca..46c7ba64 100644
--- a/src/nv_ingest/schemas/text_splitter_schema.py
+++ b/src/nv_ingest/schemas/text_splitter_schema.py
@@ -10,7 +10,7 @@
 
 class TextSplitterSchema(BaseModel):
     tokenizer: str = "intfloat/e5-large-unsupervised"
-    chunk_size: Annotated[int, Field(gt=0)] = 300
+    chunk_size: Annotated[int, Field(gt=0)] = 1024
     chunk_overlap: Annotated[int, Field(ge=0)] = 0
     raise_on_failure: bool = False
 
diff --git a/tests/nv_ingest/schemas/test_text_splitter_schema.py b/tests/nv_ingest/schemas/test_text_splitter_schema.py
index 86488105..e9406d8d 100644
--- a/tests/nv_ingest/schemas/test_text_splitter_schema.py
+++ b/tests/nv_ingest/schemas/test_text_splitter_schema.py
@@ -14,7 +14,7 @@ def test_text_splitter_schema_defaults():
     """
     schema = TextSplitterSchema()
     assert schema.tokenizer == "intfloat/e5-large-unsupervised"
-    assert schema.chunk_size == 300
+    assert schema.chunk_size == 1024
     assert schema.chunk_overlap == 0
     assert schema.raise_on_failure is False
 
diff --git a/tests/nv_ingest_client/primitives/tasks/test_split.py b/tests/nv_ingest_client/primitives/tasks/test_split.py
index a4531002..2976d2ee 100644
--- a/tests/nv_ingest_client/primitives/tasks/test_split.py
+++ b/tests/nv_ingest_client/primitives/tasks/test_split.py
@@ -11,11 +11,11 @@
 def test_split_task_initialization():
     task = SplitTask(
         tokenizer="intfloat/e5-large-unsupervised",
-        chunk_size=300,
+        chunk_size=1024,
         chunk_overlap=0,
     )
     assert task._tokenizer == "intfloat/e5-large-unsupervised"
-    assert task._chunk_size == 300
+    assert task._chunk_size == 1024
     assert task._chunk_overlap == 0
 
 
@@ -38,7 +38,7 @@ def test_split_task_str_representation():
     [
         ("intfloat/e5-large-unsupervised", 100, 10),
         ("microsoft/deberta-large", 50, 5),
-        ("meta-llama/Llama-3.2-1B", 300, 0),
+        ("meta-llama/Llama-3.2-1B", 1024, 0),
     ],
 )
 def test_split_task_to_dict(
@@ -72,7 +72,7 @@ def test_split_task_default_params():
     task = SplitTask()
     expected_str_contains = [
         "tokenizer: intfloat/e5-large-unsupervised",
-        "chunk_size: 300",
+        "chunk_size: 1024",
         "chunk_overlap: 0",
     ]
     for expected_part in expected_str_contains:
@@ -80,6 +80,6 @@ def test_split_task_default_params():
 
     expected_dict = {
         "type": "split",
-        "task_properties": {"tokenizer": "intfloat/e5-large-unsupervised", "chunk_size": 300, "chunk_overlap": 0},
+        "task_properties": {"tokenizer": "intfloat/e5-large-unsupervised", "chunk_size": 1024, "chunk_overlap": 0},
     }
     assert task.to_dict() == expected_dict

From b0529427072855ba720c74965fc147c1cd3aa1ca Mon Sep 17 00:00:00 2001
From: Chris Jarrett <cjarrett@dgx-a100-01.aselab.nvidia.com>
Date: Mon, 10 Feb 2025 21:29:59 -0800
Subject: [PATCH 11/18] Change default chunk_overlap to 20

---
 client/src/nv_ingest_client/primitives/tasks/split.py | 4 ++--
 src/nv_ingest/modules/transforms/text_splitter.py     | 2 +-
 src/nv_ingest/schemas/text_splitter_schema.py         | 2 +-
 tests/nv_ingest/schemas/test_text_splitter_schema.py  | 2 +-
 tests/nv_ingest_client/primitives/tasks/test_split.py | 4 ++--
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/client/src/nv_ingest_client/primitives/tasks/split.py b/client/src/nv_ingest_client/primitives/tasks/split.py
index 86094701..ece29605 100644
--- a/client/src/nv_ingest_client/primitives/tasks/split.py
+++ b/client/src/nv_ingest_client/primitives/tasks/split.py
@@ -19,7 +19,7 @@
 class SplitTaskSchema(BaseModel):
     tokenizer: str = "intfloat/e5-large-unsupervised"
     chunk_size: int = 1024
-    chunk_overlap: int = 0
+    chunk_overlap: int = 20
 
     class Config:
         extra = "forbid"
@@ -34,7 +34,7 @@ def __init__(
         self,
         tokenizer: str = "intfloat/e5-large-unsupervised",
         chunk_size: int = 1024,
-        chunk_overlap: int = 0,
+        chunk_overlap: int = 20,
     ) -> None:
         """
         Setup Split Task Config
diff --git a/src/nv_ingest/modules/transforms/text_splitter.py b/src/nv_ingest/modules/transforms/text_splitter.py
index 45b46ccd..694335ec 100644
--- a/src/nv_ingest/modules/transforms/text_splitter.py
+++ b/src/nv_ingest/modules/transforms/text_splitter.py
@@ -51,7 +51,7 @@ def _build_split_documents(row, chunks: List[str]) -> List[dict[str, Any]]:
     return documents
 
 
-def _split_into_chunks(text, tokenizer, chunk_size=1024, chunk_overlap=0):
+def _split_into_chunks(text, tokenizer, chunk_size=1024, chunk_overlap=20):
     # Tokenize the text into token IDs
     encoding = tokenizer.encode_plus(text, add_special_tokens=False, return_offsets_mapping=True)
 
diff --git a/src/nv_ingest/schemas/text_splitter_schema.py b/src/nv_ingest/schemas/text_splitter_schema.py
index 46c7ba64..ef191749 100644
--- a/src/nv_ingest/schemas/text_splitter_schema.py
+++ b/src/nv_ingest/schemas/text_splitter_schema.py
@@ -11,7 +11,7 @@
 class TextSplitterSchema(BaseModel):
     tokenizer: str = "intfloat/e5-large-unsupervised"
     chunk_size: Annotated[int, Field(gt=0)] = 1024
-    chunk_overlap: Annotated[int, Field(ge=0)] = 0
+    chunk_overlap: Annotated[int, Field(ge=0)] = 20
     raise_on_failure: bool = False
 
     @field_validator("chunk_overlap")
diff --git a/tests/nv_ingest/schemas/test_text_splitter_schema.py b/tests/nv_ingest/schemas/test_text_splitter_schema.py
index e9406d8d..66a725c2 100644
--- a/tests/nv_ingest/schemas/test_text_splitter_schema.py
+++ b/tests/nv_ingest/schemas/test_text_splitter_schema.py
@@ -15,7 +15,7 @@ def test_text_splitter_schema_defaults():
     schema = TextSplitterSchema()
     assert schema.tokenizer == "intfloat/e5-large-unsupervised"
     assert schema.chunk_size == 1024
-    assert schema.chunk_overlap == 0
+    assert schema.chunk_overlap == 20
     assert schema.raise_on_failure is False
 
 
diff --git a/tests/nv_ingest_client/primitives/tasks/test_split.py b/tests/nv_ingest_client/primitives/tasks/test_split.py
index 2976d2ee..eedadf61 100644
--- a/tests/nv_ingest_client/primitives/tasks/test_split.py
+++ b/tests/nv_ingest_client/primitives/tasks/test_split.py
@@ -73,13 +73,13 @@ def test_split_task_default_params():
     expected_str_contains = [
         "tokenizer: intfloat/e5-large-unsupervised",
         "chunk_size: 1024",
-        "chunk_overlap: 0",
+        "chunk_overlap: 20",
     ]
     for expected_part in expected_str_contains:
         assert expected_part in str(task)
 
     expected_dict = {
         "type": "split",
-        "task_properties": {"tokenizer": "intfloat/e5-large-unsupervised", "chunk_size": 1024, "chunk_overlap": 0},
+        "task_properties": {"tokenizer": "intfloat/e5-large-unsupervised", "chunk_size": 1024, "chunk_overlap": 20},
     }
     assert task.to_dict() == expected_dict

From 317a42621f6a4dfc0041ad03037707b419325c93 Mon Sep 17 00:00:00 2001
From: Chris Jarrett <cjarrett@dgx-a100-01.aselab.nvidia.com>
Date: Tue, 11 Feb 2025 16:32:31 -0800
Subject: [PATCH 12/18] Pass huggingface access token as param

---
 .../primitives/tasks/split.py                 | 13 +++++++++--
 .../modules/transforms/text_splitter.py       | 16 ++++++++++----
 src/nv_ingest/schemas/ingest_job_schema.py    |  1 +
 src/nv_ingest/schemas/text_splitter_schema.py |  3 ++-
 .../test_message_broker_task_source.py        |  1 +
 .../schemas/test_ingest_job_schema.py         |  5 +++++
 .../schemas/test_text_splitter_schema.py      |  2 +-
 .../primitives/tasks/test_split.py            | 22 ++++++++++++-------
 8 files changed, 47 insertions(+), 16 deletions(-)

diff --git a/client/src/nv_ingest_client/primitives/tasks/split.py b/client/src/nv_ingest_client/primitives/tasks/split.py
index ece29605..cfada11f 100644
--- a/client/src/nv_ingest_client/primitives/tasks/split.py
+++ b/client/src/nv_ingest_client/primitives/tasks/split.py
@@ -6,6 +6,8 @@
 # pylint: disable=too-few-public-methods
 # pylint: disable=too-many-arguments
 
+import os
+
 import logging
 from typing import Dict
 
@@ -17,9 +19,10 @@
 
 
 class SplitTaskSchema(BaseModel):
-    tokenizer: str = "intfloat/e5-large-unsupervised"
+    tokenizer: str = "meta-llama/Llama-3.2-1B"
     chunk_size: int = 1024
     chunk_overlap: int = 20
+    params: dict = {}
 
     class Config:
         extra = "forbid"
@@ -32,9 +35,10 @@ class SplitTask(Task):
 
     def __init__(
         self,
-        tokenizer: str = "intfloat/e5-large-unsupervised",
+        tokenizer: str = "meta-llama/Llama-3.2-1B",
         chunk_size: int = 1024,
         chunk_overlap: int = 20,
+        params: dict = {},
     ) -> None:
         """
         Setup Split Task Config
@@ -43,6 +47,7 @@ def __init__(
         self._tokenizer = tokenizer
         self._chunk_size = chunk_size
         self._chunk_overlap = chunk_overlap
+        self._params = params
 
     def __str__(self) -> str:
         """
@@ -53,6 +58,8 @@ def __str__(self) -> str:
         info += f"  tokenizer: {self._tokenizer}\n"
         info += f"  chunk_size: {self._chunk_size}\n"
         info += f"  chunk_overlap: {self._chunk_overlap}\n"
+        for key, value in self._params.items():
+            info += f"  {key}: {value}\n"
         return info
 
     def to_dict(self) -> Dict:
@@ -67,5 +74,7 @@ def to_dict(self) -> Dict:
             split_params["chunk_size"] = self._chunk_size
         if self._chunk_overlap is not None:
             split_params["chunk_overlap"] = self._chunk_overlap
+        if self._params is not None:
+            split_params["params"] = self._params
 
         return {"type": "split", "task_properties": split_params}
diff --git a/src/nv_ingest/modules/transforms/text_splitter.py b/src/nv_ingest/modules/transforms/text_splitter.py
index 694335ec..496ac094 100644
--- a/src/nv_ingest/modules/transforms/text_splitter.py
+++ b/src/nv_ingest/modules/transforms/text_splitter.py
@@ -59,7 +59,7 @@ def _split_into_chunks(text, tokenizer, chunk_size=1024, chunk_overlap=20):
     tokens = encoding["input_ids"]
     offsets = encoding["offset_mapping"]
 
-    # Split the tokens into chunks of the desired size
+    # Split the tokens into chunks of the desired size with the desired overlap
     chunks = [tokens[i : i + chunk_size] for i in range(0, len(tokens), chunk_size - chunk_overlap)]
 
     # Convert token chunks back to text while preserving original spacing and case
@@ -113,7 +113,12 @@ def split_and_forward(message: ControlMessage):
             with message.payload().mutable_dataframe() as mdf:
                 df = mdf.to_pandas()
 
-            # Filter to text only
+            # Filter to txt files only
+            # bool_index = df["document_type"] == ContentTypeEnum.TEXT
+            # df_filtered = df.loc[bool_index]
+            logger.info(df.columns)
+
+            # Filter to document type
             bool_index = df["document_type"] == ContentTypeEnum.TEXT
             df_filtered = df.loc[bool_index]
 
@@ -124,14 +129,17 @@ def split_and_forward(message: ControlMessage):
             tokenizer = task_props.get("tokenizer", validated_config.tokenizer)
             chunk_size = task_props.get("chunk_size", validated_config.chunk_size)
             chunk_overlap = task_props.get("chunk_overlap", validated_config.chunk_overlap)
+            params = task_props.get("params", {})
+
+            hf_access_token = params.get("hf_access_token", None)
 
-            logger.info(
+            logger.debug(
                 f"Splitting text with tokenizer: {tokenizer}, "
                 f"chunk_size: {chunk_size} tokens, "
                 f"chunk_overlap: {chunk_overlap}"
             )
 
-            tokenizer_model = AutoTokenizer.from_pretrained(tokenizer)
+            tokenizer_model = AutoTokenizer.from_pretrained(tokenizer, token=hf_access_token)
 
             split_docs = []
             for _, row in df_filtered.iterrows():
diff --git a/src/nv_ingest/schemas/ingest_job_schema.py b/src/nv_ingest/schemas/ingest_job_schema.py
index 17a92903..0341394f 100644
--- a/src/nv_ingest/schemas/ingest_job_schema.py
+++ b/src/nv_ingest/schemas/ingest_job_schema.py
@@ -62,6 +62,7 @@ class IngestTaskSplitSchema(BaseModelNoExt):
     tokenizer: str
     chunk_size: Annotated[int, Field(gt=0)]
     chunk_overlap: Annotated[int, Field(ge=0)]
+    params: dict
 
     @field_validator("chunk_overlap")
     def check_chunk_overlap(cls, v, values, **kwargs):
diff --git a/src/nv_ingest/schemas/text_splitter_schema.py b/src/nv_ingest/schemas/text_splitter_schema.py
index ef191749..37380af6 100644
--- a/src/nv_ingest/schemas/text_splitter_schema.py
+++ b/src/nv_ingest/schemas/text_splitter_schema.py
@@ -3,13 +3,14 @@
 # SPDX-License-Identifier: Apache-2.0
 
 
+from typing import Optional
 from pydantic import Field, BaseModel, field_validator
 
 from typing_extensions import Annotated
 
 
 class TextSplitterSchema(BaseModel):
-    tokenizer: str = "intfloat/e5-large-unsupervised"
+    tokenizer: str = "meta-llama/Llama-3.2-1B"
     chunk_size: Annotated[int, Field(gt=0)] = 1024
     chunk_overlap: Annotated[int, Field(ge=0)] = 20
     raise_on_failure: bool = False
diff --git a/tests/nv_ingest/modules/sources/test_message_broker_task_source.py b/tests/nv_ingest/modules/sources/test_message_broker_task_source.py
index c897f246..3cb9297e 100644
--- a/tests/nv_ingest/modules/sources/test_message_broker_task_source.py
+++ b/tests/nv_ingest/modules/sources/test_message_broker_task_source.py
@@ -42,6 +42,7 @@ def job_payload():
                         "tokenizer": "intfloat/e5-large-unsupervised",
                         "chunk_size": 100,
                         "chunk_overlap": 0,
+                        "params": {},
                     },
                 },
                 {
diff --git a/tests/nv_ingest/schemas/test_ingest_job_schema.py b/tests/nv_ingest/schemas/test_ingest_job_schema.py
index b98610f6..f9d559ca 100644
--- a/tests/nv_ingest/schemas/test_ingest_job_schema.py
+++ b/tests/nv_ingest/schemas/test_ingest_job_schema.py
@@ -29,6 +29,7 @@ def valid_task_properties(task_type):
             "tokenizer": "intfloat/e5-large-unsupervised",
             "chunk_size": 300,
             "chunk_overlap": 0,
+            "params": {},
         }
     elif task_type == TaskTypeEnum.extract:
         return {"document_type": "pdf", "method": "OCR", "params": {"language": "en"}}
@@ -122,6 +123,7 @@ def test_custom_validator_logic_for_sentence_window_size():
             "tokanizer": "intfloat/e5-large-unsupervised",
             "chunk_size": 200,
             "chunk_overlap": 250,  # chunk_overlap should always be less than chunk_size
+            "params": {},
         },
     }
     job_data = {
@@ -150,6 +152,7 @@ def test_multiple_task_types():
                     "tokenizer": "intfloat/e5-large-unsupervised",
                     "chunk_size": 100,
                     "chunk_overlap": 0,
+                    "params": {},
                 },
             },
             {
@@ -244,6 +247,7 @@ def test_incorrect_property_types():
                     "tokenizer": "intfloat/e5-large-unsupervised",
                     "chunk_size": {"not an int": 123},  # Incorrect type (should be int)
                     "chunk_overlap": 0,
+                    "params": {},
                 },
             }
         ],
@@ -262,6 +266,7 @@ def test_missing_required_fields():
                 "task_properties": {
                     "tokenizer": "intfloat/e5-large-unsupervised",  # Missing chunk_size
                     "chunk_overlap": 0,
+                    "params": {},
                 },
             }
         ],
diff --git a/tests/nv_ingest/schemas/test_text_splitter_schema.py b/tests/nv_ingest/schemas/test_text_splitter_schema.py
index 66a725c2..bbdda847 100644
--- a/tests/nv_ingest/schemas/test_text_splitter_schema.py
+++ b/tests/nv_ingest/schemas/test_text_splitter_schema.py
@@ -13,7 +13,7 @@ def test_text_splitter_schema_defaults():
     Test the TextSplitterSchema with default values.
     """
     schema = TextSplitterSchema()
-    assert schema.tokenizer == "intfloat/e5-large-unsupervised"
+    assert schema.tokenizer == "meta-llama/Llama-3.2-1B"
     assert schema.chunk_size == 1024
     assert schema.chunk_overlap == 20
     assert schema.raise_on_failure is False
diff --git a/tests/nv_ingest_client/primitives/tasks/test_split.py b/tests/nv_ingest_client/primitives/tasks/test_split.py
index eedadf61..6af26afc 100644
--- a/tests/nv_ingest_client/primitives/tasks/test_split.py
+++ b/tests/nv_ingest_client/primitives/tasks/test_split.py
@@ -10,13 +10,15 @@
 
 def test_split_task_initialization():
     task = SplitTask(
-        tokenizer="intfloat/e5-large-unsupervised",
+        tokenizer="meta-llama/Llama-3.2-1B",
         chunk_size=1024,
         chunk_overlap=0,
+        params={},
     )
-    assert task._tokenizer == "intfloat/e5-large-unsupervised"
+    assert task._tokenizer == "meta-llama/Llama-3.2-1B"
     assert task._chunk_size == 1024
     assert task._chunk_overlap == 0
+    assert task._params == {}
 
 
 # String Representation Tests
@@ -34,22 +36,24 @@ def test_split_task_str_representation():
 
 
 @pytest.mark.parametrize(
-    "tokenizer, chunk_size, chunk_overlap",
+    "tokenizer, chunk_size, chunk_overlap, params",
     [
-        ("intfloat/e5-large-unsupervised", 100, 10),
-        ("microsoft/deberta-large", 50, 5),
-        ("meta-llama/Llama-3.2-1B", 1024, 0),
+        ("intfloat/e5-large-unsupervised", 100, 10, {}),
+        ("microsoft/deberta-large", 50, 5, None),
+        ("meta-llama/Llama-3.2-1B", 1024, 0, {"hf_access_token": "TOKEN"}),
     ],
 )
 def test_split_task_to_dict(
     tokenizer,
     chunk_size,
     chunk_overlap,
+    params,
 ):
     task = SplitTask(
         tokenizer=tokenizer,
         chunk_size=chunk_size,
         chunk_overlap=chunk_overlap,
+        params=params,
     )
 
     expected_dict = {"type": "split", "task_properties": {}}
@@ -61,6 +65,8 @@ def test_split_task_to_dict(
         expected_dict["task_properties"]["chunk_size"] = chunk_size
     if chunk_overlap is not None:
         expected_dict["task_properties"]["chunk_overlap"] = chunk_overlap
+    if params is not None:
+        expected_dict["task_properties"]["params"] = params
 
     assert task.to_dict() == expected_dict, "The to_dict method did not return the expected dictionary representation"
 
@@ -71,7 +77,7 @@ def test_split_task_to_dict(
 def test_split_task_default_params():
     task = SplitTask()
     expected_str_contains = [
-        "tokenizer: intfloat/e5-large-unsupervised",
+        "tokenizer: meta-llama/Llama-3.2-1B",
         "chunk_size: 1024",
         "chunk_overlap: 20",
     ]
@@ -80,6 +86,6 @@ def test_split_task_default_params():
 
     expected_dict = {
         "type": "split",
-        "task_properties": {"tokenizer": "intfloat/e5-large-unsupervised", "chunk_size": 1024, "chunk_overlap": 20},
+        "task_properties": {"tokenizer": "meta-llama/Llama-3.2-1B", "chunk_size": 1024, "chunk_overlap": 20, "params": {}},
     }
     assert task.to_dict() == expected_dict

From e250effc97b6c3cc1e23671bded870f8e53c96ff Mon Sep 17 00:00:00 2001
From: Chris Jarrett <cjarrett@dgx-a100-01.aselab.nvidia.com>
Date: Tue, 11 Feb 2025 16:57:47 -0800
Subject: [PATCH 13/18] Add llama license notice

---
 README.md                                             | 4 ++++
 client/src/nv_ingest_client/primitives/tasks/split.py | 2 --
 src/nv_ingest/schemas/text_splitter_schema.py         | 2 --
 tests/nv_ingest_client/primitives/tasks/test_split.py | 7 ++++++-
 4 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 9f9fa906..5b40e591 100644
--- a/README.md
+++ b/README.md
@@ -397,6 +397,10 @@ https://pypi.org/project/pdfservices-sdk/
     required if you want to use the Adobe extraction service for PDF decomposition. Please review the
     [license agreement](https://github.com/adobe/pdfservices-python-sdk?tab=License-1-ov-file) for the
     pdfservices-sdk before enabling this option.
+- **`Split`**:
+  - **Description**: By default, the Split task tokenizer is set to `meta-llama/Llama-3.2-1B`, which will download
+    Llama-3.2-1B materials from Huggingface in order to perform tokenizer based splitting. Please review the
+    [license agreement](https://huggingface.co/meta-llama/Llama-3.2-1B) for Llama 3.2 materials before using this.
 
 
 ### Contributing
diff --git a/client/src/nv_ingest_client/primitives/tasks/split.py b/client/src/nv_ingest_client/primitives/tasks/split.py
index cfada11f..9c2d3e75 100644
--- a/client/src/nv_ingest_client/primitives/tasks/split.py
+++ b/client/src/nv_ingest_client/primitives/tasks/split.py
@@ -6,8 +6,6 @@
 # pylint: disable=too-few-public-methods
 # pylint: disable=too-many-arguments
 
-import os
-
 import logging
 from typing import Dict
 
diff --git a/src/nv_ingest/schemas/text_splitter_schema.py b/src/nv_ingest/schemas/text_splitter_schema.py
index 37380af6..d78b26c7 100644
--- a/src/nv_ingest/schemas/text_splitter_schema.py
+++ b/src/nv_ingest/schemas/text_splitter_schema.py
@@ -2,8 +2,6 @@
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-
-from typing import Optional
 from pydantic import Field, BaseModel, field_validator
 
 from typing_extensions import Annotated
diff --git a/tests/nv_ingest_client/primitives/tasks/test_split.py b/tests/nv_ingest_client/primitives/tasks/test_split.py
index 6af26afc..da2fa2fe 100644
--- a/tests/nv_ingest_client/primitives/tasks/test_split.py
+++ b/tests/nv_ingest_client/primitives/tasks/test_split.py
@@ -86,6 +86,11 @@ def test_split_task_default_params():
 
     expected_dict = {
         "type": "split",
-        "task_properties": {"tokenizer": "meta-llama/Llama-3.2-1B", "chunk_size": 1024, "chunk_overlap": 20, "params": {}},
+        "task_properties": {
+            "tokenizer": "meta-llama/Llama-3.2-1B",
+            "chunk_size": 1024,
+            "chunk_overlap": 20,
+            "params": {},
+        },
     }
     assert task.to_dict() == expected_dict

From 3c2f246ab1c68ae9afeab23005fab13702799d75 Mon Sep 17 00:00:00 2001
From: Chris Jarrett <cjarrett@dgx-a100-01.aselab.nvidia.com>
Date: Tue, 11 Feb 2025 21:43:26 -0800
Subject: [PATCH 14/18] Add support for filtering by file type

---
 src/nv_ingest/modules/transforms/text_splitter.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/nv_ingest/modules/transforms/text_splitter.py b/src/nv_ingest/modules/transforms/text_splitter.py
index 496ac094..322ee027 100644
--- a/src/nv_ingest/modules/transforms/text_splitter.py
+++ b/src/nv_ingest/modules/transforms/text_splitter.py
@@ -113,11 +113,6 @@ def split_and_forward(message: ControlMessage):
             with message.payload().mutable_dataframe() as mdf:
                 df = mdf.to_pandas()
 
-            # Filter to txt files only
-            # bool_index = df["document_type"] == ContentTypeEnum.TEXT
-            # df_filtered = df.loc[bool_index]
-            logger.info(df.columns)
-
             # Filter to document type
             bool_index = df["document_type"] == ContentTypeEnum.TEXT
             df_filtered = df.loc[bool_index]
@@ -132,6 +127,7 @@ def split_and_forward(message: ControlMessage):
             params = task_props.get("params", {})
 
             hf_access_token = params.get("hf_access_token", None)
+            split_source_types = params.get("split_source_types", ["TEXT"])
 
             logger.debug(
                 f"Splitting text with tokenizer: {tokenizer}, "
@@ -139,6 +135,15 @@ def split_and_forward(message: ControlMessage):
                 f"chunk_overlap: {chunk_overlap}"
             )
 
+            # Filter to file type
+            bool_index = pd.json_normalize(df_filtered["metadata"])["source_metadata.source_type"].isin(
+                split_source_types
+            )
+            df_filtered = df_filtered.loc[bool_index]
+
+            if df_filtered.empty:
+                return message
+
             tokenizer_model = AutoTokenizer.from_pretrained(tokenizer, token=hf_access_token)
 
             split_docs = []

From 6e32a9cd6c83badbff5ff88e82f386235c5abd59 Mon Sep 17 00:00:00 2001
From: Chris Jarrett <cjarrett@dgx-a100-01.aselab.nvidia.com>
Date: Wed, 12 Feb 2025 13:12:06 -0800
Subject: [PATCH 15/18] Fix offset mapping

---
 src/nv_ingest/modules/transforms/text_splitter.py | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/src/nv_ingest/modules/transforms/text_splitter.py b/src/nv_ingest/modules/transforms/text_splitter.py
index 322ee027..a6e09939 100644
--- a/src/nv_ingest/modules/transforms/text_splitter.py
+++ b/src/nv_ingest/modules/transforms/text_splitter.py
@@ -56,27 +56,17 @@ def _split_into_chunks(text, tokenizer, chunk_size=1024, chunk_overlap=20):
     encoding = tokenizer.encode_plus(text, add_special_tokens=False, return_offsets_mapping=True)
 
     # Get the token IDs and offsets for splitting
-    tokens = encoding["input_ids"]
     offsets = encoding["offset_mapping"]
 
     # Split the tokens into chunks of the desired size with the desired overlap
-    chunks = [tokens[i : i + chunk_size] for i in range(0, len(tokens), chunk_size - chunk_overlap)]
+    chunks = [offsets[i : i + chunk_size] for i in range(0, len(offsets), chunk_size - chunk_overlap)]
 
     # Convert token chunks back to text while preserving original spacing and case
     text_chunks = []
     for chunk in chunks:
-        # Find the start and end offsets for the current chunk
-        chunk_offsets = offsets[: len(chunk)]
-        start_offset = chunk_offsets[0][0]
-        end_offset = chunk_offsets[-1][1]
-
-        # Extract the original text for this chunk based on offsets
-        text_chunk = text[start_offset:end_offset]
+        text_chunk = text[chunk[0][0] : chunk[-1][0]]
         text_chunks.append(text_chunk)
 
-        # Remove processed offsets for the next iteration
-        offsets = offsets[len(chunk) :]
-
     return text_chunks
 
 

From 67b7051b88b36828e1bac10f7d485be94e84de7b Mon Sep 17 00:00:00 2001
From: Chris Jarrett <cjarrett@dgx-a100-01.aselab.nvidia.com>
Date: Wed, 19 Feb 2025 10:55:06 -0800
Subject: [PATCH 16/18] Add built with llama

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index cb977419..5c835ea7 100644
--- a/README.md
+++ b/README.md
@@ -396,7 +396,7 @@ https://pypi.org/project/pdfservices-sdk/
     required if you want to use the Adobe extraction service for PDF decomposition. Please review the
     [license agreement](https://github.com/adobe/pdfservices-python-sdk?tab=License-1-ov-file) for the
     pdfservices-sdk before enabling this option.
-- **`Split`**:
+- **`Built With Llama:`**:
   - **Description**: By default, the Split task tokenizer is set to `meta-llama/Llama-3.2-1B`, which will download
     Llama-3.2-1B materials from Huggingface in order to perform tokenizer based splitting. Please review the
     [license agreement](https://huggingface.co/meta-llama/Llama-3.2-1B) for Llama 3.2 materials before using this.

From 763403019d638e122afe27063f040d1f468d5463 Mon Sep 17 00:00:00 2001
From: Chris Jarrett <cjarrett@dgx-a100-01.aselab.nvidia.com>
Date: Wed, 19 Feb 2025 14:16:54 -0800
Subject: [PATCH 17/18] Change default chunk_overlap to 150

---
 client/src/nv_ingest_client/primitives/tasks/split.py | 4 ++--
 src/nv_ingest/schemas/text_splitter_schema.py         | 2 +-
 tests/nv_ingest/schemas/test_text_splitter_schema.py  | 2 +-
 tests/nv_ingest_client/primitives/tasks/test_split.py | 4 ++--
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/client/src/nv_ingest_client/primitives/tasks/split.py b/client/src/nv_ingest_client/primitives/tasks/split.py
index 9c2d3e75..12f1b2d7 100644
--- a/client/src/nv_ingest_client/primitives/tasks/split.py
+++ b/client/src/nv_ingest_client/primitives/tasks/split.py
@@ -19,7 +19,7 @@
 class SplitTaskSchema(BaseModel):
     tokenizer: str = "meta-llama/Llama-3.2-1B"
     chunk_size: int = 1024
-    chunk_overlap: int = 20
+    chunk_overlap: int = 150
     params: dict = {}
 
     class Config:
@@ -35,7 +35,7 @@ def __init__(
         self,
         tokenizer: str = "meta-llama/Llama-3.2-1B",
         chunk_size: int = 1024,
-        chunk_overlap: int = 20,
+        chunk_overlap: int = 150,
         params: dict = {},
     ) -> None:
         """
diff --git a/src/nv_ingest/schemas/text_splitter_schema.py b/src/nv_ingest/schemas/text_splitter_schema.py
index d78b26c7..7eb6d93c 100644
--- a/src/nv_ingest/schemas/text_splitter_schema.py
+++ b/src/nv_ingest/schemas/text_splitter_schema.py
@@ -10,7 +10,7 @@
 class TextSplitterSchema(BaseModel):
     tokenizer: str = "meta-llama/Llama-3.2-1B"
     chunk_size: Annotated[int, Field(gt=0)] = 1024
-    chunk_overlap: Annotated[int, Field(ge=0)] = 20
+    chunk_overlap: Annotated[int, Field(ge=0)] = 150
     raise_on_failure: bool = False
 
     @field_validator("chunk_overlap")
diff --git a/tests/nv_ingest/schemas/test_text_splitter_schema.py b/tests/nv_ingest/schemas/test_text_splitter_schema.py
index bbdda847..8cfc9d97 100644
--- a/tests/nv_ingest/schemas/test_text_splitter_schema.py
+++ b/tests/nv_ingest/schemas/test_text_splitter_schema.py
@@ -15,7 +15,7 @@ def test_text_splitter_schema_defaults():
     schema = TextSplitterSchema()
     assert schema.tokenizer == "meta-llama/Llama-3.2-1B"
     assert schema.chunk_size == 1024
-    assert schema.chunk_overlap == 20
+    assert schema.chunk_overlap == 150
     assert schema.raise_on_failure is False
 
 
diff --git a/tests/nv_ingest_client/primitives/tasks/test_split.py b/tests/nv_ingest_client/primitives/tasks/test_split.py
index da2fa2fe..533ef0c5 100644
--- a/tests/nv_ingest_client/primitives/tasks/test_split.py
+++ b/tests/nv_ingest_client/primitives/tasks/test_split.py
@@ -79,7 +79,7 @@ def test_split_task_default_params():
     expected_str_contains = [
         "tokenizer: meta-llama/Llama-3.2-1B",
         "chunk_size: 1024",
-        "chunk_overlap: 20",
+        "chunk_overlap: 150",
     ]
     for expected_part in expected_str_contains:
         assert expected_part in str(task)
@@ -89,7 +89,7 @@ def test_split_task_default_params():
         "task_properties": {
             "tokenizer": "meta-llama/Llama-3.2-1B",
             "chunk_size": 1024,
-            "chunk_overlap": 20,
+            "chunk_overlap": 150,
             "params": {},
         },
     }

From d6401bc1c3821f176daf25ab04878e476c346aa5 Mon Sep 17 00:00:00 2001
From: Chris Jarrett <cjarrett@dgx-a100-01.aselab.nvidia.com>
Date: Thu, 20 Feb 2025 14:30:24 -0800
Subject: [PATCH 18/18] Add option to predownload llama tokenizer

---
 Dockerfile                            | 11 +++++++++++
 README.md                             |  8 +++++---
 docker-compose.yaml                   |  3 +++
 docker/scripts/post_build_triggers.py |  9 +++++++++
 4 files changed, 28 insertions(+), 3 deletions(-)
 create mode 100644 docker/scripts/post_build_triggers.py

diff --git a/Dockerfile b/Dockerfile
index 4a5e73c0..fb974f0d 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -12,6 +12,8 @@ FROM $BASE_IMG:$BASE_IMG_TAG AS base
 ARG RELEASE_TYPE="dev"
 ARG VERSION=""
 ARG VERSION_REV="0"
+ARG DOWNLOAD_LLAMA_TOKENIZER=""
+ARG HF_ACCESS_TOKEN=""
 
 # Embed the `git rev-parse HEAD` as a Docker metadata label
 # Allows for linking container builds to git commits
@@ -71,6 +73,9 @@ WORKDIR /workspace
 # Copy custom entrypoint script
 COPY ./docker/scripts/entrypoint.sh /workspace/docker/entrypoint.sh
 
+# Copy post build triggers script
+COPY ./docker/scripts/post_build_triggers.py /workspace/docker/post_build_triggers.py
+
 FROM base AS nv_ingest_install
 # Copy the module code
 COPY setup.py setup.py
@@ -124,6 +129,12 @@ RUN --mount=type=cache,target=/opt/conda/pkgs\
     && pip install ./api/dist/*.whl \
     && pip install ./client/dist/*.whl
 
+
+RUN --mount=type=cache,target=/opt/conda/pkgs \
+    --mount=type=cache,target=/root/.cache/pip \
+    source activate nv_ingest_runtime \
+    && python3 /workspace/docker/post_build_triggers.py
+
 RUN rm -rf src
 
 FROM nv_ingest_install AS runtime
diff --git a/README.md b/README.md
index 8eb10894..9aad924f 100644
--- a/README.md
+++ b/README.md
@@ -411,10 +411,12 @@ https://pypi.org/project/pdfservices-sdk/
     required if you want to use the Adobe extraction service for PDF decomposition. Please review the
     [license agreement](https://github.com/adobe/pdfservices-python-sdk?tab=License-1-ov-file) for the
     pdfservices-sdk before enabling this option.
-- **`Built With Llama:`**:
-  - **Description**: By default, the Split task tokenizer is set to `meta-llama/Llama-3.2-1B`, which will download
-    Llama-3.2-1B materials from Huggingface in order to perform tokenizer based splitting. Please review the
+- **`DOWNLOAD_LLAMA_TOKENIZER` (Built With Llama):**:
+  - **Description**: The Split task uses the `meta-llama/Llama-3.2-1B` tokenizer, which will be downloaded
+    from HuggingFace at build time if `DOWNLOAD_LLAMA_TOKENIZER` is set to `True`. Please review the
     [license agreement](https://huggingface.co/meta-llama/Llama-3.2-1B) for Llama 3.2 materials before using this.
+    This is a gated model so you'll need to [request access](https://huggingface.co/meta-llama/Llama-3.2-1B) and
+    set `HF_ACCESS_TOKEN` to your HuggingFace access token in order to use it.
 
 
 ### Contributing
diff --git a/docker-compose.yaml b/docker-compose.yaml
index 44ed1107..451fc280 100644
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -155,6 +155,9 @@ services:
       context: ${NV_INGEST_ROOT:-.}
       dockerfile: "./Dockerfile"
       target: runtime
+      args:
+        DOWNLOAD_LLAMA_TOKENIZER: ${DOWNLOAD_LLAMA_TOKENIZER:-False}
+        HF_ACCESS_TOKEN: ${HF_ACCESS_TOKEN:-hfaccesstoken}
     volumes:
       - ${DATASET_ROOT:-./data}:/workspace/data
     ports:
diff --git a/docker/scripts/post_build_triggers.py b/docker/scripts/post_build_triggers.py
new file mode 100644
index 00000000..676139ab
--- /dev/null
+++ b/docker/scripts/post_build_triggers.py
@@ -0,0 +1,9 @@
+import os
+from transformers import AutoTokenizer
+
+if os.getenv("DOWNLOAD_LLAMA_TOKENIZER") == "True":
+    tokenizer_path = "/workspace/models/tokenizer/"
+    os.makedirs(tokenizer_path, exist_ok=True)
+
+    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B", token=os.getenv("HF_ACCESS_TOKEN"))
+    tokenizer.save_pretrained(tokenizer_path)