Minor CrossFit improvements

Signed-off-by: Sarah Yurick <[email protected]>
NVIDIA · Jan 16, 2025 · fde7e6d · fde7e6d
1 parent 7cfda44
commit fde7e6d
Show file tree

Hide file tree

Showing 2 changed files with 13 additions and 6 deletions.
diff --git a/nemo_curator/classifiers/base.py b/nemo_curator/classifiers/base.py
@@ -121,10 +121,13 @@ def _run_classifier_helper(
     prob_col: str = None,
 ) -> "dask_cudf.DataFrame":
 
-    if prob_col:
-        df[prob_col] = 0
-    else:
+    if prob_col is None:
         prob_col = "_prob"
+        labeler = op.Labeler(labels, cols=[prob_col], suffix=label_col)
+    else:
+        labeler = op.Labeler(
+            labels, cols=[prob_col], keep_cols=[prob_col], suffix=label_col
+        )
 
     columns_to_keep_list = df.columns.to_list()
 
@@ -138,7 +141,7 @@ def _run_classifier_helper(
             batch_size=batch_size,
             pred_output_col=prob_col,
         ),
-        op.Labeler(labels, cols=[prob_col], suffix=label_col),
+        labeler,
         repartition=df.npartitions,
         keep_cols=columns_to_keep_list,
     )

diff --git a/nemo_curator/classifiers/prompt_task_complexity.py b/nemo_curator/classifiers/prompt_task_complexity.py
@@ -336,11 +336,15 @@ def _run_classifier(self, dataset: DocumentDataset) -> DocumentDataset:
 
         df = dataset.df
         columns_to_keep_list = df.columns.to_list()
-        df["sliced_text"] = df[self.text_field].str.slice(0, self.max_chars)
 
         model = self.model
         classifier_pipe = op.Sequential(
-            op.Tokenizer(model, cols=["sliced_text"], tokenizer_type="default"),
+            op.Tokenizer(
+                model,
+                cols=[self.text_field],
+                tokenizer_type="default",
+                max_chars=self.max_chars,
+            ),
             op.Predictor(
                 model,
                 sorted_data_loader=True,