Updated training script to use sequential dataset pre-processing inst…

…ead of parallel to avoid MD5 hash checks
opendatahub-io · Nov 29, 2024 · 53c2eb4 · 53c2eb4
1 parent 9bd6959
commit 53c2eb4
Showing 1 changed file with 1 addition and 2 deletions.
diff --git a/tests/kfto/core/hf_llm_training.py b/tests/kfto/core/hf_llm_training.py
@@ -85,11 +85,10 @@ def load_and_preprocess_data(dataset_file, transformer_type, tokenizer):
 
         logger.info("Tokenize dataset")
         # TODO (andreyvelich): Discuss how user should set the tokenizer function.
-        num_cores = os.cpu_count()
         dataset = dataset.map(
             lambda x: tokenizer(x["output"], padding=True, truncation=True, max_length=128),
             batched=True,
-            num_proc=num_cores
+            keep_in_memory=True
         )
 
     # Check if dataset contains `train` key. Otherwise, load full dataset to train_data.