fixed dataset_config_name for Red_Pajama_sample

Vahe1994 · Nov 22, 2024 · a0e2c34 · a0e2c34
1 parent c4a022b
commit a0e2c34
Showing 1 changed file with 1 addition and 1 deletion.
diff --git a/README.md b/README.md
@@ -246,7 +246,7 @@ SEQLEN=4096
 DATASET=togethercomputer/RedPajama-Data-1T-Sample
 OUTPUT_PATH=./redpajama_tokenized_llama2
 
-CUDA_VISIBLE_DEVICES=0 HF_HOME=/mnt/LLM OMP_NUM_THREADS=16 torchrun --master-port 3456 --nproc-per-node=1 finetune.py --base_model $TARGET_MODEL --quantized_model ./doesnt_matter --dtype bfloat16 --block_type LlamaDecoderLayer --dataset_name=$DATASET --split train --dataset_config_name=plain_text --cache_dir=./cache_dir --trust_remote_code --model_seqlen=$SEQLEN --preprocessing_num_workers=64 --preprocessing_chunk_length 100000 --save_dataset_and_exit $OUTPUT_PATH
+CUDA_VISIBLE_DEVICES=0 HF_HOME=/mnt/LLM OMP_NUM_THREADS=16 torchrun --master-port 3456 --nproc-per-node=1 finetune.py --base_model $TARGET_MODEL --quantized_model ./doesnt_matter --dtype bfloat16 --block_type LlamaDecoderLayer --dataset_name=$DATASET --split train --dataset_config_name plain_text --cache_dir=./cache_dir --trust_remote_code --model_seqlen=$SEQLEN --preprocessing_num_workers=64 --preprocessing_chunk_length 100000 --save_dataset_and_exit $OUTPUT_PATH
 
 tar -cvf tokenized_data_llama2.tar $OUTPUT_PATH   # optionally pack for distribution
 ```