From 5715d3983f12379e0c3c0dedd7f5047e778e8a9d Mon Sep 17 00:00:00 2001 From: Malinovskii Vladimir Date: Fri, 22 Nov 2024 12:12:11 +0300 Subject: [PATCH 1/3] fixed tuning readme --- README.md | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 18c8905..e301e0b 100644 --- a/README.md +++ b/README.md @@ -235,6 +235,28 @@ Main CLI arguments: There are additional hyperparameters aviailable. Run `python main.py --help` for more details on command line arguments, including compression parameters. + +### Preparing fine-tuning dataset + +This is a script is used to pre-tokenize a subset of RedPajama data for future fine-tuning. + +```sh +TARGET_MODEL=meta-llama/Llama-2-7b-hf # used for tokenization +SEQLEN=4096 +DATASET=togethercomputer/RedPajama-Data-1T-Sample +OUTPUT_PATH=./redpajama_tokenized_llama2 + +CUDA_VISIBLE_DEVICES=0 HF_HOME=/mnt/LLM OMP_NUM_THREADS=16 torchrun --master-port 3456 --nproc-per-node=1 finetune.py --base_model $TARGET_MODEL --quantized_model ./doesnt_matter --dtype bfloat16 --block_type LlamaDecoderLayer --dataset_name=$DATASET --split train --cache_dir=./cache_dir --trust_remote_code --model_seqlen=$SEQLEN --preprocessing_num_workers=64 --preprocessing_chunk_length 100000 --save_dataset_and_exit $OUTPUT_PATH + +tar -cvf tokenized_data_llama2.tar $OUTPUT_PATH # optionally pack for distribution +``` + +The tokenized dataset is specific the model family (or more specifically, its tokenizer). For instance, Llama-3 8B is compatible with Llama-3 70B, but not with Llama-2 because it uses a different tokenizer. +To tokenize the data for another model, you need to set 1) --base_model 2) model_seqlen and 3) the path to --save_dataset_and_exit . + +You can also set --preprocessing_num_workers to something hardware-appropriate. Note that setting --download_num_workers > 1 may cause download errors, possibly due to rate limit. These and other parameters are explained in the script's --help. +The job requires 150-200 GiB of disk space to store the dataset sample and preprocessing cache. Both are stored in ./cache_dir and can be deleted afterwards. + ### Finetuning **Note** to reproduce results with old finetuning (before Aug 21), use commit [559a366](https://github.com/Vahe1994/AQLM/commit/559a36681398d7189297fccf3b1e59e8e030e942). @@ -253,7 +275,7 @@ torchrun --nproc-per-node=$NUM_GPUS finetune.py \ --load_dtype bfloat16 \ --amp_dtype bfloat16 \ --code_dtype uint16 \ - --dataset_name=pajama \ + --dataset_name=$TOKENIZED_DATASET_PATH \ --split none \ --seed 42 \ --preprocessing_chunk_length 100000 \ From c4a022bd1c4739e6860c1fba42983371492cf441 Mon Sep 17 00:00:00 2001 From: vahe1994 Date: Fri, 22 Nov 2024 20:40:12 +0400 Subject: [PATCH 2/3] added dataset_config_name for Red_Pajama_sample --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e301e0b..95ccefb 100644 --- a/README.md +++ b/README.md @@ -246,7 +246,7 @@ SEQLEN=4096 DATASET=togethercomputer/RedPajama-Data-1T-Sample OUTPUT_PATH=./redpajama_tokenized_llama2 -CUDA_VISIBLE_DEVICES=0 HF_HOME=/mnt/LLM OMP_NUM_THREADS=16 torchrun --master-port 3456 --nproc-per-node=1 finetune.py --base_model $TARGET_MODEL --quantized_model ./doesnt_matter --dtype bfloat16 --block_type LlamaDecoderLayer --dataset_name=$DATASET --split train --cache_dir=./cache_dir --trust_remote_code --model_seqlen=$SEQLEN --preprocessing_num_workers=64 --preprocessing_chunk_length 100000 --save_dataset_and_exit $OUTPUT_PATH +CUDA_VISIBLE_DEVICES=0 HF_HOME=/mnt/LLM OMP_NUM_THREADS=16 torchrun --master-port 3456 --nproc-per-node=1 finetune.py --base_model $TARGET_MODEL --quantized_model ./doesnt_matter --dtype bfloat16 --block_type LlamaDecoderLayer --dataset_name=$DATASET --split train --dataset_config_name=plain_text --cache_dir=./cache_dir --trust_remote_code --model_seqlen=$SEQLEN --preprocessing_num_workers=64 --preprocessing_chunk_length 100000 --save_dataset_and_exit $OUTPUT_PATH tar -cvf tokenized_data_llama2.tar $OUTPUT_PATH # optionally pack for distribution ``` From a0e2c347f42f9c6c9dbd1b7d8a852ae5e333a9d2 Mon Sep 17 00:00:00 2001 From: vahe1994 Date: Fri, 22 Nov 2024 20:40:51 +0400 Subject: [PATCH 3/3] fixed dataset_config_name for Red_Pajama_sample --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 95ccefb..b049c57 100644 --- a/README.md +++ b/README.md @@ -246,7 +246,7 @@ SEQLEN=4096 DATASET=togethercomputer/RedPajama-Data-1T-Sample OUTPUT_PATH=./redpajama_tokenized_llama2 -CUDA_VISIBLE_DEVICES=0 HF_HOME=/mnt/LLM OMP_NUM_THREADS=16 torchrun --master-port 3456 --nproc-per-node=1 finetune.py --base_model $TARGET_MODEL --quantized_model ./doesnt_matter --dtype bfloat16 --block_type LlamaDecoderLayer --dataset_name=$DATASET --split train --dataset_config_name=plain_text --cache_dir=./cache_dir --trust_remote_code --model_seqlen=$SEQLEN --preprocessing_num_workers=64 --preprocessing_chunk_length 100000 --save_dataset_and_exit $OUTPUT_PATH +CUDA_VISIBLE_DEVICES=0 HF_HOME=/mnt/LLM OMP_NUM_THREADS=16 torchrun --master-port 3456 --nproc-per-node=1 finetune.py --base_model $TARGET_MODEL --quantized_model ./doesnt_matter --dtype bfloat16 --block_type LlamaDecoderLayer --dataset_name=$DATASET --split train --dataset_config_name plain_text --cache_dir=./cache_dir --trust_remote_code --model_seqlen=$SEQLEN --preprocessing_num_workers=64 --preprocessing_chunk_length 100000 --save_dataset_and_exit $OUTPUT_PATH tar -cvf tokenized_data_llama2.tar $OUTPUT_PATH # optionally pack for distribution ```