From 5f049023267056df2dd27b5f44b0b5dc95826544 Mon Sep 17 00:00:00 2001 From: mobicham <37179323+mobicham@users.noreply.github.com> Date: Mon, 20 Nov 2023 17:51:48 +0100 Subject: [PATCH] Delete code/llama2_benchmark directory --- code/llama2_benchmark/eval_model.py | 52 -------------- .../llama2_benchmark/quant_llama2_awq_demo.py | 39 ---------- .../quant_llama2_gptq_demo.py | 71 ------------------- .../llama2_benchmark/quant_llama2_hqq_demo.py | 36 ---------- 4 files changed, 198 deletions(-) delete mode 100644 code/llama2_benchmark/eval_model.py delete mode 100644 code/llama2_benchmark/quant_llama2_awq_demo.py delete mode 100644 code/llama2_benchmark/quant_llama2_gptq_demo.py delete mode 100644 code/llama2_benchmark/quant_llama2_hqq_demo.py diff --git a/code/llama2_benchmark/eval_model.py b/code/llama2_benchmark/eval_model.py deleted file mode 100644 index 2f7829a..0000000 --- a/code/llama2_benchmark/eval_model.py +++ /dev/null @@ -1,52 +0,0 @@ -from datasets import load_dataset -import torch, time -import numpy as np -from tqdm import tqdm - -import gc -def cleanup(): - torch.cuda.empty_cache() - gc.collect() - -#Adapted from https://huggingface.co/transformers/v4.2.2/perplexity.html -def eval_wikitext2(model, tokenizer, max_length=1024, stride=512, verbose=True): - model.eval() - tokenizer.pad_token = tokenizer.eos_token - tokenizer.padding_side = "right" - tokenizer.add_eos_token = False - - dataset = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test') - encodings = tokenizer('\n\n'.join(dataset['text']), return_tensors='pt') - - encodings['input_ids'] = encodings['input_ids'].to('cuda') - - lls, t = [], [] - for i in tqdm(range(0, encodings['input_ids'].size(1), stride), disable=not verbose): - begin_loc = max(i + stride - max_length, 0) - end_loc = min(i + stride, encodings['input_ids'].size(1)) - trg_len = end_loc - i - input_ids = encodings['input_ids'][:,begin_loc:end_loc] - target_ids = input_ids.clone() - target_ids[:,:-trg_len] = -100 #ignore context - - t1 = time.time() - with torch.no_grad(): - log_likelihood = model(input_ids, labels=target_ids).loss * trg_len - torch.cuda.synchronize() - t2 = time.time() - t.append((t2-t1)) - lls.append(log_likelihood) - - del input_ids, target_ids - - ppl = np.round(float(torch.exp(torch.stack(lls).sum() / end_loc)), 4) - pred_time = np.round(np.mean(t), 3) - if(verbose): - print('perplexity', ppl) - print('time', str(pred_time) + ' sec') - - del encodings - cleanup() - - return {'perplexity':ppl, 'prediction_time':pred_time} - diff --git a/code/llama2_benchmark/quant_llama2_awq_demo.py b/code/llama2_benchmark/quant_llama2_awq_demo.py deleted file mode 100644 index e8370c4..0000000 --- a/code/llama2_benchmark/quant_llama2_awq_demo.py +++ /dev/null @@ -1,39 +0,0 @@ -import torch, transformers - -#Settings -###################################################################################### -hf_auth = None #HuggingFace token -cache_path = '' #cache directory to store data - -#Chose a model -model_id = "meta-llama/Llama-2-7b-hf" -#model_id = "meta-llama/Llama-2-13b-hf" -#model_id = "meta-llama/Llama-2-70b-hf" - -#AWQ settings -###################################################################################### -from awq import AutoAWQForCausalLM -import gc, time - -# Load model -tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, use_auth_token=hf_auth) -model = AutoAWQForCausalLM.from_pretrained(model_id, use_auth_token=hf_auth, cache_dir=cache_path, resume_download=True) - -#quant_config = {"w_bit": 4, "q_group_size": 128, "zero_point": True, 'version':'GEMM'} -quant_config = {"w_bit": 4, "q_group_size": 64, "zero_point": True, 'version':'GEMM'} - -t1 = time.time() -model.quantize(tokenizer, quant_config=quant_config) -t2 = time.time() -print('Took ' + str(t2-t1) + ' seconds to quantize the model with AWQ') - -model = model.cuda() -torch.cuda.empty_cache() -gc.collect() - -#Evaluate the quantized model -###################################################################################### -from eval_model import eval_wikitext2 - -eval_wikitext2(model, tokenizer, verbose=True) - diff --git a/code/llama2_benchmark/quant_llama2_gptq_demo.py b/code/llama2_benchmark/quant_llama2_gptq_demo.py deleted file mode 100644 index f4fdd48..0000000 --- a/code/llama2_benchmark/quant_llama2_gptq_demo.py +++ /dev/null @@ -1,71 +0,0 @@ -import torch, transformers - -#Important: limit the number of threads otherwise the process will hang for a long time -#num_threads=32; -#OMP_NUM_THREADS=$num_threads OPENBLAS_NUM_THREADS=$num_threads MKL_NUM_THREADS=$num_threads VECLIB_MAXIMUM_THREADS=$num_threads NUMEXPR_NUM_THREADS=$num_threads CUDA_VISIBLE_DEVICES=0 ipython3 - -#Settings -###################################################################################### -hf_auth = None #HuggingFace token -cache_path = '' #cache directory to store data - -#Chose a model -model_id = "meta-llama/Llama-2-7b-hf" -#model_id = "meta-llama/Llama-2-13b-hf" -#model_id = "meta-llama/Llama-2-70b-hf" - -#GPTQ settings -###################################################################################### -from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig -import logging, gc, time -from tqdm import tqdm - -logging.basicConfig(format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S") - -#Adapted from: https://towardsdatascience.com/4-bit-quantization-with-gptq-36b0f4f02c34 -def prepare_model(model, tokenizer, n_samples=1024, max_tokens=512, use_triton=True): - # Load data and tokenize examples - from datasets import load_dataset - import random - data = load_dataset("allenai/c4", data_files="en/c4-train.00001-of-01024.json.gz", split=f"train[:{n_samples}]", cache_dir=cache_path) - tokenized_data = torch.cat([tokenizer(data[i]['text'], return_tensors='pt').input_ids for i in tqdm(range(len(data)))], axis=-1) #~536K tokens - - # Format tokenized examples - random.seed(1) - examples_ids = [] - for _ in range(n_samples): - i = random.randint(0, tokenized_data.shape[1] - max_tokens - 1) - j = i + max_tokens - input_ids = tokenized_data[:, i:j] - attention_mask = torch.ones_like(input_ids) - examples_ids.append({'input_ids': input_ids, 'attention_mask': attention_mask}) - - print('Using ' + str(len(examples_ids)) + ' samples for calibration.') - model.quantize(examples_ids, batch_size=1, use_triton=use_triton) - model = model.cuda(); - with torch.no_grad(): x = model(input_ids.to('cuda')); - del examples_ids, x - torch.cuda.empty_cache() - gc.collect() - return model - -#quantize_config = BaseQuantizeConfig(bits=8, group_size=128, damp_percent=0.01, desc_act=False); use_triton=True; -#quantize_config = BaseQuantizeConfig(bits=4, group_size=128, damp_percent=0.01, desc_act=False); use_triton=True; -quantize_config = BaseQuantizeConfig(bits=4, group_size=64, damp_percent=0.01, desc_act=False); use_triton=True; -#quantize_config = BaseQuantizeConfig(bits=3, group_size=128, damp_percent=0.01, desc_act=False); use_triton=False; -#quantize_config = BaseQuantizeConfig(bits=3, group_size=64, damp_percent=0.01, desc_act=False); use_triton=False; -#quantize_config = BaseQuantizeConfig(bits=2, group_size=64, damp_percent=0.01, desc_act=False); use_triton=True; - -tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, use_auth_token=hf_auth) -model = AutoGPTQForCausalLM.from_pretrained(model_id, quantize_config, use_auth_token=hf_auth, cache_dir=cache_path) -t1 = time.time() -model = prepare_model(model, tokenizer, use_triton=use_triton) -t2 = time.time() -print('Took ' + str(t2-t1) + ' seconds to quantize the model with GPTQ') - -#Evaluate the quantized model -###################################################################################### -from eval_model import eval_wikitext2 - -eval_wikitext2(model, tokenizer, verbose=True) - diff --git a/code/llama2_benchmark/quant_llama2_hqq_demo.py b/code/llama2_benchmark/quant_llama2_hqq_demo.py deleted file mode 100644 index daf6fff..0000000 --- a/code/llama2_benchmark/quant_llama2_hqq_demo.py +++ /dev/null @@ -1,36 +0,0 @@ -import torch, transformers - -#Settings -###################################################################################### -hf_auth = None #HuggingFace token -cache_path = '' #cache directory to store data - -#Chose a model -model_id = "meta-llama/Llama-2-7b-hf" -#model_id = "meta-llama/Llama-2-13b-hf" -#model_id = "meta-llama/Llama-2-70b-hf" - -#Load model on the CPU -###################################################################################### -model = transformers.AutoModelForCausalLM.from_pretrained(model_id, use_auth_token=hf_auth, cache_dir=cache_path) -tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, use_auth_token=hf_auth) - -#Quantize the model -###################################################################################### -from hqq.core import * -from hqq.llama2 import quantize_model - -#quant_config = hqq_base_quant_config(nbits=8, group_size=128) -quant_config = hqq_base_quant_config(nbits=4, group_size=64) -#quant_config = hqq_base_quant_config(nbits=3, group_size=64) -#quant_config = hqq_base_quant_config(nbits=2, group_size=16) -#quant_config = hqq_base_quant_config(nbits=2, group_size=16, quant_scale=True) #scale is quantized to 8-bit/g=128 - -quantize_model(model, quant_config=quant_config) - -#Evaluate the quantized model -###################################################################################### -from eval_model import eval_wikitext2 - -eval_wikitext2(model, tokenizer, verbose=True) -