From 5f049023267056df2dd27b5f44b0b5dc95826544 Mon Sep 17 00:00:00 2001
From: mobicham <37179323+mobicham@users.noreply.github.com>
Date: Mon, 20 Nov 2023 17:51:48 +0100
Subject: [PATCH] Delete code/llama2_benchmark directory

---
 code/llama2_benchmark/eval_model.py           | 52 --------------
 .../llama2_benchmark/quant_llama2_awq_demo.py | 39 ----------
 .../quant_llama2_gptq_demo.py                 | 71 -------------------
 .../llama2_benchmark/quant_llama2_hqq_demo.py | 36 ----------
 4 files changed, 198 deletions(-)
 delete mode 100644 code/llama2_benchmark/eval_model.py
 delete mode 100644 code/llama2_benchmark/quant_llama2_awq_demo.py
 delete mode 100644 code/llama2_benchmark/quant_llama2_gptq_demo.py
 delete mode 100644 code/llama2_benchmark/quant_llama2_hqq_demo.py

diff --git a/code/llama2_benchmark/eval_model.py b/code/llama2_benchmark/eval_model.py
deleted file mode 100644
index 2f7829a..0000000
--- a/code/llama2_benchmark/eval_model.py
+++ /dev/null
@@ -1,52 +0,0 @@
-from datasets import load_dataset
-import torch, time
-import numpy as np
-from tqdm import tqdm
-
-import gc
-def cleanup():
-	torch.cuda.empty_cache()
-	gc.collect()
-
-#Adapted from https://huggingface.co/transformers/v4.2.2/perplexity.html
-def eval_wikitext2(model, tokenizer, max_length=1024, stride=512, verbose=True):
-	model.eval()
-	tokenizer.pad_token     = tokenizer.eos_token 
-	tokenizer.padding_side  = "right" 
-	tokenizer.add_eos_token = False
-
-	dataset   = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')
-	encodings = tokenizer('\n\n'.join(dataset['text']), return_tensors='pt')
-	
-	encodings['input_ids'] = encodings['input_ids'].to('cuda')
-
-	lls, t = [], []
-	for i in tqdm(range(0, encodings['input_ids'].size(1), stride), disable=not verbose):
-		begin_loc  = max(i + stride - max_length, 0)
-		end_loc    = min(i + stride, encodings['input_ids'].size(1))
-		trg_len    = end_loc - i  
-		input_ids  = encodings['input_ids'][:,begin_loc:end_loc]
-		target_ids = input_ids.clone()
-		target_ids[:,:-trg_len] = -100 #ignore context 
-
-		t1 = time.time()
-		with torch.no_grad():
-			log_likelihood = model(input_ids, labels=target_ids).loss * trg_len
-		torch.cuda.synchronize()
-		t2 = time.time()
-		t.append((t2-t1))
-		lls.append(log_likelihood)
-
-		del input_ids, target_ids
-
-	ppl       = np.round(float(torch.exp(torch.stack(lls).sum() / end_loc)), 4)
-	pred_time = np.round(np.mean(t), 3)
-	if(verbose):
-		print('perplexity', ppl)
-		print('time', str(pred_time) + '  sec')
-
-	del encodings
-	cleanup()
-
-	return {'perplexity':ppl, 'prediction_time':pred_time}
-
diff --git a/code/llama2_benchmark/quant_llama2_awq_demo.py b/code/llama2_benchmark/quant_llama2_awq_demo.py
deleted file mode 100644
index e8370c4..0000000
--- a/code/llama2_benchmark/quant_llama2_awq_demo.py
+++ /dev/null
@@ -1,39 +0,0 @@
-import torch, transformers
-
-#Settings
-######################################################################################
-hf_auth    = None #HuggingFace token
-cache_path = ''   #cache directory to store data
-
-#Chose a model
-model_id  = "meta-llama/Llama-2-7b-hf" 
-#model_id  = "meta-llama/Llama-2-13b-hf" 
-#model_id  = "meta-llama/Llama-2-70b-hf" 
-
-#AWQ settings
-######################################################################################
-from awq import AutoAWQForCausalLM
-import gc, time
-
-# Load model
-tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, use_auth_token=hf_auth)
-model     = AutoAWQForCausalLM.from_pretrained(model_id,         use_auth_token=hf_auth, cache_dir=cache_path, resume_download=True) 
-
-#quant_config = {"w_bit": 4, "q_group_size": 128, "zero_point": True, 'version':'GEMM'}
-quant_config = {"w_bit": 4, "q_group_size": 64, "zero_point": True, 'version':'GEMM'}
-
-t1 = time.time()
-model.quantize(tokenizer, quant_config=quant_config)
-t2 = time.time()
-print('Took ' + str(t2-t1) + ' seconds to quantize the model with AWQ')
-
-model = model.cuda()
-torch.cuda.empty_cache()
-gc.collect()
-
-#Evaluate the quantized model 
-######################################################################################
-from eval_model import eval_wikitext2
-
-eval_wikitext2(model, tokenizer, verbose=True)
-
diff --git a/code/llama2_benchmark/quant_llama2_gptq_demo.py b/code/llama2_benchmark/quant_llama2_gptq_demo.py
deleted file mode 100644
index f4fdd48..0000000
--- a/code/llama2_benchmark/quant_llama2_gptq_demo.py
+++ /dev/null
@@ -1,71 +0,0 @@
-import torch, transformers
-
-#Important: limit the number of threads otherwise the process will hang for a long time
-#num_threads=32;
-#OMP_NUM_THREADS=$num_threads OPENBLAS_NUM_THREADS=$num_threads MKL_NUM_THREADS=$num_threads VECLIB_MAXIMUM_THREADS=$num_threads NUMEXPR_NUM_THREADS=$num_threads CUDA_VISIBLE_DEVICES=0 ipython3
-
-#Settings
-######################################################################################
-hf_auth    = None #HuggingFace token
-cache_path = ''   #cache directory to store data
-
-#Chose a model
-model_id  = "meta-llama/Llama-2-7b-hf" 
-#model_id  = "meta-llama/Llama-2-13b-hf" 
-#model_id  = "meta-llama/Llama-2-70b-hf" 
-
-#GPTQ settings
-######################################################################################
-from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
-import logging, gc, time
-from tqdm import tqdm
-
-logging.basicConfig(format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S")
-
-#Adapted from: https://towardsdatascience.com/4-bit-quantization-with-gptq-36b0f4f02c34
-def prepare_model(model, tokenizer, n_samples=1024, max_tokens=512, use_triton=True):
-	# Load data and tokenize examples
-	from datasets import load_dataset
-	import random
-	data           = load_dataset("allenai/c4", data_files="en/c4-train.00001-of-01024.json.gz", split=f"train[:{n_samples}]", cache_dir=cache_path)
-	tokenized_data = torch.cat([tokenizer(data[i]['text'], return_tensors='pt').input_ids for i in tqdm(range(len(data)))], axis=-1) #~536K tokens
-
-	# Format tokenized examples
-	random.seed(1) 
-	examples_ids = []
-	for _ in range(n_samples):
-		i              = random.randint(0, tokenized_data.shape[1] - max_tokens - 1)
-		j              = i + max_tokens
-		input_ids      = tokenized_data[:, i:j]
-		attention_mask = torch.ones_like(input_ids)
-		examples_ids.append({'input_ids': input_ids, 'attention_mask': attention_mask})
-
-	print('Using ' + str(len(examples_ids)) + ' samples for calibration.')
-	model.quantize(examples_ids, batch_size=1, use_triton=use_triton)
-	model = model.cuda(); 
-	with torch.no_grad(): x = model(input_ids.to('cuda'));
-	del examples_ids, x
-	torch.cuda.empty_cache()
-	gc.collect()
-	return model
-
-#quantize_config = BaseQuantizeConfig(bits=8, group_size=128, damp_percent=0.01, desc_act=False); use_triton=True;
-#quantize_config = BaseQuantizeConfig(bits=4, group_size=128, damp_percent=0.01, desc_act=False); use_triton=True;
-quantize_config = BaseQuantizeConfig(bits=4, group_size=64, damp_percent=0.01, desc_act=False); use_triton=True;
-#quantize_config = BaseQuantizeConfig(bits=3, group_size=128, damp_percent=0.01, desc_act=False); use_triton=False;
-#quantize_config = BaseQuantizeConfig(bits=3, group_size=64, damp_percent=0.01, desc_act=False); use_triton=False;
-#quantize_config = BaseQuantizeConfig(bits=2, group_size=64, damp_percent=0.01, desc_act=False); use_triton=True;
-
-tokenizer = transformers.AutoTokenizer.from_pretrained(model_id,           use_auth_token=hf_auth)
-model     = AutoGPTQForCausalLM.from_pretrained(model_id, quantize_config, use_auth_token=hf_auth, cache_dir=cache_path)
-t1 = time.time()
-model = prepare_model(model, tokenizer, use_triton=use_triton)
-t2 = time.time()
-print('Took ' + str(t2-t1) + ' seconds to quantize the model with GPTQ')
-
-#Evaluate the quantized model 
-######################################################################################
-from eval_model import eval_wikitext2
-
-eval_wikitext2(model, tokenizer, verbose=True)
-
diff --git a/code/llama2_benchmark/quant_llama2_hqq_demo.py b/code/llama2_benchmark/quant_llama2_hqq_demo.py
deleted file mode 100644
index daf6fff..0000000
--- a/code/llama2_benchmark/quant_llama2_hqq_demo.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import torch, transformers
-
-#Settings
-######################################################################################
-hf_auth    = None #HuggingFace token
-cache_path = ''   #cache directory to store data
-
-#Chose a model
-model_id  = "meta-llama/Llama-2-7b-hf" 
-#model_id  = "meta-llama/Llama-2-13b-hf" 
-#model_id  = "meta-llama/Llama-2-70b-hf" 
-
-#Load model on the CPU
-######################################################################################
-model     = transformers.AutoModelForCausalLM.from_pretrained(model_id, use_auth_token=hf_auth, cache_dir=cache_path) 
-tokenizer = transformers.AutoTokenizer.from_pretrained(model_id,        use_auth_token=hf_auth)
-
-#Quantize the model
-######################################################################################
-from hqq.core import *
-from hqq.llama2 import quantize_model
-
-#quant_config = hqq_base_quant_config(nbits=8, group_size=128)
-quant_config = hqq_base_quant_config(nbits=4, group_size=64)
-#quant_config = hqq_base_quant_config(nbits=3, group_size=64)
-#quant_config = hqq_base_quant_config(nbits=2, group_size=16)
-#quant_config = hqq_base_quant_config(nbits=2, group_size=16, quant_scale=True) #scale is quantized to 8-bit/g=128
-
-quantize_model(model, quant_config=quant_config)
-
-#Evaluate the quantized model 
-######################################################################################
-from eval_model import eval_wikitext2
-
-eval_wikitext2(model, tokenizer, verbose=True)
-