not run #135

werruww · 2024-12-13T00:59:09Z

colabb t4

from hqq.engine.hf import HQQModelForCausalLM, AutoTokenizer

#Load the model
model_id = 'mobiuslabsgmbh/Llama-2-7b-chat-hf_1bitgs8_hqq'
model = HQQModelForCausalLM.from_quantized(model_id, adapter='adapter_v0.1.lora')
tokenizer = AutoTokenizer.from_pretrained(model_id)

#Setup Inference Mode
tokenizer.add_bos_token = False
tokenizer.add_eos_token = False
if not tokenizer.pad_token: tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.config.use_cache = True
model.eval();

Optional: torch compile for faster inference

model = torch.compile(model)

#Streaming Inference
import torch, transformers
from threading import Thread

def chat_processor(chat, max_new_tokens=100, do_sample=True, device='cuda'):
tokenizer.use_default_system_prompt = False
streamer = transformers.TextIteratorStreamer(tokenizer,skip_prompt=True, skip_special_tokens=True)

generate_params = dict(
    tokenizer("<s> [INST] " + chat + " [/INST] ", return_tensors="pt").to(device),
    streamer=streamer,
    max_new_tokens=max_new_tokens,
    do_sample=do_sample,
    pad_token_id=tokenizer.pad_token_id,
    top_p=0.90 if do_sample else None,
    top_k=50 if do_sample else None,
    temperature= 0.6 if do_sample else None,
    num_beams=1,
    repetition_penalty=1.2,
)

t = Thread(target=model.generate, kwargs=generate_params)
t.start()

print("User: ", chat); 
print("Assistant: ");
outputs = ""
for text in streamer:
    outputs += text
    print(text, end="", flush=True)

torch.cuda.empty_cache()

return outputs

outputs = chat_processor("What is the solution to x^2 - 1 = 0", max_new_tokens=1000).to(cuda)

8]
1
outputs = chat_processor("What is the solution to x^2 - 1 = 0", max_new_tokens=1000).to(cuda)
Exception in thread Thread-17 (generate):
Traceback (most recent call last):
File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
self.run()
File "/usr/lib/python3.10/threading.py", line 953, in run
self._target(*self._args, **self._kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py", line 2215, in generate
result = self._sample(
File "/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py", line 3206, in _sample
outputs = self(**model_inputs, return_dict=True)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py", line 1190, in forward
outputs = self.model(
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py", line 921, in forward
position_embeddings = self.rotary_emb(hidden_states, position_ids)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py", line 158, in forward
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_bmm)

The text was updated successfully, but these errors were encountered:

werruww · 2024-12-13T01:56:17Z

from hqq.engine.hf import HQQModelForCausalLM, AutoTokenizer
import torch
import transformers # Make sure transformers is imported
from threading import Thread # Make sure Thread is imported

Load the model

model_id = 'mobiuslabsgmbh/Llama-2-7b-chat-hf_1bitgs8_hqq'
model = HQQModelForCausalLM.from_quantized(model_id, adapter='adapter_v0.1.lora', compute_dtype=torch.float16, device="cuda")
tokenizer = AutoTokenizer.from_pretrained(model_id)

Define the device before using it

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Move the model to the selected device

model.to(device)

Setup Inference Mode

tokenizer.add_bos_token = False
tokenizer.add_eos_token = False
if not tokenizer.pad_token:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.config.use_cache = True
model.eval()

Optional: torch compile for faster inference

model = torch.compile(model) # You might want to enable this for potential speedup

def chat_processor(chat, max_new_tokens=100, do_sample=True, device='cuda'):
tokenizer.use_default_system_prompt = False
streamer = transformers.TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

# Get the input tensor
inputs = tokenizer("<s> [INST] " + chat + " [/INST] ", return_tensors="pt").to(device)

# Access the shape attribute of the input tensor
batch_size = inputs["input_ids"].shape[0]

generate_params = dict(
    inputs=inputs,  # Pass the input tensor directly
    streamer=streamer,
    max_new_tokens=max_new_tokens,
    do_sample=do_sample,
    pad_token_id=tokenizer.pad_token_id,
    top_p=0.90 if do_sample else None,
    top_k=50 if do_sample else None,
    temperature=0.6 if do_sample else None,
    num_beams=1,
    repetition_penalty=1.2,
)

t = Thread(target=model.generate, kwargs=generate_params)
t.start()

print("User: ", chat)
print("Assistant: ")
outputs = ""
for text in streamer:
    outputs += text
    print(text, end="", flush=True)

torch.cuda.empty_cache()

return outputs

Now you can call the function:

results = chat_processor("What is the solution to x^2 - 1 = 0", max_new_tokens=100, device=device)
print(results)

/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning:
The secret HF_TOKEN does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
warnings.warn(
Fetching 9 files: 100%
9/9 [00:00<00:00, 169.05it/s]
/usr/local/lib/python3.10/dist-packages/hqq/models/base.py:251: FutureWarning: You are using torch.load with weights_only=False (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for weights_only will be flipped to True. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via torch.serialization.add_safe_globals. We recommend you start setting weights_only=True for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
return torch.load(cls.get_weight_file(save_dir), map_location=map_location)
100%|██████████| 32/32 [00:00<00:00, 764.78it/s]
100%|██████████| 32/32 [00:01<00:00, 20.53it/s]
/usr/local/lib/python3.10/dist-packages/hqq/core/peft.py:513: FutureWarning: You are using torch.load with weights_only=False (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for weights_only will be flipped to True. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via torch.serialization.add_safe_globals. We recommend you start setting weights_only=True for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
lora_data = torch.load(filename, map_location="cpu")
100%|██████████| 32/32 [00:00<00:00, 32.98it/s]
100%|██████████| 32/32 [00:00<00:00, 182.07it/s]
100%|██████████| 32/32 [00:00<00:00, 1747.24it/s]
Exception in thread Thread-12 (generate):
Traceback (most recent call last):
File "/usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py", line 283, in getattr
return self.data[item]
KeyError: 'shape'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
self.run()
File "/usr/lib/python3.10/threading.py", line 953, in run
self._target(*self._args, **self._kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py", line 1990, in generate
batch_size = inputs_tensor.shape[0]
File "/usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py", line 285, in getattr
raise AttributeError
AttributeError
User: What is the solution to x^2 - 1 = 0
Assistant:

werruww · 2024-12-13T01:56:33Z

colab t4

werruww · 2024-12-13T03:04:45Z

from hqq.engine.hf import HQQModelForCausalLM, AutoTokenizer
from transformers import AutoModelForCausalLM
import torch

Device configuration

device = 'cuda' if torch.cuda.is_available() else 'cpu'
torch_dtype = torch.float16

Load the quantized model

quantized_model_id = 'mobiuslabsgmbh/Llama-2-7b-chat-hf_1bitgs8_hqq'
one_bit_model = HQQModelForCausalLM.from_quantized(
quantized_model_id,
adapter='adapter_v0.1.lora'
)
one_bit_model = one_bit_model.to(device)
one_bit_model.config.use_cache = True
one_bit_model.eval()

Load tokenizer

tokenizer = AutoTokenizer.from_pretrained(quantized_model_id)
tokenizer.add_bos_token = False
tokenizer.add_eos_token = False
if not tokenizer.pad_token:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.padding_side = 'left'

def debug_tensor_devices(inputs):
"""Helper function to print device information for all tensors"""
for key, value in inputs.items():
if torch.is_tensor(value):
print(f"Tensor {key} is on device: {value.device}")

def chat_processor(chat, current_model, current_tokenizer, max_new_tokens=100, do_sample=True, device=device):
print(f"\nStarting chat_processor with device: {device}")
current_tokenizer.use_default_system_prompt = False

# Create inputs
input_text = "<s> [INST] " + chat + " [/INST] "

# Tokenize and explicitly move to device
inputs = current_tokenizer(
    input_text,
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=2048
)

# Move each tensor to the correct device
inputs = {k: v.to(device) for k, v in inputs.items()}

print("\nInput tensor devices:")
debug_tensor_devices(inputs)
print(f"Model device: {next(current_model.parameters()).device}")

# Create streamer
streamer = transformers.TextIteratorStreamer(
    current_tokenizer,
    timeout=10.0,
    skip_prompt=True,
    skip_special_tokens=True
)

# Prepare generation parameters
generate_params = {
    'input_ids': inputs['input_ids'],
    'attention_mask': inputs['attention_mask'] if 'attention_mask' in inputs else None,
    'streamer': streamer,
    'max_new_tokens': max_new_tokens,
    'do_sample': do_sample,
    'pad_token_id': current_tokenizer.pad_token_id,
    'top_p': 0.90 if do_sample else None,
    'top_k': 50 if do_sample else None,
    'temperature': 0.6 if do_sample else None,
    'num_beams': 1,
    'repetition_penalty': 1.2,
}

# Remove None values from generate_params
generate_params = {k: v for k, v in generate_params.items() if v is not None}

print("\nGeneration parameters devices:")
debug_tensor_devices(generate_params)

# Start generation in a separate thread
from threading import Thread
t = Thread(target=current_model.generate, kwargs=generate_params)
t.start()

print("\nUser: ", chat)
print("Assistant: ")
outputs = ""
for text in streamer:
    outputs += text
    print(text, end="", flush=True)

# Clean up
torch.cuda.empty_cache()
return outputs

Test with explicit error handling

question = "What is 2 + 2?"
try:
with torch.no_grad():
outputs = chat_processor(
question,
one_bit_model,
tokenizer,
max_new_tokens=256,
do_sample=False
)
except Exception as e:
print(f"\nError during processing: {str(e)}")
import traceback
print("\nFull traceback:")
print(traceback.format_exc())

Fetching 9 files: 100%
9/9 [00:00<00:00, 325.83it/s]
100%|██████████| 32/32 [00:00<00:00, 3756.34it/s]
100%|██████████| 32/32 [00:01<00:00, 29.25it/s]
100%|██████████| 32/32 [00:00<00:00, 39.99it/s]
100%|██████████| 32/32 [00:00<00:00, 465.35it/s]
100%|██████████| 32/32 [00:00<00:00, 3185.12it/s]
Exception in thread Thread-16 (generate):
Traceback (most recent call last):
File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
self.run()
File "/usr/lib/python3.10/threading.py", line 953, in run
self._target(*self._args, **self._kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py", line 2215, in generate
result = self._sample(
File "/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py", line 3206, in _sample
outputs = self(**model_inputs, return_dict=True)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py", line 1190, in forward
outputs = self.model(
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py", line 921, in forward
position_embeddings = self.rotary_emb(hidden_states, position_ids)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py", line 158, in forward
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_bmm)

Starting chat_processor with device: cuda

Input tensor devices:
Tensor input_ids is on device: cuda:0
Tensor attention_mask is on device: cuda:0
Model device: cuda:0

Generation parameters devices:
Tensor input_ids is on device: cuda:0
Tensor attention_mask is on device: cuda:0

User: What is 2 + 2?
Assistant:

Error during processing:

Full traceback:
Traceback (most recent call last):
File "", line 105, in <cell line: 103>
outputs = chat_processor(
File "", line 93, in chat_processor
for text in streamer:
File "/usr/local/lib/python3.10/dist-packages/transformers/generation/streamers.py", line 223, in next
value = self.text_queue.get(timeout=self.timeout)
File "/usr/lib/python3.10/queue.py", line 179, in get
raise Empty
_queue.Empty

werruww · 2024-12-13T03:05:36Z

What is the solution?

werruww · 2024-12-13T03:06:50Z

All ideas for 1bit, 2bit

never work

mobicham · 2024-12-13T08:41:22Z

Hi, sorry I don't understand, what is the problem exactly?
I just tired it with the specified versions and it works fine: https://huggingface.co/mobiuslabsgmbh/Llama-2-7b-chat-hf_1bitgs8_hqq#usage

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

not run #135

not run #135

werruww commented Dec 13, 2024

werruww commented Dec 13, 2024

werruww commented Dec 13, 2024

werruww commented Dec 13, 2024

werruww commented Dec 13, 2024

werruww commented Dec 13, 2024

mobicham commented Dec 13, 2024

not run #135

not run #135

Comments

werruww commented Dec 13, 2024

Optional: torch compile for faster inference

werruww commented Dec 13, 2024

Load the model

Define the device before using it

Move the model to the selected device

Setup Inference Mode

Optional: torch compile for faster inference

model = torch.compile(model) # You might want to enable this for potential speedup

Now you can call the function:

werruww commented Dec 13, 2024

werruww commented Dec 13, 2024

Device configuration

Load the quantized model

Load tokenizer

Test with explicit error handling

werruww commented Dec 13, 2024

werruww commented Dec 13, 2024

mobicham commented Dec 13, 2024