Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

it run #149

Open
werruww opened this issue Nov 20, 2024 · 4 comments
Open

it run #149

werruww opened this issue Nov 20, 2024 · 4 comments

Comments

@werruww
Copy link

werruww commented Nov 20, 2024

https://huggingface.co/ISTA-DASLab/Llama-3.2-1B-Instruct-AQLM-PV-2Bit-2x8
https://github.com/yeyu2/Youtube_demos/blob/main/Mixtral_of_aqlm_transformers.ipynb
https://huggingface.co/docs/transformers/main/en/quantization/aqlm
https://pytorch.org/get-started/previous-versions/

%%capture
!pip install aqlm[gpu]>=1.0.1
!pip install accelerate>=0.27.0
!pip install transformers>=4.38.0

!pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu118

!pip install aqlm[gpu]==1.0.1
!pip install git+https://github.com/huggingface/accelerate.git@main
!pip install git+https://github.com/BlackSamorez/transformers.git@aqlm

from transformers import AutoTokenizer, AutoModelForCausalLM

quantized_model = AutoModelForCausalLM.from_pretrained(
"ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf",
torch_dtype="auto", device_map="auto", low_cpu_mem_usage=True,
)
tokenizer = AutoTokenizer.from_pretrained("ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf")

%%time
output = quantized_model.generate(tokenizer("The relationship between humans and AI ", return_tensors="pt")["input_ids"].cuda(), min_new_tokens=128, max_new_tokens=128)
print(tokenizer.decode(output[0]))

import json
import textwrap

system_prompt = "A chat between a curious user and an blog writing assistant. "

def get_prompt(human_prompt):
prompt_template=f"{system_prompt}\n\nUSER: {human_prompt} \nASSISTANT: "
return prompt_template

def remove_human_text(text):
return text.split('USER:', 1)[0]

def parse_text(data):
for item in data:
text = item['generated_text']
assistant_text_index = text.find('ASSISTANT:')
if assistant_text_index != -1:
assistant_text = text[assistant_text_index+len('ASSISTANT:'):].strip()
assistant_text = remove_human_text(assistant_text)
wrapped_text = textwrap.fill(assistant_text, width=100)
print("#####", wrapped_text)
# return assistant_text

from transformers import GenerationConfig, pipeline

pipe = pipeline(
"text-generation",
model=quantized_model,
tokenizer=tokenizer,
max_length=1200,
temperature=0.7,
top_p=0.95,
do_sample=True,
)

%%time
prompt = '''Write a short and engaging blog post of travelling in Bohol Island.
'''
raw_output = pipe(get_prompt(prompt))

parse_text(raw_output)

@werruww
Copy link
Author

werruww commented Nov 20, 2024

on colab h4

@werruww
Copy link
Author

werruww commented Nov 20, 2024

from transformers import AutoTokenizer, AutoModelForCausalLM

quantized_model = AutoModelForCausalLM.from_pretrained(
"ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf",
torch_dtype="auto", device_map="auto", low_cpu_mem_usage=True,
)
tokenizer = AutoTokenizer.from_pretrained("ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf")

output = quantized_model.generate(tokenizer("The relationship between humans and AI ", return_tensors="pt")["input_ids"].cuda(), min_new_tokens=128, max_new_tokens=128)
print(tokenizer.decode(output[0]))

import json
import textwrap

system_prompt = "You are a helpful assistant. "

def get_prompt(human_prompt):
prompt_template=f"{system_prompt}\n\nUSER: {human_prompt} \nASSISTANT: "
return prompt_template

def remove_human_text(text):
return text.split('USER:', 1)[0]

def parse_text(data):
for item in data:
text = item['generated_text']
assistant_text_index = text.find('ASSISTANT:')
if assistant_text_index != -1:
assistant_text = text[assistant_text_index+len('ASSISTANT:'):].strip()
assistant_text = remove_human_text(assistant_text)
wrapped_text = textwrap.fill(assistant_text, width=100)
print("#####", wrapped_text)
# return assistant_text

from transformers import GenerationConfig, pipeline

pipe = pipeline(
"text-generation",
model=quantized_model,
tokenizer=tokenizer,
max_length=1200,
temperature=0.2,
top_p=0.8,
do_sample=True,
)

prompt = '''who is python?'''

raw_output = pipe(get_prompt(prompt))

parse_text(raw_output)

@werruww
Copy link
Author

werruww commented Nov 20, 2024

@werruww
Copy link
Author

werruww commented Dec 19, 2024

The 2-bit model works efficiently, but the 1-bit output is incomprehensible for all models.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant