Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

llm tutorial #39

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 77 additions & 0 deletions llm-inference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#%%
from transformers.models.llama import LlamaForCausalLM, LlamaTokenizer
from transformers import GenerationConfig

#path = '/home/oamontoy/workspace/weights-llama-2-7B'
path = '/home/oamontoy/workspace/weights-llama-2-7B-chat'
tokenizer = LlamaTokenizer.from_pretrained(path)
model = LlamaForCausalLM.from_pretrained(path)
# %%
from peft import PeftModel
model = PeftModel.from_pretrained(model, "dominguesm/alpaca-lora-ptbr-7b")
# %%

def generate_prompt(instruction, input=None):
if input:
return f"""Below is a statement that describes a task, paired with an input that provides more context. Write a response that appropriately completes the request.

### instruction:
{instruction}

### input:
{input}

### response:"""
else:
return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

### instruction:
{instruction}

### response:"""

# %%
from pprint import pprint
# %%
generation_config = GenerationConfig(
temperature=0.1,
top_p=0.75,
num_beams=4,
)

def evaluate(instruction, input=None):
prompt = generate_prompt(instruction, input)
inputs = tokenizer(prompt, return_tensors="pt")
input_ids = inputs["input_ids"]
generation_output = model.generate(
input_ids=input_ids,
generation_config=generation_config,
return_dict_in_generate=True,
output_scores=True,
max_new_tokens=256
)
for s in generation_output.sequences:
output = tokenizer.decode(s)
pprint("response: " + output.split("### response:")[1].strip())
# %%
#evaluate(input("instruction: "))
# %%
instruction = 'print a long paragraph of giberish'
prompt = generate_prompt(instruction, None)
inputs = tokenizer(prompt, return_tensors="pt")
inputs
#%%
input_ids = inputs["input_ids"]
# %%
generation_output = model.generate(
input_ids=input_ids,
generation_config=generation_config,
return_dict_in_generate=True,
output_scores=True,
max_new_tokens=256
)
# %%
for s in generation_output.sequences:
output = tokenizer.decode(s)
pprint("response: " + output.split("### response:")[1].strip())
# %%
192 changes: 192 additions & 0 deletions llm-lora_classification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
#%%
import numpy as np
from datasets import load_dataset, load_metric
from peft import LoraConfig, TaskType, get_peft_model


from transformers import (AutoModelForSequenceClassification, AutoTokenizer,
DataCollatorWithPadding, Trainer,
TrainingArguments)

# %%
task = "mrpc"
num_epochs = 20
lr = 1e-3
batch_size = 32
dataset = load_dataset("glue", task)
padding_side = "right"

metric = load_metric('glue', task)
def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
return metric.compute(predictions=predictions, references=labels)

#%%
model_checkpoint = "roberta-large"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, padding_side=padding_side)
if getattr(tokenizer, "pad_token_id") is None:
tokenizer.pad_token_id = tokenizer.eos_token_id

def tokenize_function(examples):
# max_length=None => use the model max length (it's actually the default)
outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
return outputs

tokenized_datasets = dataset.map(
tokenize_function,
batched=True,
remove_columns=["idx", "sentence1", "sentence2"],
)
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")

# %%
model = AutoModelForSequenceClassification.from_pretrained(
model_checkpoint, return_dict=True)
print(len(list(model.named_parameters())))
orig_layers = [n for n,p in model.named_parameters()]

peft_config = LoraConfig(
task_type=TaskType.SEQ_CLS, inference_mode=False, r=16, lora_alpha=16, lora_dropout=0.1, bias="all"
)

#%%

model = get_peft_model(model, peft_config)
print(len(list(model.named_parameters())))
withpeft_layers = [n.replace('base_model.model.','') for n,p in model.named_parameters()]
model.print_trainable_parameters()
BATCH_SIZE = 32
NUM_EPOCHS = 5
LR = 1e-3
WEIGHT_DECAY = 0.01
#%%

training_args = TrainingArguments(
output_dir="roberta-large-lora-seq1_tests",
learning_rate=LR,
per_device_train_batch_size=BATCH_SIZE,
per_device_eval_batch_size=BATCH_SIZE,
num_train_epochs=NUM_EPOCHS,
weight_decay=WEIGHT_DECAY,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets["train"].select(range(200)),
eval_dataset=tokenized_datasets["test"].select(range(200)),
tokenizer=tokenizer,
compute_metrics=compute_metrics,
data_collator=data_collator
)

trainer.train()
#%%
#model.save_pretrained('saved_model')
# %%

from accelerate import Accelerator
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import (
AutoConfig,
AutoModelForSequenceClassification,
AutoTokenizer,
DataCollatorWithPadding,
PretrainedConfig,
SchedulerType,
default_data_collator,
get_scheduler,
)
import torch
from tqdm import tqdm

#%%
model = AutoModelForSequenceClassification.from_pretrained(
model_checkpoint, return_dict=True)
print(len(list(model.named_parameters())))
orig_layers = [n for n,p in model.named_parameters()]

peft_config = LoraConfig(
task_type=TaskType.SEQ_CLS, inference_mode=False, r=16, lora_alpha=16, lora_dropout=0.1, bias="all"
)

#%%

model = get_peft_model(model, peft_config)
print(len(list(model.named_parameters())))
withpeft_layers = [n.replace('base_model.model.','') for n,p in model.named_parameters()]
model.print_trainable_parameters()
BATCH_SIZE = 32
NUM_EPOCHS = 1
LR = 1e-3
WEIGHT_DECAY = 0.01
#%%

#accelerator = Accelerator()
#accelerator.wait_for_everyone()

def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
return metric.compute(predictions=predictions, references=labels)

no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
{
"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
"weight_decay": 0.01,
},
{
"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
"weight_decay": 0.0,
},
]
optimizer = AdamW(optimizer_grouped_parameters, lr=LR)


train_dataloader = DataLoader(tokenized_datasets["train"].select(range(200)), collate_fn=data_collator, batch_size=BATCH_SIZE)
valid_dataloader = DataLoader(tokenized_datasets["test"].select(range(200)), collate_fn=data_collator, batch_size=BATCH_SIZE)

lr_scheduler = get_scheduler(
name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * NUM_EPOCHS
)
def eval_metrics(model, dataloader, device='cpu'):
model.eval()
for batch in dataloader:
batch = {k: v.to(device) for k, v in batch.items()}

with torch.no_grad():
outputs = model(**batch)

logits = outputs.logits
predictions = torch.argmax(logits, dim=-1)
metric.add_batch(predictions=predictions, references=batch["labels"])
model.train()
return metric.compute()

#model, optimizer, train_dataloader, valid_dataloader, lr_scheduler = accelerator.prepare(
# model, optimizer, train_dataloader, valid_dataloader, lr_scheduler
# )

for epoch in range(NUM_EPOCHS):
model.train()
losses = []
for batch in tqdm(train_dataloader):
outputs = model(**batch)
loss = outputs.loss
loss.backward()

optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
losses.append(loss.detach())

accuracy = eval_metrics(model, valid_dataloader)
print(accuracy, 'loss: ', np.mean(losses))
# %%
Loading