Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add some fixes #4

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
6 changes: 5 additions & 1 deletion masked_language_modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,10 +97,14 @@ def group_texts(examples):
print("Training...")
trainer.train()

# Use the evaluate() method to evaluate the model and get its perplexity:
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

# Performing inference
text = "The Milky Way is a <mask> galaxy."
# We need to tokenize the inputs and turn them to PyTorch tensors
encoded_input = tokenizer(text, return_tensors="pt").input_ids
encoded_input = tokenizer(text, return_tensors="pt")

# To move the batch to the right device automatically, use `PartialState().device`
# which will always work no matter the environment
Expand Down
2 changes: 1 addition & 1 deletion multiple_choice.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

# Load dataset
print(f"Downloading dataset ({dataset_name})")
dataset = load_dataset(dataset_name, "regular", split="train[:8%]")
dataset = load_dataset(dataset_name, "regular", split="train[:8%]", trust_remote_code=True)
dataset = dataset.train_test_split(test_size=0.2)

# Tokenize the dataset
Expand Down
9 changes: 9 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
accelerate==0.26.1
datasets==2.16.1
evaluate==0.4.1
numpy==1.23.5
rouge-score==0.1.2
sacrebleu==2.4.0
seqeval==1.2.2
torch==2.1.0
transformers==4.35.2
3 changes: 2 additions & 1 deletion sequence_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# for sequence classification. Based on the Tasks documentation
# originally from: https://hf.co/docs/transformers/tasks/sequence_classification
import evaluate
from accelerate import PartialState
import numpy as np
import torch
from datasets import load_dataset
Expand Down Expand Up @@ -96,7 +97,7 @@ def compute_metrics(evaluation_preds):
# Performing inference
text = "This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three."
# We need to tokenize the inputs and turn them to PyTorch tensors
encoded_input = tokenizer(text, return_tensors="pt").to("cuda")
encoded_input = tokenizer(text, return_tensors="pt").to(PartialState().device)

# Then we can perform raw torch inference:
print("Performing inference...")
Expand Down
7 changes: 4 additions & 3 deletions summarization.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# for sequence classification. Based on the Tasks documentation
# originally from: https://hf.co/docs/transformers/tasks/sequence_classification
import evaluate
from accelerate import PartialState
import numpy as np
from datasets import load_dataset
from transformers import (
Expand Down Expand Up @@ -68,8 +69,8 @@ def compute_metrics(eval_pred):
training_args = Seq2SeqTrainingArguments(
output_dir="results/summarization", # Where weights are stored
learning_rate=2e-5, # The learning rate during training
per_device_train_batch_size=16, # Number of samples per batch during training
per_device_eval_batch_size=16, # Number of samples per batch during evaluation
per_device_train_batch_size=8, # Number of samples per batch during training
per_device_eval_batch_size=8, # Number of samples per batch during evaluation
num_train_epochs=4, # How many iterations through the dataloaders should be done
weight_decay=0.01, # Regularization penalization
evaluation_strategy="epoch", # How often metrics on the evaluation dataset should be computed
Expand Down Expand Up @@ -97,7 +98,7 @@ def compute_metrics(eval_pred):

# Performing inference
text = "summarize: The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country. It'll lower the deficit and ask the ultra-wealthy and corporations to pay their fair share. And no one making under $400,000 per year will pay a penny more in taxes." # We need to tokenize the inputs and turn them to PyTorch tensors
encoded_input = tokenizer(text, return_tensors="pt").input_ids
encoded_input = tokenizer(text, return_tensors="pt").input_ids.to(PartialState().device)

# Then we can perform inference using `model.generate`:
print("Performing inference...")
Expand Down
5 changes: 3 additions & 2 deletions token_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# for token classification. Based on the Tasks documentation
# originally from: https://hf.co/docs/transformers/tasks/token_classification
import evaluate
from accelerate import PartialState
import numpy as np
import torch
from datasets import load_dataset
Expand Down Expand Up @@ -143,7 +144,7 @@ def compute_metrics(evaluation_preds):
# Performing inference
text = "The Golden State Warriors are an American professional basketball team based in San Francisco."
# We need to tokenize the inputs and turn them to PyTorch tensors
encoded_input = tokenizer(text, return_tensors="pt")
encoded_input = tokenizer(text, return_tensors="pt").to(PartialState().device)

# Then we can perform raw torch inference:
print("Performing inference...")
Expand All @@ -153,5 +154,5 @@ def compute_metrics(evaluation_preds):

# Finally, decode our outputs
predictions = logits.argmax(dim=2)
print(f"Prediction: {[id2label[pred] for pred in predictions[0]]}")
print(f"Prediction: {[id2label[pred.item()] for pred in predictions[0]]}")
# Can also use `model.config.id2label` instead
3 changes: 2 additions & 1 deletion translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# originally from: https://hf.co/docs/transformers/tasks/translation
import evaluate
import numpy as np
from accelerate import PartialState
from datasets import load_dataset
from transformers import (
AutoModelForSeq2SeqLM,
Expand Down Expand Up @@ -113,7 +114,7 @@ def compute_metrics(eval_preds):
# Performing inference
text = "translate English to French: Legumes share resources with nitrogen-fixing bacteria."
# We need to tokenize the inputs and turn them to PyTorch tensors
encoded_input = tokenizer(text, return_tensors="pt").input_ids
encoded_input = tokenizer(text, return_tensors="pt").input_ids.to(PartialState().device)

# Then we can perform inference using `model.generate()`:
print("Performing inference...")
Expand Down