Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add some fixes #4

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
6 changes: 5 additions & 1 deletion masked_language_modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,10 +97,14 @@ def group_texts(examples):
print("Training...")
trainer.train()

# Use the evaluate() method to evaluate the model and get its perplexity:
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

# Performing inference
text = "The Milky Way is a <mask> galaxy."
# We need to tokenize the inputs and turn them to PyTorch tensors
encoded_input = tokenizer(text, return_tensors="pt").input_ids
encoded_input = tokenizer(text, return_tensors="pt")

# To move the batch to the right device automatically, use `PartialState().device`
# which will always work no matter the environment
Expand Down
2 changes: 1 addition & 1 deletion multiple_choice.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

# Load dataset
print(f"Downloading dataset ({dataset_name})")
dataset = load_dataset(dataset_name, "regular", split="train[:8%]")
dataset = load_dataset(dataset_name, "regular", split="train[:8%]", trust_remote_code=True)
dataset = dataset.train_test_split(test_size=0.2)

# Tokenize the dataset
Expand Down
9 changes: 9 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
accelerate==0.26.1
datasets==2.16.1
evaluate==0.4.1
numpy==1.23.5
rouge-score==0.1.2
sacrebleu==2.4.0
seqeval==1.2.2
torch==2.1.0+cu121
Foxglove144 marked this conversation as resolved.
Show resolved Hide resolved
transformers==4.35.2
4 changes: 2 additions & 2 deletions summarization.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,8 @@ def compute_metrics(eval_pred):
training_args = Seq2SeqTrainingArguments(
output_dir="results/summarization", # Where weights are stored
learning_rate=2e-5, # The learning rate during training
per_device_train_batch_size=16, # Number of samples per batch during training
per_device_eval_batch_size=16, # Number of samples per batch during evaluation
per_device_train_batch_size=8, # Number of samples per batch during training
per_device_eval_batch_size=8, # Number of samples per batch during evaluation
num_train_epochs=4, # How many iterations through the dataloaders should be done
weight_decay=0.01, # Regularization penalization
evaluation_strategy="epoch", # How often metrics on the evaluation dataset should be computed
Expand Down
4 changes: 2 additions & 2 deletions token_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ def compute_metrics(evaluation_preds):
# Performing inference
text = "The Golden State Warriors are an American professional basketball team based in San Francisco."
# We need to tokenize the inputs and turn them to PyTorch tensors
encoded_input = tokenizer(text, return_tensors="pt")
encoded_input = tokenizer(text, return_tensors="pt").to("cuda")
Foxglove144 marked this conversation as resolved.
Show resolved Hide resolved

# Then we can perform raw torch inference:
print("Performing inference...")
Expand All @@ -153,5 +153,5 @@ def compute_metrics(evaluation_preds):

# Finally, decode our outputs
predictions = logits.argmax(dim=2)
print(f"Prediction: {[id2label[pred] for pred in predictions[0]]}")
print(f"Prediction: {[id2label[pred.item()] for pred in predictions[0]]}")
# Can also use `model.config.id2label` instead
2 changes: 1 addition & 1 deletion translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def compute_metrics(eval_preds):
# Performing inference
text = "translate English to French: Legumes share resources with nitrogen-fixing bacteria."
# We need to tokenize the inputs and turn them to PyTorch tensors
encoded_input = tokenizer(text, return_tensors="pt").input_ids
encoded_input = tokenizer(text, return_tensors="pt").input_ids.to("cuda")
Foxglove144 marked this conversation as resolved.
Show resolved Hide resolved

# Then we can perform inference using `model.generate()`:
print("Performing inference...")
Expand Down