diff --git a/masked_language_modeling.py b/masked_language_modeling.py index b0c0c48..41bc260 100644 --- a/masked_language_modeling.py +++ b/masked_language_modeling.py @@ -97,10 +97,14 @@ def group_texts(examples): print("Training...") trainer.train() +# Use the evaluate() method to evaluate the model and get its perplexity: +eval_results = trainer.evaluate() +print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}") + # Performing inference text = "The Milky Way is a galaxy." # We need to tokenize the inputs and turn them to PyTorch tensors -encoded_input = tokenizer(text, return_tensors="pt").input_ids +encoded_input = tokenizer(text, return_tensors="pt") # To move the batch to the right device automatically, use `PartialState().device` # which will always work no matter the environment diff --git a/multiple_choice.py b/multiple_choice.py index 660c74c..71cbf4c 100644 --- a/multiple_choice.py +++ b/multiple_choice.py @@ -19,7 +19,7 @@ # Load dataset print(f"Downloading dataset ({dataset_name})") -dataset = load_dataset(dataset_name, "regular", split="train[:8%]") +dataset = load_dataset(dataset_name, "regular", split="train[:8%]", trust_remote_code=True) dataset = dataset.train_test_split(test_size=0.2) # Tokenize the dataset diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..da06145 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,9 @@ +accelerate==0.26.1 +datasets==2.16.1 +evaluate==0.4.1 +numpy==1.23.5 +rouge-score==0.1.2 +sacrebleu==2.4.0 +seqeval==1.2.2 +torch==2.1.0 +transformers==4.35.2 diff --git a/sequence_classification.py b/sequence_classification.py index 5ce1805..7ca045a 100644 --- a/sequence_classification.py +++ b/sequence_classification.py @@ -2,6 +2,7 @@ # for sequence classification. Based on the Tasks documentation # originally from: https://hf.co/docs/transformers/tasks/sequence_classification import evaluate +from accelerate import PartialState import numpy as np import torch from datasets import load_dataset @@ -96,7 +97,7 @@ def compute_metrics(evaluation_preds): # Performing inference text = "This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three." # We need to tokenize the inputs and turn them to PyTorch tensors -encoded_input = tokenizer(text, return_tensors="pt").to("cuda") +encoded_input = tokenizer(text, return_tensors="pt").to(PartialState().device) # Then we can perform raw torch inference: print("Performing inference...") diff --git a/summarization.py b/summarization.py index 0c68c2c..a90dd69 100644 --- a/summarization.py +++ b/summarization.py @@ -2,6 +2,7 @@ # for sequence classification. Based on the Tasks documentation # originally from: https://hf.co/docs/transformers/tasks/sequence_classification import evaluate +from accelerate import PartialState import numpy as np from datasets import load_dataset from transformers import ( @@ -68,8 +69,8 @@ def compute_metrics(eval_pred): training_args = Seq2SeqTrainingArguments( output_dir="results/summarization", # Where weights are stored learning_rate=2e-5, # The learning rate during training - per_device_train_batch_size=16, # Number of samples per batch during training - per_device_eval_batch_size=16, # Number of samples per batch during evaluation + per_device_train_batch_size=8, # Number of samples per batch during training + per_device_eval_batch_size=8, # Number of samples per batch during evaluation num_train_epochs=4, # How many iterations through the dataloaders should be done weight_decay=0.01, # Regularization penalization evaluation_strategy="epoch", # How often metrics on the evaluation dataset should be computed @@ -97,7 +98,7 @@ def compute_metrics(eval_pred): # Performing inference text = "summarize: The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country. It'll lower the deficit and ask the ultra-wealthy and corporations to pay their fair share. And no one making under $400,000 per year will pay a penny more in taxes." # We need to tokenize the inputs and turn them to PyTorch tensors -encoded_input = tokenizer(text, return_tensors="pt").input_ids +encoded_input = tokenizer(text, return_tensors="pt").input_ids.to(PartialState().device) # Then we can perform inference using `model.generate`: print("Performing inference...") diff --git a/token_classification.py b/token_classification.py index 49aa02d..6ca8f06 100644 --- a/token_classification.py +++ b/token_classification.py @@ -2,6 +2,7 @@ # for token classification. Based on the Tasks documentation # originally from: https://hf.co/docs/transformers/tasks/token_classification import evaluate +from accelerate import PartialState import numpy as np import torch from datasets import load_dataset @@ -143,7 +144,7 @@ def compute_metrics(evaluation_preds): # Performing inference text = "The Golden State Warriors are an American professional basketball team based in San Francisco." # We need to tokenize the inputs and turn them to PyTorch tensors -encoded_input = tokenizer(text, return_tensors="pt") +encoded_input = tokenizer(text, return_tensors="pt").to(PartialState().device) # Then we can perform raw torch inference: print("Performing inference...") @@ -153,5 +154,5 @@ def compute_metrics(evaluation_preds): # Finally, decode our outputs predictions = logits.argmax(dim=2) -print(f"Prediction: {[id2label[pred] for pred in predictions[0]]}") +print(f"Prediction: {[id2label[pred.item()] for pred in predictions[0]]}") # Can also use `model.config.id2label` instead diff --git a/translation.py b/translation.py index 33dc215..2701886 100644 --- a/translation.py +++ b/translation.py @@ -3,6 +3,7 @@ # originally from: https://hf.co/docs/transformers/tasks/translation import evaluate import numpy as np +from accelerate import PartialState from datasets import load_dataset from transformers import ( AutoModelForSeq2SeqLM, @@ -113,7 +114,7 @@ def compute_metrics(eval_preds): # Performing inference text = "translate English to French: Legumes share resources with nitrogen-fixing bacteria." # We need to tokenize the inputs and turn them to PyTorch tensors -encoded_input = tokenizer(text, return_tensors="pt").input_ids +encoded_input = tokenizer(text, return_tensors="pt").input_ids.to(PartialState().device) # Then we can perform inference using `model.generate()`: print("Performing inference...")