From 71acc229c83bdf83d7e1e442218c712ec8e812fb Mon Sep 17 00:00:00 2001 From: Arunabh Date: Sun, 14 Jan 2024 15:18:03 +0100 Subject: [PATCH 01/11] Update multiple_choice.py --- multiple_choice.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/multiple_choice.py b/multiple_choice.py index 660c74c..71cbf4c 100644 --- a/multiple_choice.py +++ b/multiple_choice.py @@ -19,7 +19,7 @@ # Load dataset print(f"Downloading dataset ({dataset_name})") -dataset = load_dataset(dataset_name, "regular", split="train[:8%]") +dataset = load_dataset(dataset_name, "regular", split="train[:8%]", trust_remote_code=True) dataset = dataset.train_test_split(test_size=0.2) # Tokenize the dataset From b783642fa917351c2096eb489e6666f4aa16557f Mon Sep 17 00:00:00 2001 From: Arunabh Date: Sun, 14 Jan 2024 15:19:48 +0100 Subject: [PATCH 02/11] Update summarization.py --- summarization.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/summarization.py b/summarization.py index 0c68c2c..66e8847 100644 --- a/summarization.py +++ b/summarization.py @@ -68,8 +68,8 @@ def compute_metrics(eval_pred): training_args = Seq2SeqTrainingArguments( output_dir="results/summarization", # Where weights are stored learning_rate=2e-5, # The learning rate during training - per_device_train_batch_size=16, # Number of samples per batch during training - per_device_eval_batch_size=16, # Number of samples per batch during evaluation + per_device_train_batch_size=8, # Number of samples per batch during training + per_device_eval_batch_size=8, # Number of samples per batch during evaluation num_train_epochs=4, # How many iterations through the dataloaders should be done weight_decay=0.01, # Regularization penalization evaluation_strategy="epoch", # How often metrics on the evaluation dataset should be computed From 31b0edcc7ca0ed51e276d79d2300c9620bd0d504 Mon Sep 17 00:00:00 2001 From: Arunabh Date: Sun, 14 Jan 2024 15:42:14 +0100 Subject: [PATCH 03/11] Update token_classification.py --- token_classification.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/token_classification.py b/token_classification.py index 49aa02d..7d292e2 100644 --- a/token_classification.py +++ b/token_classification.py @@ -143,7 +143,7 @@ def compute_metrics(evaluation_preds): # Performing inference text = "The Golden State Warriors are an American professional basketball team based in San Francisco." # We need to tokenize the inputs and turn them to PyTorch tensors -encoded_input = tokenizer(text, return_tensors="pt") +encoded_input = tokenizer(text, return_tensors="pt").to("cuda") # Then we can perform raw torch inference: print("Performing inference...") @@ -153,5 +153,5 @@ def compute_metrics(evaluation_preds): # Finally, decode our outputs predictions = logits.argmax(dim=2) -print(f"Prediction: {[id2label[pred] for pred in predictions[0]]}") +print(f"Prediction: {[id2label[pred.item()] for pred in predictions[0]]}") # Can also use `model.config.id2label` instead From 564c2dbc74f41b52356811241d1189d05e7d387e Mon Sep 17 00:00:00 2001 From: Arunabh Date: Sun, 14 Jan 2024 16:12:43 +0100 Subject: [PATCH 04/11] Create requirements.txt --- requirements.txt | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..6453f49 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,9 @@ +accelerate==0.26.1 +datasets==2.16.1 +evaluate==0.4.1 +numpy==1.23.5 +rouge-score==0.1.2 +sacrebleu==2.4.0 +seqeval==1.2.2 +torch==2.1.0+cu121 +transformers==4.35.2 From 789bd24905e050d21f64d6c6da23ba9c745ff948 Mon Sep 17 00:00:00 2001 From: Arunabh Date: Sun, 14 Jan 2024 16:58:01 +0100 Subject: [PATCH 05/11] Update masked_language_modeling.py --- masked_language_modeling.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/masked_language_modeling.py b/masked_language_modeling.py index b0c0c48..41bc260 100644 --- a/masked_language_modeling.py +++ b/masked_language_modeling.py @@ -97,10 +97,14 @@ def group_texts(examples): print("Training...") trainer.train() +# Use the evaluate() method to evaluate the model and get its perplexity: +eval_results = trainer.evaluate() +print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}") + # Performing inference text = "The Milky Way is a galaxy." # We need to tokenize the inputs and turn them to PyTorch tensors -encoded_input = tokenizer(text, return_tensors="pt").input_ids +encoded_input = tokenizer(text, return_tensors="pt") # To move the batch to the right device automatically, use `PartialState().device` # which will always work no matter the environment From af601087ec811ba52958da7ec306797ce078fd37 Mon Sep 17 00:00:00 2001 From: Arunabh Date: Sun, 14 Jan 2024 17:39:07 +0100 Subject: [PATCH 06/11] Update translation.py --- translation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/translation.py b/translation.py index 33dc215..1f97efb 100644 --- a/translation.py +++ b/translation.py @@ -113,7 +113,7 @@ def compute_metrics(eval_preds): # Performing inference text = "translate English to French: Legumes share resources with nitrogen-fixing bacteria." # We need to tokenize the inputs and turn them to PyTorch tensors -encoded_input = tokenizer(text, return_tensors="pt").input_ids +encoded_input = tokenizer(text, return_tensors="pt").input_ids.to("cuda") # Then we can perform inference using `model.generate()`: print("Performing inference...") From 8788edabc1bdddf25eb8a2ab87cb4fc26d9d76d3 Mon Sep 17 00:00:00 2001 From: Arunabh Date: Fri, 26 Jan 2024 21:38:36 +0100 Subject: [PATCH 07/11] Move batch to the right device automatically using PartialState().device --- token_classification.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/token_classification.py b/token_classification.py index 7d292e2..6ca8f06 100644 --- a/token_classification.py +++ b/token_classification.py @@ -2,6 +2,7 @@ # for token classification. Based on the Tasks documentation # originally from: https://hf.co/docs/transformers/tasks/token_classification import evaluate +from accelerate import PartialState import numpy as np import torch from datasets import load_dataset @@ -143,7 +144,7 @@ def compute_metrics(evaluation_preds): # Performing inference text = "The Golden State Warriors are an American professional basketball team based in San Francisco." # We need to tokenize the inputs and turn them to PyTorch tensors -encoded_input = tokenizer(text, return_tensors="pt").to("cuda") +encoded_input = tokenizer(text, return_tensors="pt").to(PartialState().device) # Then we can perform raw torch inference: print("Performing inference...") From d8978a24cf7b6f7efbef74fa0228229ccbbc3160 Mon Sep 17 00:00:00 2001 From: Arunabh Date: Fri, 26 Jan 2024 21:40:39 +0100 Subject: [PATCH 08/11] Move batch to the right device automatically using PartialState().device --- translation.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/translation.py b/translation.py index 1f97efb..2701886 100644 --- a/translation.py +++ b/translation.py @@ -3,6 +3,7 @@ # originally from: https://hf.co/docs/transformers/tasks/translation import evaluate import numpy as np +from accelerate import PartialState from datasets import load_dataset from transformers import ( AutoModelForSeq2SeqLM, @@ -113,7 +114,7 @@ def compute_metrics(eval_preds): # Performing inference text = "translate English to French: Legumes share resources with nitrogen-fixing bacteria." # We need to tokenize the inputs and turn them to PyTorch tensors -encoded_input = tokenizer(text, return_tensors="pt").input_ids.to("cuda") +encoded_input = tokenizer(text, return_tensors="pt").input_ids.to(PartialState().device) # Then we can perform inference using `model.generate()`: print("Performing inference...") From cc0de3b4f44afeea993e02a78d6fbe5758d05064 Mon Sep 17 00:00:00 2001 From: Arunabh Date: Fri, 26 Jan 2024 21:48:08 +0100 Subject: [PATCH 09/11] Move batch to the right device automatically using PartialState().device --- sequence_classification.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sequence_classification.py b/sequence_classification.py index 5ce1805..7ca045a 100644 --- a/sequence_classification.py +++ b/sequence_classification.py @@ -2,6 +2,7 @@ # for sequence classification. Based on the Tasks documentation # originally from: https://hf.co/docs/transformers/tasks/sequence_classification import evaluate +from accelerate import PartialState import numpy as np import torch from datasets import load_dataset @@ -96,7 +97,7 @@ def compute_metrics(evaluation_preds): # Performing inference text = "This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three." # We need to tokenize the inputs and turn them to PyTorch tensors -encoded_input = tokenizer(text, return_tensors="pt").to("cuda") +encoded_input = tokenizer(text, return_tensors="pt").to(PartialState().device) # Then we can perform raw torch inference: print("Performing inference...") From 74d07fad21a275b523c12a4a7009aeb3b5f08b71 Mon Sep 17 00:00:00 2001 From: Arunabh Date: Fri, 26 Jan 2024 21:50:08 +0100 Subject: [PATCH 10/11] Update requirements.txt --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 6453f49..da06145 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,5 +5,5 @@ numpy==1.23.5 rouge-score==0.1.2 sacrebleu==2.4.0 seqeval==1.2.2 -torch==2.1.0+cu121 +torch==2.1.0 transformers==4.35.2 From 9f5cd0d16b8d35c28d7ad8437efe1e3e0a2705b3 Mon Sep 17 00:00:00 2001 From: Arunabh Date: Fri, 26 Jan 2024 21:58:54 +0100 Subject: [PATCH 11/11] Move batch to the right device automatically using PartialState().device --- summarization.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/summarization.py b/summarization.py index 66e8847..a90dd69 100644 --- a/summarization.py +++ b/summarization.py @@ -2,6 +2,7 @@ # for sequence classification. Based on the Tasks documentation # originally from: https://hf.co/docs/transformers/tasks/sequence_classification import evaluate +from accelerate import PartialState import numpy as np from datasets import load_dataset from transformers import ( @@ -97,7 +98,7 @@ def compute_metrics(eval_pred): # Performing inference text = "summarize: The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country. It'll lower the deficit and ask the ultra-wealthy and corporations to pay their fair share. And no one making under $400,000 per year will pay a penny more in taxes." # We need to tokenize the inputs and turn them to PyTorch tensors -encoded_input = tokenizer(text, return_tensors="pt").input_ids +encoded_input = tokenizer(text, return_tensors="pt").input_ids.to(PartialState().device) # Then we can perform inference using `model.generate`: print("Performing inference...")