-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy path4_sentiment_analysis_fine_tuning.py
61 lines (46 loc) · 1.51 KB
/
4_sentiment_analysis_fine_tuning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import numpy as np
from datasets import load_dataset
from sklearn.metrics import f1_score
from transformers import AutoTokenizer
from transformers import (
AutoModelForSequenceClassification,
Trainer,
TrainingArguments,
AutoConfig,
)
raw_dataset = load_dataset("csv", data_files="data.csv")
split = raw_dataset["train"].train_test_split(test_size=0.3, seed=42)
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
def tokenizer_fn(batch):
return tokenizer(batch["sentence"], truncation=True)
def compute_metrics(logits_and_labels):
logits, labels = logits_and_labels
prediction = np.argmax(logits, axis=-1)
acc = np.mean(prediction == labels)
f1 = f1_score(labels, prediction, average='macro')
return {'accuracy': acc, 'f1': f1}
tokenized_datasets = split.map(tokenizer_fn, batched=True)
config = AutoConfig.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, config=config)
training_args = TrainingArguments(
checkpoint,
evaluation_strategy='epoch',
num_train_epochs=3,
learning_rate=1e-5
per_device_train_batch_size=16,
per_device_eval_batch_size=64,
save_total_limit = 3,
fp16 = False,
push_to_hub = True,
)
trainer = Trainer(
model,
training_args,
train_dataset = tokenized_datasets['train'],
eval_dataset = tokenized_datasets['test'],
tokenizer=tokenizer,
compute_metrics=compute_metrics,
)
trainer.train()
trainer.push_to_hub()