-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtweets.py
97 lines (85 loc) · 3.65 KB
/
tweets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# see https://www.spinningbytes.com/resources/germansentiment/ and https://github.com/aritter/twitter_download for obtaining the data.
import os
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from conversion import convert_examples_to_features, convert_text_to_examples
def load_datasets(data_dir, num_categories, test_size):
data = pd.read_csv(os.path.join(data_dir, "downloaded.tsv"), sep="\t", na_values="Not Available",
names=["id", "sentiment", "tweet_id", "?", "text"], index_col='id')
data = data.dropna(how='any')[['sentiment', 'text']]
data['sentiment'][data['sentiment'] == 'neutral'] = 2
data['sentiment'][data['sentiment'] == 'negative'] = 0
data['sentiment'][data['sentiment'] == 'positive'] = 1
if num_categories == 2:
data = data[np.logical_not(data.sentiment==2)]
X = data['text']
y = data['sentiment']
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=test_size, random_state=0)
return (X_train, y_train, X_test, y_test)
def get_tweets_data(data_dir, subtask, num_categories, tokenizer, max_seq_length, test_size):
fn = os.path.join(data_dir, "data_"+subtask+"_"+str(num_categories)+"cat_"+str(max_seq_length)+".npz")
if Path(fn).is_file():
f= np.load(fn)
train_input_ids = f['train_input_ids']
train_input_masks = f['train_input_masks']
train_segment_ids = f['train_segment_ids']
train_labels = f['train_labels']
test_input_ids = f['test_input_ids']
test_input_masks = f['test_input_masks']
test_segment_ids = f['test_segment_ids']
test_labels = f['test_labels']
f.close()
else:
X_train, y_train, X_test, y_test = load_datasets(data_dir, num_categories, test_size)
# Create datasets (Only take up to max_seq_length words for memory)
train_text = X_train.to_list()
train_text = [" ".join(t.split()[0:max_seq_length]) for t in train_text]
train_text = np.array(train_text, dtype=object)[:, np.newaxis]
train_label = y_train.tolist()
test_text = X_test.tolist()
test_text = [" ".join(t.split()[0:max_seq_length]) for t in test_text]
test_text = np.array(test_text, dtype=object)[:, np.newaxis]
test_label = y_test.tolist()
# Convert data to InputExample format
train_examples = convert_text_to_examples(train_text, train_label)
test_examples = convert_text_to_examples(test_text, test_label)
# Convert to features
(
train_input_ids,
train_input_masks,
train_segment_ids,
train_labels,
) = convert_examples_to_features(
tokenizer, train_examples, max_seq_length=max_seq_length
)
(
test_input_ids,
test_input_masks,
test_segment_ids,
test_labels,
) = convert_examples_to_features(
tokenizer, test_examples, max_seq_length=max_seq_length
)
np.savez(fn,
train_input_ids=train_input_ids,
train_input_masks=train_input_masks,
train_segment_ids=train_segment_ids,
train_labels=train_labels,
test_input_ids=test_input_ids,
test_input_masks=test_input_masks,
test_segment_ids=test_segment_ids,
test_labels=test_labels
)
return (
train_input_ids,
train_input_masks,
train_segment_ids,
train_labels,
test_input_ids,
test_input_masks,
test_segment_ids,
test_labels
)