-
Notifications
You must be signed in to change notification settings - Fork 2
/
helpers.py
77 lines (64 loc) · 2.84 KB
/
helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import pandas as pd
import torch
def load_tweets(PATH_DATA, small_dataset=1):
'''
This function loads the training tweets
inputs:
PATH_DATA (str) : path to folder with train_pos.txt etc
small_dataset (bool) : use the small or the big dataset
outputs:
train_pos (DataFrame) : dataframe containing the positive tweets
train_neg (DataFrame) : dataframe containing the negative tweets
'''
if small_dataset:
train_pos = pd.read_fwf(PATH_DATA + 'twitter-datasets/train_pos.txt',
header = None, names = ['Tweet'], colspecs = [(0,280)])
train_neg = pd.read_fwf(PATH_DATA + 'twitter-datasets/train_neg.txt',
header = None, names = ['Tweet'], colspecs = [(0,280)])
# otherwise load the full dataset
else:
train_pos = pd.read_fwf(PATH_DATA + 'twitter-datasets/train_pos_full.txt',
header = None, names = ['Tweet'], colspecs = [(0,280)])
train_neg = pd.read_fwf(PATH_DATA + 'twitter-datasets/train_neg_full.txt',
header = None, names = ['Tweet'], colspecs = [(0,280)])
return train_pos, train_neg
def gpu_cpu_setup():
'''
Function to setup the gpu device in pytorch if it is available
outputs:
device (str) : device names ( either 'cpu' or 'gpu' )
Based on:
http://mccormickml.com/2019/07/22/BERT-fine-tuning/#11-using-colab-gpu-for-training
https://github.com/huggingface/transformers
'''
if torch.cuda.is_available():
# If there's a GPU available, tell PyTorch to use the GPU.
device = torch.device("cuda")
print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
print('No GPU available, using the CPU instead.')
device = torch.device("cpu")
return device
def create_input_df(train_pos,train_neg):
'''
Takes the two dataframes containing positive and negative tweets and
returns one dataframe with all the tweets and their corresponding labels
Inputs:
train_pos (DataFrame) : dataframe containing the positive tweets
train_neg (DataFrame) : dataframe containing the negative tweets
Outputs :
df (DataFrame) : dataframe containing the positive and negative
tweets in the column "Tweet" and the labels the
column 'label'
'''
# label the postive tweets
df_1 = train_pos.copy()
df_1.rename(columns={"Tweet": "text"},inplace=True)
df_1['label'] = 1
# label the negative tweets
df_2 = train_neg.copy()
df_2.rename(columns={"Tweet": "text"},inplace=True)
df_2['label'] = 0
# combine both dataframes with tweets and labels
df = pd.concat([df_1, df_2], ignore_index=True, sort=False)
return df