-
Notifications
You must be signed in to change notification settings - Fork 19
/
Copy pathextract_data.py
116 lines (81 loc) · 2.52 KB
/
extract_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import pickle
import numpy as np
import os
files = os.listdir('brown/')
# In case your system can't handle all 500 samples
# set the number of samples to a reasonable number like 20
n_sample_files = 500
print('TOTAL NO. OF FILES ', len(files), '\n')
print('RUNNING ON ', n_sample_files, ' FILES\n')
raw_corpus = ''
for file in files[0:n_sample_files]:
with open('brown/' + file) as f:
raw_corpus = raw_corpus + '\n' + f.read()
corpus = raw_corpus.split('\n')
print('CORPUS SIZE', len(corpus), '\n')
X_train = []
Y_train = []
words = []
tags = []
with_slash = False
n_omitted = 0
for line in corpus:
if(len(line)>0):
tempX = []
tempY = []
for word in line.split():
try:
w, tag = word.split('/')
except:
# with_slash = True
n_omitted = n_omitted + 1
break
w = w.lower()
words.append(w)
tags.append(tag)
tempX.append(w)
tempY.append(tag)
X_train.append(tempX)
Y_train.append(tempY)
print('OMITTED sentences: ', n_omitted, '\n')
print('TOTAL NO OF SAMPLES: ', len(X_train), '\n')
print('sample X_train: ', X_train[42], '\n')
print('sample Y_train: ', Y_train[42], '\n')
words = set(words)
tags = set(tags)
print('VOCAB SIZE: ', len(words))
print('TOTAL TAGS: ', len(tags))
assert len(X_train) == len(Y_train)
word2int = {}
int2word = {}
for i, word in enumerate(words):
word2int[word] = i+1
int2word[i+1] = word
tag2int = {}
int2tag = {}
for i, tag in enumerate(tags):
tag2int[tag] = i+1
int2tag[i+1] = tag
X_train_numberised = []
Y_train_numberised = []
for sentence in X_train:
tempX = []
for word in sentence:
tempX.append(word2int[word])
X_train_numberised.append(tempX)
for tags in Y_train:
tempY = []
for tag in tags:
tempY.append(tag2int[tag])
Y_train_numberised.append(tempY)
print('sample X_train_numberised: ', X_train_numberised[42], '\n')
print('sample Y_train_numberised: ', Y_train_numberised[42], '\n')
X_train_numberised = np.asarray(X_train_numberised)
Y_train_numberised = np.asarray(Y_train_numberised)
pickle_files = [X_train_numberised, Y_train_numberised, word2int, int2word, tag2int, int2tag]
if not os.path.exists('PickledData/'):
print('MAKING DIRECTORY PickledData/ to save pickled glove file')
os.makedirs('PickledData/')
with open('PickledData/data.pkl', 'wb') as f:
pickle.dump(pickle_files, f)
print('Saved as pickle file')