-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathdata_helper.py
142 lines (127 loc) · 4.75 KB
/
data_helper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# encoding=utf8
import re
import jieba
import itertools
import os
from collections import defaultdict
import numpy as np
import io
import codecs
import pickle
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
'''
input text data and it's label
'''
def file_helper(input_file):
lines = list(open(input_file, 'r').readlines())
label2cont = defaultdict(list)
for line in lines:
idx = line.find(':')
if idx == -1:
continue
label2cont[str(line[ : idx]).strip()].append(str(line[idx + 1 :]).strip())
for key, value in label2cont.iteritems():
with open('input_data/'+ str(key) + ".txt", 'w') as f:
for line in label2cont[key]:
f.write((line + '\n'))
def load_data_and_label(input_file):
classNames = os.listdir(input_file)
x_train = []
classes = []
labels = []
for c in classNames:
tmp = []
cont = []
with codecs.open(input_file + '/' + c, 'r', encoding='utf-8', errors='ignore') as f:
tmp = [line.strip() for line in f.readlines()]
for t in tmp:
#print seperate_line(clean_str(t))
cont.append(seperate_line(clean_str(t)))
classes.append(cont)
#genarate classee
for c in classes:
x_train += c
idx = 0
for i in range(len(classes)):
labelvec = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
labelvec[idx] = 1
idx += 1
tmplabel = [labelvec for _ in classes[i]]
labels.append(tmplabel)
#combine label
y_train = np.concatenate(labels, 0)
return [x_train, y_train, label2str]
def label2str(input_file):
classNames = os.listdir(input_file)
label2str = {}
i = 0
for c in classNames:
label2str[i] = c.split('.')[0]
i += 1
return label2str
def clean_str(string):
#string = re.sub(ur"[^\u4e00-\u9fff]", " ", string)
# string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
# string = re.sub(r"\'s", " \'s", string)
# string = re.sub(r"\'ve", " \'ve", string)
# string = re.sub(r"n\'t", " n\'t", string)
# string = re.sub(r"\'re", " \'re", string)
# string = re.sub(r"\'d", " \'d", string)
# string = re.sub(r"\'ll", " \'ll", string)
# string = re.sub(r",", " , ", string)
# string = re.sub(r"!", " ! ", string)
# string = re.sub(r"\(", " \( ", string)
# string = re.sub(r"\)", " \) ", string)
# string = re.sub(r"\?", " \? ", string)
#string = re.sub(r"\s{2,}", " ", string)
string = re.sub('\s+', " ", string)
r1 = u'[A-Za-z0-9’!"#$%&\'()*+,-./:;<=>?@,。?★、…【】《》?“”‘’![\\]^_`{|}~]+'
string = re.sub(r1, ' ', string)
return string.strip()
def seperate_line(line):
line = jieba.cut(line)
return ''.join([word + " " for word in line])
def batch_iter(data, batch_size, epoch_num, shuffle = True):
data = np.array(data)
data_size = len(data)
batch_num_per_epoch = int((data_size - 1 / batch_size)) + 1
for epoch in range(epoch_num):
if shuffle:
shuffle_indices = np.random.permutation(np.arange(data_size))
shuffled_data = data[shuffle_indices]
else:
shuffled_data = data
for batch_num in range(batch_num_per_epoch):
start_idx = batch_num * batch_size
end_idx = min((batch_num + 1) * batch_size, data_size)
yield shuffled_data[start_idx : end_idx]
def padding_sentences(input_sentences, padding_token, padding_sentence_length = None):
sentences = [sentences.split() for sentences in input_sentences]
max_sentence_length = padding_sentence_length if padding_sentence_length is not None else max([len(sentence) for sentence in sentences])
for sentence in sentences:
if len(sentence) > max_sentence_length:
sentence = sentence[:max_sentence_length]
else:
sentence.extend([padding_token] * (max_sentence_length - len(sentence)))
return (sentences, max_sentence_length)
def saveDict(input_dict, output_file):
with open(output_file, 'w') as f:
pickle.dump(input_dict, f)
def loadDict(dict_file):
output_dict = None
with open(dict_file, 'r') as f:
output_dict = pickle.load(f)
return output_dict
if __name__ == '__main__':
#file_helper("trainning_data/40_million_training_data_12/big_type_str_12.txt")
#x_text, y = load_data_and_label("input_data/")
#print "len of x_train is: ", len(x_text)
#print "len of y_train is: ", y.shape
#sentences, max_document_length = padding_sentences(x_text[0:100], '<PADDING>')
#print "max document length = ", max_document_length
label2str = label2str('input_data/')
for k, v in label2str.items():
print 'label : %d' % k, ' ', 'class : %s' % v
# sentences = [sentences.split( ) for sentences in x_text[0:5000]]