-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathdata_loader.py
148 lines (127 loc) · 5.19 KB
/
data_loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import tensorflow as tf
from sklearn.utils import shuffle
import pandas as pd
import numpy as np
import os
import glob
import time
import math
from sklearn.model_selection import train_test_split
import ast
import scipy
import re
import sentencepiece as spm
import json
import tqdm
import swifter
from multiprocessing import Pool, cpu_count, freeze_support
import gc
import msgpack
from utils import replace_tokens, norm_len, replace_tandems
try:
from urllib.parse import unquote
except ImportError:
from urlparse import unquote
def preprocessing_worker(x):
x = replace_tokens(x, FLAGS.multy_replace)
if FLAGS.multy_replace:
if len(x.split()) > FLAGS.max_seq_len:
x = b" ".join(x.split()[:FLAGS.max_seq_len])
x = replace_tandems(x)
if FLAGS.tqdm_imap:
gc.collect()
return x
def main(argv=None):
if not os.path.exists(FLAGS.output_dataset):
os.mkdir(FLAGS.output_dataset)
print("per class limiter - ", FLAGS.per_class_limiter)
info_df = pd.read_csv(FLAGS.dataset_info_file).set_index('id')
with open(FLAGS.dataset_file, 'rb') as f:
vectors = msgpack.unpackb(f.read())
vectors_df = pd.DataFrame({'id': [v[0] for v in vectors], 'line': [v[1] for v in vectors]}).set_index('id')
df = pd.concat([info_df, vectors_df], axis=1)
df = df.drop_duplicates(subset=['line'])
df = df.dropna()
df['label'] = df.injection.apply(lambda x: 0 if not x else 1)
print("df:\n", df.head())
start_time = time.time()
print("preprocessing data: ")
with Pool(FLAGS.num_parallel_worker) as pool:
if FLAGS.tqdm_imap:
seq_in_data = tqdm.tqdm(pool.imap(preprocessing_worker, df["line"].values, FLAGS.chunksize),
total=len(df["line"].values))
else:
seq_in_data = pool.map(
preprocessing_worker,
df["line"].values,
FLAGS.chunksize)
df["line"] = seq_in_data
print("end preprocessing: ", time.time()-start_time)
df = df[df['line'] != b""]
df = df[df['line'] != b" "]
df = df[df['line'] != b"\n"]
if not FLAGS.test:
with open("all_corpus.txt", "w") as f:
for line in df.line.values:
f.write(line.decode()+"\n")
spm.SentencePieceTrainer.Train('--input=./all_corpus.txt --model_prefix='+FLAGS.output_dataset +
'/main_vocab --vocab_size='+str(FLAGS.vocab_size)+' --model_type=unigram --hard_vocab_limit=false')
sp = spm.SentencePieceProcessor()
sp.Load(FLAGS.output_dataset+"/main_vocab.model")
else:
sp = spm.SentencePieceProcessor()
sp.Load(FLAGS.spm_model)
print(df.head())
print("Encoding as Ids..")
df["line"] = df["line"].apply(sp.EncodeAsIds)
print("Calculating len of seqs..")
df["seq_in_len"] = df.line.apply(len).apply(
lambda l: min(l, FLAGS.max_seq_len))
print("Norming seqs..")
df["line"] = df["line"].apply(
lambda s: norm_len(s, FLAGS.max_seq_len))
if not FLAGS.test:
postfix = "train"
else:
postfix = "test"
np.savez(
os.path.join(FLAGS.output_dataset, postfix),
seq_in=np.array(df.line.values.tolist()),
label=np.array(df.label.tolist()),
seq_in_len=np.array(df.seq_in_len.values.tolist())
)
print("\nBalance: 0:1 - {}:{}".format(len(df[df.injection == 0].values),
len(df[df.injection == 1].values)))
print(np.array(df.line.values.tolist()).shape)
stats = {"max_seq_len": FLAGS.max_seq_len,
"multy_replace": FLAGS.multy_replace}
with open(os.path.join(FLAGS.output_dataset, "stats.json"), "w") as f:
json.dump(stats, f)
if __name__ == '__main__':
freeze_support()
FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_boolean(
'test', False, 'processing test dataset?')
tf.app.flags.DEFINE_string(
'output_dataset', './dataset', 'Path for storing dataset')
tf.app.flags.DEFINE_string(
'dataset_info_file', './raw_dataset/train.csv', 'Path to .csv file with dataset')
tf.app.flags.DEFINE_string(
'dataset_file', './raw_dataset/dataset.msgpack', 'Path to .msgpack file with vectors')
tf.app.flags.DEFINE_integer(
'max_seq_len', 800, 'Len of sentences')
tf.app.flags.DEFINE_integer(
'vocab_size', 4000, '(optional) Size of vocabulary for spm. Required if test := False')
tf.app.flags.DEFINE_string(
'spm_model', './dataset/main_vocab.model', "(optional) path to spm model")
tf.app.flags.DEFINE_boolean(
'multy_replace', True, 'If True - replacing consecutive tokens from one group to only one token')
tf.app.flags.DEFINE_integer(
'per_class_limiter', 290000, 'Limiter for num samples per class from dataset without nn errors. Set 0 for disable')
tf.app.flags.DEFINE_integer(
'num_parallel_worker', cpu_count() - 2, "Num parallel worker; Default cpu_count - 2")
tf.app.flags.DEFINE_integer(
'chunksize', 500, "chunksize for Pool")
tf.app.flags.DEFINE_boolean(
'tqdm_imap', False, "Progress bar for preprocessing. Warning: It's beta function, can lead to Memory Error")
tf.app.run()