-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathpreprocess_data.py
86 lines (75 loc) · 3.42 KB
/
preprocess_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import json
import os
import sys
import nltk
import random
import math
import shutil
nltk.download('punkt')
if sys.version_info[0] < 3:
raise Exception('You need to run this with Python 3.')
train_split, dev_split, test_split = 0.8, 0.1, 0.1
extraction_field = 'selftext'
path_prefix = "results_"
path_postfix = ".json"
pairs = [('Republican', 'Democrat'), ('learnpython', 'cpp'), ('mac', 'windows'),
('askmen', 'askwomen'), ('redpill', 'bluepill'),
('pokemon', 'digimon'), ('funny', 'sad'), ('stanford', 'berkeley'),
('bitcoin', 'ethereum'), ('ps3', 'xbox'), ('android', 'ios'),
('nike', 'adidas'), ('prolife', 'prochoice')]
input_dir = 'data/'
output_dir = 'preprocessed_data/'
if not os.path.exists(input_dir):
raise ValueError('Data directory (%s) does not exist.' % input_dir)
if not os.path.exists(output_dir):
os.makedirs(output_dir)
for pair in pairs:
pair_output_dir = os.path.join(output_dir, '%s_%s' % (pair[0], pair[1])).lower()
if not os.path.exists(pair_output_dir):
os.makedirs(pair_output_dir)
for i, name in enumerate(pair):
print(name)
input_filename = path_prefix + name + path_postfix
data = json.loads(open(os.path.join(input_dir, input_filename),
encoding='utf-8').read())
output_data = set()
for thread in data:
if (extraction_field in thread and thread[extraction_field] and
'[removed]' not in thread[extraction_field] and
'[deleted]' not in thread[extraction_field]):
extracted_field = thread[extraction_field].lower()
for sentence in extracted_field.split('.'):
sentence = ' '.join(nltk.word_tokenize(sentence))
for delete in ['�']:
sentence = sentence.replace(delete, '')
sentence = ''.join([c for c in sentence if ord(c) < 256 and \
(c == ' ' or c.isalnum() or c == '.' or c == ',' or c == ':' or c == '?' or c == '!')])
sentence = ' '.join([word if not word.isdigit() else 'num_num' for word in sentence.split(' ')])
sentence = ''.join([c for c in sentence if not c.isdigit()])
sentence = ' '.join([word for word in sentence.split(' ') \
if not any ([block_word in word \
for block_word in ['.net', '.org', 'x20', '.com', 'https', 'amp']])
and any([c.isalnum() for c in word])])
sentence = sentence.replace(' ', ' ')
# print(sentence)
if sentence.count(' ') > 5:
output_data.add(sentence + ' .')
output_data = list(output_data)
print(len(output_data))
random.shuffle(output_data)
train_data = output_data[:int(math.floor(len(output_data) * train_split))]
dev_data = output_data[int(math.floor(len(output_data) * train_split)):
int(math.floor(len(output_data) * (train_split + dev_split)))]
test_data = output_data[int(math.floor(len(output_data) *
(train_split + dev_split))):]
assert len(output_data) == len(train_data) + len(dev_data) + len(test_data)
for split_data, split in [(train_data, "train"), (dev_data, "dev"),
(test_data, "test")]:
output_filename = 'sentiment.%s.%s' % (split, str(i))
with open(os.path.join(pair_output_dir, output_filename), 'w') as file:
for data in split_data:
try:
file.write(data + '\n')
except:
pass
shutil.make_archive('preprocessed_data', 'zip', 'preprocessed_data')