-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathclean_data.py
26 lines (21 loc) · 890 Bytes
/
clean_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import sys
import os
import codecs
import string
def analyseTrans():
eng_file = codecs.open('clean_en.txt','w','utf-8')
t_file = codecs.open('clean_ts.txt','w','utf-8')
global_trans_counts = {}
en_transcriptions = [line.rstrip('\n') for line in codecs.open("data/en_ts/ents_parallel.en", "r", "utf-8")]
t_transcriptions = [line.rstrip('\n') for line in codecs.open("data/en_ts/ents_parallel.ts", "r", "utf-8")]
for counter, transcription in enumerate(en_transcriptions):
if transcription not in global_trans_counts:
global_trans_counts[transcription] = 1
eng_file.write(u''.join((transcription)))
t_file.write(u''.join((t_transcriptions[counter])))
else:
global_trans_counts[transcription] += 1
eng_file.close()
t_file.close()
if __name__ == "__main__":
analyseTrans()