-
Notifications
You must be signed in to change notification settings - Fork 0
/
baseline.py
61 lines (50 loc) · 2.02 KB
/
baseline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# -*- coding: utf-8 -*-
"""
Created on Sat Oct 1 11:36:07 2016
"""
import file_reader
def generate_baseline_files(input_folder_path,
output_folder_path,
baseline_dict) :
all_text_files = [];
all_text_files += file_reader.list_all_text_files(input_folder_path);
for file in all_text_files:
newline = ""
prev_token = 'O'
read_handle = open(input_folder_path+file,"r")
for line in read_handle:
if not line.strip():
newline += "\n"
prev_token = "O"
continue;
line_split = line.split();
if line_split[0] not in baseline_dict:
newline += line_split[0]+"\t"+line_split[1]+"\t"+'O'+"\n"
prev_token = 'O'
else:
if prev_token == 'B' or prev_token == 'I' :
newline += line_split[0]+"\t"+line_split[1]+"\t"+'I'+"\n"
prev_token = 'I'
else :
newline += line_split[0]+"\t"+line_split[1]+"\t"+'B'+"\n"
prev_token = 'B'
write_handle = open(output_folder_path+file,"w");
write_handle.write(newline)
write_handle.close();
def generate_weasel_dictionary(folder_path) :
weasel_dict = dict()
all_text_files = [];
all_text_files += file_reader.list_all_text_files(folder_path);
for file in all_text_files:
read_handle = open(folder_path+file,"r")
for line in read_handle:
if not line.strip():
continue
line_split = line.split();
if (line_split[2] == 'B' or line_split[2] == 'I'):
#
# Should we cross check with default weasel words before
# marking something as B? can we mark them I directly?
#
weasel_dict[line_split[0]] = line_split[2]
return weasel_dict