-
Notifications
You must be signed in to change notification settings - Fork 4
/
data_parser.py
37 lines (33 loc) · 1.27 KB
/
data_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import os
import features
import string
import cPickle as pickle
def load(fileName):
#Return a list of lists, each list corresponding to a sentence in the fileName
#The list has the format (word,category,label) for each word in sentence, each label corresponding to
#the entity of the word. For current training datasets, these are-
#(Date-Date, Num-Number of tickets, Dest-Destination, Src-Source Location)
#category is obtained from features.category() function
listt=[]
with open(fileName,'r') as file:
x=file.read()
x=x.splitlines()
for i in range(0,len(x),2):
line=x[i]
label_line=x[i+1]
label_dict={}
labels=label_line.split(',')
for label in labels:
pos=label.find(':')
label_dict[label[pos+1:]]=label[:label.find(':')]
line_list=[]
line=line.translate(None, string.punctuation)
words=line.split(' ')
for word in words:
cat=features.category(word)
if word in label_dict:
line_list.append([word,cat,label_dict[word]])
else:
line_list.append([word,cat,'0'])
listt.append(line_list)
return listt