-
Notifications
You must be signed in to change notification settings - Fork 22
/
Copy pathcreate_structured_csv.py
58 lines (49 loc) · 2.05 KB
/
create_structured_csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import pickle
import pandas as pd
import os
import glob
def main():
#create a list of pickle file names
pickles = []
for file in glob.glob(os.getcwd() + "/data/output/ner/*.pickle"):
pickles.append(file)
#load each pickle file and create the resultant csv file
for file in pickles:
with open(file,'rb') as f:
entities = pickle.load(f)
#add all the names in entity set
entity_set = set(entities.keys())
final_list = []
curr_dir = os.getcwd()
file_name_list = file.split('/')[-1].split('.')[0].split('_')[2:]
file_name = file_name_list[0]
flag = True
for str in file_name_list[1:]:
file_name += '_'
file_name += str
print(file_name)
df = pd.read_csv(curr_dir +"/data/output/kg/"+file_name+".txt-out.csv")
#parse every row present in the intermediate csv file
triplet = set()
for i,j in df.iterrows():
j[0] = j[0].strip()
#if entity is present in entity set, only then parse futrther
if j[0] in entity_set:
added = False
e2_sentence = j[2].split(' ')
#check every word in entity2, and add a new row triplet if it is present in entity2
for entity in e2_sentence:
if entity in entity_set:
_ = (entities[j[0]], j[0], j[1] ,entities[entity], j[2] )
triplet.add(_)
added = True
if not added:
_ = (entities[j[0]], j[0], j[1] ,'O', j[2] )
triplet.add(_)
#convert the pandas dataframe into csv
processed_pd = pd.DataFrame(list(triplet),columns=['Type','Entity 1','Relationship','Type', 'Entity2'])
processed_pd.to_csv('./data/result/' + file.split("/")[-1].split(".")[0] + '.csv', encoding='utf-8', index=False)
print("Processed " + file.split("/")[-1])
print("Files processed and saved")
if __name__ == '__main__':
main()