-
Notifications
You must be signed in to change notification settings - Fork 0
/
Combined_babelfy_babelnet_system.py
149 lines (120 loc) · 4.57 KB
/
Combined_babelfy_babelnet_system.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# import urllib2
import ssl
import urllib
import urllib.parse
from urllib.request import urlopen
import urllib.request
import json
import gzip
from BabelNet_BabelFy_functions import babelfy_entities, babelNet
try:
from StringIO import StringIO
except ImportError:
from io import StringIO, BytesIO
service_url = 'https://babelfy.io/v1/disambiguate'
text = 'February and April 2017, about 4.9 million people, over 40 percent of total population, are estimated to be severely food insecure and this figure is projected to reach 5.5 million people at the peak of the lean season in July. Although most food insecure ' \
'people are concentrated in the Greater Upper Nile region, food security has drastically deteriorated in former Northern Bahr el Ghazal State and the Greater Equatoria Region.'
lang = 'EN'
key = 'KEY'
text = ""
text_file=open("Test_small_V1.txt","r")
out_file=open("Babelfy_Babelnet_output.txt","w")
for line in text_file:
words=line.strip().split()
if len(words)>0:
text += words[0] + " "
else:
entities = babelfy_entities(text, lang, key)
all_words=text.split()
index=0
print ("why the hell are we getting ",entities)
for ind, entity_ind in enumerate(entities):
while index < entity_ind[0]:
out_file.write(all_words[index]+" " + "O" +"\n")
index+=1
entity_name = ""
for ent_ind in entity_ind:
index+=1
entity_name += all_words[ent_ind]
entity_name+=" "
entity_name+="LOCATION" + "\n"
out_file.write(entity_name)
out_file.write("\n")
text=""
"""
def babelfy_entities(text, lang, key):
params = {
'text' : text,
'lang' : lang,
'key' : key
}
url = service_url + '?' + urllib.parse.urlencode(params)
# request = urllib2.Request(url)
request = urllib.request.Request(url)
request.add_header('Accept-encoding', 'gzip')
gcontext = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
# response = urlopen(url, context=gcontext)
response=urlopen(request, context=gcontext)
entity_ids=[]
if response.info().get('Content-Encoding') == 'gzip':
# print ("we are here")
buf = BytesIO( response.read())
f = gzip.GzipFile(fileobj=buf)
data = json.loads(f.read())
# retrieving data
for result in data:
# retrieving token fragment
tokenFragment = result.get('tokenFragment')
tfStart = tokenFragment.get('start')
tfEnd = tokenFragment.get('end')
# print (str(tfStart) + "\t" + str(tfEnd))
# retrieving char fragment
charFragment = result.get('charFragment')
cfStart = charFragment.get('start')
cfEnd = charFragment.get('end')
# print (str(cfStart) + "\t" + str(cfEnd))
# print (str(cfStart) + "\t" + str(cfEnd)+ "\t" +text[cfStart:cfEnd+1])
ent_word=text[cfStart:cfEnd+1]
# retrieving BabelSynset ID
synsetId = result.get('babelSynsetID')
type=babelNet(synsetId, ent_word)
if type == "GEONM":
ind_entity = []
if tfStart != tfEnd:
ind_entity.append(tfStart)
ind_entity.append(tfEnd)
entity_ids.append(ind_entity)
# print (synsetId)
def babelNet(syset_ID, word):
service_url = 'https://babelnet.io/v5/getSenses'
lemma = word
syn_id = str(syset_ID)
lang = 'EN'
key = 'f49269e8-4de9-44b9-9259-f9cad5544413'
params = {
'lemma': lemma,
'id': syn_id, #"bn:02987985n",
'searchLang': lang,
'key': key
}
# url = service_url + '?' + urllib.urlencode(params)
url = service_url + '?' + urllib.parse.urlencode(params)
# request = urllib2.Request(url)
request = urllib.request.Request(url)
request.add_header('Accept-encoding', 'gzip')
gcontext = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
# response = urllib2.urlopen(request)
response = urlopen(request, context=gcontext)
if response.info().get('Content-Encoding') == 'gzip':
# buf = StringIO(response.read())
buf = BytesIO(response.read())
f = gzip.GzipFile(fileobj=buf)
data = json.loads(f.read())
print (data[0]['properties']['source'])
# retrieving BabelSense data
for result in data:
# lemma = result.get('lemma')
# language = result.get('language')
source = result.get('source')
# print (str(source))
"""