-
Notifications
You must be signed in to change notification settings - Fork 0
/
BabelNet_BabelFy_functions.py
120 lines (96 loc) · 3.48 KB
/
BabelNet_BabelFy_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# import urllib2
import ssl
import urllib
import urllib.parse
from urllib.request import urlopen
import urllib.request
import json
import gzip
try:
from StringIO import StringIO
except ImportError:
from io import StringIO, BytesIO
service_url = 'https://babelfy.io/v1/disambiguate'
lang = 'EN'
key = 'KEY'
def babelfy_entities(text, lang, key):
params = {
'text' : text,
'lang' : lang,
'key' : key
}
url = service_url + '?' + urllib.parse.urlencode(params)
# request = urllib2.Request(url)
request = urllib.request.Request(url)
request.add_header('Accept-encoding', 'gzip')
gcontext = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
# response = urlopen(url, context=gcontext)
response=urlopen(request, context=gcontext)
entity_ids=[]
if response.info().get('Content-Encoding') == 'gzip':
# print ("we are here")
buf = BytesIO( response.read())
f = gzip.GzipFile(fileobj=buf)
data = json.loads(f.read())
# retrieving data
for result in data:
# retrieving token fragment
tokenFragment = result.get('tokenFragment')
tfStart = tokenFragment.get('start')
tfEnd = tokenFragment.get('end')
# print (str(tfStart) + "\t" + str(tfEnd))
# retrieving char fragment
charFragment = result.get('charFragment')
cfStart = charFragment.get('start')
cfEnd = charFragment.get('end')
# print (str(cfStart) + "\t" + str(cfEnd))
# print (str(cfStart) + "\t" + str(cfEnd)+ "\t" +text[cfStart:cfEnd+1])
ent_word=text[cfStart:cfEnd+1]
# retrieving BabelSynset ID
synsetId = result.get('babelSynsetID')
type=babelNet(synsetId, ent_word)
if type == "GEONM":
print ("we are not able to reach here")
ind_entity = []
if tfStart != tfEnd:
print ("this is for multi-word entity...")
ind_entity.append(tfStart)
ind_entity.append(tfEnd)
else:
ind_entity.append(tfStart)
entity_ids.append(ind_entity)
return entity_ids
# print (synsetId)
def babelNet(syset_ID, word):
service_url = 'https://babelnet.io/v5/getSenses'
lemma = word
syn_id = str(syset_ID)
lang = 'EN'
key = 'f49269e8-4de9-44b9-9259-f9cad5544413'
params = {
'lemma': lemma,
'id': syn_id, #"bn:02987985n",
'searchLang': lang,
'key': key
}
# url = service_url + '?' + urllib.urlencode(params)
url = service_url + '?' + urllib.parse.urlencode(params)
# request = urllib2.Request(url)
request = urllib.request.Request(url)
request.add_header('Accept-encoding', 'gzip')
gcontext = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
# response = urllib2.urlopen(request)
response = urlopen(request, context=gcontext)
if response.info().get('Content-Encoding') == 'gzip':
# buf = StringIO(response.read())
buf = BytesIO(response.read())
f = gzip.GzipFile(fileobj=buf)
data = json.loads(f.read())
print (data[0]['properties']['source'])
# retrieving BabelSense data
for result in data:
# lemma = result.get('lemma')
# language = result.get('language')
source = result.get('source')
# print (str(source))
return data[0]['properties']['source']