This repository was archived by the owner on Sep 13, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 18
/
Copy pathDepParser.py
208 lines (177 loc) · 8.41 KB
/
DepParser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
# License: MIT
'''
:author: Nitin Madnani ([email protected])
:organization: ETS
'''
import ctypes as c
import logging
import os
# do we have nltk installed and if so, do we have its
# wordnet corpus installed?
try:
import nltk
nltk.data.find('corpora/wordnet')
except (ImportError, LookupError):
_HAS_LEMMATIZER = False
else:
_HAS_LEMMATIZER = True
from nltk.stem.wordnet import WordNetLemmatizer
class DepParser(object):
"""The ZPar English Dependency Parser"""
def __init__(self, modelpath, libptr, zpar_session_obj):
super(DepParser, self).__init__()
# save the zpar session object
self._zpar_session_obj = zpar_session_obj
# set up a logger
self.logger = logging.getLogger(__name__)
# get the library method that loads the parser models
self._load_depparser = libptr.load_depparser
self._load_depparser.restype = c.c_int
self._load_depparser.argtypes = [c.c_void_p, c.c_char_p]
# get the library methods that parse sentences and files
self._dep_parse_sentence = libptr.dep_parse_sentence
self._dep_parse_sentence.restype = c.c_char_p
self._dep_parse_sentence.argtypes = [c.c_void_p, c.c_char_p, c.c_bool]
self._dep_parse_file = libptr.dep_parse_file
self._dep_parse_file.restype = None
self._dep_parse_file.argtypes = [c.c_void_p, c.c_char_p, c.c_char_p, c.c_bool]
self._dep_parse_tagged_sentence = libptr.dep_parse_tagged_sentence
self._dep_parse_tagged_sentence.restype = c.c_char_p
self._dep_parse_tagged_sentence.argtypes = [c.c_void_p, c.c_char_p, c.c_char]
self._dep_parse_tagged_file = libptr.dep_parse_tagged_file
self._dep_parse_tagged_file.restype = None
self._dep_parse_tagged_file.argtypes = [c.c_void_p, c.c_char_p, c.c_char_p, c.c_char]
if self._load_depparser(self._zpar_session_obj, modelpath.encode('utf-8')):
raise OSError('Cannot find dependency parser model at {}\n'.format(modelpath))
# set up the wordnet lemmatizer if we have it
if _HAS_LEMMATIZER:
self.lemmatizer = WordNetLemmatizer()
else:
self.lemmatizer = None
def annotate_parse_with_lemmas(self, parse):
if not parse.strip():
return parse
else:
new_parse_lines = []
for line in parse.strip().split('\n'):
fields = line.strip().split('\t')
word, pos = fields[:2]
if pos.startswith('J'):
param = 'a'
elif pos.startswith('R'):
param = 'r'
elif pos.startswith('V'):
param = 'v'
else:
param = 'n'
lemma = self.lemmatizer.lemmatize(word.lower(), param)
new_parse_line = '\t'.join(fields + [lemma])
new_parse_lines.append(new_parse_line)
return '\n'.join(new_parse_lines) + '\n'
def dep_parse_sentence(self,
sentence,
tokenize=True,
with_lemmas=False):
if not sentence.strip():
# return empty string if the input is empty
ans = ""
else:
zpar_compatible_sentence = sentence.strip() + "\n "
zpar_compatible_sentence = zpar_compatible_sentence.strip() + "\n "
zpar_compatible_sentence = zpar_compatible_sentence.encode('utf-8')
parsed_sent = self._dep_parse_sentence(self._zpar_session_obj,
zpar_compatible_sentence,
tokenize)
ans = parsed_sent.decode('utf-8')
# if we are asked to add lemma information, then we need
# to add another field to each of the lines in the
# parse returned from zpar
if with_lemmas:
if self.lemmatizer:
ans = self.annotate_parse_with_lemmas(ans)
else:
self.logger.warning('No lemmatizer available. Please '
'install NLTK and its Wordnet corpus.')
return ans
def dep_parse_file(self,
inputfile,
outputfile,
tokenize=True,
with_lemmas=False):
if not os.path.exists(inputfile):
raise OSError('File {} does not exist.'.format(inputfile))
else:
parsed = False
# if we want lemmas, we have to individually parse
# each sentence and then annotate its parse with lemmas
if with_lemmas:
if self.lemmatizer:
with open(inputfile, 'r') as inputf, open(outputfile, 'w') as outf:
for sentence in inputf:
outf.write(self.dep_parse_sentence(sentence,
tokenize=tokenize,
with_lemmas=True) + '\n')
parsed = True
else:
self.logger.warning('No lemmatizer available. Please '
'install NLTK and its Wordnet corpus.')
# otherwise we can just parse the whole file in C++ space
if not parsed:
self._dep_parse_file(self._zpar_session_obj,
inputfile.encode('utf-8'),
outputfile.encode('utf-8'),
tokenize)
def dep_parse_tagged_sentence(self,
tagged_sentence,
sep='/',
with_lemmas=False):
if not tagged_sentence.strip():
# return empty string if the input is empty
ans = ""
else:
zpar_compatible_sentence = tagged_sentence.strip().encode('utf-8')
parsed_sent = self._dep_parse_tagged_sentence(self._zpar_session_obj,
zpar_compatible_sentence,
sep.encode('utf-8'))
ans = parsed_sent.decode('utf-8')
# if we are asked to add lemma information, then we need
# to add another field to each of the lines in the
# parse returned from zpar
if with_lemmas:
if self.lemmatizer:
ans = self.annotate_parse_with_lemmas(ans)
else:
self.logger.warning('No lemmatizer available. Please '
'install NLTK and its Wordnet corpus.')
return ans
def dep_parse_tagged_file(self, inputfile, outputfile, sep='/', with_lemmas=False):
if not os.path.exists(inputfile):
raise OSError('File {} does not exist.'.format(inputfile))
else:
parsed = False
# if we want lemmas, we have to individually parse
# each sentence and then annotate its parse with lemmas
if with_lemmas:
if self.lemmatizer:
with open(inputfile, 'r') as inputf, open(outputfile, 'w') as outf:
for sentence in inputf:
outf.write(self.dep_parse_tagged_sentence(sentence,
sep=sep,
with_lemmas=with_lemmas) + '\n')
parsed = True
else:
self.logger.warning('No lemmatizer available. Please '
'install NLTK and its Wordnet corpus.')
# otherwise we can just parse the whole file in C++ space
if not parsed:
self._dep_parse_tagged_file(self._zpar_session_obj,
inputfile.encode('utf-8'),
outputfile.encode('utf-8'),
sep.encode('utf-8'))
def cleanup(self):
self._load_depparser = None
self._dep_parse_sentence = None
self._dep_parse_file = None
self._dep_parse_tagged_sentence = None
self._dep_parse_tagged_file = None
self._zpar_session_obj = None