This repository was archived by the owner on Sep 13, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 18
/
Copy pathTagger.py
64 lines (50 loc) · 2.16 KB
/
Tagger.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# License: MIT
'''
:author: Nitin Madnani ([email protected])
:organization: ETS
'''
import ctypes as c
import logging
import os
class Tagger(object):
"""The ZPar English POS Tagger"""
def __init__(self, modelpath, libptr, zpar_session_obj):
super(Tagger, self).__init__()
# save the zpar session object
self._zpar_session_obj = zpar_session_obj
# set up a logger
self.logger = logging.getLogger(__name__)
# get the library method that loads the tagger models
self._load_tagger = libptr.load_tagger
self._load_tagger.restype = c.c_int
self._load_tagger.argtypes = [c.c_void_p, c.c_char_p]
# get the library methods that tag sentences and files
self._tag_sentence = libptr.tag_sentence
self._tag_sentence.restype = c.c_char_p
self._tag_sentence.argtypes = [c.c_void_p, c.c_char_p, c.c_bool]
self._tag_file = libptr.tag_file
self._tag_file.restype = None
self._tag_file.argtypes = [c.c_void_p, c.c_char_p, c.c_char_p, c.c_bool]
if self._load_tagger(self._zpar_session_obj, modelpath.encode('utf-8')):
raise OSError('Cannot find tagger model at {}\n'.format(modelpath))
def tag_sentence(self, sentence, tokenize=True):
if not sentence.strip():
# return empty string if the input is empty
ans = ""
else:
zpar_compatible_sentence = sentence.strip() + "\n "
zpar_compatible_sentence = zpar_compatible_sentence.encode('utf-8')
tagged_sent = self._tag_sentence(self._zpar_session_obj, zpar_compatible_sentence, tokenize)
ans = tagged_sent.decode('utf-8')
return ans
return ans
def tag_file(self, inputfile, outputfile, tokenize=True):
if os.path.exists(inputfile):
self._tag_file(self._zpar_session_obj, inputfile.encode('utf-8'), outputfile.encode('utf-8'), tokenize)
else:
raise OSError('File {} does not exist.'.format(inputfile))
def cleanup(self):
self._load_tagger = None
self._tag_sentence = None
self._tag_file = None
self._zpar_session_obj = None