-
Notifications
You must be signed in to change notification settings - Fork 225
/
train_tagger.py
executable file
·297 lines (237 loc) · 10.6 KB
/
train_tagger.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
#!/usr/bin/env python
import argparse, math, itertools, os.path
import nltk.corpus, nltk.data
import nltk_trainer.classification.args
from nltk.classify import DecisionTreeClassifier, MaxentClassifier, NaiveBayesClassifier
# special case corpus readers
from nltk.corpus.reader import SwitchboardCorpusReader, NPSChatCorpusReader, IndianCorpusReader
from nltk.corpus.util import LazyCorpusLoader
from nltk.tag import ClassifierBasedPOSTagger
from nltk_trainer import dump_object, load_corpus_reader, simplify_wsj_tag
from nltk_trainer.tagging import readers
from nltk_trainer.tagging.training import train_brill_tagger
from nltk_trainer.tagging.taggers import PhoneticClassifierBasedPOSTagger
########################################
## command options & argument parsing ##
########################################
parser = argparse.ArgumentParser(description='Train a NLTK Classifier',
formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument('corpus',
help='''The name of a tagged corpus included with NLTK, such as treebank,
brown, cess_esp, floresta, or the root path to a corpus directory,
which can be either an absolute path or relative to a nltk_data directory.''')
parser.add_argument('--filename',
help='''filename/path for where to store the pickled tagger.
The default is {corpus}_{algorithm}.pickle in ~/nltk_data/taggers''')
parser.add_argument('--no-pickle', action='store_true', default=False,
help="Don't pickle and save the tagger")
parser.add_argument('--trace', default=1, type=int,
help='How much trace output you want, defaults to %(default)d. 0 is no trace output.')
corpus_group = parser.add_argument_group('Corpus Reader Options')
corpus_group.add_argument('--reader', default=None,
help='''Full module path to a corpus reader class, such as
nltk.corpus.reader.tagged.TaggedCorpusReader''')
corpus_group.add_argument('--fileids', default=None,
help='Specify fileids to load from corpus')
corpus_group.add_argument('--fraction', default=1.0, type=float,
help='Fraction of corpus to use for training, defaults to %(default)f')
tagger_group = parser.add_argument_group('Tagger Choices')
tagger_group.add_argument('--default', default='-None-',
help='''The default tag "%(default)s". Set this to a different tag, such as "NN",
to change the default tag.''')
tagger_group.add_argument('--backoff', default=None,
help='Path to pickled backoff tagger. If given, replaces default tagger.')
if simplify_wsj_tag:
tagger_group.add_argument('--simplify_tags', action='store_true', default=False,
help='Use simplified tags')
else:
tagger_group.add_argument('--tagset', default=None,
help='Map tags to a given tagset, such as "universal"')
sequential_group = parser.add_argument_group('Sequential Tagger')
sequential_group.add_argument('--sequential', default='aubt',
help='''Sequential Backoff Algorithm. This can be any combination of the following letters:
a: AffixTagger
u: UnigramTagger
b: BigramTagger
t: TrigramTagger
The default is "%(default)s", but you can set this to the empty string
to not train a sequential backoff tagger.''')
sequential_group.add_argument('-a', '--affix', action='append', type=int,
help='''Add affixes to use for one or more AffixTaggers.
Negative numbers are suffixes, positive numbers are prefixes.
You can use this option multiple times to create multiple AffixTaggers with different affixes.
The affixes will be used in the order given.''')
brill_group = parser.add_argument_group('Brill Tagger Options')
brill_group.add_argument('--brill', action='store_true', default=False,
help='Train a Brill Tagger in front of the other tagger.')
brill_group.add_argument('--template_bounds', type=int, default=1,
help='''Choose the max bounds for Brill Templates to train a Brill Tagger.
The default is %(default)d.''')
brill_group.add_argument('--max_rules', type=int, default=200)
brill_group.add_argument('--min_score', type=int, default=2)
classifier_group = parser.add_argument_group('Classifier Based Tagger')
classifier_group.add_argument('--classifier', nargs='*',
choices=nltk_trainer.classification.args.classifier_choices,
help='''ClassifierBasedPOSTagger algorithm to use, default is %(default)s.
Maxent uses the default Maxent training algorithm, either CG or iis.''')
classifier_group.add_argument('--cutoff_prob', default=0, type=float,
help='Cutoff probability for classifier tagger to backoff to previous tagger')
phonetic_group = parser.add_argument_group('Phonetic Feature Options for a Classifier Based Tagger')
phonetic_group.add_argument('--metaphone', action='store_true',
default=False, help='Use metaphone feature')
phonetic_group.add_argument('--double-metaphone', action='store_true',
default=False, help='Use double metaphone feature')
phonetic_group.add_argument('--soundex', action='store_true',
default=False, help='Use soundex feature')
phonetic_group.add_argument('--nysiis', action='store_true',
default=False, help='Use NYSIIS feature')
phonetic_group.add_argument('--caverphone', action='store_true',
default=False, help='Use caverphone feature')
nltk_trainer.classification.args.add_maxent_args(parser)
nltk_trainer.classification.args.add_decision_tree_args(parser)
eval_group = parser.add_argument_group('Tagger Evaluation',
'Evaluation metrics for part-of-speech taggers')
eval_group.add_argument('--no-eval', action='store_true', default=False,
help="don't do any evaluation")
# TODO: word coverage of test words, how many get a tag != '-NONE-'
args = parser.parse_args()
###################
## corpus reader ##
###################
if args.trace:
print('loading %s' % args.corpus)
tagged_corpus = load_corpus_reader(args.corpus, reader=args.reader, fileids=args.fileids)
fileids = args.fileids
kwargs = {}
# all other corpora are assumed to support simplify_tags kwarg
if simplify_wsj_tag and args.simplify_tags and args.corpus not in ['conll2000', 'switchboard', 'pl196x']:
kwargs['simplify_tags'] = True
# these corpora do not support simplify_tags, and have no known workaround
elif simplify_wsj_tag and args.simplify_tags and args.corpus in ['pl196x']:
raise ValueError('%s does not support simplify_tags' % args.corpus)
elif not simplify_wsj_tag and args.tagset:
kwargs['tagset'] = args.tagset
if args.trace:
print('using %s tagset' % args.tagset)
if isinstance(tagged_corpus, SwitchboardCorpusReader):
if fileids:
raise ValueError('fileids cannot be used with switchboard')
tagged_sents = list(itertools.chain(*[[list(s) for s in d if s] for d in tagged_corpus.tagged_discourses(**kwargs)]))
elif isinstance(tagged_corpus, NPSChatCorpusReader):
tagged_sents = tagged_corpus.tagged_posts(**kwargs)
else:
if isinstance(tagged_corpus, IndianCorpusReader) and not fileids:
fileids = 'hindi.pos'
if fileids and fileids in tagged_corpus.fileids():
kwargs['fileids'] = [fileids]
if args.trace:
print('using tagged sentences from %s' % fileids)
tagged_sents = tagged_corpus.tagged_sents(**kwargs)
# manual simplification is needed for these corpora
if simplify_wsj_tag and args.simplify_tags and args.corpus in ['conll2000', 'switchboard']:
tagged_sents = [[(word, simplify_wsj_tag(tag)) for (word, tag) in sent] for sent in tagged_sents]
##################
## tagged sents ##
##################
# can't trust corpus to provide valid list of sents (indian)
tagged_sents = [sent for sent in tagged_sents if sent]
nsents = len(tagged_sents)
if args.fraction == 1.0:
train_sents = test_sents = tagged_sents
else:
cutoff = int(math.ceil(nsents * args.fraction))
train_sents = tagged_sents[:cutoff]
test_sents = tagged_sents[cutoff:]
if args.trace:
print('%d tagged sents, training on %d' % (nsents, len(train_sents)))
####################
## default tagger ##
####################
if args.backoff:
if args.trace:
print('loading backoff tagger %s' % args.backoff)
tagger = nltk.data.load(args.backoff)
else:
tagger = nltk.tag.DefaultTagger(args.default)
################################
## sequential backoff taggers ##
################################
# NOTE: passing in verbose=args.trace doesn't produce useful printouts
def affix_constructor(train_sents, backoff=None):
affixes = args.affix or [-3]
for affix in affixes:
if args.trace:
print('training AffixTagger with affix %d and backoff %s' % (affix, backoff))
backoff = nltk.tag.AffixTagger(train_sents, affix_length=affix,
min_stem_length=min(affix, 2), backoff=backoff)
return backoff
def ngram_constructor(cls):
def f(train_sents, backoff=None):
if args.trace:
print('training %s tagger with backoff %s' % (cls, backoff))
# TODO: args.cutoff option
return cls(train_sents, backoff=backoff)
return f
sequential_constructors = {
'a': affix_constructor,
'u': ngram_constructor(nltk.tag.UnigramTagger),
'b': ngram_constructor(nltk.tag.BigramTagger),
't': ngram_constructor(nltk.tag.TrigramTagger)
}
if args.sequential:
for c in args.sequential:
if c not in sequential_constructors:
raise NotImplementedError('%s is not a valid sequential backoff tagger' % c)
constructor = sequential_constructors[c]
tagger = constructor(train_sents, backoff=tagger)
#######################
## classifier tagger ##
#######################
if args.classifier:
kwargs = {
'train': train_sents,
'verbose': args.trace,
'backoff': tagger,
'cutoff_prob': args.cutoff_prob,
'classifier_builder': nltk_trainer.classification.args.make_classifier_builder(args)
}
phonetic_keys = ['metaphone', 'double_metaphone', 'soundex', 'nysiis', 'caverphone']
if any([getattr(args, key) for key in phonetic_keys]):
cls = PhoneticClassifierBasedPOSTagger
for key in phonetic_keys:
kwargs[key] = getattr(args, key)
else:
cls = ClassifierBasedPOSTagger
if args.trace:
print('training %s %s' % (args.classifier, cls.__name__))
tagger = cls(**kwargs)
##################
## brill tagger ##
##################
if args.brill:
tagger = train_brill_tagger(tagger, train_sents, args.template_bounds,
trace=args.trace, max_rules=args.max_rules, min_score=args.min_score)
################
## evaluation ##
################
if not args.no_eval:
print('evaluating %s' % tagger.__class__.__name__)
print('accuracy: %f' % tagger.evaluate(test_sents))
##############
## pickling ##
##############
if not args.no_pickle:
if args.filename:
fname = os.path.expanduser(args.filename)
else:
# use the last part of the corpus name/path as the prefix
parts = [os.path.split(args.corpus.rstrip('/'))[-1]]
if args.brill:
parts.append('brill')
if args.classifier:
parts.append('_'.join(args.classifier))
if args.sequential:
parts.append(args.sequential)
name = '%s.pickle' % '_'.join(parts)
fname = os.path.join(os.path.expanduser('~/nltk_data/taggers'), name)
dump_object(tagger, fname, trace=args.trace)