This repository has been archived by the owner on May 28, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathannotate_timex
executable file
·166 lines (144 loc) · 8.37 KB
/
annotate_timex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
#!/usr/bin/env python
#
# TERNIP: Temporal Expression Recognition and Normalisation in Python
#
import optparse
import sys
import logging
import ternip.formats
from ternip.formats.tern import TernDocument
from ternip.formats.timeml import TimeMlDocument
from ternip.formats.timex2 import Timex2XmlDocument
from ternip.formats.timex3 import Timex3XmlDocument
import ternip.rule_engine
from ternip.rule_engine.normalisation_rule_engine import NormalisationRuleEngine
from ternip.rule_engine.recognition_rule_engine import RecognitionRuleEngine
option_parser = optparse.OptionParser(usage='%prog [options] FILENAME', version=""" $Id$, """)
# The options we take
io_group = optparse.OptionGroup(option_parser, "Format Options", "Options for dealing with the type of input and output files")
io_group.add_option('-t', '--doctype', dest='doc_type', type='choice', choices=['timex2','timex3','tern','timeml'], help='The format of the document and resulting tags. Supported values: timex2 - XML document resulting in TIMEX2 tags; timex3 - XML document resulting in TIMEX3 tags; tern - a document from the TERN corpus; timeml - a document annotated with TimeML')
io_group.add_option('-s', '--strip-timexes', dest='strip_timexes', default=False, action="store_true", help='If set, any timexes in the document are stripped, and then tagging starts afresh. If you don\'t enable this, feed in a document which already has TIMEXes in it and are doing recognition, you may end up with duplicate TIMEX tags.')
io_group.add_option('-b', '--body-tag', dest='body_tag', default=None, type='string', help='If set, this tag only the contents of this tag is tagged.')
io_group.add_option('--s-tag', dest='has_S', metavar='S_tag', default=None, type='string', help='If set, this tag name is used to denote sentence boundaries. If unset, NLTK is used to tokenise.')
io_group.add_option('--lex-tag', dest='has_LEX', metavar='LEX_tag', default=None, type='string', help='If set, this tag name is used to denote token boundaries. If unset, NLTK is used to tokenise.')
io_group.add_option('--pos-attr', dest='pos_attr', metavar='POS_attr', default=None, type='string', help='If set, then this attribute on the tag set by --lex-tag is used to denote the POS tag of that token. If unset, NLTK is used for POS tagging.')
option_parser.add_option_group(io_group)
recog_group = optparse.OptionGroup(option_parser, "Recognition Rules")
recog_group.add_option('-r', '--recognition-engine', dest='recognition_engine', type='choice', default='default', choices=['none','rule','default'], help='Selects the engine to use for TIMEX recognition. Defaults to the currently recommended TERNIP engine. Other options are \'rule\' for the rule engine and \'none\' to disable recognition (e.g., if the document already has TIMEXs annotated, but just needs normalising)')
recog_group.add_option('--recognition-rules', dest='recognition_rules', type='string', default=None, help='Path to recognition rules. Defaults to ./rules/recognition/')
option_parser.add_option_group(recog_group)
norm_group = optparse.OptionGroup(option_parser, "Normalisation Rules")
norm_group.add_option('-n', '--normalisation-engine', dest='normalisation_engine', type='choice', default='default', choices=['none','rule','default'], help='Selects the engine to use for TIMEX recognition. Defaults to the currently recommended TERNIP engine. Other options are \'rule\' for the rule engine and \'none\' to disable recognition (e.g., to just do recognition)')
norm_group.add_option('--normalisation-rules', dest='normalisation_rules', type='string', default=None, help='Path to normalisation rules. Defaults to ./rules/normalisation/')
option_parser.add_option('-c', '--dct', dest='dct', default=None, type='string', help='The document creation time used as the basis for normalisation. If not set, it will attempt to be extracted from the document.')
option_parser.add_option_group(norm_group)
(options, args) = option_parser.parse_args()
if len(args) != 1:
# Only parse one file at a time
option_parser.print_help()
print >>sys.stderr, "ERROR: multiple input files specified" if len(args) > 1 else "ERROR: no input files specified"
sys.exit(1)
input_file = args[0]
# Set up logging
logger = logging.getLogger('ternip')
logger.setLevel(logging.WARN)
logger.addHandler(logging.StreamHandler())
# Create document for input file
if options.doc_type == 'timex2':
with open(input_file) as fd:
doc = Timex2XmlDocument(fd.read(), options.body_tag, options.has_S, options.has_LEX, options.pos_attr)
elif options.doc_type == 'timex3':
with open(input_file) as fd:
doc = Timex3XmlDocument(fd.read(), options.body_tag, options.has_S, options.has_LEX, options.pos_attr)
elif options.doc_type == 'timeml':
if options.body_tag is not None:
option_parser.print_help()
print >>sys.stderr, "ERROR: incompatible options with document type"
sys.exit(1)
with open(input_file) as fd:
doc = TimeMlDocument(fd.read(), has_S=options.has_S, has_LEX=options.has_LEX, pos_attr=options.pos_attr)
elif options.doc_type == 'tern':
if options.body_tag is not None:
option_parser.print_help()
print >>sys.stderr,"ERROR: incompatible options with document type"
sys.exit(1)
with open(input_file) as fd:
doc = TernDocument(fd.read(), has_S=options.has_S, has_LEX=options.has_LEX, pos_attr=options.pos_attr)
else:
option_parser.print_help()
print >>sys.stderr, "ERROR: invalid document type specified"
sys.exit(1)
# Strip TIMEXes from the input document if need be
if options.strip_timexes:
doc.strip_timexes()
# Get internal representation
sents = doc.get_sents()
# Load correct recognition engine
if options.recognition_engine == 'none':
recogniser = None
elif options.recognition_engine == 'default':
recogniser = ternip.recogniser()
elif options.recognition_engine == 'rule':
recogniser = RecognitionRuleEngine()
else:
option_parser.print_help()
print >>sys.stderr, "ERROR: invalid recognition engine specified"
sys.exit(1)
# Load rules
if options.recognition_rules is not None and options.recognition_engine == 'rule':
recogniser.load_rules(options.recognition_rules)
elif options.recognition_rules is None and options.recognition_engine == 'rule':
print >>sys.stderr, "WARNING: no recognition rules set to load"
elif options.recognition_rules is not None and options.recognition_engine != 'rule':
print >>sys.stderr, "WARNING: recognition rule path only valid when rule engine specified"
# Do recognition
if recogniser is not None:
if options.dct is None:
dct_sents = doc.get_dct_sents()
dct_sents = recogniser.tag(dct_sents)
sents = recogniser.tag(sents)
# Load correct recognition engine
if options.normalisation_engine == 'none':
normaliser = None
elif options.normalisation_engine == 'default':
normaliser = ternip.normaliser()
elif options.normalisation_engine == 'rule':
normaliser = NormalisationRuleEngine()
normaliser.load_rules(options.normalisation_rules)
else:
option_parser.print_help()
print >>sys.stderr, "ERROR: invalid normalisation engine specified"
sys.exit(1)
# Load rules
if options.normalisation_rules is not None and options.normalisation_engine == 'rule':
normaliser.load_rules(options.normalisation_rules)
elif options.normalisation_rules is None and options.normalisation_engine == 'rule':
print >>sys.stderr, "WARNING: no normsalisation rules set to load"
elif options.normalisation_rules is not None and options.normalisation_engine != 'rule':
print >>sys.stderr, "WARNING: normalisation rule path only valid when rule engine specified"
# Do normalisation
if normaliser is not None:
dct = ''
if options.dct is None:
# Get DCT from normaliser
normaliser.annotate(dct_sents, 'XXXXXXXX')
doc.reconcile_dct(dct_sents)
# Get dct value
for sent in dct_sents:
for (tok, pos, ts) in sent:
for t in ts:
if dct == '' and t.value is not None:
dct = t.value
else:
dct = options.dct
# Now check if we got a dct
if dct == '':
print >>sys.stderr, "WARNING: Could not determine document creation time, use -c to override"
normaliser.annotate(sents, dct)
# Now apply the changes back to the internal document
doc.reconcile(sents)
# Now output the final document
if options.doc_type == 'tern':
print str(doc)[22:]
else:
print str(doc)