-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathtwokenize.py
260 lines (217 loc) · 7.69 KB
/
twokenize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
# -*- coding: utf-8 -*-
"""
This is a tokenizer for programming-specific or software-specific texts in Stack Overflow.
Such text is both social, e.g., people use ungrammatical and informal language in comments,
and domain-specific, e.g., printf() should be recognized as one token rather than 3 tokens printf, '(' and ')'.
Acknowledgement goes to Brendan O'Connor, Kevin Gimpel and Daniel Mills,
who are the authors of a Twitter tokenizer. This tokenizer modifies their Twitter one.
general philosophy is to throw as little out as possible.
development philosophy: every time you change a rule, do a diff of this
program's output on ~100k tweets. if you iterate through many possible rules
and only accept the ones that seeem to result in good diffs, it's a sort of
statistical learning with in-the-loop human evaluation :)
"""
import re,sys
import emoticons
#import util
mycompile = lambda pat: re.compile(pat, re.UNICODE)
def regex_or(*items):
r = '|'.join(items)
r = '(' + r + ')'
return r
def pos_lookahead(r):
return '(?=' + r + ')'
def neg_lookahead(r):
return '(?!' + r + ')'
def optional(r):
return '(%s)?' % r
PunctChars = r'''['“".?!,(/:;]'''
foo = r'''foo'''
Punct = '%s+' % PunctChars
PunctSeq = r"""['`\"“”‘’)\]]+|[.?!,…]+|[:;/(\[]+"""
Entity = '&(amp|lt|gt|quot);'
# one-liner URL recognition:
#Url = r'''https?://\S+'''
# more complex version:
UrlStart1 = regex_or('https?://', r'www\.')
CommonTLDs = regex_or('com','co\\.uk','org','net','info','ca','edu','gov')
UrlStart2 = r'[a-z0-9\.-]+?' + r'\.' + CommonTLDs + pos_lookahead(r'[/ \W\b]')
UrlBody = r'[^ \t\r\n<>]*?' # * not + for case of: "go to bla.com." -- don't want period
UrlExtraCrapBeforeEnd = '%s+?' % regex_or(PunctChars, Entity)
UrlEnd = regex_or( r'\.\.+', r'[<>]', r'\s', '$') # / added by Deheng
Url = (r'\b' +
regex_or(UrlStart1, UrlStart2) +
UrlBody +
pos_lookahead( optional(UrlExtraCrapBeforeEnd) + UrlEnd))
Url_RE = re.compile("(%s)" % Url, re.U|re.I)
EmailName = r'([a-zA-Z0-9-_+]+[.])*[a-zA-Z0-9-_+]+'
EmailDomain = r'([a-zA-Z0-9]+[.])+' + CommonTLDs
Email = EmailName + "@" + EmailDomain
Email_RE = mycompile(Email)
API = regex_or(r'((\w+)[\.\-])+\w+\(\)', r'\w+\(\)', r'((\w+)[\.\-])+(\w+)', r'\.\w+\(\)', r'\.\w+')
API_RE = mycompile(API)
plural = r'\w+\(s\)'
plural_RE = mycompile(plural)
ProgrammingOperators = regex_or(r'==', r'!=', r'>=', r'<=', r'&&', r'\|\|')
Concat = r'\w+[/—]\w+'
Timelike = r'\d+:\d+'
NumNum = r'\d+\.\d+'
NumberWithCommas = r'(\d+,)+?\d{3}' + pos_lookahead(regex_or('[^,]','$'))
Abbrevs1 = ['am','pm','us','usa','ie','eg']
def regexify_abbrev(a):
chars = list(a)
icase = ["[%s%s]" % (c,c.upper()) for c in chars]
dotted = [r'%s\.' % x for x in icase]
return "".join(dotted)
Abbrevs = [regexify_abbrev(a) for a in Abbrevs1]
BoundaryNotDot = regex_or(r'\s', '[“"?!,:;]', Entity)
aa1 = r'''([A-Za-z]\.){2,}''' + pos_lookahead(BoundaryNotDot)
aa2 = r'''([A-Za-z]\.){1,}[A-Za-z]''' + pos_lookahead(BoundaryNotDot)
ArbitraryAbbrev = regex_or(aa1,aa2)
assert '-' != '―'
Separators = regex_or('--+', '―')
Decorations = r' [ ♫ ]+ '.replace(' ','')
EmbeddedApostrophe = r"\w+'\w+" # \S -> \w, by Deheng
Hashtag = r'#[a-zA-Z0-9_]+'
AtMention = r'@[a-zA-Z0-9_]+'
ProtectThese = [
emoticons.Emoticon,
Url,
Email,
Entity,
Hashtag,
AtMention,
Timelike,
#NumNum,
NumberWithCommas,
ArbitraryAbbrev,
Separators,
Decorations,
EmbeddedApostrophe,
API, # Deheng
PunctSeq, # Deheng
#plural, # Deheng
ProgrammingOperators # Deheng
]
Protect_RE = mycompile(regex_or(*ProtectThese))
class Tokenization(list):
" list of tokens, plus extra info "
def __init__(self):
self.alignments = []
self.text = ""
def subset(self, tok_inds):
new = Tokenization()
new += [self[i] for i in tok_inds]
new.alignments = [self.alignments[i] for i in tok_inds]
new.text = self.text
return new
def assert_consistent(t):
assert len(t) == len(t.alignments)
assert [t.text[t.alignments[i] : (t.alignments[i]+len(t[i]))] for i in range(len(t))] == list(t)
def align(toks, orig):
s_i = 0
alignments = [None]*len(toks)
for tok_i in range(len(toks)):
while True:
L = len(toks[tok_i])
if orig[s_i:(s_i+L)] == toks[tok_i]:
alignments[tok_i] = s_i
s_i += L
break
s_i += 1
if s_i >= len(orig): raise AlignmentFailed((orig,toks,alignments))
#if orig[s_i] != ' ': raise AlignmentFailed("nonspace advance: %s" % ((s_i,orig),))
if any(a is None for a in alignments): raise AlignmentFailed((orig,toks,alignments))
return alignments
class AlignmentFailed(Exception): pass
def unicodify(s, encoding='utf8', *args):
if isinstance(s,unicode): return s
if isinstance(s,str): return s.decode(encoding, *args)
return unicode(s)
def tokenize(tweet):
#tweet = re.sub(u'—', ' ', tweet) # Deheng
#tweet = tweet.replace('\x97', ' ') # Deheng
text = unicodify(tweet)
text = re.sub(u'—', ' ', text) # Deheng
text = re.sub('(?<=\w)/(\s|$)', ' ', text) # Deheng
text = squeeze_whitespace(text)
# Convert HTML escape sequences into their actual characters (so that the tokenizer and emoticon finder are not fooled).
text = re.sub(r'<', '<', text)
text = re.sub(r'>', '>', text)
text = re.sub(r'&', '&', text)
t = Tokenization()
t += simple_tokenize(text)
t.text = text
t.alignments = align(t, text)
return t
def simple_tokenize(text):
s = text
s = edge_punct_munge(s)
# strict alternating ordering through the string. first and last are goods.
# good bad good bad good bad good
goods = []
bads = []
i = 0
if Protect_RE.search(s):
for m in Protect_RE.finditer(s):
goods.append( (i, m.start()) )
bads.append(m.span())
i = m.end()
goods.append( (m.end(), len(s)) )
else:
goods = [ (0, len(s)) ]
assert len(bads)+1 == len(goods)
goods = [s[i:j] for i,j in goods]
bads = [s[i:j] for i,j in bads]
# print goods
# print bads
goods = [unprotected_tokenize(x) for x in goods]
res = []
for i in range(len(bads)):
res += goods[i]
res.append(bads[i])
res += goods[-1]
#res = post_process(res)
return res
AposS = mycompile(r"(\S+)('s)$")
def post_process(pre_toks):
# hacky: further splitting of certain tokens
post_toks = []
for tok in pre_toks:
m = AposS.search(tok)
if m:
post_toks += m.groups()
else:
post_toks.append( tok )
return post_toks
WS_RE = mycompile(r'\s+')
def squeeze_whitespace(s):
new_string = WS_RE.sub(" ",s)
return new_string.strip()
# fun: copy and paste outta http://en.wikipedia.org/wiki/Smart_quotes
#EdgePunct = r"""[ ' " “ ” ‘ ’ * < > « » { } ( \) [ \] ]""".replace(' ','')
# Removed < and > because they were causing problems with <3 emoticons
EdgePunct = r"""[ ' " “ ” ‘ ’ * « » { } ( \) [ \] ]""".replace(' ','')
#EdgePunct = r'[^a-zA-Z0-9\s]'
#NotEdgePunct = r"""[^'"([\)\]]""" # alignment failures?
NotEdgePunct = r"""[a-zA-Z0-9]"""
EdgePunctLeft = r"""(\s|^)(%s+)(%s)""" % (EdgePunct, NotEdgePunct)
# programming API suffixes () are considered
EdgePunctRight = r"""(%s(?:\(s?\))?)(%s*)(\s|$)""" % (NotEdgePunct, EdgePunct)
EdgePunctLeft_RE = mycompile(EdgePunctLeft)
EdgePunctRight_RE= mycompile(EdgePunctRight)
def edge_punct_munge(s):
s = EdgePunctLeft_RE.sub( r"\1\2 \3", s)
# print s
s = EdgePunctRight_RE.sub(r"\1 \2\3", s)
# print s
return s
def unprotected_tokenize(s):
return s.split()
if __name__=='__main__':
for line in sys.stdin:
print u" ".join(tokenize(line[:-1])).encode('utf-8')
#print "CUR\t" + " ".join(tokenize(line[:-1]))
#print "WS\t" + " ".join(line[:-1].split())
#print ansi.color(line.strip(),'red')
#print ansi.color(" ".join(tokenize(line.strip())),'blue','bold')