-
Notifications
You must be signed in to change notification settings - Fork 17
/
charSimilarity.py
357 lines (319 loc) · 13.5 KB
/
charSimilarity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
versionRCS='$Id: charSimilarity.py,v 1.11 2008/05/15 18:21:47 black Exp $'
# *created "Sat Feb 9 08:54:30 2008" *by "Paul E. Black"
versionMod=' *modified "Thu May 15 14:13:04 2008" *by "Paul E. Black"'
#----------------------------------------------------------------------------
#
# Return the basic visual similarity between two characters, like a and o,
# or between a pair of characters and a character, like rn and m. Context
# adjustments, for instance, leading characters are more carefully
# scrutinized than other characters and iiii is similar to iiiii, are
# handled elsewhere.
#
# Character similarity ranges from 0, completely different, to 1,
# visually indistinguishable (at least in some font).
#
# Similarity is based on typical fonts, like Courier and Times. We ignore
# "fancy" fonts, like script - they're rarely used for URLs and have an
# entirely different set of similarities.
#
# Top-Level Domains are case insensitive. To model this, all comparisons
# are done in lower case, even if it is the upper case version that causes
# confusion, like O and Q.
#
# Similarity is symmetric. SIM_SCORE(X, Y) = SIM_SCORE(Y, X).
#
#
# This software was developed at the National Institute of Standards
# and Technology by employees of the Federal Government in the course
# of their official duties. Pursuant to title 17 Section 105 of the
# United States Code this software is not subject to copyright
# protection and is in the public domain. This software is an
# experimental system. NIST assumes no responsibility whatsoever for
# its use by other parties, and makes no guarantees, expressed or
# implied, about its quality, reliability, or any other
# characteristic.
#
# We would appreciate acknowledgment if this software is used.
#
# This software may be redistributed and/or modified freely provided
# that any modified versions bear some notice that they were modified.
#
# Paul E. Black [email protected]
#
#----------------------------------------------------------------------------
# Even though similarity is symmetric, the tables only have one of the pairs.
# That is, the tables don't have both X Y and Y X.
# single character similarity table
scsimtab = {
('0', 'O'): .9,
('1', '7'): .5, # for European 1s
('1', 'i'): .9, # in some fonts
('1', 'l'): 1, # identical in some fonts
('2', 'Z'): .2,
('3', 'E'): .1, # maybe?
('4', 'A'): .1, # in some fonts
('4', 'H'): .1, # in some fonts
('4', '9'): .2, # in some fonts
('5', 'S'): .2,
('6', 'b'): .3,
('8', 'B'): .3,
('9', 'P'): .1, # maybe?
('a', 'c'): .2,
('a', 'd'): .2,
('a', 'e'): .2,
('a', 'o'): .3,
('A', 'H'): .1,
('b', 'd'): .2,
#('B', 'D'): .1, more similar as lower case, so ignore
('b', 'h'): .2,
('B', 'E'): .1,
('B', 'P'): .2,
('B', 'R'): .2,
('c', 'e'): .2,
('c', 'o'): .3,
#('C', 'O'): .2, more similar as lower case, so ignore
('C', 'G'): .2,
('d', 'o'): .2,
('e', 'o'): .2,
('E', 'F'): .2,
('F', 'P'): .1,
('f', 't'): .3,
('g', 'q'): .2, # in some fonts
('G', 'O'): .1,
('h', 'k'): .1,
#('H', 'K'): .1, as similar as lower case, so ignore
('h', 'n'): .4,
#('H', 'N'): .2, more similar as lower case, so ignore
('i', 'j'): .5, # in some fonts
('I', 'l'): 1, # identical in some fonts
('K', 'X'): .1,
('m', 'n'): .1, # proportional fonts
('n', 'r'): .1,
('o', 'p'): .1,
('O', 'Q'): .4,
('p', 'q'): .1,
('P', 'R'): .2,
('u', 'v'): .1,
('v', 'w'): .1,
('v', 'x'): .1,
('v', 'y'): .2,
# TO TEST SELF-TEST
#('y', 'x'): .1,
('x', 'y'): .1
}
# double character similarity table
dcsimtab = {
('cl', 'd'): .4, # in proportional fonts
('fl', 'f'): .5, # in fonts with ligatures
('mn', 'nm'): .5, # in proportional fonts
('nn', 'm'): .5, # in proportional fonts
('AA', 'M'): .2,
('rn', 'm'): 1, # I misread "warns" as "wams" once!
('VV', 'W'): .8, # in proportional fonts
}
#------------------------------------------------------------------------------
# initialize the similarity tables
#
# make sure there is a lower case version of every pair and
# check for table inconsistencies
#------------------------------------------------------------------------------
for (ch1, ch2) in scsimtab.keys():
if ch1.isupper() or ch2.isupper():
# make sure we don't override another entry
if (ch1.lower(), ch2.lower()) in scsimtab:
print 'Inconsistency: ('+ch1+', '+ch2+') to be inserted, but is already present as lower case.'
scsimtab[(ch1.lower(), ch2.lower())] = scsimtab[(ch1, ch2)]
for (str1, str2) in dcsimtab.keys():
# (python note: isupper() means is *all* upper case, so can't use it here)
if not str1.islower() or not str2.islower():
# make sure we don't override another entry
if (str1.lower(), str2.lower()) in dcsimtab:
print 'Inconsistency: ('+str1+', '+str2+') to be inserted, but is already present as lower case.'
dcsimtab[(str1.lower(), str2.lower())] = dcsimtab[(str1, str2)]
#------------------------------------------------------------------------------
# how similar are two characters
#------------------------------------------------------------------------------
def characterSimilarity(ch1, ch2):
"""Rate the similarity of two characters."""
# both parameters must be single characters
assert(len(ch1) == 1)
assert(len(ch2) == 1)
# Characters must not be upper case
assert(not ch1.isupper())
assert(not ch2.isupper())
# look up character pair in table
if (ch1, ch2) in scsimtab:
similarity = scsimtab[(ch1, ch2)]
elif (ch2, ch1) in scsimtab:
similarity = scsimtab[(ch2, ch1)]
else:
# character pair not in table, use default
if ch1 == ch2:
similarity = 1
else:
similarity = 0
return similarity
def characterSimilarity_chkSym(str1, str2):
"""Check that characterSimilarity() is symmetric."""
resultScore = characterSimilarity(str1, str2)
reverseScore = characterSimilarity(str2, str1)
if resultScore != reverseScore:
print 'characterSimilarity failed built-in test for', \
str1, 'and', str2 + '.'
print ' It returned', resultScore, 'one way, and', \
reverseScore, 'the other.'
return resultScore
def characterSimilarity_chkPair(str1, str2, expectedScore):
"""Check that characterSimilarity() works in one instance."""
resultScore = characterSimilarity_chkSym(str1.lower(), str2.lower())
if resultScore != expectedScore:
print 'characterSimilarity failed built-in test for', \
str1, 'and', str2 + '.'
print ' It returned', resultScore, \
'instead of', str(expectedScore) + '.'
def characterSimilarity_selftest():
"""Built-in self test for characterSimilarity()."""
print ' running self test for characterSimilarity() ...'
#--------------------------------------------------------------------------
# check consistency in the single character similarity table
#--------------------------------------------------------------------------
for (ch1, ch2) in scsimtab:
# no pair should be in backwards
if (ch2, ch1) in scsimtab:
print 'inconsistency in single char similarity table (scsimtab):'
print ' both ('+ch1+', '+ch2+') and ('+ch2+', '+ch1+') are present'
# there should be lower case versions of every pair
if not (ch1.lower(), ch2.lower()) in scsimtab:
print 'inconsistency in single char similarity table (scsimtab):'
print ' ('+ch1+', '+ch2+') present, but not ('\
+ch1.lower()+', '+ch2.lower()+')'
#--------------------------------------------------------------------------
# some pairs of characters
#--------------------------------------------------------------------------
# Empty strings and None parameters are not handled
#characterSimilarity_chkPair('', 'light on a hill', 0)
characterSimilarity_chkPair('a', 'a', 1)
characterSimilarity_chkPair('A', 'a', 1)
characterSimilarity_chkPair('a', 'b', 0)
characterSimilarity_chkPair('a', 'c', .2)
characterSimilarity_chkPair('p', 'b', .2)
characterSimilarity_chkPair('c', 'a', .2)
characterSimilarity_chkPair('i', '1', .9)
characterSimilarity_chkPair('l', '1', 1)
characterSimilarity_chkPair('u', 'v', .1)
characterSimilarity_chkPair('v', 'x', .1)
characterSimilarity_chkPair('v', 'y', .2)
#--------------------------------------------------------------------------
print ' try all pairs of single characters'
# for regression, I suppose - *I'm* not going to check each one manually
#--------------------------------------------------------------------------
# see RFC 1035 2.3.1 http://www.faqs.org/rfcs/rfc1035.html
allowedCharacters = \
'abcdefghijklmnopqrstuvwxyz0123456789-'
for ch1 in allowedCharacters:
for ch2 in allowedCharacters:
resultScore = characterSimilarity_chkSym(ch1, ch2)
print ch1, ch2, resultScore
print ' self test for characterSimilarity() done.'
#------------------------------------------------------------------------------
# how similar are a digraph and a character or two digraphs
#------------------------------------------------------------------------------
def digraphSimilarity(dg1, dg2):
"""Rate the similarity of digraphs and characters."""
#print dg1, dg2 # TEMP
# both "digraphs" must be one or two characters
assert(len(dg1) in [1, 2])
assert(len(dg2) in [1, 2])
# at least one of them must be two characters
assert(len(dg1) == 2 or len(dg2) == 2)
# Isn't it inefficient to convert to lower case over and over?
# Yup, but speed isn't an issue, and it is clearer, more reliable,
# and more flexible to do it in one place, here.
dg1l = dg1.lower()
dg2l = dg2.lower()
# look up character pair in table
if (dg1l, dg2l) in dcsimtab:
similarity = dcsimtab[(dg1l, dg2l)]
elif (dg2l, dg1l) in dcsimtab:
similarity = dcsimtab[(dg2l, dg1l)]
else:
# pair not in table, use default
if dg1l == dg2l:
similarity = 1
else:
similarity = 0
return similarity
def digraphSimilarity_chkSym(str1, str2):
"""Check that digraphSimilarity() is symmetric."""
resultScore = digraphSimilarity(str1, str2)
reverseScore = digraphSimilarity(str2, str1)
if resultScore != reverseScore:
print 'digraphSimilarity failed built-in test for', \
str1, 'and', str2 + '.'
print ' It returned', resultScore, 'one way, and', \
reverseScore, 'the other.'
return resultScore
def digraphSimilarity_chkPair(str1, str2, expectedScore):
"""Check that digraphSimilarity() works in one instance."""
resultScore = digraphSimilarity_chkSym(str1, str2)
if resultScore != expectedScore:
print 'digraphSimilarity failed built-in test for', \
str1, 'and', str2 + '.'
print ' It returned', resultScore, \
'instead of', str(expectedScore) + '.'
def digraphSimilarity_selftest():
"""Built-in self test for digraphSimilarity()."""
print ' running self test for digraphSimilarity() ...'
#--------------------------------------------------------------------------
# check consistency in the digraph similarity table
#--------------------------------------------------------------------------
for (str1, str2) in dcsimtab:
# no pair should be in backwards
if (str2, str1) in dcsimtab:
print 'inconsistency in digraph similarity table (dcsimtab):'
print ' both ('+str1+', '+str2+') and ('+str2+', '+str1+') are present'
# there should be lower case versions of every pair
if not (str1.lower(), str2.lower()) in dcsimtab:
print 'inconsistency in digraph similarity table (dcsimtab):'
print ' ('+str1+', '+str2+') present, but not ('\
+str1.lower()+', '+str2.lower()+')'
#--------------------------------------------------------------------------
# some pairs
#--------------------------------------------------------------------------
# Empty strings and None parameters are not handled
#digraphSimilarity_chkPair('', 'light on a hill', 0)
digraphSimilarity_chkPair('cl', 'd', .4)
digraphSimilarity_chkPair('D', 'cl', .4)
digraphSimilarity_chkPair('E', 'cl', 0)
digraphSimilarity_chkPair('f', 'fL', .5)
digraphSimilarity_chkPair('fl', 'F', .5)
digraphSimilarity_chkPair('fm', 'FM', 1)
digraphSimilarity_chkPair('MN', 'NM', .5)
digraphSimilarity_chkPair('nm', 'mn', .5)
digraphSimilarity_chkPair('nn', 'mn', 0)
digraphSimilarity_chkPair('m', 'Nn', .5)
digraphSimilarity_chkPair('nn', 'm', .5)
digraphSimilarity_chkPair('nn', 'j', 0)
digraphSimilarity_chkPair('Aa', 'M', .2)
digraphSimilarity_chkPair('m', 'AA', .2)
digraphSimilarity_chkPair('AA', 'AA', 1)
digraphSimilarity_chkPair('m', 'Rn', 1)
digraphSimilarity_chkPair('rn', 'm', 1)
digraphSimilarity_chkPair('rP', 'm', 0)
digraphSimilarity_chkPair('vv', 'w', .8)
digraphSimilarity_chkPair('W', 'VV', .8)
digraphSimilarity_chkPair('xb', 'JK', 0)
# what WOULD a good semi-exhaustive test be??
#--------------------------------------------------------------------------
#print ' try all pairs of single characters'
# for regression, I suppose - *I'm* not going to check each one manually
#--------------------------------------------------------------------------
# see RFC 1035 2.3.1 http://www.faqs.org/rfcs/rfc1035.html
#allowedCharacters = \
# 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-'
#for str1 in allowedCharacters:
# for str2 in allowedCharacters:
# resultScore = digraphSimilarity_chkSym(str1, str2)
# print str1, str2, resultScore
print ' self test for digraphSimilarity() done.'
# end of $Source: /home/black/GTLD/RCS/charSimilarity.py,v $