-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathcluster-locations.py
executable file
·147 lines (119 loc) · 3.9 KB
/
cluster-locations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#!/usr/bin/python
"""
Reads in locations.txt (produced by generate-geocodes.py with
--output_mode=locations.txt) and clusters very close points. This reduces the
number of unique map markers and makes it easier to find things.
Output is an exhaustive mapping of "old_lat,old_lon\tnew_lat,new_lon" pairs.
TODO:
- remove large catcodes from consideration
- break up clusters which have grown too large
- look over pre-built eval pairs
- implement this as a hook for lat-lons.js
- look at before/after
"""
from collections import defaultdict
import fileinput
output_mode = 'map' # 'urls'
catcodes = {}
for line in file('cat-codes.txt'):
line = line.strip()
if not line: continue
lat_lon = line.split(':')[0].split(',')
assert len(lat_lon) == 2, line
lat = float(lat_lon[0])
lon = float(lat_lon[1])
catcodes['%.6f,%.6f' % (lat, lon)] = True
counts = []
lat_lons = []
orig_points = 0
for line in fileinput.input():
line = line.strip()
if not line: continue
orig_points += 1
count, ll = line.split('\t')
lat, lon = [float(x) for x in ll.split(',')]
count = int(count)
# 188 clusters, 451/2378 points
# ignore large catcodes -- these should be left unaltered
key = '%.6f,%.6f' % (lat, lon)
if key in catcodes and count >= 10:
continue
counts.append(count)
lat_lons.append((lat, lon))
def UrlForIndex(idx):
global lat_lons
lat, lon = lat_lons[idx]
return 'http://localhost:8080/#ll:%.6f|%.6f,m:%.5f|%.5f|19' % (lat, lon, lat, lon)
def dist(a, b):
d1 = a[0] - b[0]
d2 = a[1] - b[1]
return 1.0e8 * (d1*d1 + d2*d2)
def centroidForIndices(idxs):
global lat_lons, counts
lat_sum = 0
lon_sum = 0
total = 0
for i in idxs:
ll = lat_lons[i]
lat_sum += ll[0] * counts[i]
lon_sum += ll[1] * counts[i]
total += counts[i]
return (lat_sum / total, lon_sum / total)
# calculate all-pairs distances
nns = [] # index -> list of (distance, index) neighbors
for i in range(0, len(lat_lons)):
neighbors = [] # (dist, index)
a = lat_lons[i]
for j in range(i + 1, len(lat_lons)):
b = lat_lons[j]
d = dist(a, b)
if d > 4: continue
neighbors.append((-d, j))
neighbors.sort()
nns.append([(-x[0], x[1]) for x in neighbors])
# we hope there aren't any really degenerate cases
cluster_map = {} # idx -> cluster representative idx
for i, buds in enumerate(nns):
if not buds: continue
cluster_idx = i
if i in cluster_map: cluster_idx = cluster_map[i]
for d, j in buds:
if j in cluster_map:
if cluster_map[j] != cluster_idx:
old_idx = cluster_map[j]
for idx, rep in cluster_map.iteritems():
if rep == old_idx: cluster_map[idx] = cluster_idx
cluster_map[old_idx] = cluster_idx
# this is pathological behavior; we artificially break the cluster
#a = j
#b = cluster_map[j]
#c = cluster_idx
#ll = lat_lons[a]
#print ' Current: %d = 0.000 %s %s' % (a, ll, UrlForIndex(b))
#print 'Old cluster: %d = %.3f %s %s' % (
# b, dist(ll, lat_lons[b]), lat_lons[b], UrlForIndex(b))
#print 'New cluster: %d = %.3f %s %s' % (
# c, dist(ll, lat_lons[c]), lat_lons[c], UrlForIndex(c))
#assert False
cluster_map[j] = cluster_idx
clusters = {} # representative idx -> list of constituent indices
for i, rep in cluster_map.iteritems():
if rep not in clusters:
clusters[rep] = [rep]
clusters[rep].append(i)
if output_mode == 'map':
for base, members in clusters.iteritems():
ll = centroidForIndices(members)
for i in members:
b = lat_lons[i]
print '%.6f,%.6f->%.6f,%.6f' % (b[0], b[1], ll[0], ll[1])
if output_mode == 'urls':
num_points = 0
for base, members in clusters.iteritems():
if not members: continue
print '(%d)' % len(members)
for i in members:
print ' %s' % UrlForIndex(i)
print ''
num_points += len(members)
print '%d clusters, %d/%d points' % (len(clusters), num_points, orig_points)