-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcombo-preprocessor.py
executable file
·132 lines (83 loc) · 3.25 KB
/
combo-preprocessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
'''
Commutes to Combo Preprocessor
Garrett Dash Nelson and Alasdair Rae, 2016
Read the paper: http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0166083
'''
import csv
import time
from os import remove, path
weightFlows = True # set to False to use raw ACS flow data unweighted by margin of error
debug = False # set to True for debug throttling
startTime = time.time()
sourceDatabaseFile = 'data-src/commutes.csv' # Source file: CSV file with columns origin,destination,flow,moe
nodesDatabaseFile = 'data-stage1/fips_table.csv' # Output file: Lookup table matching FIPS to serialized ID used in Combo
pajekFile = 'data-stage1/commutes.net' # Output file: Pajek format for feeding into Combo
if not path.isfile(sourceDatabaseFile):
print 'Failed: No source file in ./data-src/commutes.csv'
exit()
if not path.isdir('data-stage1'):
print 'Failed: Requires an output directory data-stage1'
exit()
# Look for a file ./data-src/subselection.txt
# If it's there, we're running an extract; create a set from FIPS in that file
if path.isfile('data-src/subselection.txt'):
subSelection = True
subSet = set(line.strip() for line in open('data-src/subselection.txt'))
else:
subSelection = False
# Initialize a dict which will hold the FIPS and their serialized ID for Combo
serializedFips = {}
# Function to lookup/add FIPS to serialization list
def getFipsSerialId(fips):
if fips not in serializedFips:
serialId = len(serializedFips) + 1
serializedFips[fips] = serialId
return serialId
else:
return serializedFips[fips]
# Create a temporary file to hold the arcs (commutes)
arcTmp = open('arctmp.tmp','w+')
arcTmp.write('*Arcs\n')
# Begin building the list of arcs (commutes) here
i = 0 # rowcounter
reader = csv.reader(open(sourceDatabaseFile,'r'))
reader.next() # skip header
print ('Beginning reading through commutes database')
for row in reader:
# for debug throttling, limit to 1000 rows
if debug:
if i > 200: break
i = i+1
origin = row[0]
dest = row[1]
if subSelection:
if origin not in subSet or dest not in subSet:
continue
origId = getFipsSerialId(origin)
destId = getFipsSerialId(dest)
if weightFlows:
strength = float(row[2])/float(row[3])
else:
strength = float(row[2])
arcTmp.write(' %s %s %f\n' % (origId,destId,strength ))
if i % 10000 == 0:
print str(i) + ' rows processed, total time ' + str(time.time()-startTime)
# Create a temporary file to hold the vertices (FIPS census tracts)
verticesTmp = open('verttmp.tmp','w+')
verticesTmp.write('*Vertices %d\n' % (len(serializedFips)))
# Initialize the lookup table CSV and give it a header row
nodesDatabase = open(nodesDatabaseFile,'w')
nodesDatabase.write('serial_id,fips\n')
# Loop through all the vertices we created and write them into both verttmp.tmp and the lookup csv
for fips, serialId in sorted(serializedFips.iteritems(), key=lambda(k,v): (v,k)):
verticesTmp.write(' ' + str(serialId) + ' "' + fips + '"\n')
nodesDatabase.write(str(serialId) + ',' + fips + '\n')
# Join the two temporary files to make a Pajek file
outFile = open(pajekFile,'w')
verticesTmp.seek(0)
outFile.write(verticesTmp.read())
arcTmp.seek(0)
outFile.write(arcTmp.read())
remove('verttmp.tmp')
remove('arctmp.tmp')
print 'done, total time %d' % (time.time()-startTime)