-
Notifications
You must be signed in to change notification settings - Fork 2
/
importer.py
149 lines (112 loc) · 5.9 KB
/
importer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import os
import subprocess
import urllib
import time
from collections import namedtuple
from log import set_up_logger
from django.db import connection
F = namedtuple('F', ['url', 'filename', 'schema', 'sql_table','cycle'])
cycle_ints = range(1980,2014,2)
cycles = [unicode(x)[-2:] for x in cycle_ints]
def long_cycle(cycle):
if int(cycle)>60: return '19' + cycle
return '20'+cycle
def cycle_config(cycle):
return [
F('ftp://ftp.fec.gov/FEC/%s/indiv%s.zip' % (long_cycle(cycle), cycle), 'itcont.txt', None, 'fec_indiv', long_cycle(cycle)),
F('ftp://ftp.fec.gov/FEC/%s/pas2%s.zip' % (long_cycle(cycle), cycle), 'itpas2.txt', None, 'fec_pac2cand', long_cycle(cycle)),
F('ftp://ftp.fec.gov/FEC/%s/oth%s.zip' % (long_cycle(cycle), cycle), 'itoth.txt', None, 'fec_pac2pac', long_cycle(cycle)),
F('ftp://ftp.fec.gov/FEC/%s/cm%s.zip' % (long_cycle(cycle), cycle), 'cm.txt', 'fec_committee_master_schema.csv', 'fec_committees', long_cycle(cycle)),
F('ftp://ftp.fec.gov/FEC/%s/cn%s.zip' % (long_cycle(cycle), cycle), 'cn.txt', 'fec_candidate_master_schema.csv', 'fec_candidates', long_cycle(cycle)),
F('ftp://ftp.fec.gov/FEC/%s/webl%s.zip' % (long_cycle(cycle), cycle), 'webl%s.txt' % cycle, 'fec_candidate_summary.csv', 'fec_candidate_summaries', long_cycle(cycle)),
F('ftp://ftp.fec.gov/FEC/%s/webk%s.zip' % (long_cycle(cycle), cycle), 'webk%s.txt' % cycle, 'fec_pac_summary.csv', 'fec_committee_summaries', long_cycle(cycle))
]
SQL_PRELOAD_FILE = os.path.join(os.path.dirname(__file__), 'preload.sql')
SQL_POSTLOAD_FILE = os.path.join(os.path.dirname(__file__), 'postload.sql')
FEC_CONFIG = cycle_config('12')
class FECImporter():
def __init__(self, processing_dir, config=FEC_CONFIG):
self.processing_dir = os.path.expanduser(processing_dir)
self.FEC_CONFIG = config
self.log = set_up_logger('fec_importer', self.processing_dir, 'Unhappy FEC Importer')
def update_csv(self):
try:
self.log.info("Downloading files to %s..." % self.processing_dir)
self.download()
self.log.info("Extracting files...")
self.extract()
self.log.info("Converting to unicode...")
self.fix_unicode()
except Exception as e:
self.log.error(e)
raise
def update_db(self):
try:
c = connection.cursor()
self.log.info("Uploading data...")
self.upload(c)
self.log.info("Done.")
except Exception as e:
self.log.error(e)
raise
def _download_file(self, conf):
filename = conf.url.split("/")[-1]
dirname = filename.split(".")[0]
return os.path.join(self.processing_dir, dirname, filename)
def _working_dir(self, conf):
filename = conf.url.split("/")[-1]
dirname = filename.split(".")[0]
return os.path.join(self.processing_dir, dirname)
def download(self):
for conf in self.FEC_CONFIG:
if not os.path.isdir(self._working_dir(conf)):
os.makedirs(self._working_dir(conf))
self.log.info("downloading %s to %s..." % (conf.url, self._download_file(conf)))
urllib.urlretrieve(conf.url, self._download_file(conf))
def extract(self):
for conf in self.FEC_CONFIG:
subprocess.check_call(['unzip', '-oqu', self._download_file(conf), "-d" + self._working_dir(conf)])
def fix_unicode(self):
for conf in self.FEC_CONFIG:
try:
infile = open(os.path.join(self._working_dir(conf), conf.filename), 'r')
except:
infile = open(os.path.join(self._working_dir(conf), conf.filename)[:-3]+'dta', 'r')
outfile = open(os.path.join(self._working_dir(conf), conf.filename[:-3] + "txt.utf8"), 'w')
for line in infile:
try:
fixed_line = line.decode('utf8', 'replace').encode('utf8', 'replace')
except:
fixed_line = line
self.log.info('utf problem' + line)
continue #don't include this line in the database (!)
if not conf.schema:
parts = fixed_line.split('|')
if len(parts[13])==7:
parts[13] = '0' + parts[13][:2] + parts[13][3:]
date = parts[13][-4:] + '-' + parts[13][:2] + '-' + parts[13][2:4]
try:
time.strptime(date,"%Y-%m-%d")
except:
date = ''
parts[13] = date
fixed_line = '|'.join(parts)
try:
outfile.write(conf.cycle+'|'+fixed_line)
except:
self.log.info('couldnt write ' + fixed_line)
def execute_file(self, cursor, filename):
print 'executing ' + filename
contents = " ".join([line for line in open(filename, 'r') if line[0:2] != '--'])
statements = contents.split(';')[:-1] # split on semi-colon. Last element will be trailing whitespace
for statement in statements:
self.log.info("Executing %s" % statement)
cursor.execute(statement)
def upload(self, c):
for conf in self.FEC_CONFIG:
infile = os.path.join(self._working_dir(conf), conf.filename + '.utf8')
c.execute("DELETE FROM %s WHERE cycle=%s" % (conf.sql_table, conf.cycle))
# note: quote character is an arbitrary ASCII code that is unlikely to appear in data.
# FEC uses single and double quotes and most other printable characters in the data,
# so we have to be sure not to misinterpret any of them as semantically meaningful.
os.system("""psql datacommons -c "COPY %s FROM '%s' CSV HEADER DELIMITER '|' QUOTE E'\\x1F';\"""" % (conf.sql_table, infile))