-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathprocess_filing_headers.py
93 lines (69 loc) · 3.04 KB
/
process_filing_headers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import os
import fecfile
import json
import csv
import sys
from settings import RAW_ELECTRONIC_DIR, MASTER_HEADER_ROW, HEADER_DUMP_FILE
START_YEAR = 2021
ERROR_HEADERS = ['path', 'error', ]
def readfile(filepath, writer):
filename = os.path.basename(filepath)
filename = filename.replace(".fec", "")
file_number = int(filename)
file = open(filepath, encoding = "ISO-8859-1")
#file = open(filepath)
firstline = file.readline()
secondline = file.readline()
firstline = firstline.replace("\n", "")
raw_results = fecfile.parse_header(firstline)
results = raw_results[0]
results["filing_number"] = file_number
version = raw_results[1]
lines = None
if len(raw_results)==3:
lines = raw_results[1]
original_report = results.get('report_id', None)
report_number = results.get('report_number', None)
if original_report:
original_report = original_report.replace("FEC-", "")
original_report_number = int(original_report)
results["amends"] = original_report_number
#print("Found amended filing %s amends %s # %s" % (file_number, original_report_number, report_number))
secondlineparsed = fecfile.parse_line(secondline, version)
#print(secondlineparsed)
results["form_type"] = secondlineparsed.get('form_type', '')
results["filer_committee_id_number"] = secondlineparsed.get('filer_committee_id_number', '')
results["committee_name"] = secondlineparsed.get('committee_name', '')
results["date_signed"] = secondlineparsed.get('date_signed', '')
results["coverage_from_date"] = secondlineparsed.get('coverage_from_date', '')
results["coverage_through_date"] = secondlineparsed.get('coverage_through_date', '')
writer.writerow(results)
if __name__ == '__main__':
outfile = open(HEADER_DUMP_FILE, 'w')
dw = csv.DictWriter(outfile, fieldnames=MASTER_HEADER_ROW, extrasaction='ignore')
dw.writeheader()
print("Writing output to %s" % HEADER_DUMP_FILE)
errorfile = open("header_read_errors.csv", 'w')
error_writer = csv.DictWriter(errorfile, fieldnames=ERROR_HEADERS, extrasaction='ignore')
error_writer.writeheader()
for dirName, subdirList, fileList in os.walk(RAW_ELECTRONIC_DIR, topdown=False):
try:
directory_year = int(dirName.split("/")[-1][0:4])
if directory_year < START_YEAR:
print("Ignoring directory %s" % dirName)
continue
except ValueError:
continue
for fname in fileList:
if fname.endswith(".fec"):
full_path = os.path.join(dirName, fname)
#readfile(full_path, dw)
#print("Found file %s" % full_path)
try:
readfile(full_path, dw)
except Exception as e:
print("error reading %s: %s" % (full_path, e))
error_writer.writerow({
'path':full_path,
'error':e
})