-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDownload_House_Financial_Disclosure_Reports.py
93 lines (84 loc) · 2.41 KB
/
Download_House_Financial_Disclosure_Reports.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import datetime
import wget
import pandas as pd
from zipfile import ZipFile
from xmlutils.xml2csv import xml2csv
import os.path
from pathlib import Path
from os.path import expanduser
now = datetime.datetime.now()
year = now.year
fdurl = ('https://disclosures-clerk.house.gov/public_disc/financial-pdfs/' + str(year) + 'FD.ZIP')
fdzip = str(year) + 'FD.ZIP'
fdxml = str(year) + 'FD.xml'
fd = 'FilingDate'
docD = os.path.join(os.environ.get("HOME"), 'TheHouse')
print()
print(now.year, now.month, now.day, now.hour, now.minute, now.second)
print()
print('Downloading ' + fdzip)
print(fdzip)
wget.download(fdurl)
print('Extracting XML File...')
print()
with ZipFile(fdzip, 'r') as zipObj:
listOfFileNames = zipObj.namelist()
for fileName in listOfFileNames:
if fileName.endswith('.xml'):
zipObj.extract(fileName)
print('Converting to CSV....')
print()
converter = xml2csv(fdxml, "FDHouse.csv", encoding="utf-8")
converter.convert(tag="Member")
print("Getting last 10 days & sorting list...")
print()
df = pd.read_csv('FDHouse.csv')
df[fd] = pd.to_datetime(df[fd])
mask = df[fd] >= (pd.to_datetime('now') - pd.DateOffset(days=40))
df[mask].to_csv('lastdays.csv', index=False)
df = pd.read_csv('lastdays.csv')
df[fd] = pd.to_datetime(df[fd])
sortedlist = df.sort_values(by=["FilingDate"], ascending=False)
sortedlist.to_csv('sortedlastdays.csv', index=False)
print(sortedlist)
print()
print('Getting DocIDs...')
print()
docids = []
data = pd.read_csv('sortedlastdays.csv')
for v in data['DocID']:
# print(v)
docids.append(v)
for ids in docids:
home = expanduser("~")
folder = os.path.join(os.environ.get("HOME"), 'THeHouse', '')
# print(folder)
pdf = (folder + str(ids) + '.pdf')
# print(pdf)
if Path(pdf).is_file():
print()
print("PDF " + str(ids) + " exist")
print()
else:
try:
wg = wget.download('https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/' + str(ids) + '.pdf', docD)
print()
print('Downloaded: ' + str(ids))
print(wg)
print()
except:
print()
print('DocID ' + str(ids) + ' not Found on disclosures-clerk.house.gov')
print()
continue
print()
print('Cleaning up files...')
print()
os.remove(fdzip)
os.remove(fdxml)
os.remove("FDHouse.csv")
os.remove("lastdays.csv")
os.remove('sortedlastdays.csv')
print()
print('DONE!!!')
print()