-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpdfscraper.py
117 lines (105 loc) · 3.97 KB
/
pdfscraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import asyncio
import argparse
import json
from pathlib import Path
import pandas as pd
import ssl
import urllib
from classes import PDFScraper
def run():
# Hack for windows
ssl._create_default_https_context = ssl._create_unverified_context
# Parse args for program
parser = argparse.ArgumentParser(prog="pdfscraper", description="Scrapes a website for annual report PDFs.")
parser.add_argument(
"-i", "--id",
type=str,
required=False,
help="The ID of the company to scrape."
)
parser.add_argument(
"-a", "--all",
action="store_true",
required=False,
help="Scrape all companies found in config.json. Ignored when --id is present."
)
parser.add_argument(
"-l", "--get_links",
action="store_true",
required=False,
help="Only crawl for PDF links from the website."
)
parser.add_argument(
"-p", "--parse_links",
action="store_true",
required=False,
help="Only find annual reports from the PDF links."
)
parser.add_argument(
"-d", "--download_pdfs",
action="store_true",
required=False,
help="Only download annual reports from the PDF links."
)
parser.add_argument(
"-v", "--verbose",
action="store_true",
required=False,
help="Enable verbose mode"
)
args = vars(parser.parse_args())
verbose = args.get('verbose')
# Figure out which companies to run
if (args.get('id') is None and args.get('all')==False):
print("Must specify a company to scrape using --id or --all")
return
with open('config.json', 'r') as f:
config = json.load(f)
to_run = []
if args.get('id'):
if args.get('id') in config.keys():
to_run.append(args.get('id'))
else:
print(f"No config found for {args.get('id')}")
elif args.get('all'):
to_run = list(config.keys())
if verbose:
print(f"Running scraper on ids: {to_run}")
# Figure out what operation we're running
get_links = args.get('get_links')
parse_links = args.get('parse_links')
download_pdfs = args.get('download_pdfs')
run_all = not (get_links or parse_links or download_pdfs)
# Run and save data
pdfscraper = PDFScraper(verbose=verbose)
for id in to_run:
id_config = config[id]
id_path = f"data\\{id}"
Path(id_path).mkdir(parents=True, exist_ok=True)
# Scrape all PDF links and save
if run_all or get_links:
links = asyncio.run(pdfscraper.get_pdf_links_from_url(id_config['crawler']['root_url'], id_config['crawler']['deep'], id_config['crawler']['url_depth']))
links.to_csv(f"{id_path}\\links.csv")
# Use LLM to get a table of annual reports from all candidate PDFs
if run_all or parse_links:
try:
links = pd.read_csv(f"{id_path}\\links.csv", index_col=0).to_dict('records')
pdfs = pdfscraper.find_annual_reports_from_pdf_links(links=links, special_instructions=id_config['link_parser']['special_instructions'])
pdfs.to_csv(f"{id_path}\\pdfs.csv")
except FileNotFoundError:
print(f"Links not scraped for for {id}")
# Download PDFs from PDF links
if run_all or download_pdfs:
try:
pdfs = pd.read_csv(f"{id_path}\\pdfs.csv", index_col=0)
for y, row in pdfs.iterrows():
report_path = f"{id_path}\\{y}"
Path(report_path).mkdir(parents=True, exist_ok=True)
try:
urllib.request.urlretrieve(row['url'], report_path+f"\\{row['filename']}")
except Exception as e:
print(f"Error getting data for year {y}:")
print(e)
except FileNotFoundError:
print(f"PDF links not parsed for {id}")
return