-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathencarda.py
40 lines (34 loc) · 1.19 KB
/
encarda.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
from BeautifulSoup import BeautifulSoup
import csv
import urlparse
from re import compile as re
base_url = "http://www.glyphweb.com/ARDA/"
base_url_re = re(r"^http://www\.glyphweb\.com/ARDA/\w/.*")
def scrape_file(file_name):
soup = BeautifulSoup(open(file_name).read())
out = csv.writer(sys.stdout)
for tag in soup.findAll('a', title=True):
if all(isinstance(content, basestring) for content in tag.contents):
href = tag['href']
try:
href = urlparse.urljoin(base_url, href)
if base_url_re.match(href):
yield href, " ".join(tag.contents), tag['title'],
except UnicodeEncodeError:
pass
def scrape_files():
hrefs = {}
for file_name in glob("build/encarda/*"):
for href, title, description in scrape_file(file_name):
titles, description = hrefs.setdefault(href, (set(), description))
titles.add(title)
return hrefs
def main():
import sys
from pprint import pprint
command = sys.argv[1].replace('-', '_')
result = globals()[command](*sys.argv[2:])
if result is not None:
pprint(result)
if __name__ == "__main__":
main()