-
Notifications
You must be signed in to change notification settings - Fork 0
/
hnscrape.py
64 lines (58 loc) · 2.03 KB
/
hnscrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import requests
from lxml import html
def _yield_tr_rows(trs):
rv = []
for tr in trs:
if 'style' in tr.keys():
yield rv
rv = []
else:
rv.append(tr)
if rv:
yield rv
def _scrape_from_string(s):
rv = []
t = html.fromstring(s)
trs = t.cssselect('center > table > tr')[2].cssselect('table > tr')
trs = trs[:-2]
for first, second in _yield_tr_rows(trs):
result = {}
result['rank'] = int(first.cssselect('td')[0].text_content().strip()[:-1])
title_link = first.cssselect('td')[2].cssselect('a')[0]
result['title'] = title_link.text_content()
result['url'] = title_link.attrib['href']
#skip, points/id/user/timestamp/comments
second = second.cssselect('.subtext')[0]
result['points'] = int(second.text_content().split()[0])
if not second.cssselect('a'):
result['id'] = 0
result['user'] = None
result['time'] = None
result['points'] = None
result['comments'] = 0
rv.append(result)
continue
discussion = second.cssselect('a')[-1]
if discussion.text_content() == 'discuss':
result['comments'] = 0
else:
try:
result['comments'] = int(discussion.text_content().split()[0])
except:
result['comments'] = 0
result['id'] = int(discussion.attrib['href'].split('=')[-1])
result['user'] = second.cssselect('a')[0].text
time = second.cssselect('a')[0].tail
result['time'] = time.strip().strip('|').strip().strip('ago').strip()
result['points'] = int(second.cssselect('span')[0].text_content().split()[0])
rv.append(result)
return rv
_urls = ('https://news.ycombinator.com/',
'https://news.ycombinator.com/news?p=2')
def get_stories():
rv = []
for url in _urls:
r = requests.get(url)
r.raise_for_status()
rv.extend(_scrape_from_string(r.text))
return rv