-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsoup.py
54 lines (35 loc) · 1.09 KB
/
soup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
from __future__ import print_function
from celery import Celery
from celery import chord
from bs4 import BeautifulSoup
from collections import Counter
import requests
app = Celery('soup')
app.config_from_object('config')
@app.task
def map(url):
c = Counter()
r = requests.get(url)
soup = BeautifulSoup(r.text)
for word in soup.get_text().split():
if word not in c:
c[word] = 1
else:
c[word] += 1
return c
@app.task
def reduce(counters):
res = counters[0]
for c in counters[1:]:
res += c
return res
if __name__ == "__main__":
r = requests.get('http://en.wikipedia.org/wiki/Wikipedia:Top_25_Report')
d = BeautifulSoup(r.text)
for table in d.find_all('table', class_='wikitable'):
callback = reduce.s()
header = [ map.s('http://en.wikipedia.org'+link.get('href')) for link in table.find_all('a') ]
res = chord(header)(callback)
m = res.get()
for k in sorted(m, key=m.get, reverse=True)[:25]:
print(k, m[k])