-
Notifications
You must be signed in to change notification settings - Fork 18
/
Copy pathscraper.py
50 lines (39 loc) · 1.9 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
'''
Download Cricket Data
'''
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import urllib2
import csv
import sys
import time
import os
import unicodedata
from urlparse import urlparse
from BeautifulSoup import BeautifulSoup, SoupStrainer
BASE_URL = 'http://www.espncricinfo.com'
if not os.path.exists('./espncricinfo-fc'):
os.mkdir('./espncricinfo-fc')
for i in range(0, 6019):
#odi: soupy = BeautifulSoup(urllib2.urlopen('http://search.espncricinfo.com/ci/content/match/search.html?search=odi;all=1;page=' + str(i)).read())
#test: soupy = BeautifulSoup(urllib2.urlopen('http://search.espncricinfo.com/ci/content/match/search.html?search=test;all=1;page=' + str(i)).read())
#t20i: soupy = BeautifulSoup(urllib2.urlopen('http://search.espncricinfo.com/ci/content/match/search.html?search=t20i;all=1;page=' + str(i)).read())
#t20: soupy = BeautifulSoup(urllib2.urlopen('http://search.espncricinfo.com/ci/content/match/search.html?search=t20;all=1;page=' + str(i)).read())
#list a: soupy = BeautifulSoup(urllib2.urlopen('http://search.espncricinfo.com/ci/content/match/search.html?search=list%20a;all=1;page=' + str(i)).read())
#fc:
soupy = BeautifulSoup(urllib2.urlopen('http://search.espncricinfo.com/ci/content/match/search.html?search=first%20class;all=1;page=' + str(i)).read())
time.sleep(1)
for new_host in soupy.findAll('a', {'class' : 'srchPlyrNmTxt'}):
try:
new_host = new_host['href']
except:
continue
odiurl = BASE_URL + urlparse(new_host).geturl()
new_host = unicodedata.normalize('NFKD', new_host).encode('ascii','ignore')
print new_host
#print(type(str.split(new_host)[3]))
print str.split(new_host, "/")[4]
html = urllib2.urlopen(odiurl).read()
if html:
with open('espncricinfo-fc/{0!s}'.format(str.split(new_host, "/")[4]), "wb") as f:
f.write(html)