-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsports_wscraper.py
127 lines (96 loc) · 3.9 KB
/
sports_wscraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
'''
Author: Jess Summerhill
Project: A sport betting analytics web scrapper
Date: 2-29-2020
Project Specs:
This is a web scrapper
1. The client wants all the team names from the first column
2. Ingore team rankings
3. I am now scraping the whole site, and doing this in an OOP way.
4. First, get the Team names, and then safe that to a list.
'''
# On v.0001, used selenium and not requests.
import sports_fout as spf, requests as req
from collections import defaultdict as dd
from bs4 import BeautifulSoup as bsoup
class SportsWebScraper:
def __init__(self):
return None
def main(self):
# Get the main URL
rurl = "https://www.teamrankings.com"
burl = "https://www.teamrankings.com/ncb/team-stats/"
wsrc = req.get(burl)
# Setup xpaths
bsoupy = bsoup(wsrc.content, 'lxml')
# setup all my functions
def get_team_names(soupy, root):
# Find all the team names, and save them to a new list.
teams = []
new_url = ""
expanded = soupy.find_all("ul", {"class": "expand-content hidden"})
for e in expanded:
tnames = e.find("a", href=True).get_text()
if tnames == "Points per Game":
turl = e.find("a", href=True)
new_url = root + turl.get('href')
break
nreq = req.get(new_url)
nnsoupy = bsoup(nreq.content, 'lxml')
nno_wrap = nnsoupy.find_all("td", {"class", "text-left nowrap"})
for n in nno_wrap:
tname = n.find("a", href=True).get_text()
teams.append(tname)
return teams
def get_key_names(soupy):
# Get all of the Titles from all the stats
key_names = []
choosey = soupy.find("ul", {"class", "chooser-list"})
expanded = choosey.find_all("li")
for e in expanded:
ehyper = e.find("a", href=True)
if ehyper.get('href') != '#':
tnames = ehyper.get_text()
key_names.append(tnames)
return key_names
def get_stat_urls(soupy, root):
# Find all of the stat URLs
surls = []
choosey = soupy.find("ul", {"class", "chooser-list"})
expanded = choosey.find_all("li")
for e in expanded:
ehyper = e.find("a", href=True)
if ehyper.get('href') != '#':
url = root + ehyper.get('href')
surls.append(url)
return surls
def req_get_wrapper(l, urlsl = []):
# get the request from the url list, and then save the first column of data
surl = urlsl[l]
rsrc = req.get(surl)
# get a new request from a new url
ssoup = bsoup(rsrc.content, 'lxml')
nwrapy = ssoup.find_all("td", {"class", "text-left nowrap"})
return nwrapy
key_list, urlstat_list, wrappers, sdict_list, team_list = [], [], [], [], []
key1 = "Team Names"
team_list = get_team_names(bsoupy, rurl)
key_list = get_key_names(bsoupy)
urlstat_list = get_stat_urls(bsoupy, rurl)
data_dict = dd(dict)
for t in team_list:
data_dict[t][key1] = t
wrappers = []
for idx in range(0, len(urlstat_list)):
wrappers.append(req_get_wrapper(idx, urlstat_list))
for wr in wrappers[idx]:
team_name = wr.find("a", href=True).get_text()
data = wr.find_next("td").contents[0]
colname = key_list[idx]
data_dict[team_name][colname] = data
for key, val in data_dict.items():
sdict_list.append(val)
fname = spf.sfout.save_file_format()
spf.sfout.output_csvfile(fname, key1, key_list, sdict_list)
s_scrap = SportsWebScraper()
s_scrap.main()