-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclassify-players.py
192 lines (160 loc) · 7.13 KB
/
classify-players.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
import requests
import pandas as pd
import numpy as np
from random import randrange
from matplotlib import pyplot
import matplotlib.patches as mpatches
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
headers = {
'Host': 'stats.nba.com',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:94.0) Gecko/20100101 Firefox/94.0',
'Referer': 'https://stats.nba.com/',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.5',
}
def getData(pos, ssn):
# Gets the basic data such as points, rebounds, assists, field goal percentage, etc.
# If pos isn't an empty string, then it outputs the info for players of a certain position
url = 'https://stats.nba.com/stats/leaguedashplayerstats?College=&Conference=&Country=&DateFrom=&DateTo=&Division=&DraftPick=&DraftYear=&GameScope=&GameSegment=&Height=&LastNGames=0&LeagueID=00&Location=&MeasureType=Base&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PaceAdjust=N&PerMode=Totals&Period=0&PlayerExperience=&PlayerPosition=' + str(pos) + '&PlusMinus=N&Rank=N&Season=' + str(ssn) + '&SeasonSegment=&SeasonType=Regular+Season&ShotClockRange=&StarterBench=&TeamID=0&TwoWay=0&VsConference=&VsDivision=&Weight='
# Gets JSON from the URL and converts it into a DataFrame
json = requests.get(url, headers = headers).json()
data = json['resultSets'][0]['rowSet']
columns = json['resultSets'][0]['headers']
index = []
for player in data:
index.append(player[0])
df = pd.DataFrame.from_records(data, index = index, columns=columns)
return df
def getBioData(df, ssn):
# Gets the player's name, height, and weight
url = 'https://stats.nba.com/stats/leaguedashplayerbiostats?College=&Conference=&Country=&DateFrom=&DateTo=&Division=&DraftPick=&DraftYear=&GameScope=&GameSegment=&Height=&LastNGames=&LeagueID=00&Location=&Month=&OpponentTeamID=&Outcome=&PORound=&PerMode=Totals&Period=&PlayerExperience=&PlayerPosition=&Season=' + ssn + '&SeasonSegment=&SeasonType=Regular+Season&ShotClockRange=&StarterBench=&TeamID=&VsConference=&VsDivision=&Weight='
json = requests.get(url, headers=headers).json()
data = json['resultSets'][0]['rowSet']
columns = json['resultSets'][0]['headers']
# Changes the index to be the players' IDs
ids = []
for player in data:
ids.append(player[0])
dfBio = pd.DataFrame.from_records(data, index = ids, columns = columns)
# Takes the height and weight columns from the data and puts them into df_bio
name = ['PLAYER_NAME']
bioStats = ['PLAYER_HEIGHT_INCHES', 'PLAYER_WEIGHT']
dfBio = dfBio[name + bioStats]
# Gets rid of the players who are not in df (since some played <300 minutes)
dfBio = dfBio.loc[list(df.index)]
# For some reason, the player's weight is an object instead of an int
dfBio['PLAYER_WEIGHT'] = dfBio['PLAYER_WEIGHT'].astype(np.int64)
# Puts the data into df
df['HEIGHT'] = dfBio['PLAYER_HEIGHT_INCHES']
df['WEIGHT'] = dfBio['PLAYER_WEIGHT']
def filterGameData(playerStats, gameStats, ssn):
# Filters the data to be only players who played 300 minutes or more
df = getData("", ssn)
df = df.query('MIN > 300')
# Converts to per 36
df = df[playerStats + gameStats]
for stat in gameStats:
df[stat] = df[stat] / df.MIN * 36
# Gets the position for each player and a corresponding color (so we can graph later)
positions = ['C', 'F', 'G']
colorMap = {
'C': 'Red',
'F': 'Blue',
'G': 'Green',
}
for pos in positions:
# pos_df.index has all the players who played a certain position
posDf = getData(pos, ssn)
for player in df.index.intersection(posDf.index):
df.at[player, 'POS'] = pos
df.at[player, 'COL'] = colorMap[pos]
return df
def makeGraph(df, xVal, yVal):
# Makes a graph of the data
plot = df.plot.scatter(x = xVal, y = yVal, c = 'COL', s = 'SZ')
# Makes legend
centerLabel = mpatches.Patch(color = 'red', label = 'Center')
forwardLabel = mpatches.Patch(color = 'blue', label = 'Forward')
guardLabel = mpatches.Patch(color = 'green', label = 'Guard')
pyplot.legend(handles = [centerLabel, forwardLabel, guardLabel])
pyplot.show()
def getModel(df, totalGameStats):
# Gets the model for the data and outputs the results
dataX = df[totalGameStats]
dataY = df['POS']
# Splits data into training and testing
trainX, testX, trainY, testY = train_test_split(
dataX, dataY, test_size = 0.2, shuffle = True
)
# Uses a logistic regression to predict the results
classifierLog = LogisticRegression(max_iter = 100)
classifierLog.fit(trainX, trainY)
predsLog = classifierLog.predict(testX)
# Looks through the coefficients to determine the most "important" factors (not perfect measurement)
positions = classifierLog.classes_
coefDf = pd.DataFrame(classifierLog.coef_, index = positions, columns = totalGameStats)
for i, pos in enumerate(positions):
coefDf = abs(coefDf)
coefDf.sort_values(by = [positions[i]], axis = 1, inplace = True)
cols = coefDf.columns
print("For %s, The strongest indicators are %s, %s, %s" % (pos, cols[-1], cols[-2], cols[-3]))
classifierRandom = RandomForestClassifier(n_estimators = 100)
classifierRandom.fit(trainX, trainY)
predsRandom = classifierRandom.predict(testX)
classifierSVM = svm.SVC(kernel = "linear")
classifierSVM.fit(trainX, trainY)
predsSVM = classifierSVM.predict(testX)
# Finds the amount that are correct and wrong
for i, preds in enumerate([predsLog, predsRandom, predsSVM]):
correct = 0
incorrect = 0
for pred, gt in zip(preds, testY):
if pred == gt:
correct += 1
else:
incorrect += 1
# Outputs the results
if i == 0:
print("\nResults for a Logistic Regression:")
elif i == 1:
print("\nResults for a Random Forest Classifier:")
else:
print("\nResults for a Support Vector Machine:")
print(f"Correct: {correct}, Incorrect: {incorrect}, % Correct: {correct/(correct + incorrect): 5.2}")
plot_confusion_matrix(classifierLog, testX, testY)
pyplot.show()
def main():
ssn = '2020-21'
playerStats = ['PLAYER_NAME','AGE','GP','MIN']
gameStats = ['FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT',
'OREB', 'DREB', 'REB', 'AST', 'TOV', 'STL', 'BLK', 'BLKA', 'PF', 'PFD', 'PTS']
totalGameStats = gameStats + ['HEIGHT', 'WEIGHT']
df = filterGameData(playerStats, gameStats, ssn)
getBioData(df, ssn)
# Formula for the size of the dots so it isn't too big or too small, but the difference is noticeable
df['SZ'] = (df['MIN'] - 250) ** (0.75) / 3
index = df.index
# Scales the data
std_scaler = StandardScaler()
dfScaledGame = std_scaler.fit_transform(df[totalGameStats])
dfScaledGame = pd.DataFrame(dfScaledGame, index = index, columns = totalGameStats)
dfScaled = dfScaledGame
for stat in playerStats + ['POS']:
dfScaled[stat] = 0
for player in index:
for stat in playerStats + ['POS']:
dfScaled[stat] = df[stat]
# Makes a bunch of graphs
makeGraph(df, 'REB', 'AST')
makeGraph(df, 'HEIGHT', 'WEIGHT')
makeGraph(df, 'FGM', "FG3M")
makeGraph(df, 'STL', 'BLK')
makeGraph(df, 'AGE', 'PTS') # A graph that tells you nothing
getModel(dfScaled, totalGameStats)
main()