import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
1. Getting the data
# Get all the links to the gamelogs for Jordan
= 'https://www.basketball-reference.com/players/j/jordami01.html'
url = requests.get(url)
response = BeautifulSoup(response.content, 'html.parser')
soup
= soup.find(id="per_game")
table = table.findAll('a', href=re.compile('gamelog')) links
# Extract all the gamelogs from all the links and create dataframe
=[]
dfs
= 'https://www.basketball-reference.com'
base_url
for link in links:
= pd.read_html(base_url+link['href'], attrs={'id': 'pgl_basic'})[0]
df
dfs.append(df)
3.2)
time.sleep(
= pd.concat(dfs, ignore_index=True)
result
result.head()
Rk | G | Date | Age | Tm | Unnamed: 5 | Opp | Unnamed: 7 | GS | MP | ... | DRB | TRB | AST | STL | BLK | TOV | PF | PTS | GmSc | +/- | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 1984-10-26 | 21-252 | CHI | NaN | WSB | W (+16) | 1 | 40:00 | ... | 5 | 6 | 7 | 2 | 4 | 5 | 2 | 16 | 12.5 | NaN |
1 | 2 | 2 | 1984-10-27 | 21-253 | CHI | @ | MIL | L (-2) | 1 | 34:00 | ... | 2 | 5 | 5 | 2 | 1 | 3 | 4 | 21 | 19.4 | NaN |
2 | 3 | 3 | 1984-10-29 | 21-255 | CHI | NaN | MIL | W (+6) | 1 | 34:00 | ... | 2 | 4 | 5 | 6 | 2 | 3 | 4 | 37 | 32.9 | NaN |
3 | 4 | 4 | 1984-10-30 | 21-256 | CHI | @ | KCK | W (+5) | 1 | 36:00 | ... | 2 | 4 | 5 | 3 | 1 | 6 | 5 | 25 | 14.7 | NaN |
4 | 5 | 5 | 1984-11-01 | 21-258 | CHI | @ | DEN | L (-16) | 1 | 33:00 | ... | 2 | 5 | 5 | 1 | 1 | 2 | 4 | 17 | 13.2 | NaN |
5 rows × 30 columns
# Save the dataframe to a csv file
'jordan.csv', index=False) result.to_csv(
# Load the data
= pd.read_csv('jordan.csv') result
2. Clean the data + basic feature eng’
# Check Dtypes
result.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1135 entries, 0 to 1134
Data columns (total 30 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Rk 1135 non-null object
1 G 1121 non-null object
2 Date 1135 non-null object
3 Age 1135 non-null object
4 Tm 1135 non-null object
5 Unnamed: 5 545 non-null object
6 Opp 1135 non-null object
7 Unnamed: 7 1086 non-null object
8 GS 1135 non-null object
9 MP 1135 non-null object
10 FG 1135 non-null object
11 FGA 1135 non-null object
12 FG% 1135 non-null object
13 3P 1135 non-null object
14 3PA 1135 non-null object
15 3P% 805 non-null object
16 FT 1135 non-null object
17 FTA 1135 non-null object
18 FT% 1105 non-null object
19 ORB 1135 non-null object
20 DRB 1135 non-null object
21 TRB 1135 non-null object
22 AST 1135 non-null object
23 STL 1135 non-null object
24 BLK 1135 non-null object
25 TOV 1135 non-null object
26 PF 1135 non-null object
27 PTS 1135 non-null object
28 GmSc 1135 non-null object
29 +/- 335 non-null object
dtypes: object(30)
memory usage: 266.1+ KB
# Drop unnecessary rows
== "Date"].index, inplace=True)
result.drop(result[result.Date == "PTS"].index, inplace=True)
result.drop(result[result.PTS == "Did Not Dress"].index, inplace=True)
result.drop(result[result.PTS == "Not With Team"].index, inplace=True)
result.drop(result[result.PTS
# Convert the 'Date' column to a datetime object
'Date'] = pd.to_datetime(result['Date'])
result[
# Convert PTS to int
'PTS'] = result['PTS'].astype(int)
result[
# Split the MP column on ':' and keep the first element
'MP'] = result['MP'].str.split(':').str[0]
result[
# Convert the MP column to float
'MP'] = result['MP'].astype(float)
result[
# Convert Goals % to float
'FG%'] = result['FG%'].astype(float)
result['3P%'] = result['3P%'].astype(float)
result[
# Convert other stats to correct dtype
'TRB'] = result['TRB'].astype(int)
result['AST'] = result['AST'].astype(int)
result['STL'] = result['STL'].astype(int)
result['BLK'] = result['BLK'].astype(int)
result['FGA'] = result['FGA'].astype(int)
result['FTA'] = result['FTA'].astype(int)
result['GmSc'] = result['GmSc'].astype(float)
result[
# Fix the Age column
'Age'] = result['Age'].str.split('-').str[0]
result['Age'] = result['Age'].astype(int)
result[
# Keep only Jordan's seasons with the Bulls
== 'WAS'].index, inplace=True)
result.drop(result[result.Tm
# Create a Season column
'Season'] = result['Date'].apply(lambda x: str(x.year-1) + '-' + str(x.year)[2:] if x.month < 10 else str(x.year) + '-' + str(x.year+1)[2:]) result[
3. Visualize
# How many games did Jordan play in each season?
# Group the data by season and count games
= result.Season.value_counts().sort_index()
games_played
= plt.figure(figsize=(10, 5))
fig
# Create a bar plot
plt.bar(games_played.index, games_played.values)"Jordan's Total Games Played by Season")
plt.title('Season')
plt.xlabel('Games Played')
plt.ylabel(=45)
plt.xticks(rotation
plt.show()
# How many minutes did Jordan play in each season?
# Group the data by season and sum the minutes
= result.groupby('Season')['MP'].apply(lambda x: x.sum() / len(x))
minutes_played
= plt.figure(figsize=(10, 5))
fig
# Create a bar plot
plt.bar(minutes_played.index, minutes_played.values)"Jordan's Average Minutes Played by Season")
plt.title('Season')
plt.xlabel('Average Minutes')
plt.ylabel(=45)
plt.xticks(rotation
plt.show()
# How much did he score in each season?
# Group the data by season and calculate the average points per game
= result.groupby('Season')['PTS'].mean()
season_avg_pts
= plt.figure(figsize=(10, 5))
fig
# Create a line plot
='o')
plt.plot(season_avg_pts.index, season_avg_pts.values, marker"Jordan's Average Points per Game by Season")
plt.title('Season')
plt.xlabel('Average Points per Game')
plt.ylabel(=45)
plt.xticks(rotation
plt.show()
# How many games did Jordan play in each season - box plot
'whitegrid')
sns.set_style(
= plt.figure(figsize=(10, 5))
fig
='Season', y='PTS', data=result)
sns.boxplot(x
"Jordan's Points by Season")
plt.title('Season')
plt.xlabel('Points')
plt.ylabel(=45)
plt.xticks(rotation
plt.show()
# How does his scores looks like along his career?
# Define the bin edges
= [i for i in range(0, 71, 10)]
bins
= plt.figure(figsize=(10, 5))
fig
'PTS'], bins=bins, edgecolor='black')
plt.hist(result[
"Jordan's Career Points Frequency")
plt.title('Points')
plt.xlabel('Frequency')
plt.ylabel(
plt.show()
# Calculate FG% and 3P% for each season
= result.groupby('Season')['FG%'].mean()
fg_pct_by_season = result.groupby('Season')['3P%'].mean()
three_pct_by_season
= plt.figure(figsize=(10, 5))
fig
# Create a multi-line plot
* 100, label='Field goals %')
plt.plot(fg_pct_by_season.index, fg_pct_by_season.values * 100, label='3 point %')
plt.plot(three_pct_by_season.index, three_pct_by_season.values "Jordan's Field Goal Percentage and Three-Point Percentage by Season")
plt.title('Season')
plt.xlabel('Percentage')
plt.ylabel(=45)
plt.xticks(rotation
plt.legend()
plt.show()
# Create a density plot of Jordan's points per game
= plt.figure(figsize=(10, 5))
fig
=result, x="PTS", hue="Season", fill=True, alpha=0.5, common_norm=False, palette='tab20')
sns.kdeplot(data
"Distribution of Jordan's Points per Game by Season")
plt.title("Points per Game")
plt.xlabel("Density")
plt.ylabel(
plt.show()
# Calculate average points per opponent
= result.groupby('Opp')['PTS'].mean().sort_values()
avg_pts_by_opp
= plt.subplots(figsize=(8, 8))
fig, ax ='blue')
ax.barh(avg_pts_by_opp.index, avg_pts_by_opp.values, color
"Jordan's Average Points by Opponent")
ax.set_title('Average Points')
ax.set_xlabel('Opponent')
ax.set_ylabel(
plt.show()
# What was his True Shooting Percentage (TS%)? (formula from the internet)
'TS%'] = result['PTS'] / (2 * (result['FGA'] + 0.44 * result['FTA']))
result[
# Calculate true shooting percentage for each season
= result.groupby('Season')['TS%'].mean()
ts_pct_by_season
= plt.figure(figsize=(10, 5))
fig
# Create a line plot
* 100)
plt.plot(ts_pct_by_season.index, ts_pct_by_season.values
"Jordan's True Shooting Percentage by Season")
plt.title('Season')
plt.xlabel('True Shooting %')
plt.ylabel(=45)
plt.xticks(rotation
plt.show()
# Group the data by season and sum the defensive stats
= result.groupby('Season')[['TRB', 'AST', 'STL', 'BLK']].sum()
defensive_stats_by_season
# Create a stacked bar plot
='bar', stacked=True, figsize=(10, 5))
defensive_stats_by_season.plot(kind
"Jordan's Defensive Stats by Season")
plt.title('Season')
plt.xlabel('Total')
plt.ylabel(=45)
plt.xticks(rotation'Rebounds', 'Assists', 'Steals', 'Blocks'])
plt.legend([
plt.show()
import pygal
# Create a radar chart
= pygal.Radar()
radar_chart = "Jordan's Defensive Abilities by Season"
radar_chart.title
# Add the defensive statistics as axes to the chart
= ['TRB', 'AST', 'STL', 'BLK']
radar_chart.x_labels for season in defensive_stats_by_season.index:
list(defensive_stats_by_season.loc[season]))
radar_chart.add(season,
# Render the chart
radar_chart.render_in_browser()
file://C:/Users/nisan/AppData/Local/Temp/tmp9_pao9ut.html
This will open in your browser:
The following two plots can not be displayed in this website… you can use them on your machine.
# His "game score" along the seasons.
"""
Game Score = Points Scored + (0.4 x Field Goals) –
(0.7 x Field Goal Attempts) – (0.4 x (Free Throw Attempts – Free Throws)) +
(0.7 x Offensive Rebounds) + (0.3 x Defensive Rebounds) + Steals + (0.7 x Assists) +
(0.7 x Blocks) – (0.4 x Personal Fouls) – Turnovers
From: https://captaincalculator.com/sports/basketball/game-score-calculator/
"""
import plotly.graph_objs as go
from plotly.subplots import make_subplots
# Get the unique seasons
= result['Season'].unique()
seasons
# Create the subplot grid
= make_subplots(rows=len(seasons), cols=82, shared_yaxes=True,
fig =0.01, vertical_spacing=0.01)
horizontal_spacing
# Loop through each season and add the trace to the corresponding subplot
for i, season in enumerate(seasons):
= result[result['Season'] == season]
season_data
= go.Scatter(x=season_data['Date'], y=season_data['GmSc'], mode='lines', fill='tozeroy',
trace =False)
showlegend
=i+1, col=1)
fig.add_trace(trace, row
=800, width=300000, title="",
fig.update_layout(height='', yaxis_title='GmSc',
xaxis_title=dict(tickvals=[], ticktext=[]),
xaxis=dict(l=0, r=0, t=30, b=0, pad=0),
margin='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)',
paper_bgcolor=dict(color='#FFFFFF', size=8))
font
='Date', row=len(seasons), col=1)
fig.update_xaxes(title_text
fig.show()
Unable to display output for mime type(s): application/vnd.plotly.v1+json
It looks like that:
import plotly.graph_objs as go
'Opp'] = result['Opp'].astype('category')
result['Opp_codes'] = result['Opp'].cat.codes
result[
# Create the trace
= go.Scatter3d(
trace =result['GmSc'],
x=result['FG%'],
y=result['STL'],
z='markers',
mode=dict(
marker=result['Opp_codes'],
color='Viridis',
colorscale=5,
size=0.8,
opacity
),=result['Opp'],
text='Jordan'
name
)
# Create the layout
= go.Layout(
layout ='Jordan Performance',
title=dict(
scene=dict(title='Game Score (GmSc)'),
xaxis=dict(title='Field goals %'),
yaxis=dict(title='Steals'),
zaxis
),=dict(l=0, r=0, b=0, t=50),
margin=dict(
legend='',
title=dict(
font=10,
size
)
)
)
= go.Figure(data=[trace], layout=layout)
fig fig.show()
Unable to display output for mime type(s): application/vnd.plotly.v1+json
And this one looks like that: