orig_df.head(n=2).T
play_id | 20150301CLEHOU-0 | 20150301CLEHOU-1 |
---|---|---|
period | Q4 | Q4 |
seconds_left | 112 | 103 |
call_type | Foul: Shooting | Foul: Shooting |
committing_player | Josh Smith | J.R. Smith |
disadvantaged_player | Kevin Love | James Harden |
review_decision | CNC | CC |
away | CLE | CLE |
home | HOU | HOU |
date | 2015-03-01 00:00:00 | 2015-03-01 00:00:00 |
score_away | 103 | 103 |
score_home | 105 | 105 |
disadvantaged_team | CLE | HOU |
committing_team | HOU | CLE |
(orig_df.call_type
.str.split(':', expand=True).iloc[:, 0]
.value_counts()
.plot(kind='bar', color=blue, logy=True, title="Call types")
.set_ylabel("Frequency"));
(foul_df.call_type
.str.split(': ', expand=True).iloc[:, 1]
.value_counts()
.plot(kind='bar', color=blue, logy=True, title="Foul Types")
.set_ylabel("Frequency"));
df.head(n=2).T
play_id | 20151028INDTOR-1 | 20151028INDTOR-2 |
---|---|---|
seconds_left | 89.0 | 73.0 |
call_type | 4.0 | 4.0 |
foul_called | 1.0 | 0.0 |
player_committing | 162.0 | 36.0 |
player_disadvantaged | 98.0 | 358.0 |
score_committing | 99.0 | 106.0 |
score_disadvantaged | 106.0 | 99.0 |
season | 0.0 | 0.0 |
George Box (via Dustin Tran):
|
make_foul_rate_yaxis(
df.pivot_table('foul_called', 'season')
.rename(index=season_enc.inverse_transform)
.rename_axis("Season")
.plot(kind='bar', rot=0, legend=False)
);
import pymc3 as pm
with pm.Model() as base_model:
β_season = pm.Normal('β_season', 0., 5., shape=n_season)
p = pm.Deterministic('p', pm.math.sigmoid(β_season))
with base_model:
y = pm.Bernoulli(
'y', p[season],
observed=df.foul_called.values
)
with base_model:
base_trace = pm.sample(**SAMPLE_KWARGS)
Auto-assigning NUTS sampler... Initializing NUTS using jitter+adapt_diag... 100%|██████████| 1500/1500 [00:06<00:00, 235.48it/s]
The folk theorem [of statistical computing] is this: When you have computational problems, often there’s a problem with your model.
(pm.energyplot(base_trace, legend=False, figsize=(6, 4))
.set_title(CONVERGENCE_TITLE()));
resid_df = (df.assign(p_hat=base_trace['p'][:, df.season].mean(axis=0))
.assign(resid=lambda df: df.foul_called - df.p_hat))
resid_df[['foul_called', 'p_hat', 'resid']].head()
foul_called | p_hat | resid | |
---|---|---|---|
play_id | |||
20151028INDTOR-1 | 1.0 | 0.403732 | 0.596268 |
20151028INDTOR-2 | 0.0 | 0.403732 | -0.403732 |
20151028INDTOR-3 | 1.0 | 0.403732 | 0.596268 |
20151028INDTOR-4 | 0.0 | 0.403732 | -0.403732 |
20151028INDTOR-6 | 0.0 | 0.403732 | -0.403732 |
(resid_df.pivot_table('resid', 'season')
.rename(index=season_enc.inverse_transform))
resid | |
---|---|
season | |
2015-2016 | -0.000019 |
2016-2017 | 0.000026 |
make_time_axes(
resid_df.pivot_table('resid', 'seconds_left')
.reset_index()
.plot('seconds_left', 'resid', kind='scatter'),
ylabel="Residual"
);
make_time_axes(
df.pivot_table('foul_called', 'seconds_left', 'trailing_committing')
.rolling(20).mean()
.rename(columns={0: "No", 1: "Yes"})
.rename_axis("Committing team is trailing", axis=1)
.plot()
);
ax = (df.pivot_table('foul_called', 'call_type')
.rename(index=call_type_enc.inverse_transform)
.rename_axis("Call type", axis=0)
.plot(kind='barh', legend=False))
ax.xaxis.set_major_formatter(pct_formatter);
ax.set_xlabel("Observed foul call rate");
with poss_model:
β_call = hierarchical_normal('β_call', n_call_type)
make_time_axes(
df.pivot_table('foul_called', 'seconds_left', 'trailing_poss')
.loc[:, 1:3]
.rolling(20).mean()
.rename_axis(
"Trailing possessions\n(committing team)",
axis=1
)
.plot()
);
make_time_axes(
df.pivot_table('foul_called', 'seconds_left', 'call_type')
.rolling(20).mean()
.rename(columns=call_type_enc.inverse_transform)
.rename_axis(None, axis=1)
.plot()
);
ax = sns.heatmap(
df.pivot_table('foul_called', 'trailing_poss', 'remaining_poss')
.rename_axis(
"Trailing possessions\n(committing team)", axis=0
)
.rename_axis("Remaining possessions", axis=1),
cmap='seismic', cbar_kws={'format': pct_formatter}
)
ax.invert_yaxis();
ax.set_title("Observed foul call rate");
(sns.FacetGrid(diff_df, col='call_type', col_wrap=3, aspect=1.5)
.map_dataframe(plot_foul_diff_heatmap)
.set_axis_labels(
"Remaining possessions",
"Trailing possessions\n(committing team)"
)
.set_titles("{col_name}"));
with poss_model:
β_poss = hierarchical_normal(
'β_poss',
(n_trailing_poss, n_remaining_poss, n_call_type),
σ_shape=(1, 1, n_call_type)
)
with poss_model:
η_game = β_season[season] \
+ β_call[call_type] \
+ β_poss[
trailing_poss, remaining_poss, call_type
]
with poss_model:
poss_trace = pm.sample(**SAMPLE_KWARGS)
Auto-assigning NUTS sampler... Initializing NUTS using jitter+adapt_diag... 100%|██████████| 1500/1500 [06:34<00:00, 3.80it/s]
ax = sns.heatmap(
resid_df.pivot_table('resid', 'trailing_poss', 'remaining_poss')
.rename_axis("Trailing possessions\n(committing team)", axis=0)
.rename_axis("Remaining possessions", axis=1)
.loc[-3:3],
cmap='seismic', cbar_kws={'format': pct_formatter}
)
ax.invert_yaxis();
ax.set_title("Observed foul call rate");
ax = (resid_df.groupby(bins[bin_ix])
.resid.mean()
.rename_axis('p_hat', axis=0)
.reset_index()
.plot('p_hat', 'resid', kind='scatter'))
ax.xaxis.set_major_formatter(pct_formatter);
ax.set_xlabel(r"Binned $\hat{p}$");
make_foul_rate_yaxis(ax, label="Residual");
ax = (resid_df.groupby('seconds_left')
.resid.mean()
.reset_index()
.plot('seconds_left', 'resid', kind='scatter'))
make_time_axes(ax, ylabel="Residual");
comp_df = (pm.compare(
(base_trace, poss_trace),
(base_model, poss_model)
)
.rename(index=MODEL_NAME_MAP)
.loc[MODEL_NAME_MAP.values()])
comp_df
WAIC | pWAIC | dWAIC | weight | SE | dSE | warning | |
---|---|---|---|---|---|---|---|
Base | 11609.8 | 1.98 | 1543.34 | 0 | 56.98 | 73.35 | 1 |
Possession | 10066.5 | 81.92 | 0 | 1 | 87.93 | 0 | 1 |
fig, ax = plt.subplots()
ax.errorbar(
np.arange(len(MODEL_NAME_MAP)), comp_df.WAIC,
yerr=comp_df.SE, fmt='o'
);
ax.set_xticks(np.arange(len(MODEL_NAME_MAP)));
ax.set_xticklabels(comp_df.index);
ax.set_xlabel("Model");
ax.set_ylabel("WAIC");
fig
with irt_model:
θ_player = hierarchical_normal(
'θ_player', (n_player, n_season)
)
θ = θ_player[player_disadvantaged, season]
with irt_model:
b_player = hierarchical_normal(
'b_player', (n_player, n_season)
)
b = b_player[player_committing, season]
with irt_model:
η_player = θ - b
with irt_model:
η = η_game + η_player
with irt_model:
irt_trace = pm.sample(**SAMPLE_KWARGS)
Auto-assigning NUTS sampler... Initializing NUTS using jitter+adapt_diag... 100%|██████████| 1500/1500 [08:45<00:00, 2.85it/s]
ax = (resid_df.groupby(bins[bin_ix])
.resid.mean()
.rename_axis('p_hat', axis=0)
.reset_index()
.plot('p_hat', 'resid', kind='scatter'))
ax.xaxis.set_major_formatter(pct_formatter);
ax.set_xlabel(r"Binned $\hat{p}$");
make_foul_rate_yaxis(ax, label="Residual");
fig, ax = plt.subplots()
ax.errorbar(
np.arange(len(MODEL_NAME_MAP)), comp_df.WAIC,
yerr=comp_df.SE, fmt='o'
);
ax.set_xticks(np.arange(len(MODEL_NAME_MAP)));
ax.set_xticklabels(comp_df.index);
ax.set_xlabel("Model");
ax.set_ylabel("WAIC");
fig
fig, ax = plot_latent_params(
top_bot_irt_df[top_bot_irt_df.param == 'θ']
.sort_values('mean')
)
ax.set_xlabel(r"$\hat{\theta}$");
ax.set_title("Top and bottom ten");
fig, ax = plot_latent_params(
top_bot_irt_df[top_bot_irt_df.param == 'b']
.sort_values('mean')
)
ax.set_xlabel(r"$\hat{b}$");
ax.set_title("Top and bottom ten");
player_irt_df.head()
param | b | θ | ||
---|---|---|---|---|
season | 0 | 1 | 0 | 1 |
player | ||||
0 | -0.064689 | 0.006621 | -0.015132 | -0.003946 |
1 | -0.006142 | 0.004387 | 0.010513 | 0.037012 |
2 | 0.090107 | 0.092868 | -0.034668 | -0.003519 |
3 | -0.022949 | -0.002456 | 0.005822 | -0.003961 |
4 | -0.006999 | 0.288897 | 0.068338 | 0.007508 |
style_grid(sns.PairGrid(player_irt_df.loc[player_all_season], size=1.75)
.map_upper(plt.scatter, alpha=0.5)
.map_diag(plt.hist)
.map_lower(plot_corr));
grid.fig
import dash
import dash_html_components as html
app = dash.Dash()
app.layout = html.Div(children=[
html.H1(
"Understanding NBA Foul Calls with Python"
),
html.Center(TABLE)
])
@app.callback(
dash.dependencies.Output(
'irt-param-scatter', 'figure'
),
[
dash.dependencies.Input(
'season-dropdown','value'
),
dash.dependencies.Input(
'player-committing-dropdown','value'
),
dash.dependencies.Input(
'player-disadvantaged-dropdown','value'
)
]
)
def update_irt_scatter(season_,
player_committing_,
player_disadvantaged_):
irt_df = get_irt_df(
player_committing_=player_committing_,
player_disadvantaged_=player_disadvantaged_,
season_=season_,
)
return get_irt_scatter(irt_df)