{ "cells": [ { "cell_type": "code", "execution_count": 49, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "\n", "from nba_stats.ingest import get_all_games_for_year\n", "from play_by_play_with_lineups.api import HOME_COLS, AWAY_COLS" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "yr_string = '2017-18'" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "pbp = pd.read_msgpack('pbp_{}.mp'.format(yr_string))" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "shots = pd.read_msgpack('player_shot_logs_{}.mp'.format(yr_string))\n", "shots['GAME_EVENT_ID'] = shots['GAME_EVENT_ID'].astype(int)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": true }, "outputs": [], "source": [ "games = get_all_games_for_year(yr_string)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": true }, "outputs": [], "source": [ "home_team_dict = games[~games['MATCHUP'].str.contains('@')].set_index('GAME_ID')['TEAM_ID'].to_dict()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(562006, 44)" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pbp.shape" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(211708, 24)" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "shots.shape" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
GAME_IDEVENTNUMEVENTMSGTYPEEVENTMSGACTIONTYPEPERIODWCTIMESTRINGPCTIMESTRINGHOMEDESCRIPTIONNEUTRALDESCRIPTIONVISITORDESCRIPTION...HOME_PLAYER_2HOME_PLAYER_3HOME_PLAYER_4HOME_PLAYER_5AWAY_PLAYER_1AWAY_PLAYER_2AWAY_PLAYER_3AWAY_PLAYER_4AWAY_PLAYER_5seconds
00021700001212018:04 PM12:00NaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaN720
10021700001410018:04 PM12:00Jump Ball Love vs. Horford: Tip to IrvingNaNNaN...203109.02544.0201567.0201565.01628369.01627759.0202330.0201143.0202681.0720
200217000017110118:04 PM11:44NaNNaNIrving 10' Driving Floating Jump Shot (2 PTS) ......203109.02544.0201567.0201565.01628369.01627759.0202330.0201143.0202681.0704
3002170000192518:04 PM11:27MISS Rose 2' LayupNaNHorford BLOCK (1 BLK)...203109.02544.0201567.0201565.01628369.01627759.0202330.0201143.0202681.0687
40021700001114018:05 PM11:23NaNNaNHorford REBOUND (Off:0 Def:1)...203109.02544.0201567.0201565.01628369.01627759.0202330.0201143.0202681.0683
\n", "

5 rows × 44 columns

\n", "
" ], "text/plain": [ " GAME_ID EVENTNUM EVENTMSGTYPE EVENTMSGACTIONTYPE PERIOD \\\n", "0 0021700001 2 12 0 1 \n", "1 0021700001 4 10 0 1 \n", "2 0021700001 7 1 101 1 \n", "3 0021700001 9 2 5 1 \n", "4 0021700001 11 4 0 1 \n", "\n", " WCTIMESTRING PCTIMESTRING HOMEDESCRIPTION \\\n", "0 8:04 PM 12:00 NaN \n", "1 8:04 PM 12:00 Jump Ball Love vs. Horford: Tip to Irving \n", "2 8:04 PM 11:44 NaN \n", "3 8:04 PM 11:27 MISS Rose 2' Layup \n", "4 8:05 PM 11:23 NaN \n", "\n", " NEUTRALDESCRIPTION VISITORDESCRIPTION \\\n", "0 NaN NaN \n", "1 NaN NaN \n", "2 NaN Irving 10' Driving Floating Jump Shot (2 PTS) ... \n", "3 NaN Horford BLOCK (1 BLK) \n", "4 NaN Horford REBOUND (Off:0 Def:1) \n", "\n", " ... HOME_PLAYER_2 HOME_PLAYER_3 HOME_PLAYER_4 HOME_PLAYER_5 \\\n", "0 ... NaN NaN NaN NaN \n", "1 ... 203109.0 2544.0 201567.0 201565.0 \n", "2 ... 203109.0 2544.0 201567.0 201565.0 \n", "3 ... 203109.0 2544.0 201567.0 201565.0 \n", "4 ... 203109.0 2544.0 201567.0 201565.0 \n", "\n", " AWAY_PLAYER_1 AWAY_PLAYER_2 AWAY_PLAYER_3 AWAY_PLAYER_4 AWAY_PLAYER_5 \\\n", "0 NaN NaN NaN NaN NaN \n", "1 1628369.0 1627759.0 202330.0 201143.0 202681.0 \n", "2 1628369.0 1627759.0 202330.0 201143.0 202681.0 \n", "3 1628369.0 1627759.0 202330.0 201143.0 202681.0 \n", "4 1628369.0 1627759.0 202330.0 201143.0 202681.0 \n", "\n", " seconds \n", "0 720 \n", "1 720 \n", "2 704 \n", "3 687 \n", "4 683 \n", "\n", "[5 rows x 44 columns]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pbp.head()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
GRID_TYPEGAME_IDGAME_EVENT_IDPLAYER_IDPLAYER_NAMETEAM_IDTEAM_NAMEPERIODMINUTES_REMAININGSECONDS_REMAINING...SHOT_ZONE_AREASHOT_ZONE_RANGESHOT_DISTANCELOC_XLOC_YSHOT_ATTEMPTED_FLAGSHOT_MADE_FLAGGAME_DATEHTMVTM
0Shot Chart Detail002170001574203518Alex Abrines1610612760Oklahoma City Thunder1647...Right Side(R)24+ ft.23232401120171019OKCNYK
1Shot Chart Detail002170001578203518Alex Abrines1610612760Oklahoma City Thunder1622...Left Side Center(LC)24+ ft.24-1412001020171019OKCNYK
2Shot Chart Detail0021700015644203518Alex Abrines1610612760Oklahoma City Thunder4317...Center(C)24+ ft.26-672541020171019OKCNYK
3Shot Chart Detail0021700015678203518Alex Abrines1610612760Oklahoma City Thunder422...Center(C)24+ ft.25512511020171019OKCNYK
4Shot Chart Detail0021700015700203518Alex Abrines1610612760Oklahoma City Thunder4018...Left Side(L)24+ ft.23-231371020171019OKCNYK
\n", "

5 rows × 24 columns

\n", "
" ], "text/plain": [ " GRID_TYPE GAME_ID GAME_EVENT_ID PLAYER_ID PLAYER_NAME \\\n", "0 Shot Chart Detail 0021700015 74 203518 Alex Abrines \n", "1 Shot Chart Detail 0021700015 78 203518 Alex Abrines \n", "2 Shot Chart Detail 0021700015 644 203518 Alex Abrines \n", "3 Shot Chart Detail 0021700015 678 203518 Alex Abrines \n", "4 Shot Chart Detail 0021700015 700 203518 Alex Abrines \n", "\n", " TEAM_ID TEAM_NAME PERIOD MINUTES_REMAINING \\\n", "0 1610612760 Oklahoma City Thunder 1 6 \n", "1 1610612760 Oklahoma City Thunder 1 6 \n", "2 1610612760 Oklahoma City Thunder 4 3 \n", "3 1610612760 Oklahoma City Thunder 4 2 \n", "4 1610612760 Oklahoma City Thunder 4 0 \n", "\n", " SECONDS_REMAINING ... SHOT_ZONE_AREA SHOT_ZONE_RANGE SHOT_DISTANCE \\\n", "0 47 ... Right Side(R) 24+ ft. 23 \n", "1 22 ... Left Side Center(LC) 24+ ft. 24 \n", "2 17 ... Center(C) 24+ ft. 26 \n", "3 2 ... Center(C) 24+ ft. 25 \n", "4 18 ... Left Side(L) 24+ ft. 23 \n", "\n", " LOC_X LOC_Y SHOT_ATTEMPTED_FLAG SHOT_MADE_FLAG GAME_DATE HTM VTM \n", "0 232 40 1 1 20171019 OKC NYK \n", "1 -141 200 1 0 20171019 OKC NYK \n", "2 -67 254 1 0 20171019 OKC NYK \n", "3 51 251 1 0 20171019 OKC NYK \n", "4 -231 37 1 0 20171019 OKC NYK \n", "\n", "[5 rows x 24 columns]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "shots.head()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": true }, "outputs": [], "source": [ "joined = pd.merge(pbp, shots, left_on=['GAME_ID', 'EVENTNUM'], right_on=['GAME_ID', 'GAME_EVENT_ID'], how='left')" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": true }, "outputs": [], "source": [ "joined['HOME_TEAM_ID'] = joined['GAME_ID'].apply(home_team_dict.get)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": true }, "outputs": [], "source": [ "joined['HOME_TEAM_ACTION'] = joined['HOME_TEAM_ID'] == joined['PLAYER1_TEAM_ID']" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": true }, "outputs": [], "source": [ "joined = joined[joined['EVENTMSGTYPE'] != 12]" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": true }, "outputs": [], "source": [ "joined['Home'] = joined[HOME_COLS].astype(int).astype(str).values.tolist()\n", "joined['Away'] = joined[AWAY_COLS].astype(int).astype(str).values.tolist()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": true }, "outputs": [], "source": [ "joined['SHOOTER'] = joined['PLAYER1_ID'].astype(str)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": true, "scrolled": true }, "outputs": [], "source": [ "joined['OFF_all'] = joined.apply(lambda x: x['Home'] if x['HOME_TEAM_ACTION'] else x['Away'], axis=1)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": true }, "outputs": [], "source": [ "joined['DEF'] = joined.apply(lambda x: x['Away'] if x['HOME_TEAM_ACTION'] else x['Home'], axis=1)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": true }, "outputs": [], "source": [ "joined['OFF'] = joined.apply(lambda x: [player for player in x['OFF_all'] if player != x['SHOOTER']], axis=1)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": true }, "outputs": [], "source": [ "full = joined.copy()\n", "has_shot_mask = joined['LOC_X'].notnull()\n", "joined = joined[has_shot_mask]" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 11min 22s, sys: 24.3 s, total: 11min 46s\n", "Wall time: 11min 55s\n" ] } ], "source": [ "%%time\n", "off_df=joined['OFF'].str.join(sep='*').str.get_dummies(sep='*')\n", "def_df=joined['DEF'].str.join(sep='*').str.get_dummies(sep='*')" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 58.3 ms, sys: 72.6 ms, total: 131 ms\n", "Wall time: 143 ms\n" ] } ], "source": [ "%%time\n", "shooter_df=pd.get_dummies(joined['SHOOTER'])" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": true }, "outputs": [], "source": [ "off_df.columns ='off' + off_df.columns.astype(str)\n", "def_df.columns ='def' + def_df.columns.astype(str)\n", "shooter_df.columns='shoot' + shooter_df.columns" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": true }, "outputs": [], "source": [ "joined_columns = ['GAME_ID', 'GAME_EVENT_ID', 'SHOT_ZONE_BASIC', 'SHOT_MADE_FLAG', 'SHOT_TYPE']" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": true }, "outputs": [], "source": [ "preprocessed_df = pd.concat([joined[joined_columns], off_df, def_df, shooter_df], axis=1)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": true }, "outputs": [], "source": [ "preprocessed_df.to_msgpack('preprocessed_df.mp')" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(557017, 75)" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "full.shape" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 47min 16s, sys: 5min 30s, total: 52min 46s\n", "Wall time: 1h 10min 56s\n" ] } ], "source": [ "%%time\n", "off_full_df=full['OFF_all'].str.join(sep='*').str.get_dummies(sep='*')\n", "def_full_df=full['DEF'].str.join(sep='*').str.get_dummies(sep='*')" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": true }, "outputs": [], "source": [ "off_full_df.columns ='off' + off_full_df.columns.astype(str)\n", "def_full_df.columns ='def' + def_full_df.columns.astype(str)" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "collapsed": true }, "outputs": [], "source": [ "full['turnover'] = full['EVENTMSGTYPE'] == 5" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "collapsed": true }, "outputs": [], "source": [ "turnover_columns = ['GAME_ID', 'GAME_EVENT_ID', 'turnover', 'LOC_X']\n", "prep_turnover_df = pd.concat([full[turnover_columns], off_full_df, def_full_df], axis=1)" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "collapsed": true }, "outputs": [], "source": [ "turnover_mask = prep_turnover_df['turnover']\n", "shot_mask = prep_turnover_df['LOC_X'].notnull()\n", "prep_turnover_df[turnover_mask | shot_mask].to_msgpack('prep_turnover_df.mp')" ] }, { "cell_type": "code", "execution_count": 74, "metadata": { "collapsed": true }, "outputs": [], "source": [ "rebound_columns = ['GAME_ID', 'GAME_EVENT_ID', 'EVENTMSGTYPE', 'EVENTMSGACTIONTYPE','HOME_TEAM_ACTION', 'prev_miss_team_ffill']\n", "full['miss'] = full['HOMEDESCRIPTION'].str.contains('MISS').fillna(False) | full['VISITORDESCRIPTION'].str.contains('MISS').fillna(False)\n", "full['prev_miss_team'] = np.nan\n", "full.loc[full['miss'], 'prev_miss_team'] = full.loc[full['miss'], 'HOME_TEAM_ACTION']\n", "full['prev_miss_team_ffill'] = full['prev_miss_team'].fillna(method='ffill')\n", "prep_reb_df = pd.concat([full[rebound_columns], off_full_df, def_full_df], axis=1)\n" ] }, { "cell_type": "code", "execution_count": 75, "metadata": { "collapsed": true }, "outputs": [], "source": [ "reb_mask = prep_reb_df['EVENTMSGTYPE'] == 4\n", "prep_reb_df[reb_mask].to_msgpack('prep_reb_df.mp')\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.10" } }, "nbformat": 4, "nbformat_minor": 2 }