In [49]:
import pandas as pd
import numpy as np

from nba_stats.ingest import get_all_games_for_year
from play_by_play_with_lineups.api import HOME_COLS, AWAY_COLS

In [2]:
yr_string = '2017-18'

In [3]:
pbp = pd.read_msgpack('pbp_{}.mp'.format(yr_string))

In [4]:
shots = pd.read_msgpack('player_shot_logs_{}.mp'.format(yr_string))
shots['GAME_EVENT_ID'] = shots['GAME_EVENT_ID'].astype(int)

In [5]:
games = get_all_games_for_year(yr_string)

In [6]:
home_team_dict = games[~games['MATCHUP'].str.contains('@')].set_index('GAME_ID')['TEAM_ID'].to_dict()

In [7]:
pbp.shape

(562006, 44)

In [8]:
shots.shape

(211708, 24)

In [9]:
pbp.head()

Unnamed: 0,GAME_ID,EVENTNUM,EVENTMSGTYPE,EVENTMSGACTIONTYPE,PERIOD,WCTIMESTRING,PCTIMESTRING,HOMEDESCRIPTION,NEUTRALDESCRIPTION,VISITORDESCRIPTION,...,HOME_PLAYER_2,HOME_PLAYER_3,HOME_PLAYER_4,HOME_PLAYER_5,AWAY_PLAYER_1,AWAY_PLAYER_2,AWAY_PLAYER_3,AWAY_PLAYER_4,AWAY_PLAYER_5,seconds
0,21700001,2,12,0,1,8:04 PM,12:00,,,,...,,,,,,,,,,720
1,21700001,4,10,0,1,8:04 PM,12:00,Jump Ball Love vs. Horford: Tip to Irving,,,...,203109.0,2544.0,201567.0,201565.0,1628369.0,1627759.0,202330.0,201143.0,202681.0,720
2,21700001,7,1,101,1,8:04 PM,11:44,,,Irving 10' Driving Floating Jump Shot (2 PTS) ...,...,203109.0,2544.0,201567.0,201565.0,1628369.0,1627759.0,202330.0,201143.0,202681.0,704
3,21700001,9,2,5,1,8:04 PM,11:27,MISS Rose 2' Layup,,Horford BLOCK (1 BLK),...,203109.0,2544.0,201567.0,201565.0,1628369.0,1627759.0,202330.0,201143.0,202681.0,687
4,21700001,11,4,0,1,8:05 PM,11:23,,,Horford REBOUND (Off:0 Def:1),...,203109.0,2544.0,201567.0,201565.0,1628369.0,1627759.0,202330.0,201143.0,202681.0,683


In [10]:
shots.head()

Unnamed: 0,GRID_TYPE,GAME_ID,GAME_EVENT_ID,PLAYER_ID,PLAYER_NAME,TEAM_ID,TEAM_NAME,PERIOD,MINUTES_REMAINING,SECONDS_REMAINING,...,SHOT_ZONE_AREA,SHOT_ZONE_RANGE,SHOT_DISTANCE,LOC_X,LOC_Y,SHOT_ATTEMPTED_FLAG,SHOT_MADE_FLAG,GAME_DATE,HTM,VTM
0,Shot Chart Detail,21700015,74,203518,Alex Abrines,1610612760,Oklahoma City Thunder,1,6,47,...,Right Side(R),24+ ft.,23,232,40,1,1,20171019,OKC,NYK
1,Shot Chart Detail,21700015,78,203518,Alex Abrines,1610612760,Oklahoma City Thunder,1,6,22,...,Left Side Center(LC),24+ ft.,24,-141,200,1,0,20171019,OKC,NYK
2,Shot Chart Detail,21700015,644,203518,Alex Abrines,1610612760,Oklahoma City Thunder,4,3,17,...,Center(C),24+ ft.,26,-67,254,1,0,20171019,OKC,NYK
3,Shot Chart Detail,21700015,678,203518,Alex Abrines,1610612760,Oklahoma City Thunder,4,2,2,...,Center(C),24+ ft.,25,51,251,1,0,20171019,OKC,NYK
4,Shot Chart Detail,21700015,700,203518,Alex Abrines,1610612760,Oklahoma City Thunder,4,0,18,...,Left Side(L),24+ ft.,23,-231,37,1,0,20171019,OKC,NYK


In [11]:
joined = pd.merge(pbp, shots, left_on=['GAME_ID', 'EVENTNUM'], right_on=['GAME_ID', 'GAME_EVENT_ID'], how='left')

In [14]:
joined['HOME_TEAM_ID'] = joined['GAME_ID'].apply(home_team_dict.get)

In [15]:
joined['HOME_TEAM_ACTION'] = joined['HOME_TEAM_ID'] == joined['PLAYER1_TEAM_ID']

In [16]:
joined = joined[joined['EVENTMSGTYPE'] != 12]

In [17]:
joined['Home'] = joined[HOME_COLS].astype(int).astype(str).values.tolist()
joined['Away'] = joined[AWAY_COLS].astype(int).astype(str).values.tolist()

In [18]:
joined['SHOOTER'] = joined['PLAYER1_ID'].astype(str)

In [19]:
joined['OFF_all'] = joined.apply(lambda x: x['Home'] if x['HOME_TEAM_ACTION'] else x['Away'], axis=1)

In [20]:
joined['DEF'] = joined.apply(lambda x: x['Away'] if x['HOME_TEAM_ACTION'] else x['Home'], axis=1)

In [21]:
joined['OFF'] = joined.apply(lambda x: [player for player in x['OFF_all'] if player != x['SHOOTER']], axis=1)

In [22]:
full = joined.copy()
has_shot_mask = joined['LOC_X'].notnull()
joined = joined[has_shot_mask]

In [23]:
%%time
off_df=joined['OFF'].str.join(sep='*').str.get_dummies(sep='*')
def_df=joined['DEF'].str.join(sep='*').str.get_dummies(sep='*')

CPU times: user 11min 22s, sys: 24.3 s, total: 11min 46s
Wall time: 11min 55s


In [24]:
%%time
shooter_df=pd.get_dummies(joined['SHOOTER'])

CPU times: user 58.3 ms, sys: 72.6 ms, total: 131 ms
Wall time: 143 ms


In [25]:
off_df.columns ='off' + off_df.columns.astype(str)
def_df.columns ='def' + def_df.columns.astype(str)
shooter_df.columns='shoot' + shooter_df.columns

In [26]:
joined_columns = ['GAME_ID', 'GAME_EVENT_ID', 'SHOT_ZONE_BASIC', 'SHOT_MADE_FLAG', 'SHOT_TYPE']

In [27]:
preprocessed_df = pd.concat([joined[joined_columns], off_df, def_df, shooter_df], axis=1)

In [29]:
preprocessed_df.to_msgpack('preprocessed_df.mp')

In [30]:
full.shape

(557017, 75)

In [32]:
%%time
off_full_df=full['OFF_all'].str.join(sep='*').str.get_dummies(sep='*')
def_full_df=full['DEF'].str.join(sep='*').str.get_dummies(sep='*')

CPU times: user 47min 16s, sys: 5min 30s, total: 52min 46s
Wall time: 1h 10min 56s


In [33]:
off_full_df.columns ='off' + off_full_df.columns.astype(str)
def_full_df.columns ='def' + def_full_df.columns.astype(str)

In [34]:
full['turnover'] = full['EVENTMSGTYPE'] == 5

In [35]:
turnover_columns = ['GAME_ID', 'GAME_EVENT_ID', 'turnover', 'LOC_X']
prep_turnover_df = pd.concat([full[turnover_columns], off_full_df, def_full_df], axis=1)

In [36]:
turnover_mask = prep_turnover_df['turnover']
shot_mask = prep_turnover_df['LOC_X'].notnull()
prep_turnover_df[turnover_mask | shot_mask].to_msgpack('prep_turnover_df.mp')

In [74]:
rebound_columns = ['GAME_ID', 'GAME_EVENT_ID', 'EVENTMSGTYPE', 'EVENTMSGACTIONTYPE','HOME_TEAM_ACTION', 'prev_miss_team_ffill']
full['miss'] = full['HOMEDESCRIPTION'].str.contains('MISS').fillna(False) | full['VISITORDESCRIPTION'].str.contains('MISS').fillna(False)
full['prev_miss_team'] = np.nan
full.loc[full['miss'], 'prev_miss_team'] = full.loc[full['miss'], 'HOME_TEAM_ACTION']
full['prev_miss_team_ffill'] = full['prev_miss_team'].fillna(method='ffill')
prep_reb_df = pd.concat([full[rebound_columns], off_full_df, def_full_df], axis=1)


In [75]:
reb_mask = prep_reb_df['EVENTMSGTYPE'] == 4
prep_reb_df[reb_mask].to_msgpack('prep_reb_df.mp')
