In [1]:
# Notebook for performing analysis. Please excuse any messinesses

In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegressionCV
import tqdm
from sklearn.metrics import log_loss

### Load any and all relevant data

In [3]:
preprocessed_df = pd.read_msgpack('preprocessed_df.mp').sample(frac=1)
player_info = pd.read_msgpack('player_info_2017-18.mp')

### Get X and Y dataframes

In [4]:
preprocessed_df.columns[4:]

Index([u'SHOT_TYPE', u'off101106', u'off101107', u'off101108', u'off101109',
       u'off101112', u'off101123', u'off101127', u'off101133', u'off101139',
       ...
       u'shoot2734', u'shoot2736', u'shoot2738', u'shoot2744', u'shoot2746',
       u'shoot2747', u'shoot2749', u'shoot2754', u'shoot2772', u'shoot2863'],
      dtype='object', length=1615)

In [5]:
y_loc = preprocessed_df['SHOT_ZONE_BASIC']
y_make = preprocessed_df['SHOT_MADE_FLAG'].astype(int)
y_type = pd.Series(['Miss' if x ==0 else y for x,y in zip(preprocessed_df['SHOT_MADE_FLAG'], preprocessed_df['SHOT_TYPE'])], index=preprocessed_df.index)
x_df = preprocessed_df[preprocessed_df.columns[5:]]

### Add player positions

In [6]:
def add_pos(x_train_pos):
    for type_ in ['shoot', 'off', 'def']:
        for pos in player_info['pos'].unique():
            x_train_pos['{}{}'.format(type_, pos)] = 0
    for col in tqdm.tqdm(x_df.columns):
        if col[0] == 'o':
            type_ = 'off'
        elif col[0] == 's':
            type_ = 'shoot'
        else:
            type_ = 'def'
        player = col.replace(type_,'')
        pos = player_pos_dict[int(player)]
        x_train_pos['{}{}'.format(type_, pos)] = x_train_pos['{}{}'.format(type_, pos)] + x_train_pos[col]
    return x_train_pos

In [7]:
player_info['pos'] = player_info['POSITION'].str.split('-').apply(lambda x: x[0])
player_info['pos'].value_counts()

Guard      224
Forward    187
Center      84
            45
Name: pos, dtype: int64

In [8]:
position_filler = {'Guard': [2754, 203505, 201628, 203477, 1628492, 1627982, 204066,
                             1628455, 1627215, 202714, 203590, 1628513, 1626242, 1628506,
                             1626643, 1627817, 1628504, 1626173, 2863, 1628475, 203489],
                   'Forward': [1627852, 201956, 203816, 1626205, 1627744, 204179,1627851,
                               203949, 201177, 203948, 203940, 202347, 203186, 2746,
                              203141, 1627866, 202682, 203966, 203923],
                   'Center': [101106, 1627762, 1628451, 203481, 201148]}


In [9]:
player_pos_dict = dict(player_info[['PERSON_ID','pos']].values)
for pos, player_list in position_filler.iteritems():
    for player in player_list:
        player_pos_dict[player] = pos

In [10]:
pd.Series(player_pos_dict).value_counts()

Guard      245
Forward    206
Center      89
dtype: int64

In [11]:
x_df_pos = add_pos(x_df.copy())

100%|██████████| 1614/1614 [00:02<00:00, 614.80it/s]


### Split into time respecting train and validation indices

In [12]:
train_index = preprocessed_df[preprocessed_df['GAME_ID']<='0021701100'].index
test_index = preprocessed_df[preprocessed_df['GAME_ID']>'0021701100'].index

In [13]:
x_train = x_df_pos.loc[train_index]
x_base = x_df.loc[train_index]
y_loc_train = y_loc.loc[train_index]
y_make_train = y_make.loc[train_index]
y_type_train = y_type.loc[train_index]

In [14]:
x_test = x_df.loc[test_index]
y_loc_test = y_loc.loc[test_index]
y_make_test = y_make.loc[test_index]
y_type_test = y_type.loc[test_index]

In [15]:
x_train.shape

(189002, 1626)

### Make alternate test dfs

In [16]:
x_test_copies = [[],[],[],[]]
shooters = []
for index, row in tqdm.tqdm(x_test.iterrows()):
    offs = [x.replace('off','') for x in row[lambda x: x==1].reset_index()[lambda x: x['index'].str.contains('off')]['index']]
    shooter = [x.replace('shoot','') for x in row[lambda x: x==1].reset_index()[lambda x: x['index'].str.contains('shoot')]['index']][0]
    row_base = row.copy()
    shooters.append([shooter] + offs)
    row_base['shoot{}'.format(shooter)] = 0
    row_base['off{}'.format(shooter)] = 1
    for ix, off in enumerate(offs):
        row_new = row_base.copy()
        row_new['off{}'.format(off)] = 0
        row_new['shoot{}'.format(off)] = 1
        x_test_copies[ix].append(row_new)

22704it [01:58, 192.34it/s]


In [17]:
x_test_0 = add_pos(x_test.copy())
x_test_1 = add_pos(pd.concat(x_test_copies[0], axis=1).T)
x_test_2 = add_pos(pd.concat(x_test_copies[1], axis=1).T)
x_test_3 = add_pos(pd.concat(x_test_copies[2], axis=1).T)
x_test_4 = add_pos(pd.concat(x_test_copies[3], axis=1).T)

100%|██████████| 1614/1614 [00:00<00:00, 2157.55it/s]
100%|██████████| 1614/1614 [00:00<00:00, 1813.56it/s]
100%|██████████| 1614/1614 [00:01<00:00, 1594.84it/s]
100%|██████████| 1614/1614 [00:00<00:00, 1684.58it/s]
100%|██████████| 1614/1614 [00:00<00:00, 1774.07it/s]


### Get baseline

In [19]:
test_type_df = pd.get_dummies(y_type_test)

In [20]:
test_type_df_pred_base = test_type_df.copy()
for type_ in test_type_df.columns:
    test_type_df_pred_base[type_] = (y_type_train.value_counts()/ len(y_type_train))[type_]

In [24]:
def score_preds(pct_make_test):
    m_logloss = []
    for type_ in y_type_train.unique():
        m_logloss.append(log_loss(test_type_df[type_], pct_make_test[type_]))
        print type_, log_loss(test_type_df[type_], pct_make_test[type_])
    print np.average(m_logloss)

In [25]:
score_preds(test_type_df_pred_base)

Miss 0.689638514677234
3PT Field Goal 0.3682935288331273
2PT Field Goal 0.6392885472094791
0.5657401969066135


### Fit initial models

In [26]:
%%time
model_train_loc = LogisticRegressionCV(multi_class='multinomial', scoring='log_loss').fit(x_train, y_loc_train)

  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


CPU times: user 2h 54min 45s, sys: 13min, total: 3h 7min 46s
Wall time: 34min 19s


In [27]:
models_train = {}
for class_ in tqdm.tqdm(y_loc.unique()):
    shot_index = y_loc_train[lambda x: x==class_].index
    y_make_train_sub = y_make_train.loc[shot_index]
    x_train_sub = x_train.loc[shot_index]
    models_train[class_] = LogisticRegressionCV(scoring='neg_log_loss').fit(x_train_sub, y_make_train_sub)

100%|██████████| 7/7 [10:03<00:00, 86.19s/it] 


### Do not include position dummies

In [28]:
%%time
model_train_loc_base = LogisticRegressionCV(multi_class='multinomial', scoring='log_loss').fit(x_base, y_loc_train)

  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


CPU times: user 2h 23min 48s, sys: 9min 8s, total: 2h 32min 56s
Wall time: 23min 57s


In [29]:
models_train_base = {}
for class_ in tqdm.tqdm(y_loc.unique()):
    shot_index = y_loc_train[lambda x: x==class_].index
    y_make_train_sub = y_make_train.loc[shot_index]
    x_train_sub = x_base.loc[shot_index]
    models_train_base[class_] = LogisticRegressionCV(scoring='neg_log_loss').fit(x_train_sub, y_make_train_sub)

100%|██████████| 7/7 [07:40<00:00, 65.83s/it]


### Helper function to combine models

In [30]:
def make_pct_make(x_test, model_loc, models):
    loc_preds_test = pd.DataFrame(model_loc.predict_proba(x_test), columns=model_loc.classes_)
    pct_preds_test = {}
    for class_ in models.keys():
        pct_preds_test[class_] = [x[1] for x in models[class_].predict_proba(x_test)]
    pct_preds_test = pd.DataFrame(pct_preds_test)
    make_prob_df = (loc_preds_test * pct_preds_test)
    pct_preds_outcome = pd.DataFrame(0,index=x_test.index, columns=['Miss', '2PT Field Goal', '3PT Field Goal'])
    pct_preds_outcome['Miss'] = (1-make_prob_df.sum(axis=1)).values
    pct_preds_outcome['2PT Field Goal'] = make_prob_df[['In The Paint (Non-RA)','Mid-Range', 'Restricted Area']].sum(axis=1).values
    pct_preds_outcome['3PT Field Goal'] = make_prob_df[['Above the Break 3', 'Backcourt', 'Left Corner 3', 'Right Corner 3']].sum(axis=1).values
    return pct_preds_outcome

In [31]:


pct_make_test = make_pct_make(x_test_0, model_train_loc, models_train)

In [32]:
score_preds(pct_make_test)

Miss 0.6855066836129877
3PT Field Goal 0.34271533085487427
2PT Field Goal 0.6143327423598205
0.5475182522758941


In [33]:
pct_make_test_base = make_pct_make(x_test, model_train_loc_base, models_train_base)

In [34]:
score_preds(pct_make_test_base)

Miss 0.6857800119712321
3PT Field Goal 0.34347870561885346
2PT Field Goal 0.6153357453584926
0.5481981543161928


### Compare to log loss if just modeling miss, 2pt, or 3pt

In [35]:
%%time
model_train_make = LogisticRegressionCV(multi_class='multinomial', scoring='log_loss').fit(x_train, y_type_train)

  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


CPU times: user 2h 59min 36s, sys: 12min 19s, total: 3h 11min 55s
Wall time: 28min 51s


In [36]:
test_preds_make = pd.DataFrame(model_train_make.predict_proba(x_test_0), columns=model_train_make.classes_)

In [37]:
score_preds(test_preds_make)

Miss 0.6932640472159506
3PT Field Goal 0.34896992669706967
2PT Field Goal 0.6202050881365443
0.5541463540165216


### Do not condition on shooter

In [38]:
x_off_only = x_base.copy()
for col in x_off_only.columns:
    if 'shoot' in col:
        new_col = col.replace('shoot', 'off')
        if new_col in x_off_only:
            x_off_only[new_col] = x_off_only[new_col] + x_off_only[col]
        else:
            x_off_only[new_col] = x_off_only[col]

In [40]:
x_off_only = x_off_only[[col for col in x_off_only.columns if 'shoot' not in col]]

In [41]:
%%time
model_off_only_make = LogisticRegressionCV(multi_class='multinomial', scoring='log_loss').fit(x_off_only, y_type_train)

  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


CPU times: user 1h 25min 38s, sys: 10min 22s, total: 1h 36min 1s
Wall time: 17min 39s


In [42]:
x_off_only_test = x_test.copy()
for col in x_off_only_test.columns:
    if 'shoot' in col:
        new_col = col.replace('shoot', 'off')
        if new_col in x_off_only_test:
            x_off_only_test[new_col] = x_off_only_test[new_col] + x_off_only_test[col]
        else:
            x_off_only_test[new_col] = x_off_only_test[col]
x_off_only_test = x_off_only_test[[col for col in x_off_only_test.columns if 'shoot' not in col]]

In [43]:
preds_off_only = pd.DataFrame(model_off_only_make.predict_proba(x_off_only_test), columns=model_off_only_make.classes_)

In [44]:
score_preds(preds_off_only)

Miss 0.6919973278838878
3PT Field Goal 0.36818879560421525
2PT Field Goal 0.6395346782914838
0.5665736005931956


### Get average for all 

In [45]:
pct_make_test_1 = make_pct_make(x_test_1, model_train_loc, models_train)
pct_make_test_2 = make_pct_make(x_test_2, model_train_loc, models_train)
pct_make_test_3 = make_pct_make(x_test_3, model_train_loc, models_train)
pct_make_test_4 = make_pct_make(x_test_4, model_train_loc, models_train)

In [46]:
average_test_pred = (pct_make_test + pct_make_test_1 + pct_make_test_2 + pct_make_test_3 + pct_make_test_4) / 5

In [47]:
score_preds(average_test_pred)

Miss 0.6892712827707342
3PT Field Goal 0.3669448863667952
2PT Field Goal 0.6385106658837749
0.5649089450071014


### Full Model

In [None]:
model_loc = LogisticRegressionCV(multi_class='multinomial', scoring='log_loss').fit(x_df_pos, y_loc)

In [None]:
models = {}
for class_ in tqdm.tqdm(y_loc.unique()):
    shot_index = y_loc[lambda x: x==class_].index
    y_make_train_sub = y_make.loc[shot_index]
    x_train_sub = x_df_pos.loc[shot_index]
    models[class_] = LogisticRegressionCV(scoring='neg_log_loss').fit(x_train_sub, y_make_train_sub)

In [None]:
loc_coefs = pd.DataFrame(model_loc.coef_, columns=x_df_pos.columns, index=model_loc.classes_)
loc_coefs['int'] = model_loc.intercept_
loc_coefs.to_msgpack('full_loc_coefs.mp')

In [None]:
model_coefs = []
model_ints = []
for class_ in tqdm.tqdm(y_loc.unique()):
    model_coefs.append(pd.Series(models[class_].coef_[0], name=class_, index=x_df_pos.columns))
    model_ints.append(models[class_].intercept_[0])

In [None]:
pct_coefs = pd.concat(model_coefs,axis=1).T
pct_coefs['int'] = model_ints
pct_coefs.to_msgpack('full_pct_coefs.mp')