# Import Statements
import pandas as pd
import time
import numpy as np
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import gc
import matplotlib.pyplot as plt
import os
# Extract time delta features
def do_next_Click( df,agg_suffix='nextClick', agg_type='float32'):
print(">> \nExtracting time calculation features...\n")
GROUP_BY_NEXT_CLICKS = [
{'groupby': ['ip', 'app', 'device', 'os', 'channel']},
{'groupby': ['ip', 'os', 'device', 'app']},
{'groupby': ['app', 'device', 'channel']}
]
# Calculate the time to next click for each group
for spec in GROUP_BY_NEXT_CLICKS:
# Name of new feature
new_feature = '{}_{}'.format('_'.join(spec['groupby']),agg_suffix)
# Unique list of features to select
all_features = spec['groupby'] + ['click_time']
# Run calculation
df[new_feature] = (df[all_features].groupby(spec[
'groupby']).click_time.shift(-1) - df.click_time).dt.seconds.astype(agg_type)
gc.collect()
return (df)
# Extract aggregate features
# Extract count feature using different columns
def count_feat( df, group_cols, agg_type='uint16', show_max=False, show_agg=True ):
agg_name='{}count'.format('_'.join(group_cols))
if show_agg:
print( "\nAggregating by ", group_cols , '... and saved in', agg_name )
gp = df[group_cols][group_cols].groupby(group_cols).size().rename(agg_name).to_frame().reset_index()
df = df.merge(gp, on=group_cols, how='left')
del gp
if show_max:
print( agg_name + " max value = ", df[agg_name].max() )
df[agg_name] = df[agg_name].astype(agg_type)
gc.collect()
return( df )
# Extract unique count feature using different cols
def count_unique( df, group_cols, counted, agg_type='uint8', show_max=False, show_agg=True ):
agg_name= '{}_by_{}_countuniq'.format(('_'.join(group_cols)),(counted))
if show_agg:
print( "\nCounting unqiue ", counted, " by ", group_cols , '... and saved in', agg_name )
gp = df[group_cols+[counted]].groupby(group_cols)[counted].nunique().reset_index().rename(columns={counted:agg_name})
df = df.merge(gp, on=group_cols, how='left')
del gp
if show_max:
print( agg_name + " max value = ", df[agg_name].max() )
df[agg_name] = df[agg_name].astype(agg_type)
gc.collect()
return( df )
# Extract cumulative count feature from different cols
def cumulative_count( df, group_cols, counted,agg_type='uint16', show_max=False, show_agg=True ):
agg_name= '{}_by_{}_cumcount'.format(('_'.join(group_cols)),(counted))
if show_agg:
print( "\nCumulative count by ", group_cols , '... and saved in', agg_name )
gp = df[group_cols+[counted]].groupby(group_cols)[counted].cumcount()
df[agg_name]=gp.values
del gp
if show_max:
print( agg_name + " max value = ", df[agg_name].max() )
df[agg_name] = df[agg_name].astype(agg_type)
gc.collect()
return( df )
# LightGBM Model
def lgb_modelfit_nocv(params, dtrain, dvalid, predictors, target='target', objective='binary', metrics='auc',
feval=None, early_stopping_rounds=50, num_boost_round=3000, verbose_eval=10, categorical_features=None):
lgb_params = {
'boosting_type': 'gbdt',
'objective': objective,
'metric':metrics,
'learning_rate': 0.05,
'num_leaves': 31, # we should let it be smaller than 2^(max_depth)
'max_depth': -1, # -1 means no limit
'min_child_samples': 20, # Minimum number of data need in a child(min_data_in_leaf)
'max_bin': 255, # Number of bucketed bin for feature values
'subsample': 0.6, # Subsample ratio of the training instance.
'subsample_freq': 0, # frequence of subsample, <=0 means no enable
'colsample_bytree': 0.3, # Subsample ratio of columns when constructing each tree.
'min_child_weight': 5, # Minimum sum of instance weight(hessian) needed in a child(leaf)
'subsample_for_bin': 200000, # Number of samples for constructing bin
'min_split_gain': 0, # lambda_l1, lambda_l2 and min_gain_to_split to regularization
'reg_alpha': 0, # L1 regularization term on weights
'reg_lambda': 0, # L2 regularization term on weights
'nthread': 8,
'verbose': 0,
}
lgb_params.update(params)
print("preparing validation datasets")
xgtrain = lgb.Dataset(dtrain[predictors].values, label=dtrain[target].values,
feature_name=predictors,
categorical_feature=categorical_features
)
xgvalid = lgb.Dataset(dvalid[predictors].values, label=dvalid[target].values,
feature_name=predictors,
categorical_feature=categorical_features
)
del dtrain
del dvalid
gc.collect()
evals_results = {}
bst1 = lgb.train(lgb_params,
xgtrain,
valid_sets=[ xgvalid],
valid_names=['valid'],
evals_result=evals_results,
num_boost_round=num_boost_round,
early_stopping_rounds=early_stopping_rounds,
verbose_eval=10,
feval=feval)
print("\nModel Report")
print("bst1.best_iteration: ", bst1.best_iteration)
print(metrics+":", evals_results['valid'][metrics][bst1.best_iteration-1])
return (bst1,bst1.best_iteration)
# Main Script File
def DO(frm,to,nchunk):
dtypes = {
'ip' : 'uint32',
'app' : 'uint16',
'device' : 'uint8',
'os' : 'uint16',
'channel' : 'uint16',
'is_attributed' : 'uint8',
'click_id' : 'uint32',
}
print('loading train data...',frm,to)
train_df = pd.read_csv("../input/train.csv", parse_dates=['click_time'], skiprows=range(1,frm), nrows=to-frm, dtype=dtypes, usecols=['ip','app','device','os', 'channel', 'click_time', 'is_attributed'])
print('loading test data...')
test_df = pd.read_csv("../input/test_supplement.csv", parse_dates=['click_time'], dtype=dtypes, usecols=['ip','app','device','os', 'channel', 'click_time', 'click_id'])
df_converted = pd.DataFrame()
# Load chunks of 5 million
chunksize = (10 ** 6)*5
chunk_ct = 0
# Filter values that have 'is_attributed'==1, and merge these values into one dataframe
for chunk in pd.read_csv('../input/train.csv', chunksize=chunksize, parse_dates=['click_time'], dtype=dtypes, usecols=['ip','app','device','os', 'channel', 'click_time', 'is_attributed']):
filtered = (chunk[(np.where(chunk['is_attributed']==1, True, False))])
df_converted = pd.concat([df_converted, filtered], ignore_index=True, )
chunk_ct = chunk_ct+5
if chunk_ct==135:
break
print("\nEntries with attr=1 size: ", len(df_converted))
train_df = df_converted.append(train_df)
len_train = len(train_df)
train_df=train_df.append(test_df)
del test_df,df_converted
gc.collect()
train_df['hour'] = pd.to_datetime(train_df.click_time).dt.hour.astype('int8')
train_df['day'] = pd.to_datetime(train_df.click_time).dt.day.astype('int8')
train_df = do_next_Click( train_df,agg_suffix='nextClick', agg_type='float32' ); gc.collect()
# Count unique features
train_df = count_unique( train_df, ['ip'], 'channel' ); gc.collect()
train_df = count_unique( train_df, ['ip'], 'app'); gc.collect()
train_df = count_unique( train_df, ['ip'], 'device'); gc.collect()
# Cumulative count features
train_df = cumulative_count( train_df, ['ip', 'device', 'os'], 'app'); gc.collect()
# Count features
train_df = count_feat( train_df, ['ip', 'day', 'hour'] ); gc.collect()
train_df = count_feat( train_df, ['ip', 'app']); gc.collect()
train_df = count_feat( train_df, ['ip', 'app', 'os']); gc.collect()
del train_df['day']
gc.collect()
test_df = train_df[len_train:]
val_df = train_df[(len_train-val_size):len_train]
train_df = train_df[:(len_train-val_size)]
print("\n Completed Feature extraction")
predictors = ['ip_by_app_countuniq', 'ip_appcount', 'hour', 'ip_device_os_by_app_cumcount', 'app', 'ip_app_device_os_channel_nextClick', 'ip_by_channel_countuniq', 'ip_by_device_countuniq', 'ip_day_hourcount', 'channel', 'device', 'ip_app_oscount', 'os', 'app_device_channel_nextClick', 'ip_os_device_app_nextClick']
print("\n Predictor size: ", len(predictors))
target = 'is_attributed'
categorical = ['app', 'device', 'os', 'channel', 'hour']
sub = pd.DataFrame()
sub['click_id'] = test_df['click_id'].astype('int')
gc.collect()
print("Training...")
start_time = time.time()
params = {
'learning_rate': 0.03, # was 0.01
'num_leaves': 31, #was 31 # 2^max_depth - 1 # was 7
'max_depth': -1, # -1 means no limit
'min_child_samples': 100, # Minimum number of data need in a child(min_data_in_leaf)
'max_bin': 100, # Number of bucketed bin for feature values
'subsample': 0.7, # Subsample ratio of the training instance.
'subsample_freq': 1, # frequence of subsample, <=0 means no enable
'colsample_bytree': 0.9, # Subsample ratio of columns when constructing each tree.
'min_child_weight': 0, # Minimum sum of instance weight(hessian) needed in a child(leaf)
'scale_pos_weight':200 # because training data is extremely unbalanced
}
(bst,best_iteration) = lgb_modelfit_nocv(params,
train_df,
val_df,
predictors,
target,
objective='binary',
metrics='auc',
early_stopping_rounds=50,
verbose_eval=True,
num_boost_round=10000,
categorical_features=categorical)
print('[{}]: model training time'.format(time.time() - start_time))
del train_df
del val_df
gc.collect()
print("\nPredicting...")
sub['is_attributed'] = bst.predict(test_df[predictors],num_iteration=best_iteration)
del test_df
gc.collect()
# Merge test supplement and test
test_supp_df = pd.read_csv("../input/test_supplement.csv", parse_dates=['click_time'], dtype=dtypes, usecols=['ip','app','device','os', 'channel', 'click_time', 'click_id'])
test_df = pd.read_csv("../input/test.csv", parse_dates=['click_time'], dtype=dtypes, usecols=['ip','app','device','os', 'channel', 'click_time', 'click_id'])
sub = pd.read_pickle('../input/new_features_v8/sub_test_suppl_v1.pkl')
test_supp_df['is_attributed'] = sub['is_attributed'].values
print(test_supp_df.head(5))
del sub
gc.collect()
print('\nprojecting prediction onto test')
join_cols = ['ip', 'app', 'device', 'os', 'channel', 'click_time']
all_cols = join_cols + ['is_attributed']
test_df = test_df.merge(test_supp_df[all_cols], how='left', on=join_cols)
test_df = test_df.drop_duplicates(subset=['click_id'])
print("\nWriting the submission data into a csv file...")
test_df[['click_id', 'is_attributed']].to_csv('sub_ft_v20.csv', index=False)
print("\nDone...")
# Final Run
nrows=184903891-1
nchunk=150000000
val_size=15000000
frm=nrows-nchunk
to=frm+nchunk
DO(frm,to,nchunk)