Ad Fraud Detection using LightGBM (Light Gradient Boosting)
In [1]:
# Import Statements
import pandas as pd
import time
import numpy as np
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import gc
import matplotlib.pyplot as plt
import os
In [2]:
# Extract time delta features
def do_next_Click( df,agg_suffix='nextClick', agg_type='float32'):
    print(">> \nExtracting time calculation features...\n")
    GROUP_BY_NEXT_CLICKS = [
    {'groupby': ['ip', 'app', 'device', 'os', 'channel']},
    {'groupby': ['ip', 'os', 'device', 'app']},    
    {'groupby': ['app', 'device', 'channel']}
    ]

    # Calculate the time to next click for each group
    for spec in GROUP_BY_NEXT_CLICKS:   
       # Name of new feature
        new_feature = '{}_{}'.format('_'.join(spec['groupby']),agg_suffix)    
        # Unique list of features to select
        all_features = spec['groupby'] + ['click_time']
        # Run calculation
        df[new_feature] = (df[all_features].groupby(spec[
            'groupby']).click_time.shift(-1) - df.click_time).dt.seconds.astype(agg_type)        
        gc.collect()
    return (df)
In [3]:
# Extract aggregate features

# Extract count feature using different columns
def count_feat( df, group_cols, agg_type='uint16', show_max=False, show_agg=True ):
    agg_name='{}count'.format('_'.join(group_cols))  
    if show_agg:
        print( "\nAggregating by ", group_cols ,  '... and saved in', agg_name )
    gp = df[group_cols][group_cols].groupby(group_cols).size().rename(agg_name).to_frame().reset_index()
    df = df.merge(gp, on=group_cols, how='left')
    del gp
    if show_max:
        print( agg_name + " max value = ", df[agg_name].max() )
    df[agg_name] = df[agg_name].astype(agg_type)
    gc.collect()
    return( df )
    

# Extract unique count feature using different cols
def count_unique( df, group_cols, counted, agg_type='uint8', show_max=False, show_agg=True ):
    agg_name= '{}_by_{}_countuniq'.format(('_'.join(group_cols)),(counted))  
    if show_agg:
        print( "\nCounting unqiue ", counted, " by ", group_cols ,  '... and saved in', agg_name )
    gp = df[group_cols+[counted]].groupby(group_cols)[counted].nunique().reset_index().rename(columns={counted:agg_name})
    df = df.merge(gp, on=group_cols, how='left')
    del gp
    if show_max:
        print( agg_name + " max value = ", df[agg_name].max() )
    df[agg_name] = df[agg_name].astype(agg_type)
    gc.collect()
    return( df )


# Extract cumulative count feature  from different cols    
def cumulative_count( df, group_cols, counted,agg_type='uint16', show_max=False, show_agg=True ):
    agg_name= '{}_by_{}_cumcount'.format(('_'.join(group_cols)),(counted)) 
    if show_agg:
        print( "\nCumulative count by ", group_cols , '... and saved in', agg_name  )
    gp = df[group_cols+[counted]].groupby(group_cols)[counted].cumcount()
    df[agg_name]=gp.values
    del gp
    if show_max:
        print( agg_name + " max value = ", df[agg_name].max() )
    df[agg_name] = df[agg_name].astype(agg_type)
    gc.collect()
    return( df )
In [4]:
# LightGBM Model
def lgb_modelfit_nocv(params, dtrain, dvalid, predictors, target='target', objective='binary', metrics='auc',
                 feval=None, early_stopping_rounds=50, num_boost_round=3000, verbose_eval=10, categorical_features=None):
    lgb_params = {
        'boosting_type': 'gbdt',
        'objective': objective,
        'metric':metrics,
        'learning_rate': 0.05,
        'num_leaves': 31,  # we should let it be smaller than 2^(max_depth)
        'max_depth': -1,  # -1 means no limit
        'min_child_samples': 20,  # Minimum number of data need in a child(min_data_in_leaf)
        'max_bin': 255,  # Number of bucketed bin for feature values
        'subsample': 0.6,  # Subsample ratio of the training instance.
        'subsample_freq': 0,  # frequence of subsample, <=0 means no enable
        'colsample_bytree': 0.3,  # Subsample ratio of columns when constructing each tree.
        'min_child_weight': 5,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
        'subsample_for_bin': 200000,  # Number of samples for constructing bin
        'min_split_gain': 0,  # lambda_l1, lambda_l2 and min_gain_to_split to regularization
        'reg_alpha': 0,  # L1 regularization term on weights
        'reg_lambda': 0,  # L2 regularization term on weights
        'nthread': 8,
        'verbose': 0,
    }

    lgb_params.update(params)

    print("preparing validation datasets")

    xgtrain = lgb.Dataset(dtrain[predictors].values, label=dtrain[target].values,
                          feature_name=predictors,
                          categorical_feature=categorical_features
                          )
    xgvalid = lgb.Dataset(dvalid[predictors].values, label=dvalid[target].values,
                          feature_name=predictors,
                          categorical_feature=categorical_features
                          )
    del dtrain
    del dvalid
    gc.collect()

    evals_results = {}

    bst1 = lgb.train(lgb_params, 
                     xgtrain, 
                     valid_sets=[ xgvalid], 
                     valid_names=['valid'], 
                     evals_result=evals_results, 
                     num_boost_round=num_boost_round,
                     early_stopping_rounds=early_stopping_rounds,
                     verbose_eval=10, 
                     feval=feval)

    print("\nModel Report")
    print("bst1.best_iteration: ", bst1.best_iteration)
    print(metrics+":", evals_results['valid'][metrics][bst1.best_iteration-1])
    return (bst1,bst1.best_iteration)
In [5]:
# Main Script File
def DO(frm,to,nchunk):
    dtypes = {
            'ip'            : 'uint32',
            'app'           : 'uint16',
            'device'        : 'uint8',
            'os'            : 'uint16',
            'channel'       : 'uint16',
            'is_attributed' : 'uint8',
            'click_id'      : 'uint32',
            }

    print('loading train data...',frm,to)
    train_df = pd.read_csv("../input/train.csv", parse_dates=['click_time'], skiprows=range(1,frm), nrows=to-frm, dtype=dtypes, usecols=['ip','app','device','os', 'channel', 'click_time', 'is_attributed'])

    print('loading test data...')
    test_df = pd.read_csv("../input/test_supplement.csv", parse_dates=['click_time'], dtype=dtypes, usecols=['ip','app','device','os', 'channel', 'click_time', 'click_id'])

    df_converted = pd.DataFrame()
    
    # Load chunks of 5 million
    chunksize = (10 ** 6)*5
    chunk_ct = 0
    # Filter values that have 'is_attributed'==1, and merge these values into one dataframe
    for chunk in pd.read_csv('../input/train.csv', chunksize=chunksize, parse_dates=['click_time'], dtype=dtypes, usecols=['ip','app','device','os', 'channel', 'click_time', 'is_attributed']):
        filtered = (chunk[(np.where(chunk['is_attributed']==1, True, False))])
        df_converted = pd.concat([df_converted, filtered], ignore_index=True, )
        chunk_ct = chunk_ct+5
        if chunk_ct==135:
            break


    print("\nEntries with attr=1 size: ", len(df_converted))        
    train_df = df_converted.append(train_df)    

    len_train = len(train_df)
    train_df=train_df.append(test_df)
    
    del test_df,df_converted        
    gc.collect()

    train_df['hour'] = pd.to_datetime(train_df.click_time).dt.hour.astype('int8')
    train_df['day'] = pd.to_datetime(train_df.click_time).dt.day.astype('int8') 
    train_df = do_next_Click( train_df,agg_suffix='nextClick', agg_type='float32'  ); gc.collect()
    
    # Count unique features
    train_df = count_unique( train_df, ['ip'], 'channel' ); gc.collect() 
    train_df = count_unique( train_df, ['ip'], 'app'); gc.collect() 
    train_df = count_unique( train_df, ['ip'], 'device'); gc.collect() 
    
    # Cumulative count features
    train_df = cumulative_count( train_df, ['ip', 'device', 'os'], 'app'); gc.collect() 
    
    # Count features
    train_df = count_feat( train_df, ['ip', 'day', 'hour'] ); gc.collect() 
    train_df = count_feat( train_df, ['ip', 'app']); gc.collect() 
    train_df = count_feat( train_df, ['ip', 'app', 'os']); gc.collect() 

    del train_df['day']
    gc.collect()
    
    test_df = train_df[len_train:]
    val_df = train_df[(len_train-val_size):len_train]
    train_df = train_df[:(len_train-val_size)]
    
    print("\n Completed Feature extraction")
        
    predictors = ['ip_by_app_countuniq', 'ip_appcount', 'hour', 'ip_device_os_by_app_cumcount', 'app', 'ip_app_device_os_channel_nextClick', 'ip_by_channel_countuniq', 'ip_by_device_countuniq', 'ip_day_hourcount', 'channel', 'device', 'ip_app_oscount', 'os', 'app_device_channel_nextClick', 'ip_os_device_app_nextClick']
    print("\n Predictor size: ", len(predictors))
    target = 'is_attributed'
    categorical = ['app', 'device', 'os', 'channel', 'hour']
    
    sub = pd.DataFrame()
    sub['click_id'] = test_df['click_id'].astype('int')

    gc.collect()

    print("Training...")
    start_time = time.time()

    params = {
        'learning_rate': 0.03, # was 0.01
        'num_leaves': 31,  #was 31 # 2^max_depth - 1 # was 7
        'max_depth': -1,  # -1 means no limit
        'min_child_samples': 100,  # Minimum number of data need in a child(min_data_in_leaf)
        'max_bin': 100,  # Number of bucketed bin for feature values
        'subsample': 0.7,  # Subsample ratio of the training instance.
        'subsample_freq': 1,  # frequence of subsample, <=0 means no enable
        'colsample_bytree': 0.9,  # Subsample ratio of columns when constructing each tree.
        'min_child_weight': 0,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
        'scale_pos_weight':200 # because training data is extremely unbalanced 
    }
    (bst,best_iteration) = lgb_modelfit_nocv(params, 
                            train_df, 
                            val_df, 
                            predictors, 
                            target, 
                            objective='binary', 
                            metrics='auc',
                            early_stopping_rounds=50,
                            verbose_eval=True, 
                            num_boost_round=10000,
                            categorical_features=categorical)

    print('[{}]: model training time'.format(time.time() - start_time))
    del train_df
    del val_df
    gc.collect()

    print("\nPredicting...")
    sub['is_attributed'] = bst.predict(test_df[predictors],num_iteration=best_iteration)
    
    del test_df
    gc.collect()
    
    # Merge test supplement and test
    test_supp_df = pd.read_csv("../input/test_supplement.csv", parse_dates=['click_time'], dtype=dtypes, usecols=['ip','app','device','os', 'channel', 'click_time', 'click_id'])
    test_df = pd.read_csv("../input/test.csv", parse_dates=['click_time'], dtype=dtypes, usecols=['ip','app','device','os', 'channel', 'click_time', 'click_id'])

    sub = pd.read_pickle('../input/new_features_v8/sub_test_suppl_v1.pkl') 

    test_supp_df['is_attributed'] = sub['is_attributed'].values
    print(test_supp_df.head(5))

    del sub
    gc.collect()

    print('\nprojecting prediction onto test')

    join_cols = ['ip', 'app', 'device', 'os', 'channel', 'click_time']
    all_cols = join_cols + ['is_attributed']

    test_df = test_df.merge(test_supp_df[all_cols], how='left', on=join_cols)

    test_df = test_df.drop_duplicates(subset=['click_id'])

    print("\nWriting the submission data into a csv file...")

    test_df[['click_id', 'is_attributed']].to_csv('sub_ft_v20.csv', index=False)
    
    print("\nDone...")
    
In [6]:
# Final Run
nrows=184903891-1
nchunk=150000000 
val_size=15000000

frm=nrows-nchunk
to=frm+nchunk

DO(frm,to,nchunk)
loading train data... 34903890 184903890
loading test data...

Entries with attr=1 size:  329016
>> 
Extracting time calculation features...


Counting unqiue  channel  by  ['ip'] ... and saved in ip_by_channel_countuniq

Counting unqiue  app  by  ['ip'] ... and saved in ip_by_app_countuniq

Counting unqiue  device  by  ['ip'] ... and saved in ip_by_device_countuniq

Cumulative count by  ['ip', 'device', 'os'] ... and saved in ip_device_os_by_app_cumcount

Aggregating by  ['ip', 'day', 'hour'] ... and saved in ip_day_hourcount

Aggregating by  ['ip', 'app'] ... and saved in ip_appcount

Aggregating by  ['ip', 'app', 'os'] ... and saved in ip_app_oscount

 Completed Feature extraction

 Predictor size:  15
Training...
preparing validation datasets
Training until validation scores don't improve for 50 rounds.
[10]  valid's auc: 0.974725
[20]  valid's auc: 0.976168
[30]  valid's auc: 0.976537
[40]  valid's auc: 0.977305
[50]  valid's auc: 0.97866
[60]  valid's auc: 0.979369
[70]  valid's auc: 0.979895
[80]  valid's auc: 0.980684
[90]  valid's auc: 0.981349
[100] valid's auc: 0.98191
[110] valid's auc: 0.982401
[120] valid's auc: 0.982872
[130] valid's auc: 0.98325
[140] valid's auc: 0.983483
[150] valid's auc: 0.98382
[160] valid's auc: 0.984082
[170] valid's auc: 0.98429
[180] valid's auc: 0.984545
[190] valid's auc: 0.98467
[200] valid's auc: 0.984681
[210] valid's auc: 0.98432
[220] valid's auc: 0.984075
[230] valid's auc: 0.983857
[240] valid's auc: 0.983308
Early stopping, best iteration is:
[197] valid's auc: 0.984723

Model Report
bst1.best_iteration:  197
auc: 0.9847231310436954
[1329.5972707271576]: model training time

projecting prediction onto test

Writing the submission data into a csv file...

Done...