Bi-LSTM Model with GloVe Embedding for Comment Classification [Keras - TensorFlow]
In [1]:
# Import statements
import sys
import os
import re
import csv
import codecs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Conv1D, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D, concatenate, SpatialDropout1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
Using TensorFlow backend.
In [2]:
# Load train and test files
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
GLOVE_EMBEDDING_FILE = '../../library/glove/glove.840B.300d.txt'
In [3]:
train.head()
Out[3]:
id comment_text toxic severe_toxic obscene threat insult identity_hate
0 0000997932d777bf Explanation\nWhy the edits made under my usern... 0 0 0 0 0 0
1 000103f0d9cfb60f D'aww! He matches this background colour I'm s... 0 0 0 0 0 0
2 000113f07ec002fd Hey man, I'm really not trying to edit war. It... 0 0 0 0 0 0
3 0001b41b1c6bb37e "\nMore\nI can't make any real suggestions on ... 0 0 0 0 0 0
4 0001d958c54c6e35 You, sir, are my hero. Any chance you remember... 0 0 0 0 0 0
In [4]:
# Check for null values 
train.isnull().any(),test.isnull().any()
Out[4]:
(id               False
 comment_text     False
 toxic            False
 severe_toxic     False
 obscene          False
 threat           False
 insult           False
 identity_hate    False
 dtype: bool, id              False
 comment_text    False
 dtype: bool)
In [5]:
# Split labels and data 
list_classes = ["toxic","severe_toxic","obscene","threat","insult","identity_hate"]
train_labels = train[list_classes].values
train_data = train["comment_text"]
test_data = test["comment_text"]
In [6]:
# Approach Used ::  Bidirectional LSTM with Glove Embedding
# To prepare data for LSTM --> we use the following steps
# 1. Tokenization 
# Break down sentences to unique words
# 2. Indexing
# Put words in a dictionary-like structure and give every word an index
# 3. Index Representation 
# Represent sequence of words in comments --> in form of index
# Feed this chain to LSTM 
# (This will make the feature vector for every comment in number format)


# Define max number of words in dictionary
max_features = 20000 
tokenizer = Tokenizer(num_words=max_features, lower=True)
tokenizer.fit_on_texts(list(train_data))
feature_tokenized_train = tokenizer.texts_to_sequences(train_data)
feature_tokenized_test = tokenizer.texts_to_sequences(test_data)

# Make all feature vectors of the same size
# How to select the correct length for the words 
# Find the distribution of length_of_words 

totalNumWords = [len(feature) for feature in feature_tokenized_train]
plt.hist(totalNumWords,bins=np.arange(0,400,10))
plt.show()

# Based on histogram distribution, setting feature maxLen=200
maxLen = 150
final_train_data = pad_sequences(feature_tokenized_train, maxlen=maxLen)
final_test_data = pad_sequences(feature_tokenized_test, maxlen=maxLen)


# GloVe Embedding
embedding_size = 300
embedding_index = {}
with open(GLOVE_EMBEDDING_FILE, encoding='utf8') as f:
    for line in f:
        values = line.rstrip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs
    
word_index = tokenizer.word_index
num_words = min(max_features, len(word_index)+1)
embedding_matrix = np.zeros((num_words, embedding_size))

for word,i in word_index.items():
    if i>=max_features:
        continue
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
In [7]:
# Bidirectional LSTM Model
# Input Layer
ip = Input(shape=(maxLen, ))
# Embedding Layer
# Projects words to vector space based on word2vec to get relevance
x = Embedding(max_features, embedding_size, weights=[embedding_matrix], trainable=True)(ip)

# Bidirectional LSTM Layer 
x = Bidirectional(LSTM(128, return_sequences=True, dropout=0.15, recurrent_dropout=0.15))(x) # Dimn shd be (None,200,128)
x = Conv1D(64, kernel_size=3, padding='valid', kernel_initializer='glorot_uniform')(x)

# Average and Max Pooling --> To reduce dimensionalty
# Followed by Dropout
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
x = concatenate([avg_pool, max_pool])

# Final Dense Layer + activation(sigmoid); Output shd be num_classes=6
x = Dense(6, activation='sigmoid')(x)


# Define model and optimization process
model = Model(inputs=ip, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
batch_size = 32
epochs = 1
model.fit(final_train_data, train_labels, batch_size=batch_size, epochs=epochs, verbose=1)

model.summary()
Epoch 1/1
159571/159571 [==============================] - 1317s 8ms/step - loss: 0.0490 - acc: 0.9820
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
==================================================================================================
input_1 (InputLayer)            (None, 150)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 150, 300)     6000000     input_1[0][0]                    
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 150, 256)     439296      embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 148, 64)      49216       bidirectional_1[0][0]            
__________________________________________________________________________________________________
global_average_pooling1d_1 (Glo (None, 64)           0           conv1d_1[0][0]                   
__________________________________________________________________________________________________
global_max_pooling1d_1 (GlobalM (None, 64)           0           conv1d_1[0][0]                   
__________________________________________________________________________________________________
concatenate_1 (Concatenate)     (None, 128)          0           global_average_pooling1d_1[0][0] 
                                                                 global_max_pooling1d_1[0][0]     
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 6)            774         concatenate_1[0][0]              
==================================================================================================
Total params: 6,489,286
Trainable params: 6,489,286
Non-trainable params: 0
__________________________________________________________________________________________________
In [8]:
# Predictions
predictions = model.predict(final_test_data, batch_size=batch_size, verbose=1)
submission = pd.read_csv('../input/sample_submission.csv')
submission[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']] = predictions
submission.to_csv('submission_bilstm_glove.csv', index=False)
# Prediction Accuracy (Test Set) - 98.46%
153164/153164 [==============================] - 250s 2ms/step