# Import statements
import sys
import os
import re
import csv
import codecs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Conv1D, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D, concatenate, SpatialDropout1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
# Load train and test files
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
GLOVE_EMBEDDING_FILE = '../../library/glove/glove.840B.300d.txt'
train.head()
# Check for null values
train.isnull().any(),test.isnull().any()
# Split labels and data
list_classes = ["toxic","severe_toxic","obscene","threat","insult","identity_hate"]
train_labels = train[list_classes].values
train_data = train["comment_text"]
test_data = test["comment_text"]
# Approach Used :: Bidirectional LSTM with Glove Embedding
# To prepare data for LSTM --> we use the following steps
# 1. Tokenization
# Break down sentences to unique words
# 2. Indexing
# Put words in a dictionary-like structure and give every word an index
# 3. Index Representation
# Represent sequence of words in comments --> in form of index
# Feed this chain to LSTM
# (This will make the feature vector for every comment in number format)
# Define max number of words in dictionary
max_features = 20000
tokenizer = Tokenizer(num_words=max_features, lower=True)
tokenizer.fit_on_texts(list(train_data))
feature_tokenized_train = tokenizer.texts_to_sequences(train_data)
feature_tokenized_test = tokenizer.texts_to_sequences(test_data)
# Make all feature vectors of the same size
# How to select the correct length for the words
# Find the distribution of length_of_words
totalNumWords = [len(feature) for feature in feature_tokenized_train]
plt.hist(totalNumWords,bins=np.arange(0,400,10))
plt.show()
# Based on histogram distribution, setting feature maxLen=200
maxLen = 150
final_train_data = pad_sequences(feature_tokenized_train, maxlen=maxLen)
final_test_data = pad_sequences(feature_tokenized_test, maxlen=maxLen)
# GloVe Embedding
embedding_size = 300
embedding_index = {}
with open(GLOVE_EMBEDDING_FILE, encoding='utf8') as f:
for line in f:
values = line.rstrip().rsplit(' ')
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embedding_index[word] = coefs
word_index = tokenizer.word_index
num_words = min(max_features, len(word_index)+1)
embedding_matrix = np.zeros((num_words, embedding_size))
for word,i in word_index.items():
if i>=max_features:
continue
embedding_vector = embedding_index.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
# Bidirectional LSTM Model
# Input Layer
ip = Input(shape=(maxLen, ))
# Embedding Layer
# Projects words to vector space based on word2vec to get relevance
x = Embedding(max_features, embedding_size, weights=[embedding_matrix], trainable=True)(ip)
# Bidirectional LSTM Layer
x = Bidirectional(LSTM(128, return_sequences=True, dropout=0.15, recurrent_dropout=0.15))(x) # Dimn shd be (None,200,128)
x = Conv1D(64, kernel_size=3, padding='valid', kernel_initializer='glorot_uniform')(x)
# Average and Max Pooling --> To reduce dimensionalty
# Followed by Dropout
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
x = concatenate([avg_pool, max_pool])
# Final Dense Layer + activation(sigmoid); Output shd be num_classes=6
x = Dense(6, activation='sigmoid')(x)
# Define model and optimization process
model = Model(inputs=ip, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
batch_size = 32
epochs = 1
model.fit(final_train_data, train_labels, batch_size=batch_size, epochs=epochs, verbose=1)
model.summary()
# Predictions
predictions = model.predict(final_test_data, batch_size=batch_size, verbose=1)
submission = pd.read_csv('../input/sample_submission.csv')
submission[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']] = predictions
submission.to_csv('submission_bilstm_glove.csv', index=False)
# Prediction Accuracy (Test Set) - 98.46%