Download Links:
Jupyter Notebook (.ipynb)
Input Data (.tar.gz)
a. Divide the dataset as train, development, and test
import numpy as np
import pandas as pd
import tarfile
import matplotlib.pyplot as plt
import scipy as sp
import csv
import random
import math
import operator
import os
from collections import Counter
import nltk
df_train = pd.DataFrame()
df_dev_test = pd.DataFrame()
df_dev = pd.DataFrame()
df_test = pd.DataFrame()
labels = {'pos': 1, 'neg': 0}
tar = tarfile.open("aclImdb_v1.tar.gz", "r:gz")
for member in tar.getmembers():
f = tar.extractfile(member)
if f is not None:
if(member.name.find('aclImdb/train/pos/') != -1):
content = f.read().decode('utf-8')
df_train = df_train.append([[content, labels['pos']]],
ignore_index=True)
if(member.name.find('aclImdb/train/neg/') != -1):
content = f.read().decode('utf-8')
df_train = df_train.append([[content, labels['neg']]],
ignore_index=True)
if(member.name.find('aclImdb/test/pos/') != -1):
content = f.read().decode('utf-8')
df_dev_test = df_dev_test.append([[content, labels['pos']]],
ignore_index=True)
if(member.name.find('aclImdb/test/neg/') != -1):
content = f.read().decode('utf-8')
df_dev_test = df_dev_test.append([[content, labels['neg']]],
ignore_index=True)
df_train.columns = ['review', 'sentiment']
df_dev_test.columns = ['review', 'sentiment']
spdf = np.random.rand(len(df_dev_test)) < 0.6
df_dev = df_dev_test[spdf]
df_test = df_dev_test[~spdf]
df_train_len = len(df_train)
print('total train records:', df_train_len)
df_train_pos_len = len(df_train[df_train['sentiment'] == 1])
df_train_neg_len = len(df_train[df_train['sentiment'] == 0])
prob_pos_train = df_train_pos_len / df_train_len
prob_neg_train = df_train_neg_len / df_train_len
print ('positive records:', df_train_pos_len)
print ('negative records:', df_train_neg_len)
print ('prob positive records:', prob_pos_train)
print ('prob negative records:', prob_neg_train)
print()
print('total dev records:', len(df_dev))
print('total test records:', len(df_test))
total train records: 25000
positive records: 12500
negative records: 12500
prob positive records: 0.5
prob negative records: 0.5
total dev records: 15059
total test records: 9941
df_train.groupby('sentiment').count()
| review |
---|
sentiment | |
---|
0 | 12500 |
1 | 12500 |
df_dev.groupby('sentiment').count()
| review |
---|
sentiment | |
---|
0 | 7443 |
1 | 7616 |
df_test.groupby('sentiment').count()
| review |
---|
sentiment | |
---|
0 | 5057 |
1 | 4884 |
np.random.seed(0)
df_train = df_train.reindex(np.random.permutation(df_train.index))
df_train.to_csv('movie_data_train.csv', index=False, encoding = 'utf-8')
np.random.seed(0)
df_dev = df_dev.reindex(np.random.permutation(df_dev.index))
df_dev.to_csv('movie_data_dev.csv', index=False, encoding = 'utf-8')
np.random.seed(0)
df_test = df_test.reindex(np.random.permutation(df_test.index))
df_test.to_csv('movie_data_test.csv', index=False, encoding = 'utf-8')
df_train = pd.read_csv('movie_data_train.csv', encoding = 'utf-8')
df_train.head(5)
| review | sentiment |
---|
0 | Fräulein Doktor is as good a demonstration as ... | 1 |
1 | I watched this knowing almost nothing about it... | 0 |
2 | I must give How She Move a near-perfect rating... | 1 |
3 | The storyline is absurd and lame,also sucking ... | 0 |
4 | I watched Grendel the other night and am compe... | 0 |
df_dev = pd.read_csv('movie_data_dev.csv', encoding = 'utf-8')
df_dev.head(5)
| review | sentiment |
---|
0 | Scary Movie 1-4, Epic Movie, Date Movie, Meet ... | 0 |
1 | This is a funny, intelligent and, in a sense, ... | 1 |
2 | I give this movie 2 stars purely because of it... | 0 |
3 | When at the very start of the film Paleontolog... | 0 |
4 | I saw this movie awhile back and can't seem to... | 1 |
df_test = pd.read_csv('movie_data_test.csv', encoding = 'utf-8')
df_test.head(5)
| review | sentiment |
---|
0 | I've been a fan of Larry King's show for awhil... | 1 |
1 | THE FEELING of the need to have someone play t... | 1 |
2 | "Bride of Chucky" is one of the better horror ... | 1 |
3 | I just purchased this movie because I love to ... | 0 |
4 | This film is great - well written and very ent... | 1 |
doc_len = len(df_train)
print(doc_len)
25000
df_train.columns = df_train.columns.str.strip()
df_train.columns = df_train.columns.str.replace(r"[^a-zA-Z\d\_]+", "")
df_train.columns = df_train.columns.str.replace(r"[^a-zA-Z\d\_]+", "")
df_dev.columns = df_dev.columns.str.strip()
df_dev.columns = df_dev.columns.str.replace(r"[^a-zA-Z\d\_]+", "")
df_dev.columns = df_dev.columns.str.replace(r"[^a-zA-Z\d\_]+", "")
df_test.columns = df_test.columns.str.strip()
df_test.columns = df_test.columns.str.replace(r"[^a-zA-Z\d\_]+", "")
df_test.columns = df_test.columns.str.replace(r"[^a-zA-Z\d\_]+", "")
df_train = df_train.replace([";",":","=","\+","<", ">", "\?", "!", "\\\\", "@", "#", "$", "\*", "%", ",", "\.", "\(", "\)", "\[", "\]", "\{", "\}", "\"", "/br"], "", regex = True)
df_dev = df_dev.replace([";",":","=","\+","<", ">", "\?", "!", "\\\\", "@", "#", "$", "\*", "%", ",", "\.", "\(", "\)", "\[", "\]", "\{", "\}", "\"", "/br"], "", regex = True)
df_test = df_test.replace([";",":","=","\+","<", ">", "\?", "!", "\\\\", "@", "#", "$", "\*", "%", ",", "\.", "\(", "\)", "\[", "\]", "\{", "\}", "\"", "/br"], "", regex = True)
df_train = df_train.replace(["' ", " '"], " ", regex = True)
df_dev = df_dev.replace(["' ", " '"], " ", regex = True)
df_test = df_test.replace(["' ", " '"], " ", regex = True)
b. Build a vocabulary as list
wordfreq = dict()
wordfreq_pos = dict()
wordfreq_neg = dict()
for ind in df_train.index:
review_set = set(df_train['review'][ind].lower().split())
for word in review_set:
if word in wordfreq:
wordfreq[word] += 1
else:
wordfreq[word] = 1
if df_train['sentiment'][ind] == 1:
if word in wordfreq_pos:
wordfreq_pos[word] += 1
else:
wordfreq_pos[word] = 1
else:
if word in wordfreq_neg:
wordfreq_neg[word] += 1
else:
wordfreq_neg[word] = 1
final_vocab = dict()
final_vocab_pos = dict()
final_vocab_neg = dict()
for word in wordfreq:
if wordfreq[word] > 5:
final_vocab[word] = wordfreq[word]
if word in wordfreq_pos:
if wordfreq_pos[word] > 5:
final_vocab_pos[word] = wordfreq_pos[word]
if word in wordfreq_neg:
if wordfreq_neg[word] > 5:
final_vocab_neg[word] = wordfreq_neg[word]
print("Total Vocab:",len(final_vocab))
Total Vocab: 27681
c. Calculate the probabilities
prob_word = dict()
prob_word_g_pos = dict()
prob_word_g_neg = dict()
for word in final_vocab:
prob_word[word] = final_vocab[word] / doc_len
if word in final_vocab_pos:
prob_word_g_pos[word] = final_vocab_pos[word] / df_train_pos_len
if word in final_vocab_neg:
prob_word_g_neg[word] = final_vocab_neg[word] / df_train_neg_len
d. Calculate accuracy using dev dataset
accuracy_normal = []
df_dev_arr = np.array_split(df_dev, 5)
ctr = 0
print("Accuracy using 5-Fold Cross Validation:")
for df in df_dev_arr:
count = 0
ctr += 1
predicted_sentiments = []
prob_pos_g_wir = dict()
prob_neg_g_wir = dict()
for ind in df.index:
numPos = 0.00
numNeg = 0.00
review_set = set(df['review'][ind].lower().split())
for word in review_set:
if word in prob_word:
if word not in prob_word_g_pos:
numNeg = 0
elif word not in prob_word_g_neg:
numPos = 0
else:
numPos = numPos + math.log(prob_word_g_pos[word])
numNeg = numNeg + math.log(prob_word_g_neg[word])
prob_pos_g_wir[ind] = pow(math.e, numPos) * prob_pos_train
prob_neg_g_wir[ind] = pow(math.e, numNeg) * prob_neg_train
if(prob_pos_g_wir[ind] < prob_neg_g_wir[ind]):
predicted_sentiments.append(0)
else:
predicted_sentiments.append(1)
df['prediction'] = predicted_sentiments
for ind in df.index:
if df['sentiment'][ind] == df['prediction'][ind]:
count += 1
accuracy = count / len(df)
accuracy_normal.append(accuracy)
print (ctr,": Accuracy df_dev:",accuracy*100,"%")
Accuracy using 5-Fold Cross Validation:
1 : Accuracy df_dev: 61.254980079681275 %
2 : Accuracy df_dev: 61.75298804780876 %
3 : Accuracy df_dev: 63.081009296148736 %
4 : Accuracy df_dev: 62.35059760956175 %
5 : Accuracy df_dev: 62.570574559946856 %
e.1 Compare the effect of Smoothing
prob_word_g_pos_smooth = dict()
prob_word_g_neg_smooth = dict()
for word in final_vocab:
if word in final_vocab_pos:
prob_word_g_pos_smooth[word] = (final_vocab_pos[word]+1) / (df_train_pos_len + len(final_vocab))
if word in final_vocab_neg:
prob_word_g_neg_smooth[word] = (final_vocab_neg[word]+1) / (df_train_neg_len + len(final_vocab))
accuracy_smooth = []
df_dev_arr = np.array_split(df_dev, 5)
ctr = 0
print("Accuracy after Smoothing using 5-Fold Cross Validation:")
for df in df_dev_arr:
count = 0
ctr += 1
predicted_sentiments = []
prob_pos_g_wir = dict()
prob_neg_g_wir = dict()
for ind in df.index:
numPos = 0.00
numNeg = 0.00
review_set = set(df['review'][ind].lower().split())
for word in review_set:
if word in prob_word:
if word not in prob_word_g_pos:
numNeg = 0
elif word not in prob_word_g_neg:
numPos = 0
else:
numPos = numPos + math.log(prob_word_g_pos_smooth[word])
numNeg = numNeg + math.log(prob_word_g_neg_smooth[word])
prob_pos_g_wir[ind] = pow(math.e, numPos) * prob_pos_train
prob_neg_g_wir[ind] = pow(math.e, numNeg) * prob_neg_train
if(prob_pos_g_wir[ind] < prob_neg_g_wir[ind]):
predicted_sentiments.append(0)
else:
predicted_sentiments.append(1)
df['prediction'] = predicted_sentiments
for ind in df.index:
if df['sentiment'][ind] == df['prediction'][ind]:
count += 1
accuracy = count / len(df)
accuracy_smooth.append(accuracy)
print (ctr,": Accuracy df_dev:",accuracy*100,"%")
Accuracy after Smoothing using 5-Fold Cross Validation:
1 : Accuracy df_dev: 61.05577689243028 %
2 : Accuracy df_dev: 61.68658698539177 %
3 : Accuracy df_dev: 63.01460823373174 %
4 : Accuracy df_dev: 62.21779548472776 %
5 : Accuracy df_dev: 62.3380936565925 %
betterNormal = 0
betterSmoothing = 0
for i in range(len(accuracy_normal)):
if accuracy_normal[i] > accuracy_smooth[i]:
betterNormal += 1
else:
betterSmoothing +=1
if(betterNormal > betterSmoothing):
print("For the given dev dataset, accuracy is better without smoothing")
else:
print("For the given dev dataset, accuracy is better with smoothing")
For the given dev dataset, accuracy is better without smoothing
e.2 Derive Top 10 words that predicts positive and negative class
prob_pos_given_word = dict()
prob_neg_given_word = dict()
for word in final_vocab:
if word in final_vocab_pos:
prob_pos_given_word[word] = (prob_word_g_pos[word] * prob_pos_train) / prob_word[word]
if word in final_vocab_neg:
prob_neg_given_word[word] = (prob_word_g_neg[word] * prob_neg_train) / prob_word[word]
print("Top 10 words predicting positive class:")
prob_pos_given_word = sorted(prob_pos_given_word.items(), key=operator.itemgetter(1), reverse=True)
prob_pos_given_word[:10]
Top 10 words predicting positive class:
[('doktor', 1.0),
('mccartney', 1.0),
('brownstone', 1.0),
('unwillingly', 1.0),
('nord', 1.0),
("gilliam's", 1.0),
('stitzer', 1.0),
('apatow', 1.0),
('edie', 1.0),
('shimmering', 1.0)]
print("Top 10 words predicting negative class:")
prob_neg_given_word = sorted(prob_neg_given_word.items(), key=operator.itemgetter(1), reverse=True)
prob_neg_given_word[:10]
Top 10 words predicting negative class:
[('recoil', 1.0),
('clowns', 1.0),
('unintended', 1.0),
('dorff', 1.0),
('slater', 1.0),
('kareena', 1.0),
('atari', 1.0),
('kargil', 1.0),
('weisz', 1.0),
('2/10', 1.0)]
f. Calculate accuracy using test dataset
predicted_sentiments = []
prob_pos_g_wir = dict()
prob_neg_g_wir = dict()
if(betterNormal < betterSmoothing):
prob_word_g_pos = prob_word_g_pos_smooth
for ind in df_test.index:
numPos = 0.00
numNeg = 0.00
review_set = set(df_test['review'][ind].lower().split())
for word in review_set:
if word in prob_word:
if word not in prob_word_g_pos:
numNeg = 0
elif word not in prob_word_g_neg:
numPos = 0
else:
numPos = numPos + math.log(prob_word_g_pos[word])
numNeg = numNeg + math.log(prob_word_g_neg[word])
prob_pos_g_wir[ind] = pow(math.e, numPos) * prob_pos_train
prob_neg_g_wir[ind] = pow(math.e, numNeg) * prob_neg_train
if(prob_pos_g_wir[ind] < prob_neg_g_wir[ind]):
predicted_sentiments.append(0)
else:
predicted_sentiments.append(1)
df_test['prediction'] = predicted_sentiments
count = 0
for ind in df_test.index:
if df_test['sentiment'][ind] == df_test['prediction'][ind]:
count += 1
accuracy = count / len(df_test)
print ("Accuracy df_test:",accuracy*100,"%")
Accuracy df_test: 61.41233276330349 %