This notebook explains how to perform document classification using the Scikit-Learn and Pandas libraries. Make sure to install the latest versions of these in your Anaconda environment with commands:
# conda install scikit-learn
# conda install pandas
We use a variety of vectorizers to turn text documents into feature vectors and compare different classifier algorithms on these features.
The code is derived from notebooks published by Zac Stewart http://zacstewart.com/2015/04/28/document-classification-with-scikit-learn.html and Radim Rehurek http://radimrehurek.com/data_science_python/.
We will work on two datasets - one of email messages classified as spam and ham (ham = not spam, good messages); and one of SMS messages, classified as spam and ham as well.
The email spam messages are collected from:
To make the work simpler, the two datasets are put into a single zip file here http://www.cs.bgu.ac.il/~elhadad/nlp16/spam.zip (107MB, contains about 60K files).
The SMS dataset is from:
%matplotlib inline
import os
import sys
import numpy
from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import KFold
from sklearn.metrics import confusion_matrix, f1_score
def progress(i, end_val, bar_length=50):
'''
Print a progress bar of the form: Percent: [##### ]
i is the current progress value expected in a range [0..end_val]
bar_length is the width of the progress bar on the screen.
'''
percent = float(i) / end_val
hashes = '#' * int(round(percent * bar_length))
spaces = ' ' * (bar_length - len(hashes))
sys.stdout.write("\rPercent: [{0}] {1}%".format(hashes + spaces, int(round(percent * 100))))
sys.stdout.flush()
NEWLINE = '\n'
The email files are organized in folders each containing only ham or spam files. The following code loads the whole dataset into a Pandas dataframe.
You should learn about Pandas by running the following notebooks:
HAM = 'ham'
SPAM = 'spam'
SOURCES = [
('data/spam', SPAM),
('data/easy_ham', HAM),
('data/hard_ham', HAM),
('data/beck-s', HAM),
('data/farmer-d', HAM),
('data/kaminski-v', HAM),
('data/kitchen-l', HAM),
('data/lokay-m', HAM),
('data/williams-w3', HAM),
('data/BG', SPAM),
('data/GP', SPAM),
('data/SH', SPAM)
]
SKIP_FILES = {'cmds'}
def read_files(path):
'''
Generator of pairs (filename, filecontent)
for all files below path whose name is not in SKIP_FILES.
The content of the file is of the form:
header....
<emptyline>
body...
This skips the headers and returns body only.
'''
for root, dir_names, file_names in os.walk(path):
for path in dir_names:
read_files(os.path.join(root, path))
for file_name in file_names:
if file_name not in SKIP_FILES:
file_path = os.path.join(root, file_name)
if os.path.isfile(file_path):
past_header, lines = False, []
f = open(file_path, encoding="latin-1")
for line in f:
if past_header:
lines.append(line)
elif line == NEWLINE:
past_header = True
f.close()
content = NEWLINE.join(lines)
yield file_path, content
def build_data_frame(l, path, classification):
rows = []
index = []
for i, (file_name, text) in enumerate(read_files(path)):
if ((i+l) % 100 == 0):
progress(i+l, 58910, 50)
rows.append({'text': text, 'class': classification})
index.append(file_name)
data_frame = DataFrame(rows, index=index)
return data_frame, len(rows)
def load_data():
data = DataFrame({'text': [], 'class': []})
l = 0
for path, classification in SOURCES:
data_frame, nrows = build_data_frame(l, path, classification)
data = data.append(data_frame)
l += nrows
data = data.reindex(numpy.random.permutation(data.index))
return data
# This should take about 2 minutes
data=load_data()
len(data)
data.describe()
def build_pipeline():
pipeline = Pipeline([
('count_vectorizer', CountVectorizer(ngram_range=(1, 2))),
('classifier', MultinomialNB())
])
return pipeline
def train(data = None, n_folds = 6):
if data is None:
print("Loading data...")
data = load_data()
print("Data loaded")
k_fold = KFold(n=len(data), n_folds = n_folds)
pipeline = build_pipeline()
scores = []
confusion = numpy.array([[0, 0], [0, 0]])
print("Training with %d folds" % n_folds)
for i, (train_indices, test_indices) in enumerate(k_fold):
train_text = data.iloc[train_indices]['text'].values
train_y = data.iloc[train_indices]['class'].values.astype(str)
test_text = data.iloc[test_indices]['text'].values
test_y = data.iloc[test_indices]['class'].values.astype(str)
print("Training for fold %d" % i)
pipeline.fit(train_text, train_y)
print("Testing for fold %d" % i)
predictions = pipeline.predict(test_text)
confusion += confusion_matrix(test_y, predictions)
score = f1_score(test_y, predictions, pos_label=SPAM)
scores.append(score)
print("Score for %d: %2.2f" % (i, score))
print("Confusion matrix for %d: " % i)
print(confusion)
print('Total emails classified:', len(data))
print('Score:', sum(scores)/len(scores))
print('Confusion matrix:')
print(confusion)
return pipeline
from sklearn.linear_model import LogisticRegression
def build_pipeline2():
pipeline = Pipeline([
('count_vectorizer', CountVectorizer(ngram_range=(1, 2))),
('classifier', LogisticRegression())
])
return pipeline
def train2(data = None, n_folds = 4):
if data is None:
print("Loading data...")
data = load_data()
print("Data loaded")
k_fold = KFold(n=len(data), n_folds = n_folds)
pipeline = build_pipeline2()
scores = []
confusion = numpy.array([[0, 0], [0, 0]])
print("Training with %d folds" % n_folds)
for i, (train_indices, test_indices) in enumerate(k_fold):
train_text = data.iloc[train_indices]['text'].values
train_y = data.iloc[train_indices]['class'].values.astype(str)
test_text = data.iloc[test_indices]['text'].values
test_y = data.iloc[test_indices]['class'].values.astype(str)
print("Training for fold %d" % i)
pipeline.fit(train_text, train_y)
print("Testing for fold %d" % i)
predictions = pipeline.predict(test_text)
confusion += confusion_matrix(test_y, predictions)
score = f1_score(test_y, predictions, pos_label=SPAM)
scores.append(score)
print("Score for %d: %2.2f" % (i, score))
print("Confusion matrix for %d: " % i)
print(confusion)
print('Total emails classified:', len(data))
print('Score:', sum(scores)/len(scores))
print('Confusion matrix:')
print(confusion)
return pipeline
confusion = confusion_matrix(test_y, predictions)
score = f1_score(test_y, predictions, pos_label=SPAM)
print("Score for %d: %2.2f" % (i, score))
print("Confusion matrix for %d: " % i)
print(confusion)
print('Total emails classified:', len(test_text))
return pipeline
# This trains the pipeline on our data (about 60K email messages)
# using count vectors over unigrams and bigrams and using N-folding with 6 folds.
# The training takes about 5 minutes for Multinomial Naive Bayes and about 30 minutes for Logistic Regression.
pipeline = train2(data)
pipeline_nb = train(data)
data.describe()
from pandas import value_counts
value_counts(data['class'])
We add a new column to our dataframe to represent the length of the messages.
data['length'] = data['text'].map(lambda text: len(text))
Let us explore the distribution of the message lengths:
data.length[data.length < 10000].plot(bins=100, kind='hist')
dsl = data.length[(data['class'] == 'spam') & (data.length < 10000)]
dhl = data.length[(data['class'] == 'ham') & (data.length < 10000)]
dsl.plot(bins=100, kind='hist')
dhl.plot(bins=100, kind='hist')
data.groupby('class').describe()
# All empty messages are marked as spam.
value_counts(data[data.length == 0]['class'])
data[(data.length > 20) & (data.length < 10000)].groupby('class').describe()