This is an example showing how the scikit-learn can be used to classify documents by topics using a bag-of-words approach. This example uses a scipy.sparse matrix to store the features instead of standard numpy arrays.
The dataset used in this example is the 20 newsgroups dataset and should be downloaded from the http://mlcomp.org (free registration required): http://mlcomp.org/datasets/379
Once downloaded unzip the archive somewhere on your filesystem. For instance in:
% mkdir -p ~/data/mlcomp
% cd ~/data/mlcomp
% unzip /path/to/dataset-379-20news-18828_XXXXX.zip
You should get a folder ~/data/mlcomp/379 with a file named metadata and subfolders raw, train and test holding the text documents organized by newsgroups.
Then set the MLCOMP_DATASETS_HOME environment variable pointing to the root folder holding the uncompressed archive:
% export MLCOMP_DATASETS_HOME="~/data/mlcomp"
Alternatively, you can execute this code (assuming the Data folder is located under the folder in which your notebook is running):
import os
os.environ['MLCOMP_DATASETS_HOME']='Data/mlcomp'
The statistics of the dataset are:
20 newsgroups dataset for document classification
Source: http://people.csail.mit.edu/jrennie/20Newsgroups
Format: DocumentClassification
User: pliang
Size: 64M
#train 13180
#test 5648
#labels 20
# Author: Olivier Grisel <olivier.grisel@ensta.org>
# License: BSD 3 clause
%matplotlib inline
from __future__ import print_function
from time import time
import sys
import os
import numpy as np
import scipy.sparse as sp
import matplotlib.pyplot as plt
from sklearn.datasets import load_mlcomp
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
SKLearn has built-in utilities to load datasets from the MLCOMP repository.
# Load the training set
print("Loading 20 newsgroups training set... ")
news_train = load_mlcomp('20news-18828', 'train')
print(news_train.DESCR)
print("%d documents" % len(news_train.filenames))
print("%d categories" % len(news_train.target_names))
We now vectorize the dataset using SKLearn vectorizers. This is the stage of feature extraction. We use the TF-IDF feature model.
print("Extracting features from the dataset using a sparse vectorizer")
t0 = time()
vectorizer = TfidfVectorizer(encoding='latin1')
X_train = vectorizer.fit_transform((open(f, encoding='latin1').read() for f in news_train.filenames))
print("done in %fs" % (time() - t0))
print("n_samples: %d, n_features: %d" % X_train.shape)
assert sp.issparse(X_train)
y_train = news_train.target
Let us load the test dataset:
print("Loading 20 newsgroups test set... ")
news_test = load_mlcomp('20news-18828', 'test')
t0 = time()
print("done in %fs" % (time() - t0))
print("Predicting the labels of the test set...")
print("%d documents" % len(news_test.filenames))
print("%d categories" % len(news_test.target_names))
We also extract features from the test dataset using the same vectorizer as for training:
print("Extracting features from the dataset using the same vectorizer")
t0 = time()
X_test = vectorizer.transform((open(f, encoding='latin1').read() for f in news_test.filenames))
y_test = news_test.target
print("done in %fs" % (time() - t0))
print("n_samples: %d, n_features: %d" % X_test.shape)
Let us now benchmark different classifiers:
def benchmark(clf_class, params, name):
print("parameters:", params)
t0 = time()
clf = clf_class(**params).fit(X_train, y_train)
print("done in %fs" % (time() - t0))
if hasattr(clf, 'coef_'):
print("Percentage of non zeros coef: %f" % (np.mean(clf.coef_ != 0) * 100))
print("Predicting the outcomes of the testing set")
t0 = time()
pred = clf.predict(X_test)
print("done in %fs" % (time() - t0))
print("Classification report on test set for classifier:")
print(clf)
print()
print(classification_report(y_test, pred, target_names=news_test.target_names))
cm = confusion_matrix(y_test, pred)
print("Confusion matrix:")
print(cm)
# Show confusion matrix
plt.matshow(cm)
plt.title('Confusion matrix of the %s classifier' % name)
plt.colorbar()
print("Testbenching a linear classifier...")
parameters = {
'loss': 'hinge',
'penalty': 'l2',
'n_iter': 50,
'alpha': 0.00001,
'fit_intercept': True,
}
benchmark(SGDClassifier, parameters, 'SGD')
print("Testbenching a MultinomialNB classifier...")
parameters = {'alpha': 0.01}
benchmark(MultinomialNB, parameters, 'MultinomialNB')