Classification

We apply classification tools to solve NLP tasks.

We start with a very simple task that looks at words in isolation and tries to classify them into 2 labels: gender identification. The task consists of guessing whether a name is masculine or feminine.

The classification method consists of taking as input an observation, turning this observation into a feature vector, then predicting the label of this feature vector by applying a trained classifier model.

To prepare for this procedure, we must train a classifier. In supervised learning, a classifier is learned by generalizing a set of observed pairs (observationi, labeli) where [i = 1..N].

In [1]:
%matplotlib inline

def gender_features(word):
    return {'last_letter': word[-1]}
gender_features('Shrek')
Out[1]:
{'last_letter': 'k'}
In [2]:
from nltk.corpus import names
labeled_names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')])
import random
random.shuffle(labeled_names)
In [3]:
print("There are %s samples in the dataset." % (len(labeled_names)))
There are 7944 samples in the dataset.
In [4]:
import nltk
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
In [5]:
print("Neo is classified as %s" % (classifier.classify(gender_features('Neo'))))

print("Trinity is classified as %s" % (classifier.classify(gender_features('Trinity'))))
Neo is classified as male
Trinity is classified as female
In [6]:
print(nltk.classify.accuracy(classifier, test_set))
0.788
In [7]:
classifier.show_most_informative_features(5)
Most Informative Features
             last_letter = 'a'            female : male   =     34.5 : 1.0
             last_letter = 'k'              male : female =     31.7 : 1.0
             last_letter = 'f'              male : female =     15.9 : 1.0
             last_letter = 'p'              male : female =     11.9 : 1.0
             last_letter = 'v'              male : female =      9.8 : 1.0
In [8]:
def gender_features2(name):
    features = {}
    features["first_letter"] = name[0].lower()
    features["last_letter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
        features["has(%s)" % letter] = (letter in name.lower())
    return features
In [9]:
gender_features2('John') 
Out[9]:
{'first_letter': 'j',
 'last_letter': 'n',
 'count(a)': 0,
 'has(a)': False,
 'count(b)': 0,
 'has(b)': False,
 'count(c)': 0,
 'has(c)': False,
 'count(d)': 0,
 'has(d)': False,
 'count(e)': 0,
 'has(e)': False,
 'count(f)': 0,
 'has(f)': False,
 'count(g)': 0,
 'has(g)': False,
 'count(h)': 1,
 'has(h)': True,
 'count(i)': 0,
 'has(i)': False,
 'count(j)': 1,
 'has(j)': True,
 'count(k)': 0,
 'has(k)': False,
 'count(l)': 0,
 'has(l)': False,
 'count(m)': 0,
 'has(m)': False,
 'count(n)': 1,
 'has(n)': True,
 'count(o)': 1,
 'has(o)': True,
 'count(p)': 0,
 'has(p)': False,
 'count(q)': 0,
 'has(q)': False,
 'count(r)': 0,
 'has(r)': False,
 'count(s)': 0,
 'has(s)': False,
 'count(t)': 0,
 'has(t)': False,
 'count(u)': 0,
 'has(u)': False,
 'count(v)': 0,
 'has(v)': False,
 'count(w)': 0,
 'has(w)': False,
 'count(x)': 0,
 'has(x)': False,
 'count(y)': 0,
 'has(y)': False,
 'count(z)': 0,
 'has(z)': False}
In [10]:
featuresets = [(gender_features2(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))
0.784
In [11]:
train_names = labeled_names[1500:]
devtest_names = labeled_names[500:1500]
test_names = labeled_names[:500]
In [12]:
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
test_set = [(gender_features(n), gender) for (n, gender) in test_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set))
0.747
In [13]:
errors = []
for (name, tag) in devtest_names:
    guess = classifier.classify(gender_features(name))
    if guess != tag:
        errors.append( (tag, guess, name) )
In [14]:
for (tag, guess, name) in sorted(errors):
    print('correct=%-8s guess=%-8s name=%-30s' % (tag, guess, name))
correct=female   guess=male     name=Abagail                       
correct=female   guess=male     name=Abigael                       
correct=female   guess=male     name=Adriaens                      
correct=female   guess=male     name=Annabal                       
correct=female   guess=male     name=Annabel                       
correct=female   guess=male     name=April                         
correct=female   guess=male     name=Arden                         
correct=female   guess=male     name=Arleen                        
correct=female   guess=male     name=Arlyn                         
correct=female   guess=male     name=Astrid                        
correct=female   guess=male     name=Avril                         
correct=female   guess=male     name=Beatriz                       
correct=female   guess=male     name=Bell                          
correct=female   guess=male     name=Brit                          
correct=female   guess=male     name=Caitrin                       
correct=female   guess=male     name=Cam                           
correct=female   guess=male     name=Candis                        
correct=female   guess=male     name=Carlyn                        
correct=female   guess=male     name=Carmel                        
correct=female   guess=male     name=Carol-Jean                    
correct=female   guess=male     name=Carolynn                      
correct=female   guess=male     name=Caryn                         
correct=female   guess=male     name=Cat                           
correct=female   guess=male     name=Chantal                       
correct=female   guess=male     name=Charis                        
correct=female   guess=male     name=Charleen                      
correct=female   guess=male     name=Charmian                      
correct=female   guess=male     name=Cher                          
correct=female   guess=male     name=Chriss                        
correct=female   guess=male     name=Christan                      
correct=female   guess=male     name=Cloris                        
correct=female   guess=male     name=Coleen                        
correct=female   guess=male     name=Crystal                       
correct=female   guess=male     name=Cybal                         
correct=female   guess=male     name=Cybil                         
correct=female   guess=male     name=Dael                          
correct=female   guess=male     name=Darb                          
correct=female   guess=male     name=Daryn                         
correct=female   guess=male     name=Demeter                       
correct=female   guess=male     name=Dot                           
correct=female   guess=male     name=Drew                          
correct=female   guess=male     name=Emmalyn                       
correct=female   guess=male     name=Erinn                         
correct=female   guess=male     name=Esther                        
correct=female   guess=male     name=Farand                        
correct=female   guess=male     name=Farrand                       
correct=female   guess=male     name=Francis                       
correct=female   guess=male     name=Glen                          
correct=female   guess=male     name=Glynis                        
correct=female   guess=male     name=Guendolen                     
correct=female   guess=male     name=Gwendolin                     
correct=female   guess=male     name=Harriot                       
correct=female   guess=male     name=Helyn                         
correct=female   guess=male     name=Hildegaard                    
correct=female   guess=male     name=Inger                         
correct=female   guess=male     name=Ingrid                        
correct=female   guess=male     name=Iris                          
correct=female   guess=male     name=Iseabal                       
correct=female   guess=male     name=Isobel                        
correct=female   guess=male     name=Jasmin                        
correct=female   guess=male     name=Jen                           
correct=female   guess=male     name=Jenn                          
correct=female   guess=male     name=Jessalyn                      
correct=female   guess=male     name=Jessamyn                      
correct=female   guess=male     name=Joell                         
correct=female   guess=male     name=Jolynn                        
correct=female   guess=male     name=Jonis                         
correct=female   guess=male     name=Juliet                        
correct=female   guess=male     name=Karalynn                      
correct=female   guess=male     name=Kristal                       
correct=female   guess=male     name=Kristen                       
correct=female   guess=male     name=Laural                        
correct=female   guess=male     name=Leanor                        
correct=female   guess=male     name=Lian                          
correct=female   guess=male     name=Lilias                        
correct=female   guess=male     name=Lillian                       
correct=female   guess=male     name=Lind                          
correct=female   guess=male     name=Lorrin                        
correct=female   guess=male     name=Lurleen                       
correct=female   guess=male     name=Lust                          
correct=female   guess=male     name=Mab                           
correct=female   guess=male     name=Mabel                         
correct=female   guess=male     name=Madalyn                       
correct=female   guess=male     name=Magdalen                      
correct=female   guess=male     name=Manon                         
correct=female   guess=male     name=Marie-Ann                     
correct=female   guess=male     name=Mariel                        
correct=female   guess=male     name=Marigold                      
correct=female   guess=male     name=Marion                        
correct=female   guess=male     name=Maryl                         
correct=female   guess=male     name=Marylin                       
correct=female   guess=male     name=Marylou                       
correct=female   guess=male     name=Mercedes                      
correct=female   guess=male     name=Merilyn                       
correct=female   guess=male     name=Mildred                       
correct=female   guess=male     name=Millicent                     
correct=female   guess=male     name=Morgan                        
correct=female   guess=male     name=Morgen                        
correct=female   guess=male     name=Phil                          
correct=female   guess=male     name=Philis                        
correct=female   guess=male     name=Phylis                        
correct=female   guess=male     name=Rahel                         
correct=female   guess=male     name=Raven                         
correct=female   guess=male     name=Rhiamon                       
correct=female   guess=male     name=Robbyn                        
correct=female   guess=male     name=Rosaleen                      
correct=female   guess=male     name=Saraann                       
correct=female   guess=male     name=Shaun                         
correct=female   guess=male     name=Shell                         
correct=female   guess=male     name=Sherilyn                      
correct=female   guess=male     name=Shirleen                      
correct=female   guess=male     name=Sibel                         
correct=female   guess=male     name=Sigrid                        
correct=female   guess=male     name=Sydel                         
correct=female   guess=male     name=Teryl                         
correct=female   guess=male     name=Theo                          
correct=female   guess=male     name=Viv                           
correct=female   guess=male     name=Wileen                        
correct=female   guess=male     name=Yoshiko                       
correct=male     guess=female   name=Aditya                        
correct=male     guess=female   name=Adolphe                       
correct=male     guess=female   name=Alex                          
correct=male     guess=female   name=Alexei                        
correct=male     guess=female   name=Alfie                         
correct=male     guess=female   name=Artie                         
correct=male     guess=female   name=Ashley                        
correct=male     guess=female   name=Ave                           
correct=male     guess=female   name=Bailey                        
correct=male     guess=female   name=Barri                         
correct=male     guess=female   name=Bary                          
correct=male     guess=female   name=Berkley                       
correct=male     guess=female   name=Billie                        
correct=male     guess=female   name=Billy                         
correct=male     guess=female   name=Blaine                        
correct=male     guess=female   name=Brady                         
correct=male     guess=female   name=Cammy                         
correct=male     guess=female   name=Cary                          
correct=male     guess=female   name=Chaddie                       
correct=male     guess=female   name=Chancey                       
correct=male     guess=female   name=Chase                         
correct=male     guess=female   name=Clarke                        
correct=male     guess=female   name=Clay                          
correct=male     guess=female   name=Cobbie                        
correct=male     guess=female   name=Cody                          
correct=male     guess=female   name=Connolly                      
correct=male     guess=female   name=Corby                         
correct=male     guess=female   name=Corky                         
correct=male     guess=female   name=Davidde                       
correct=male     guess=female   name=Deane                         
correct=male     guess=female   name=Dennie                        
correct=male     guess=female   name=Duane                         
correct=male     guess=female   name=Durante                       
correct=male     guess=female   name=Dwaine                        
correct=male     guess=female   name=Dwayne                        
correct=male     guess=female   name=Elisha                        
correct=male     guess=female   name=Elmore                        
correct=male     guess=female   name=Emmy                          
correct=male     guess=female   name=Fonsie                        
correct=male     guess=female   name=Fonzie                        
correct=male     guess=female   name=Frederich                     
correct=male     guess=female   name=Gabriele                      
correct=male     guess=female   name=Garcia                        
correct=male     guess=female   name=Garey                         
correct=male     guess=female   name=Garry                         
correct=male     guess=female   name=Gary                          
correct=male     guess=female   name=George                        
correct=male     guess=female   name=Germaine                      
correct=male     guess=female   name=Gerry                         
correct=male     guess=female   name=Giovanni                      
correct=male     guess=female   name=Goose                         
correct=male     guess=female   name=Hadleigh                      
correct=male     guess=female   name=Hartley                       
correct=male     guess=female   name=Heath                         
correct=male     guess=female   name=Hermy                         
correct=male     guess=female   name=Hersch                        
correct=male     guess=female   name=Hewie                         
correct=male     guess=female   name=Hilary                        
correct=male     guess=female   name=Hodge                         
correct=male     guess=female   name=Horace                        
correct=male     guess=female   name=Howie                         
correct=male     guess=female   name=Hugh                          
correct=male     guess=female   name=Jeffie                        
correct=male     guess=female   name=Jeremie                       
correct=male     guess=female   name=Jerrome                       
correct=male     guess=female   name=Jose                          
correct=male     guess=female   name=Judah                         
correct=male     guess=female   name=Judith                        
correct=male     guess=female   name=Kelly                         
correct=male     guess=female   name=Kyle                          
correct=male     guess=female   name=Lane                          
correct=male     guess=female   name=Laurie                        
correct=male     guess=female   name=Lee                           
correct=male     guess=female   name=Leroy                         
correct=male     guess=female   name=Locke                         
correct=male     guess=female   name=Lonnie                        
correct=male     guess=female   name=Luce                          
correct=male     guess=female   name=Martie                        
correct=male     guess=female   name=Maurie                        
correct=male     guess=female   name=Maurise                       
correct=male     guess=female   name=Maxie                         
correct=male     guess=female   name=Meade                         
correct=male     guess=female   name=Mike                          
correct=male     guess=female   name=Mischa                        
correct=male     guess=female   name=Mitch                         
correct=male     guess=female   name=Moe                           
correct=male     guess=female   name=Montague                      
correct=male     guess=female   name=Monte                         
correct=male     guess=female   name=Morly                         
correct=male     guess=female   name=Moshe                         
correct=male     guess=female   name=Munroe                        
correct=male     guess=female   name=Mustafa                       
correct=male     guess=female   name=Noe                           
correct=male     guess=female   name=Orville                       
correct=male     guess=female   name=Paige                         
correct=male     guess=female   name=Parke                         
correct=male     guess=female   name=Rabbi                         
correct=male     guess=female   name=Ramsay                        
correct=male     guess=female   name=Ramsey                        
correct=male     guess=female   name=Richy                         
correct=male     guess=female   name=Rickie                        
correct=male     guess=female   name=Rocky                         
correct=male     guess=female   name=Rory                          
correct=male     guess=female   name=Rube                          
correct=male     guess=female   name=Rutledge                      
correct=male     guess=female   name=Samuele                       
correct=male     guess=female   name=Sandy                         
correct=male     guess=female   name=Sasha                         
correct=male     guess=female   name=Scarface                      
correct=male     guess=female   name=Serge                         
correct=male     guess=female   name=Sheffie                       
correct=male     guess=female   name=Siddhartha                    
correct=male     guess=female   name=Skye                          
correct=male     guess=female   name=Sonny                         
correct=male     guess=female   name=Spike                         
correct=male     guess=female   name=Taite                         
correct=male     guess=female   name=Teddie                        
correct=male     guess=female   name=Terence                       
correct=male     guess=female   name=Terry                         
correct=male     guess=female   name=Thaine                        
correct=male     guess=female   name=Thorndike                     
correct=male     guess=female   name=Thornie                       
correct=male     guess=female   name=Tonnie                        
correct=male     guess=female   name=Tyrone                        
correct=male     guess=female   name=Verne                         
correct=male     guess=female   name=Virgie                        
correct=male     guess=female   name=Walsh                         
correct=male     guess=female   name=Ware                          
correct=male     guess=female   name=Way                           
correct=male     guess=female   name=Wesley                        
correct=male     guess=female   name=Yancy                         
correct=male     guess=female   name=Yehudi                        
correct=male     guess=female   name=Zane                          
correct=male     guess=female   name=Zolly                         
In [15]:
def gender_features(word):
    return {'suffix1': word[-1:],
            'suffix2': word[-2:]}
In [16]:
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set))
0.774

Document Classification

We now turn our attention to classifying full documents as opposed to single words in isolation.

The task seems more challenging, but simple methods can achieve surprisingly good results when the task is well defined. Consider the task of predicting whether a movie review is positive or negative. This is a task called sentiment analysis and is a hot practical task in the era of user-generated content (UGC) on the Web.

A good dataset is available in NLTK to experiment with this task.

In [17]:
from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
In [18]:
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = [w for (w, c) in all_words.most_common(2000)]

def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features
In [19]:
print("There are %d documents in the movie reviews dataset." % (len(documents)))
print("There are %d distinct words in the dataset." % (all_words.B()))
print("There are %d tokens in the dataset." % (all_words.N()))
all_words.tabulate(20)
all_words.plot(100)
There are 2000 documents in the movie reviews dataset.
There are 39768 distinct words in the dataset.
There are 1583820 tokens in the dataset.
    ,   the     .     a   and    of    to     '    is    in     s     "    it  that     -     )     (    as  with   for 
77717 76529 65876 38106 35576 34123 31937 30585 25195 21822 18513 17612 16107 15924 15595 11781 11664 11378 10792  9961 
Out[19]:
<matplotlib.axes._subplots.AxesSubplot at 0x1ee6fd3ea20>
In [20]:
print(document_features(movie_reviews.words('pos/cv957_8737.txt'))) 
{'contains(,)': True, 'contains(the)': True, 'contains(.)': True, 'contains(a)': True, 'contains(and)': True, 'contains(of)': True, 'contains(to)': True, "contains(')": True, 'contains(is)': True, 'contains(in)': True, 'contains(s)': True, 'contains(")': True, 'contains(it)': True, 'contains(that)': True, 'contains(-)': True, 'contains())': True, 'contains(()': True, 'contains(as)': True, 'contains(with)': True, 'contains(for)': True, 'contains(his)': True, 'contains(this)': True, 'contains(film)': False, 'contains(i)': False, 'contains(he)': True, 'contains(but)': True, 'contains(on)': True, 'contains(are)': True, 'contains(t)': False, 'contains(by)': True, 'contains(be)': True, 'contains(one)': True, 'contains(movie)': True, 'contains(an)': True, 'contains(who)': True, 'contains(not)': True, 'contains(you)': True, 'contains(from)': True, 'contains(at)': False, 'contains(was)': False, 'contains(have)': True, 'contains(they)': True, 'contains(has)': True, 'contains(her)': False, 'contains(all)': True, 'contains(?)': False, 'contains(there)': True, 'contains(like)': True, 'contains(so)': False, 'contains(out)': True, 'contains(about)': True, 'contains(up)': False, 'contains(more)': False, 'contains(what)': True, 'contains(when)': True, 'contains(which)': True, 'contains(or)': False, 'contains(she)': True, 'contains(their)': False, 'contains(:)': True, 'contains(some)': False, 'contains(just)': True, 'contains(can)': False, 'contains(if)': True, 'contains(we)': False, 'contains(him)': True, 'contains(into)': True, 'contains(even)': False, 'contains(only)': True, 'contains(than)': False, 'contains(no)': False, 'contains(good)': False, 'contains(time)': False, 'contains(most)': True, 'contains(its)': False, 'contains(will)': True, 'contains(story)': False, 'contains(would)': False, 'contains(been)': False, 'contains(much)': False, 'contains(character)': False, 'contains(also)': True, 'contains(get)': True, 'contains(other)': True, 'contains(do)': True, 'contains(two)': True, 'contains(well)': True, 'contains(them)': True, 'contains(very)': True, 'contains(characters)': False, 'contains(;)': False, 'contains(first)': False, 'contains(--)': True, 'contains(after)': False, 'contains(see)': False, 'contains(!)': True, 'contains(way)': True, 'contains(because)': False, 'contains(make)': True, 'contains(life)': False, 'contains(off)': False, 'contains(too)': False, 'contains(any)': False, 'contains(does)': False, 'contains(really)': False, 'contains(had)': False, 'contains(while)': True, 'contains(films)': False, 'contains(how)': True, 'contains(plot)': True, 'contains(little)': True, 'contains(where)': True, 'contains(people)': False, 'contains(over)': False, 'contains(could)': False, 'contains(then)': True, 'contains(me)': True, 'contains(scene)': True, 'contains(man)': False, 'contains(bad)': False, 'contains(my)': False, 'contains(never)': True, 'contains(being)': False, 'contains(best)': True, 'contains(these)': False, 'contains(don)': False, 'contains(new)': False, 'contains(doesn)': False, 'contains(scenes)': False, 'contains(many)': True, 'contains(director)': False, 'contains(such)': False, 'contains(know)': False, 'contains(were)': False, 'contains(movies)': True, 'contains(through)': False, 'contains(here)': True, 'contains(action)': True, 'contains(great)': True, 'contains(re)': True, 'contains(another)': False, 'contains(love)': False, 'contains(go)': False, 'contains(made)': False, 'contains(us)': True, 'contains(big)': False, 'contains(end)': False, 'contains(something)': False, 'contains(back)': False, 'contains(*)': True, 'contains(still)': False, 'contains(world)': True, 'contains(seems)': False, 'contains(work)': False, 'contains(those)': False, 'contains(makes)': False, 'contains(now)': False, 'contains(before)': False, 'contains(however)': True, 'contains(between)': True, 'contains(few)': False, 'contains(/)': False, 'contains(down)': False, 'contains(every)': False, 'contains(though)': False, 'contains(better)': False, 'contains(real)': False, 'contains(audience)': False, 'contains(enough)': False, 'contains(seen)': False, 'contains(take)': False, 'contains(around)': False, 'contains(both)': False, 'contains(going)': False, 'contains(year)': False, 'contains(performance)': False, 'contains(why)': False, 'contains(should)': False, 'contains(role)': False, 'contains(isn)': False, 'contains(same)': True, 'contains(old)': False, 'contains(gets)': True, 'contains(your)': False, 'contains(may)': False, 'contains(things)': True, 'contains(think)': False, 'contains(years)': False, 'contains(last)': False, 'contains(comedy)': True, 'contains(funny)': True, 'contains(actually)': True, 'contains(ve)': False, 'contains(long)': False, 'contains(look)': True, 'contains(almost)': False, 'contains(own)': True, 'contains(thing)': False, 'contains(fact)': False, 'contains(nothing)': False, 'contains(say)': False, 'contains(right)': False, 'contains(john)': False, 'contains(although)': False, 'contains(played)': True, 'contains(find)': False, 'contains(script)': False, 'contains(come)': False, 'contains(ever)': True, 'contains(cast)': False, 'contains(since)': False, 'contains(did)': False, 'contains(star)': False, 'contains(plays)': False, 'contains(young)': False, 'contains(show)': False, 'contains(comes)': False, 'contains(m)': False, 'contains(part)': False, 'contains(original)': False, 'contains(actors)': False, 'contains(screen)': True, 'contains(without)': False, 'contains(again)': False, 'contains(acting)': False, 'contains(three)': False, 'contains(day)': True, 'contains(each)': True, 'contains(point)': False, 'contains(lot)': False, 'contains(least)': True, 'contains(takes)': False, 'contains(guy)': True, 'contains(quite)': False, 'contains(himself)': False, 'contains(away)': False, 'contains(during)': False, 'contains(family)': False, 'contains(effects)': False, 'contains(course)': True, 'contains(goes)': False, 'contains(minutes)': False, 'contains(interesting)': False, 'contains(might)': False, 'contains(far)': False, 'contains(high)': False, 'contains(rather)': False, 'contains(once)': True, 'contains(must)': False, 'contains(anything)': False, 'contains(place)': True, 'contains(set)': False, 'contains(yet)': False, 'contains(watch)': True, 'contains(d)': False, 'contains(making)': True, 'contains(our)': False, 'contains(wife)': True, 'contains(hard)': False, 'contains(always)': False, 'contains(fun)': True, 'contains(didn)': False, 'contains(ll)': False, 'contains(seem)': False, 'contains(special)': False, 'contains(bit)': False, 'contains(times)': False, 'contains(trying)': False, 'contains(hollywood)': False, 'contains(instead)': False, 'contains(give)': False, 'contains(want)': False, 'contains(picture)': False, 'contains(kind)': True, 'contains(american)': False, 'contains(job)': False, 'contains(sense)': False, 'contains(woman)': True, 'contains(home)': False, 'contains(having)': False, 'contains(series)': True, 'contains(actor)': False, 'contains(probably)': False, 'contains(help)': True, 'contains(half)': False, 'contains(along)': True, 'contains(men)': False, 'contains(everything)': True, 'contains(pretty)': False, 'contains(becomes)': False, 'contains(sure)': False, 'contains(black)': False, 'contains(together)': False, 'contains(dialogue)': False, 'contains(money)': False, 'contains(become)': False, 'contains(gives)': False, 'contains(given)': False, 'contains(looking)': False, 'contains(whole)': False, 'contains(watching)': False, 'contains(father)': False, 'contains(`)': False, 'contains(feel)': False, 'contains(everyone)': False, 'contains(music)': False, 'contains(wants)': False, 'contains(sex)': False, 'contains(less)': False, 'contains(done)': False, 'contains(horror)': False, 'contains(got)': True, 'contains(death)': False, 'contains(perhaps)': False, 'contains(city)': False, 'contains(next)': False, 'contains(especially)': True, 'contains(play)': False, 'contains(girl)': False, 'contains(mind)': False, 'contains(10)': False, 'contains(moments)': False, 'contains(looks)': True, 'contains(completely)': False, 'contains(2)': False, 'contains(reason)': False, 'contains(mother)': False, 'contains(whose)': False, 'contains(line)': False, 'contains(night)': False, 'contains(human)': False, 'contains(until)': False, 'contains(rest)': False, 'contains(performances)': False, 'contains(different)': False, 'contains(evil)': False, 'contains(small)': False, 'contains(james)': False, 'contains(simply)': False, 'contains(couple)': False, 'contains(put)': False, 'contains(let)': False, 'contains(anyone)': False, 'contains(ending)': False, 'contains(case)': False, 'contains(several)': False, 'contains(dead)': False, 'contains(michael)': False, 'contains(left)': False, 'contains(thought)': False, 'contains(school)': False, 'contains(shows)': False, 'contains(humor)': False, 'contains(true)': False, 'contains(lost)': False, 'contains(written)': False, 'contains(itself)': False, 'contains(friend)': False, 'contains(entire)': False, 'contains(getting)': True, 'contains(town)': False, 'contains(turns)': False, 'contains(soon)': False, 'contains(someone)': False, 'contains(second)': False, 'contains(main)': False, 'contains(stars)': False, 'contains(found)': False, 'contains(use)': False, 'contains(problem)': False, 'contains(friends)': True, 'contains(tv)': False, 'contains(top)': True, 'contains(name)': False, 'contains(begins)': False, 'contains(called)': False, 'contains(based)': False, 'contains(comic)': False, 'contains(david)': False, 'contains(head)': False, 'contains(else)': False, 'contains(idea)': True, 'contains(either)': False, 'contains(wrong)': True, 'contains(unfortunately)': False, 'contains(later)': False, 'contains(final)': False, 'contains(hand)': False, 'contains(alien)': False, 'contains(house)': False, 'contains(group)': False, 'contains(full)': False, 'contains(used)': True, 'contains(tries)': True, 'contains(often)': True, 'contains(against)': False, 'contains(war)': False, 'contains(sequence)': False, 'contains(keep)': False, 'contains(turn)': False, 'contains(playing)': True, 'contains(boy)': False, 'contains(behind)': False, 'contains(named)': False, 'contains(certainly)': False, 'contains(live)': False, 'contains(believe)': False, 'contains(under)': False, 'contains(works)': False, 'contains(relationship)': False, 'contains(face)': False, 'contains(hour)': False, 'contains(run)': False, 'contains(style)': False, 'contains(said)': False, 'contains(despite)': False, 'contains(person)': False, 'contains(finally)': False, 'contains(shot)': False, 'contains(book)': False, 'contains(doing)': False, 'contains(tell)': False, 'contains(maybe)': False, 'contains(nice)': False, 'contains(son)': False, 'contains(perfect)': False, 'contains(side)': False, 'contains(seeing)': True, 'contains(able)': False, 'contains(finds)': False, 'contains(children)': False, 'contains(days)': False, 'contains(past)': False, 'contains(summer)': False, 'contains(camera)': False, 'contains(won)': False, 'contains(including)': False, 'contains(mr)': False, 'contains(kids)': False, 'contains(lives)': False, 'contains(directed)': False, 'contains(moment)': False, 'contains(game)': False, 'contains(running)': False, 'contains(fight)': True, 'contains(supposed)': False, 'contains(video)': False, 'contains(car)': False, 'contains(matter)': False, 'contains(kevin)': True, 'contains(joe)': False, 'contains(lines)': False, 'contains(worth)': True, 'contains(=)': False, 'contains(daughter)': False, 'contains(earth)': False, 'contains(starts)': False, 'contains(need)': False, 'contains(entertaining)': False, 'contains(white)': False, 'contains(start)': True, 'contains(writer)': False, 'contains(dark)': False, 'contains(short)': False, 'contains(self)': False, 'contains(worst)': False, 'contains(nearly)': False, 'contains(opening)': False, 'contains(try)': False, 'contains(upon)': False, 'contains(care)': False, 'contains(early)': True, 'contains(violence)': False, 'contains(throughout)': False, 'contains(team)': False, 'contains(production)': False, 'contains(example)': False, 'contains(beautiful)': False, 'contains(title)': False, 'contains(exactly)': False, 'contains(jack)': False, 'contains(review)': False, 'contains(major)': False, 'contains(drama)': False, 'contains(&)': False, 'contains(problems)': True, 'contains(sequences)': False, 'contains(obvious)': False, 'contains(version)': False, 'contains(screenplay)': False, 'contains(known)': True, 'contains(killer)': False, 'contains(wasn)': False, 'contains(robert)': False, 'contains(disney)': False, 'contains(already)': False, 'contains(close)': False, 'contains(classic)': False, 'contains(others)': True, 'contains(hit)': False, 'contains(kill)': False, 'contains(deep)': True, 'contains(five)': False, 'contains(order)': False, 'contains(act)': False, 'contains(simple)': False, 'contains(fine)': False, 'contains(themselves)': False, 'contains(heart)': False, 'contains(roles)': False, 'contains(jackie)': True, 'contains(direction)': False, 'contains(eyes)': False, 'contains(four)': False, 'contains(question)': False, 'contains(sort)': False, 'contains(sometimes)': False, 'contains(knows)': False, 'contains(supporting)': False, 'contains(coming)': False, 'contains(voice)': False, 'contains(women)': False, 'contains(truly)': False, 'contains(save)': False, 'contains(jokes)': False, 'contains(computer)': False, 'contains(child)': False, 'contains(o)': False, 'contains(boring)': False, 'contains(tom)': False, 'contains(level)': False, 'contains(1)': False, 'contains(body)': False, 'contains(guys)': False, 'contains(genre)': False, 'contains(brother)': False, 'contains(strong)': False, 'contains(stop)': True, 'contains(room)': False, 'contains(space)': False, 'contains(lee)': False, 'contains(ends)': False, 'contains(beginning)': False, 'contains(ship)': False, 'contains(york)': False, 'contains(attempt)': False, 'contains(thriller)': False, 'contains(scream)': True, 'contains(peter)': False, 'contains(aren)': False, 'contains(husband)': False, 'contains(fiction)': False, 'contains(happens)': False, 'contains(hero)': False, 'contains(novel)': False, 'contains(note)': False, 'contains(hope)': False, 'contains(king)': False, 'contains(yes)': False, 'contains(says)': False, 'contains(tells)': False, 'contains(quickly)': False, 'contains(romantic)': False, 'contains(dog)': False, 'contains(oscar)': False, 'contains(stupid)': False, 'contains(possible)': False, 'contains(saw)': False, 'contains(lead)': True, 'contains(career)': False, 'contains(murder)': False, 'contains(extremely)': False, 'contains(manages)': False, 'contains(god)': False, 'contains(mostly)': False, 'contains(wonder)': False, 'contains(particularly)': False, 'contains(future)': False, 'contains(fans)': False, 'contains(sound)': False, 'contains(worse)': False, 'contains(piece)': False, 'contains(involving)': False, 'contains(de)': False, 'contains(appears)': False, 'contains(planet)': False, 'contains(paul)': False, 'contains(involved)': False, 'contains(mean)': False, 'contains(none)': False, 'contains(taking)': False, 'contains(hours)': False, 'contains(laugh)': True, 'contains(police)': False, 'contains(sets)': False, 'contains(attention)': False, 'contains(co)': False, 'contains(hell)': False, 'contains(eventually)': False, 'contains(single)': False, 'contains(fall)': False, 'contains(falls)': False, 'contains(material)': False, 'contains(emotional)': False, 'contains(power)': False, 'contains(late)': False, 'contains(lack)': False, 'contains(dr)': False, 'contains(van)': False, 'contains(result)': False, 'contains(elements)': False, 'contains(meet)': False, 'contains(smith)': False, 'contains(science)': False, 'contains(experience)': False, 'contains(bring)': False, 'contains(wild)': False, 'contains(living)': False, 'contains(theater)': False, 'contains(interest)': False, 'contains(leads)': False, 'contains(word)': False, 'contains(feature)': False, 'contains(battle)': False, 'contains(girls)': False, 'contains(alone)': False, 'contains(obviously)': False, 'contains(george)': False, 'contains(within)': False, 'contains(usually)': False, 'contains(enjoy)': False, 'contains(guess)': False, 'contains(among)': True, 'contains(taken)': False, 'contains(feeling)': False, 'contains(laughs)': False, 'contains(aliens)': False, 'contains(talk)': True, 'contains(chance)': False, 'contains(talent)': False, 'contains(3)': False, 'contains(middle)': False, 'contains(number)': False, 'contains(easy)': False, 'contains(across)': False, 'contains(needs)': False, 'contains(attempts)': False, 'contains(happen)': False, 'contains(television)': False, 'contains(chris)': False, 'contains(deal)': False, 'contains(poor)': False, 'contains(form)': False, 'contains(girlfriend)': True, 'contains(viewer)': False, 'contains(release)': False, 'contains(killed)': False, 'contains(forced)': False, 'contains(whether)': False, 'contains(wonderful)': False, 'contains(feels)': False, 'contains(oh)': False, 'contains(tale)': False, 'contains(serious)': False, 'contains(expect)': False, 'contains(except)': False, 'contains(light)': False, 'contains(success)': False, 'contains(features)': True, 'contains(premise)': False, 'contains(happy)': False, 'contains(words)': False, 'contains(leave)': False, 'contains(important)': False, 'contains(meets)': False, 'contains(history)': False, 'contains(giving)': False, 'contains(crew)': False, 'contains(type)': False, 'contains(call)': False, 'contains(turned)': False, 'contains(released)': False, 'contains(parents)': False, 'contains(art)': False, 'contains(impressive)': False, 'contains(mission)': False, 'contains(working)': False, 'contains(seemed)': False, 'contains(score)': False, 'contains(told)': False, 'contains(recent)': False, 'contains(robin)': False, 'contains(basically)': False, 'contains(entertainment)': False, 'contains(america)': False, 'contains($)': False, 'contains(surprise)': False, 'contains(apparently)': False, 'contains(easily)': False, 'contains(ryan)': False, 'contains(cool)': False, 'contains(stuff)': False, 'contains(cop)': False, 'contains(change)': False, 'contains(williams)': False, 'contains(crime)': False, 'contains(office)': False, 'contains(parts)': False, 'contains(somehow)': False, 'contains(sequel)': False, 'contains(william)': False, 'contains(cut)': False, 'contains(die)': False, 'contains(jones)': False, 'contains(credits)': False, 'contains(batman)': False, 'contains(suspense)': False, 'contains(brings)': False, 'contains(events)': False, 'contains(reality)': False, 'contains(whom)': False, 'contains(local)': False, 'contains(talking)': False, 'contains(difficult)': True, 'contains(using)': False, 'contains(went)': False, 'contains(writing)': False, 'contains(remember)': False, 'contains(near)': False, 'contains(straight)': False, 'contains(hilarious)': True, 'contains(ago)': False, 'contains(certain)': False, 'contains(ben)': False, 'contains(kid)': False, 'contains(wouldn)': False, 'contains(slow)': True, 'contains(blood)': False, 'contains(mystery)': False, 'contains(complete)': False, 'contains(red)': False, 'contains(popular)': False, 'contains(effective)': False, 'contains(am)': False, 'contains(fast)': True, 'contains(flick)': False, 'contains(due)': False, 'contains(runs)': False, 'contains(gone)': False, 'contains(return)': False, 'contains(presence)': False, 'contains(quality)': False, 'contains(dramatic)': False, 'contains(filmmakers)': False, 'contains(age)': False, 'contains(brothers)': False, 'contains(business)': False, 'contains(general)': False, 'contains(rock)': False, 'contains(sexual)': False, 'contains(present)': False, 'contains(surprisingly)': False, 'contains(anyway)': False, 'contains(uses)': False, 'contains(4)': False, 'contains(personal)': False, 'contains(figure)': False, 'contains(smart)': False, 'contains(ways)': False, 'contains(decides)': False, 'contains(annoying)': False, 'contains(begin)': False, 'contains(couldn)': False, 'contains(somewhat)': False, 'contains(shots)': False, 'contains(rich)': False, 'contains(minute)': False, 'contains(law)': False, 'contains(previous)': False, 'contains(jim)': False, 'contains(successful)': False, 'contains(harry)': False, 'contains(water)': False, 'contains(similar)': False, 'contains(absolutely)': False, 'contains(motion)': False, 'contains(former)': False, 'contains(strange)': False, 'contains(came)': False, 'contains(follow)': False, 'contains(read)': False, 'contains(project)': False, 'contains(million)': True, 'contains(secret)': False, 'contains(starring)': False, 'contains(clear)': False, 'contains(familiar)': False, 'contains(romance)': False, 'contains(intelligent)': False, 'contains(third)': True, 'contains(excellent)': False, 'contains(amazing)': False, 'contains(party)': False, 'contains(budget)': False, 'contains(eye)': False, 'contains(actress)': False, 'contains(prison)': False, 'contains(latest)': False, 'contains(means)': True, 'contains(company)': False, 'contains(towards)': False, 'contains(predictable)': False, 'contains(powerful)': False, 'contains(nor)': False, 'contains(bob)': False, 'contains(beyond)': False, 'contains(visual)': False, 'contains(leaves)': False, 'contains(r)': False, 'contains(nature)': False, 'contains(following)': False, 'contains(villain)': False, 'contains(leaving)': False, 'contains(animated)': False, 'contains(low)': False, 'contains(myself)': False, 'contains(b)': False, 'contains(bill)': False, 'contains(sam)': False, 'contains(filled)': False, 'contains(wars)': False, 'contains(questions)': False, 'contains(cinema)': False, 'contains(message)': False, 'contains(box)': False, 'contains(moving)': True, 'contains(herself)': False, 'contains(country)': False, 'contains(usual)': False, 'contains(martin)': False, 'contains(definitely)': False, 'contains(add)': False, 'contains(large)': False, 'contains(clever)': False, 'contains(create)': False, 'contains(felt)': False, 'contains(stories)': False, 'contains(brilliant)': False, 'contains(ones)': False, 'contains(giant)': False, 'contains(situation)': False, 'contains(murphy)': False, 'contains(break)': False, 'contains(opens)': False, 'contains(scary)': False, 'contains(doubt)': False, 'contains(drug)': True, 'contains(bunch)': False, 'contains(thinking)': False, 'contains(solid)': False, 'contains(effect)': False, 'contains(learn)': False, 'contains(move)': False, 'contains(force)': False, 'contains(potential)': False, 'contains(seriously)': False, 'contains(follows)': False, 'contains(above)': False, 'contains(saying)': False, 'contains(huge)': False, 'contains(class)': False, 'contains(plan)': False, 'contains(agent)': False, 'contains(created)': False, 'contains(unlike)': False, 'contains(pay)': False, 'contains(non)': True, 'contains(married)': False, 'contains(mark)': False, 'contains(sweet)': False, 'contains(perfectly)': False, 'contains(ex)': False, 'contains(realize)': False, 'contains(audiences)': False, 'contains(took)': False, 'contains(decent)': False, 'contains(likely)': False, 'contains(dream)': False, 'contains(view)': False, 'contains(scott)': False, 'contains(subject)': False, 'contains(understand)': False, 'contains(happened)': False, 'contains(enjoyable)': True, 'contains(studio)': False, 'contains(immediately)': False, 'contains(open)': False, 'contains(e)': False, 'contains(points)': False, 'contains(heard)': False, 'contains(viewers)': False, 'contains(cameron)': False, 'contains(truman)': False, 'contains(bruce)': False, 'contains(frank)': False, 'contains(private)': False, 'contains(stay)': False, 'contains(fails)': False, 'contains(impossible)': False, 'contains(cold)': False, 'contains(richard)': False, 'contains(overall)': False, 'contains(merely)': False, 'contains(exciting)': False, 'contains(mess)': False, 'contains(chase)': True, 'contains(free)': False, 'contains(ten)': False, 'contains(neither)': False, 'contains(wanted)': False, 'contains(gun)': True, 'contains(appear)': False, 'contains(carter)': False, 'contains(escape)': False, 'contains(ultimately)': False, 'contains(+)': False, 'contains(fan)': False, 'contains(inside)': False, 'contains(favorite)': False, 'contains(haven)': False, 'contains(modern)': False, 'contains(l)': False, 'contains(wedding)': False, 'contains(stone)': False, 'contains(trek)': False, 'contains(brought)': False, 'contains(trouble)': True, 'contains(otherwise)': False, 'contains(tim)': False, 'contains(5)': False, 'contains(allen)': False, 'contains(bond)': False, 'contains(society)': False, 'contains(liked)': False, 'contains(dumb)': False, 'contains(musical)': False, 'contains(stand)': False, 'contains(political)': False, 'contains(various)': False, 'contains(talented)': False, 'contains(particular)': False, 'contains(west)': False, 'contains(state)': False, 'contains(keeps)': True, 'contains(english)': False, 'contains(silly)': False, 'contains(u)': False, 'contains(situations)': False, 'contains(park)': False, 'contains(teen)': False, 'contains(rating)': False, 'contains(slightly)': False, 'contains(steve)': False, 'contains(truth)': False, 'contains(air)': False, 'contains(element)': False, 'contains(joke)': False, 'contains(spend)': False, 'contains(key)': True, 'contains(biggest)': False, 'contains(members)': False, 'contains(effort)': False, 'contains(government)': False, 'contains(focus)': False, 'contains(eddie)': False, 'contains(soundtrack)': False, 'contains(hands)': False, 'contains(earlier)': False, 'contains(chan)': True, 'contains(purpose)': False, 'contains(today)': True, 'contains(showing)': False, 'contains(memorable)': False, 'contains(six)': False, 'contains(cannot)': False, 'contains(max)': False, 'contains(offers)': False, 'contains(rated)': False, 'contains(mars)': False, 'contains(heavy)': False, 'contains(totally)': False, 'contains(control)': False, 'contains(credit)': False, 'contains(fi)': False, 'contains(woody)': False, 'contains(ideas)': False, 'contains(sci)': False, 'contains(wait)': False, 'contains(sit)': False, 'contains(female)': False, 'contains(ask)': False, 'contains(waste)': False, 'contains(terrible)': False, 'contains(depth)': False, 'contains(simon)': False, 'contains(aspect)': False, 'contains(list)': False, 'contains(mary)': False, 'contains(sister)': False, 'contains(animation)': False, 'contains(entirely)': False, 'contains(fear)': False, 'contains(steven)': False, 'contains(moves)': False, 'contains(actual)': False, 'contains(army)': False, 'contains(british)': False, 'contains(constantly)': False, 'contains(fire)': False, 'contains(convincing)': False, 'contains(setting)': False, 'contains(gave)': False, 'contains(tension)': False, 'contains(street)': False, 'contains(8)': False, 'contains(brief)': True, 'contains(ridiculous)': False, 'contains(cinematography)': False, 'contains(typical)': False, 'contains(nick)': False, 'contains(screenwriter)': False, 'contains(ability)': False, 'contains(spent)': False, 'contains(quick)': True, 'contains(violent)': False, 'contains(atmosphere)': False, 'contains(subtle)': False, 'contains(expected)': False, 'contains(fairly)': True, 'contains(seven)': False, 'contains(killing)': False, 'contains(tone)': False, 'contains(master)': False, 'contains(disaster)': False, 'contains(lots)': False, 'contains(thinks)': False, 'contains(song)': False, 'contains(cheap)': False, 'contains(suddenly)': False, 'contains(background)': False, 'contains(club)': False, 'contains(willis)': False, 'contains(whatever)': False, 'contains(highly)': False, 'contains(sees)': True, 'contains(complex)': False, 'contains(greatest)': False, 'contains(impact)': False, 'contains(beauty)': False, 'contains(front)': False, 'contains(humans)': False, 'contains(indeed)': False, 'contains(flat)': False, 'contains(grace)': False, 'contains(wrote)': False, 'contains(amusing)': False, 'contains(ii)': False, 'contains(mike)': False, 'contains(further)': False, 'contains(cute)': False, 'contains(dull)': False, 'contains(minor)': False, 'contains(recently)': False, 'contains(hate)': False, 'contains(outside)': False, 'contains(plenty)': False, 'contains(wish)': False, 'contains(godzilla)': False, 'contains(college)': False, 'contains(titanic)': False, 'contains(sounds)': False, 'contains(telling)': False, 'contains(sight)': False, 'contains(double)': False, 'contains(cinematic)': False, 'contains(queen)': False, 'contains(hold)': False, 'contains(meanwhile)': False, 'contains(awful)': False, 'contains(clearly)': False, 'contains(theme)': False, 'contains(hear)': False, 'contains(x)': False, 'contains(amount)': False, 'contains(baby)': False, 'contains(approach)': False, 'contains(dreams)': False, 'contains(shown)': False, 'contains(island)': False, 'contains(reasons)': False, 'contains(charm)': False, 'contains(miss)': True, 'contains(longer)': False, 'contains(common)': False, 'contains(sean)': False, 'contains(carry)': False, 'contains(believable)': False, 'contains(realistic)': False, 'contains(chemistry)': True, 'contains(possibly)': False, 'contains(casting)': False, 'contains(carrey)': False, 'contains(french)': False, 'contains(trailer)': False, 'contains(tough)': False, 'contains(produced)': False, 'contains(imagine)': False, 'contains(choice)': False, 'contains(ride)': False, 'contains(somewhere)': False, 'contains(hot)': False, 'contains(race)': False, 'contains(road)': False, 'contains(leader)': False, 'contains(thin)': False, 'contains(jerry)': False, 'contains(slowly)': False, 'contains(delivers)': False, 'contains(detective)': False, 'contains(brown)': False, 'contains(jackson)': False, 'contains(member)': False, 'contains(provide)': False, 'contains(president)': False, 'contains(puts)': False, 'contains(asks)': False, 'contains(critics)': False, 'contains(appearance)': False, 'contains(famous)': False, 'contains(okay)': False, 'contains(intelligence)': False, 'contains(energy)': False, 'contains(sent)': False, 'contains(spielberg)': False, 'contains(development)': False, 'contains(etc)': False, 'contains(language)': False, 'contains(blue)': False, 'contains(proves)': False, 'contains(vampire)': False, 'contains(seemingly)': False, 'contains(basic)': False, 'contains(caught)': False, 'contains(decide)': False, 'contains(opportunity)': False, 'contains(incredibly)': False, 'contains(images)': False, 'contains(band)': False, 'contains(j)': False, 'contains(writers)': False, 'contains(knew)': False, 'contains(interested)': False, 'contains(considering)': False, 'contains(boys)': False, 'contains(thanks)': False, 'contains(remains)': False, 'contains(climax)': True, 'contains(event)': False, 'contains(directing)': False, 'contains(conclusion)': False, 'contains(leading)': False, 'contains(ground)': False, 'contains(lies)': False, 'contains(forget)': False, 'contains(alive)': False, 'contains(tarzan)': False, 'contains(century)': False, 'contains(provides)': False, 'contains(trip)': False, 'contains(partner)': False, 'contains(central)': False, 'contains(tarantino)': False, 'contains(period)': False, 'contains(pace)': False, 'contains(yourself)': False, 'contains(worked)': False, 'contains(ready)': False, 'contains(date)': False, 'contains(thus)': False, 'contains(1998)': False, 'contains(terrific)': False, 'contains(write)': False, 'contains(average)': False, 'contains(onto)': False, 'contains(songs)': False, 'contains(occasionally)': False, 'contains(doctor)': False, 'contains(stands)': False, 'contains(hardly)': False, 'contains(monster)': False, 'contains(led)': False, 'contains(mysterious)': False, 'contains(details)': False, 'contains(wasted)': False, 'contains(apart)': False, 'contains(aside)': False, 'contains(store)': False, 'contains(billy)': False, 'contains(boss)': True, 'contains(travolta)': False, 'contains(producer)': False, 'contains(pull)': False, 'contains(consider)': False, 'contains(pictures)': False, 'contains(becoming)': False, 'contains(cage)': False, 'contains(loud)': False, 'contains(looked)': False, 'contains(officer)': False, 'contains(twenty)': False, 'contains(system)': False, 'contains(contains)': False, 'contains(julia)': False, 'contains(subplot)': False, 'contains(missing)': False, 'contains(personality)': False, 'contains(building)': False, 'contains(learns)': False, 'contains(hong)': True, 'contains(la)': False, 'contains(apartment)': False, 'contains(7)': False, 'contains(bizarre)': False, 'contains(powers)': False, 'contains(flaws)': False, 'contains(catch)': False, 'contains(lawyer)': False, 'contains(shoot)': False, 'contains(student)': False, 'contains(unique)': True, 'contains(000)': False, 'contains(admit)': False, 'contains(concept)': False, 'contains(needed)': False, 'contains(thrown)': False, 'contains(christopher)': False, 'contains(laughing)': False, 'contains(green)': False, 'contains(twists)': False, 'contains(matthew)': False, 'contains(touch)': False, 'contains(waiting)': False, 'contains(victim)': False, 'contains(cover)': False, 'contains(machine)': False, 'contains(danny)': False, 'contains(mention)': False, 'contains(search)': False, 'contains(1997)': False, 'contains(win)': False, 'contains(door)': False, 'contains(manner)': False, 'contains(train)': True, 'contains(saving)': False, 'contains(share)': False, 'contains(image)': False, 'contains(discovers)': False, 'contains(normal)': False, 'contains(cross)': False, 'contains(fox)': False, 'contains(returns)': False, 'contains(adult)': False, 'contains(adds)': False, 'contains(answer)': False, 'contains(adventure)': False, 'contains(lame)': False, 'contains(male)': False, 'contains(odd)': False, 'contains(singer)': False, 'contains(deserves)': False, 'contains(gore)': False, 'contains(states)': False, 'contains(include)': False, 'contains(equally)': False, 'contains(months)': False, 'contains(barely)': False, 'contains(directors)': False, 'contains(introduced)': False, 'contains(fashion)': False, 'contains(social)': False, 'contains(1999)': False, 'contains(news)': False, 'contains(hair)': False, 'contains(dance)': False, 'contains(innocent)': False, 'contains(camp)': False, 'contains(teacher)': False, 'contains(became)': False, 'contains(sad)': False, 'contains(witch)': False, 'contains(includes)': False, 'contains(nights)': False, 'contains(jason)': False, 'contains(julie)': False, 'contains(latter)': False, 'contains(food)': True, 'contains(jennifer)': False, 'contains(land)': False, 'contains(menace)': False, 'contains(rate)': False, 'contains(storyline)': False, 'contains(contact)': False, 'contains(jean)': False, 'contains(elizabeth)': False, 'contains(fellow)': False, 'contains(changes)': False, 'contains(henry)': False, 'contains(hill)': False, 'contains(pulp)': False, 'contains(gay)': False, 'contains(tried)': False, 'contains(surprised)': False, 'contains(literally)': False, 'contains(walk)': False, 'contains(standard)': False, 'contains(90)': False, 'contains(forward)': False, 'contains(wise)': False, 'contains(enjoyed)': False, 'contains(discover)': False, 'contains(pop)': False, 'contains(anderson)': False, 'contains(offer)': False, 'contains(recommend)': False, 'contains(public)': False, 'contains(drive)': False, 'contains(c)': False, 'contains(toy)': False, 'contains(charming)': False, 'contains(fair)': False, 'contains(chinese)': True, 'contains(rescue)': False, 'contains(terms)': False, 'contains(mouth)': False, 'contains(lucas)': False, 'contains(accident)': False, 'contains(dies)': False, 'contains(decided)': False, 'contains(edge)': False, 'contains(footage)': False, 'contains(culture)': False, 'contains(weak)': False, 'contains(presented)': False, 'contains(blade)': False, 'contains(younger)': False, 'contains(douglas)': False, 'contains(natural)': False, 'contains(born)': False, 'contains(generally)': False, 'contains(teenage)': False, 'contains(older)': False, 'contains(horrible)': False, 'contains(addition)': False, 'contains(sadly)': False, 'contains(creates)': False, 'contains(disturbing)': False, 'contains(roger)': False, 'contains(detail)': False, 'contains(devil)': False, 'contains(debut)': False, 'contains(track)': False, 'contains(developed)': False, 'contains(week)': False, 'contains(russell)': False, 'contains(attack)': False, 'contains(explain)': False, 'contains(rarely)': False, 'contains(fully)': False, 'contains(prove)': False, 'contains(exception)': False, 'contains(jeff)': False, 'contains(twist)': False, 'contains(gang)': False, 'contains(winning)': False, 'contains(jr)': False, 'contains(species)': False, 'contains(issues)': False, 'contains(fresh)': False, 'contains(rules)': False, 'contains(meaning)': False, 'contains(inspired)': False, 'contains(heroes)': False, 'contains(desperate)': False, 'contains(fighting)': False, 'contains(filmed)': False, 'contains(faces)': False, 'contains(alan)': False, 'contains(bright)': False, 'contains(ass)': True, 'contains(flying)': False, 'contains(kong)': True, 'contains(rush)': False, 'contains(forces)': False, 'contains(charles)': False, 'contains(numerous)': False, 'contains(emotions)': False, 'contains(involves)': True, 'contains(patrick)': False, 'contains(weird)': False, 'contains(apparent)': False, 'contains(information)': False, 'contains(revenge)': False, 'contains(jay)': False, 'contains(toward)': False, 'contains(surprising)': False, 'contains(twice)': False, 'contains(editing)': False, 'contains(calls)': False, 'contains(lose)': False, 'contains(vegas)': False, 'contains(stage)': False, 'contains(intended)': False, 'contains(gags)': False, 'contains(opinion)': False, 'contains(likes)': False, 'contains(crazy)': False, 'contains(owner)': False, 'contains(places)': False, 'contains(pair)': False, 'contains(genuine)': False, 'contains(epic)': False, 'contains(speak)': False, 'contains(throw)': False, 'contains(appeal)': False, 'contains(gibson)': False, 'contains(captain)': False, 'contains(military)': False, 'contains(20)': False, 'contains(blair)': False, 'contains(nowhere)': False, 'contains(length)': False, 'contains(nicely)': False, 'contains(cause)': False, 'contains(pass)': False, 'contains(episode)': False, 'contains(kiss)': False, 'contains(arnold)': True, 'contains(please)': False, 'contains(hasn)': False, 'contains(phone)': False, 'contains(filmmaking)': False, 'contains(formula)': False, 'contains(boyfriend)': False, 'contains(talents)': False, 'contains(creating)': False, 'contains(kelly)': False, 'contains(buy)': False, 'contains(wide)': False, 'contains(fantasy)': False, 'contains(mood)': False, 'contains(heads)': False, 'contains(pathetic)': False, 'contains(lacks)': False, 'contains(loved)': False, 'contains(asked)': False, 'contains(mrs)': False, 'contains(witty)': False, 'contains(shakespeare)': False, 'contains(mulan)': False, 'contains(generation)': False, 'contains(affair)': False, 'contains(pieces)': False, 'contains(task)': False, 'contains(rare)': False, 'contains(kept)': False, 'contains(cameo)': False, 'contains(fascinating)': False, 'contains(ed)': False, 'contains(fbi)': False, 'contains(burton)': False, 'contains(incredible)': False, 'contains(accent)': False, 'contains(artist)': False, 'contains(superior)': False, 'contains(academy)': False, 'contains(thomas)': False, 'contains(spirit)': False, 'contains(technical)': False, 'contains(confusing)': False, 'contains(poorly)': False, 'contains(target)': False, 'contains(lover)': False, 'contains(woo)': False, 'contains(mentioned)': False, 'contains(theaters)': False, 'contains(plane)': False, 'contains(confused)': False, 'contains(dennis)': False, 'contains(rob)': False, 'contains(appropriate)': False, 'contains(christmas)': False, 'contains(considered)': False, 'contains(legend)': False, 'contains(shame)': False, 'contains(soul)': False, 'contains(matt)': False, 'contains(campbell)': False, 'contains(process)': False, 'contains(bottom)': False, 'contains(sitting)': False, 'contains(brain)': False, 'contains(creepy)': False, 'contains(13)': False, 'contains(forever)': False, 'contains(dude)': False, 'contains(crap)': False, 'contains(superb)': False, 'contains(speech)': False, 'contains(ice)': False, 'contains(journey)': False, 'contains(masterpiece)': False, 'contains(intriguing)': False, 'contains(names)': False, 'contains(pick)': False, 'contains(speaking)': False, 'contains(virtually)': False, 'contains(award)': False, 'contains(worthy)': False, 'contains(marriage)': False, 'contains(deliver)': False, 'contains(cash)': False, 'contains(magic)': False, 'contains(respect)': False, 'contains(product)': False, 'contains(necessary)': False, 'contains(suppose)': False, 'contains(silent)': False, 'contains(pointless)': False, 'contains(station)': False, 'contains(affleck)': False, 'contains(dimensional)': False, 'contains(charlie)': False, 'contains(allows)': False, 'contains(avoid)': False, 'contains(meant)': False, 'contains(cops)': False, 'contains(attitude)': False, 'contains(relationships)': False, 'contains(hits)': False, 'contains(stephen)': False, 'contains(spends)': False, 'contains(relief)': False, 'contains(physical)': True, 'contains(count)': False, 'contains(reviews)': False, 'contains(appreciate)': False, 'contains(cliches)': False, 'contains(holds)': False, 'contains(pure)': False, 'contains(plans)': False, 'contains(limited)': False, 'contains(failed)': False, 'contains(pain)': False, 'contains(impression)': False, 'contains(unless)': False, 'contains(sub)': False, 'contains([)': False, 'contains(total)': False, 'contains(creature)': False, 'contains(viewing)': False, 'contains(loves)': False, 'contains(princess)': False, 'contains(kate)': False, 'contains(rising)': False, 'contains(woods)': False, 'contains(baldwin)': False, 'contains(angry)': False, 'contains(drawn)': False, 'contains(step)': False, 'contains(matrix)': False, 'contains(themes)': False, 'contains(satire)': False, 'contains(arts)': False, 'contains(])': False, 'contains(remake)': False, 'contains(wall)': False, 'contains(moral)': False, 'contains(color)': False, 'contains(ray)': False, 'contains(stuck)': False, 'contains(touching)': False, 'contains(wit)': False, 'contains(tony)': False, 'contains(hanks)': False, 'contains(continues)': False, 'contains(damn)': False, 'contains(nobody)': False, 'contains(cartoon)': False, 'contains(keeping)': False, 'contains(realized)': False, 'contains(criminal)': False, 'contains(unfunny)': False, 'contains(comedic)': False, 'contains(martial)': False, 'contains(disappointing)': False, 'contains(anti)': False, 'contains(graphic)': False, 'contains(stunning)': False, 'contains(actions)': False, 'contains(floor)': False, 'contains(emotion)': False, 'contains(soldiers)': False, 'contains(edward)': False, 'contains(comedies)': False, 'contains(driver)': False, 'contains(expectations)': False, 'contains(added)': False, 'contains(mad)': False, 'contains(angels)': False, 'contains(shallow)': False, 'contains(suspect)': False, 'contains(humorous)': False, 'contains(phantom)': False, 'contains(appealing)': False, 'contains(device)': False, 'contains(design)': False, 'contains(industry)': False, 'contains(reach)': False, 'contains(fat)': False, 'contains(blame)': False, 'contains(united)': False, 'contains(sign)': False, 'contains(portrayal)': False, 'contains(rocky)': False, 'contains(finale)': False, 'contains(grand)': False, 'contains(opposite)': False, 'contains(hotel)': False, 'contains(match)': False, 'contains(damme)': False, 'contains(speed)': False, 'contains(ok)': False, 'contains(loving)': False, 'contains(field)': True, 'contains(larry)': False, 'contains(urban)': False, 'contains(troopers)': False, 'contains(compared)': False, 'contains(apes)': False, 'contains(rose)': False, 'contains(falling)': False, 'contains(era)': False, 'contains(loses)': False, 'contains(adults)': False, 'contains(managed)': False, 'contains(dad)': False, 'contains(therefore)': False, 'contains(pg)': False, 'contains(results)': False, 'contains(guns)': False, 'contains(radio)': False, 'contains(lady)': False, 'contains(manage)': False, 'contains(spice)': False, 'contains(naked)': False, 'contains(started)': False, 'contains(intense)': False, 'contains(humanity)': False, 'contains(wonderfully)': False, 'contains(slasher)': False, 'contains(bland)': False, 'contains(imagination)': False, 'contains(walking)': False, 'contains(willing)': False, 'contains(horse)': False, 'contains(rent)': False, 'contains(mix)': False, 'contains(generated)': False, 'contains(g)': False, 'contains(utterly)': False, 'contains(scientist)': False, 'contains(washington)': False, 'contains(notice)': False, 'contains(players)': False, 'contains(teenagers)': False, 'contains(moore)': False, 'contains(board)': False, 'contains(price)': False, 'contains(frightening)': False, 'contains(tommy)': False, 'contains(spectacular)': False, 'contains(bored)': False, 'contains(jane)': False, 'contains(join)': False, 'contains(producers)': False, 'contains(johnny)': False, 'contains(zero)': False, 'contains(vampires)': False, 'contains(adaptation)': False, 'contains(dollars)': False, 'contains(parody)': False, 'contains(documentary)': False, 'contains(dvd)': False, 'contains(wayne)': False, 'contains(post)': False, 'contains(exist)': False, 'contains(matters)': False, 'contains(chosen)': False, 'contains(mel)': False, 'contains(attractive)': True, 'contains(plain)': False, 'contains(trust)': False, 'contains(safe)': False, 'contains(reading)': False, 'contains(hoping)': False, 'contains(protagonist)': False, 'contains(feelings)': False, 'contains(fate)': False, 'contains(finding)': False, 'contains(feet)': False, 'contains(visuals)': False, 'contains(spawn)': False, 'contains(compelling)': False, 'contains(hall)': False, 'contains(sympathetic)': False, 'contains(featuring)': False, 'contains(difference)': False, 'contains(professional)': False, 'contains(drugs)': False, 'contains(ford)': False, 'contains(shooting)': False, 'contains(gold)': False, 'contains(patch)': False, 'contains(build)': False, 'contains(boat)': False, 'contains(cruise)': False, 'contains(honest)': False, 'contains(media)': False, 'contains(flicks)': False, 'contains(bug)': False, 'contains(bringing)': False, 'contains(dangerous)': True, 'contains(watched)': False, 'contains(grant)': False, 'contains(smile)': False, 'contains(plus)': False, 'contains(shouldn)': False, 'contains(decision)': False, 'contains(visually)': False, 'contains(allow)': False, 'contains(starship)': False, 'contains(roberts)': False, 'contains(dying)': False, 'contains(portrayed)': False, 'contains(turning)': False, 'contains(believes)': False, 'contains(changed)': False, 'contains(shock)': False, 'contains(destroy)': False, 'contains(30)': False, 'contains(crowd)': False, 'contains(broken)': False, 'contains(tired)': False, 'contains(fail)': False, 'contains(south)': False, 'contains(died)': False, 'contains(cult)': False, 'contains(fake)': False, 'contains(vincent)': False, 'contains(identity)': False, 'contains(sexy)': False, 'contains(hunt)': False, 'contains(jedi)': False, 'contains(flynt)': False, 'contains(alex)': False, 'contains(engaging)': False, 'contains(serve)': False, 'contains(snake)': False, 'contains(yeah)': False, 'contains(expecting)': False, 'contains(100)': False, 'contains(decade)': False, 'contains(ups)': False, 'contains(constant)': False, 'contains(current)': False, 'contains(survive)': False, 'contains(jimmy)': False, 'contains(buddy)': False, 'contains(send)': False, 'contains(brooks)': False, 'contains(goofy)': False, 'contains(likable)': False, 'contains(humour)': False, 'contains(technology)': False, 'contains(files)': False, 'contains(babe)': False, 'contains(aspects)': False, 'contains(presents)': False, 'contains(kills)': False, 'contains(supposedly)': False, 'contains(eight)': True, 'contains(sandler)': False, 'contains(hospital)': False, 'contains(test)': False, 'contains(hidden)': False, 'contains(brian)': False, 'contains(books)': False, 'contains(promise)': False, 'contains(determined)': False, 'contains(professor)': False, 'contains(welcome)': False, 'contains(pleasure)': False, 'contains(succeeds)': False, 'contains(individual)': False, 'contains(annie)': False, 'contains(mob)': False, 'contains(ted)': False, 'contains(virus)': False, 'contains(content)': False, 'contains(gary)': False, 'contains(direct)': False, 'contains(contrived)': False, 'contains(carpenter)': False, 'contains(scale)': False, 'contains(sick)': False, 'contains(nasty)': False, 'contains(conflict)': False, 'contains(haunting)': False, 'contains(ghost)': False, 'contains(filmmaker)': False, 'contains(japanese)': False, 'contains(helps)': False, 'contains(fare)': False, 'contains(lucky)': False, 'contains(ultimate)': False, 'contains(window)': False, 'contains(support)': False, 'contains(goal)': False, 'contains(provided)': False, 'contains(genius)': False, 'contains(winner)': False, 'contains(taylor)': False, 'contains(fantastic)': False, 'contains(faith)': False, 'contains(lynch)': False, 'contains(fit)': False, 'contains(catherine)': False, 'contains(ms)': False, 'contains(paced)': False, 'contains(breaks)': False, 'contains(al)': False, 'contains(frame)': False, 'contains(travel)': False, 'contains(badly)': False, 'contains(available)': False, 'contains(cares)': False, 'contains(reeves)': False, 'contains(crash)': False, 'contains(driving)': False, 'contains(press)': False, 'contains(seagal)': False, 'contains(amy)': False, 'contains(9)': False, 'contains(headed)': False, 'contains(instance)': False, 'contains(excuse)': False, 'contains(offensive)': False, 'contains(narrative)': False, 'contains(fault)': False, 'contains(bus)': False, 'contains(f)': False, 'contains(extreme)': False, 'contains(miller)': False, 'contains(guilty)': False, 'contains(grows)': False, 'contains(overly)': False, 'contains(liners)': False, 'contains(forgotten)': False, 'contains(ahead)': False, 'contains(accept)': False, 'contains(porn)': False, 'contains(directly)': False, 'contains(helen)': False, 'contains(began)': False, 'contains(lord)': False, 'contains(folks)': False, 'contains(mediocre)': False, 'contains(bar)': False, 'contains(surface)': False, 'contains(super)': False, 'contains(failure)': False, 'contains(6)': False, 'contains(acted)': False, 'contains(quiet)': False, 'contains(laughable)': False, 'contains(sheer)': False, 'contains(security)': True, 'contains(emotionally)': False, 'contains(season)': False, 'contains(stuart)': False, 'contains(jail)': True, 'contains(deals)': False, 'contains(cheesy)': False, 'contains(court)': False, 'contains(beach)': False, 'contains(austin)': False, 'contains(model)': False, 'contains(outstanding)': False, 'contains(substance)': False, 'contains(nudity)': False, 'contains(slapstick)': False, 'contains(joan)': False, 'contains(reveal)': False, 'contains(placed)': False, 'contains(check)': False, 'contains(beast)': False, 'contains(hurt)': False, 'contains(bloody)': False, 'contains(acts)': False, 'contains(fame)': False, 'contains(meeting)': False, 'contains(nuclear)': False, 'contains(1996)': False, 'contains(strength)': False, 'contains(center)': False, 'contains(funniest)': False, 'contains(standing)': True, 'contains(damon)': False, 'contains(clich)': False, 'contains(position)': False, 'contains(desire)': False, 'contains(driven)': False, 'contains(seat)': False, 'contains(stock)': False, 'contains(wondering)': True, 'contains(realizes)': False, 'contains(dealing)': False, 'contains(taste)': False, 'contains(routine)': False, 'contains(comparison)': False, 'contains(cinematographer)': False, 'contains(seconds)': False, 'contains(singing)': False, 'contains(gangster)': True, 'contains(responsible)': False, 'contains(football)': False, 'contains(remarkable)': False, 'contains(hunting)': False, 'contains(adams)': False, 'contains(fly)': False, 'contains(suspects)': False, 'contains(treat)': False, 'contains(hopes)': False, 'contains(heaven)': False, 'contains(myers)': False, 'contains(struggle)': False, 'contains(costumes)': False, 'contains(beat)': False, 'contains(happening)': False, 'contains(skills)': False, 'contains(ugly)': False, 'contains(figures)': False, 'contains(thoroughly)': False, 'contains(ill)': False, 'contains(surprises)': False, 'contains(player)': False, 'contains(rival)': False, 'contains(guard)': True, 'contains(anthony)': False, 'contains(strike)': False, 'contains(community)': False, 'contains(streets)': False, 'contains(hopkins)': False, 'contains(ended)': False, 'contains(originally)': False, 'contains(sarah)': False, 'contains(creative)': False, 'contains(characterization)': False, 'contains(thankfully)': False, 'contains(growing)': False, 'contains(sharp)': False, 'contains(williamson)': False, 'contains(eccentric)': False, 'contains(explained)': False, 'contains(hey)': False, 'contains(claire)': False, 'contains(steal)': False, 'contains(inevitable)': False, 'contains(joel)': False, 'contains(core)': False, 'contains(weren)': False, 'contains(sorry)': False, 'contains(built)': False, 'contains(anne)': False, 'contains(breaking)': False, 'contains(villains)': False, 'contains(critic)': False, 'contains(lets)': False, 'contains(visit)': False, 'contains(followed)': False}
In [21]:
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)
In [22]:
print("Naive Bayes accuracy with 2000 bag of words features is %s" % (nltk.classify.accuracy(classifier, test_set)))
classifier.show_most_informative_features(5) 
Naive Bayes accuracy with 2000 bag of words features is 0.81
Most Informative Features
   contains(outstanding) = True              pos : neg    =     10.8 : 1.0
         contains(mulan) = True              pos : neg    =      9.0 : 1.0
        contains(seagal) = True              neg : pos    =      8.2 : 1.0
   contains(wonderfully) = True              pos : neg    =      7.1 : 1.0
          contains(lame) = True              neg : pos    =      6.1 : 1.0
In [23]:
classifier.show_most_informative_features(20) 
Most Informative Features
   contains(outstanding) = True              pos : neg    =     10.8 : 1.0
         contains(mulan) = True              pos : neg    =      9.0 : 1.0
        contains(seagal) = True              neg : pos    =      8.2 : 1.0
   contains(wonderfully) = True              pos : neg    =      7.1 : 1.0
          contains(lame) = True              neg : pos    =      6.1 : 1.0
         contains(damon) = True              pos : neg    =      6.1 : 1.0
         contains(flynt) = True              pos : neg    =      5.6 : 1.0
         contains(awful) = True              neg : pos    =      5.3 : 1.0
        contains(poorly) = True              neg : pos    =      5.3 : 1.0
        contains(wasted) = True              neg : pos    =      5.3 : 1.0
    contains(ridiculous) = True              neg : pos    =      5.1 : 1.0
         contains(waste) = True              neg : pos    =      4.8 : 1.0
           contains(era) = True              pos : neg    =      4.8 : 1.0
         contains(damme) = True              neg : pos    =      4.7 : 1.0
         contains(worst) = True              neg : pos    =      4.3 : 1.0
       contains(unfunny) = True              neg : pos    =      4.3 : 1.0
         contains(bland) = True              neg : pos    =      4.2 : 1.0
        contains(allows) = True              pos : neg    =      4.1 : 1.0
        contains(stupid) = True              neg : pos    =      3.9 : 1.0
     contains(portrayal) = True              pos : neg    =      3.9 : 1.0

Back to Part of Speech Tagging

We can consider the task of POS tagging as a classification task and use the classifier methodology described here. Let us revisit the POS tagging task discussed in the first lecture using the new tools we have developed.

In [24]:
from nltk.corpus import brown
suffix_fdist = nltk.FreqDist()
for word in brown.words():
     word = word.lower()
     suffix_fdist[word[-1:]] += 1
     suffix_fdist[word[-2:]] += 1
     suffix_fdist[word[-3:]] += 1
In [25]:
common_suffixes = [suffix for (suffix, count) in suffix_fdist.most_common(100)]
print(common_suffixes)
['e', ',', '.', 's', 'd', 't', 'he', 'n', 'a', 'of', 'the', 'y', 'r', 'to', 'in', 'f', 'o', 'ed', 'nd', 'is', 'on', 'l', 'g', 'and', 'ng', 'er', 'as', 'ing', 'h', 'at', 'es', 'or', 're', 'it', '``', 'an', "''", 'm', ';', 'i', 'ly', 'ion', 'en', 'al', '?', 'nt', 'be', 'hat', 'st', 'his', 'th', 'll', 'le', 'ce', 'by', 'ts', 'me', 've', "'", 'se', 'ut', 'was', 'for', 'ent', 'ch', 'k', 'w', 'ld', '`', 'rs', 'ted', 'ere', 'her', 'ne', 'ns', 'ith', 'ad', 'ry', ')', '(', 'te', '--', 'ay', 'ty', 'ot', 'p', 'nce', "'s", 'ter', 'om', 'ss', ':', 'we', 'are', 'c', 'ers', 'uld', 'had', 'so', 'ey']
In [26]:
def pos_features(word):
    features = {}
    for suffix in common_suffixes:
        features['endswith(%s)' % suffix] = word.lower().endswith(suffix)
    return features
In [27]:
tagged_words = brown.tagged_words(categories='news', tagset='universal')
featuresets = [(pos_features(n), g) for (n,g) in tagged_words]
In [28]:
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)
Out[28]:
0.7011437095972153
In [29]:
classifier.classify(pos_features('cats'))
Out[29]:
'NOUN'
In [30]:
classifier.classify(pos_features('books'))
Out[30]:
'NOUN'
In [31]:
classifier.show_most_informative_features(20) 
Most Informative Features
           endswith(the) = True              DET : NOUN   =   3416.9 : 1.0
             endswith(.) = True                . : ADP    =   2481.6 : 1.0
            endswith(to) = True              PRT : ADJ    =   2138.0 : 1.0
             endswith(f) = True              ADP : VERB   =   2050.5 : 1.0
            endswith(he) = True              DET : NOUN   =   1808.9 : 1.0
           endswith(and) = True             CONJ : ADV    =   1642.0 : 1.0
             endswith(a) = True              DET : VERB   =   1597.1 : 1.0
            endswith(of) = True              ADP : NOUN   =   1406.9 : 1.0
           endswith(his) = True              DET : NOUN   =    728.0 : 1.0
            endswith(ut) = True             CONJ : DET    =    694.7 : 1.0
            endswith(nd) = True             CONJ : NUM    =    636.1 : 1.0
           endswith(hat) = True             PRON : NOUN   =    570.6 : 1.0
            endswith(ey) = True             PRON : VERB   =    549.0 : 1.0
             endswith(i) = True             PRON : ADP    =    547.2 : 1.0
             endswith(') = True                . : VERB   =    503.7 : 1.0
             endswith(o) = True              PRT : ADJ    =    493.4 : 1.0
            endswith(es) = True             NOUN : ADP    =    427.0 : 1.0
           endswith(uld) = True             VERB : NOUN   =    422.5 : 1.0
            endswith(we) = True             PRON : NOUN   =    353.5 : 1.0
           endswith(ted) = True             VERB : NOUN   =    337.9 : 1.0

Testing Different Classifiers

NLTK provides a common interface to different classifier algorithms. This is illustrated in the following examples.

In [32]:
import nltk
train = [
    (dict(a=1,b=1,c=1), 'y'),
    (dict(a=1,b=1,c=1), 'x'),
    (dict(a=1,b=1,c=0), 'y'),
    (dict(a=0,b=1,c=1), 'x'),
    (dict(a=0,b=1,c=1), 'y'),
    (dict(a=0,b=0,c=1), 'y'),
    (dict(a=0,b=1,c=0), 'x'),
    (dict(a=0,b=0,c=0), 'x'),
    (dict(a=0,b=1,c=1), 'y'),
]
test = [
    (dict(a=1,b=0,c=1)), # unseen
    (dict(a=1,b=0,c=0)), # unseen
    (dict(a=0,b=1,c=1)), # seen 3 times, labels=y,y,x
    (dict(a=0,b=1,c=0)), # seen 1 time, label=x
]

Naive Bayes Classifier

In [33]:
classifier = nltk.classify.NaiveBayesClassifier.train(train)
sorted(classifier.labels())
Out[33]:
['x', 'y']
In [34]:
classifier.classify_many(test)
Out[34]:
['y', 'x', 'y', 'x']
In [35]:
for pdist in classifier.prob_classify_many(test):
    print('%.4f %.4f' % (pdist.prob('x'), pdist.prob('y')))
0.3203 0.6797
0.5857 0.4143
0.3792 0.6208
0.6470 0.3530
In [36]:
classifier.show_most_informative_features()
Most Informative Features
                       c = 0                   x : y      =      2.0 : 1.0
                       c = 1                   y : x      =      1.5 : 1.0
                       a = 1                   y : x      =      1.4 : 1.0
                       b = 0                   x : y      =      1.2 : 1.0
                       a = 0                   x : y      =      1.2 : 1.0
                       b = 1                   y : x      =      1.1 : 1.0

Decision Tree Classifier

In [37]:
classifier = nltk.classify.DecisionTreeClassifier.train(
    train, entropy_cutoff=0, support_cutoff=0)
sorted(classifier.labels())
Out[37]:
['x', 'y']
In [38]:
print(classifier)
c=0? .................................................. x
  a=0? ................................................ x
  a=1? ................................................ y
c=1? .................................................. y

In [39]:
classifier.classify_many(test)
Out[39]:
['y', 'y', 'y', 'x']

There is no prob() method for decision tree classifiers, as they do not provide a probability interpretation.

Scikit-Learn Classifiers

NLTK provides an interface to the Scikit-learn (sklearn) classifiers - including maximum entropy and SVM.

In [40]:
from nltk.classify import SklearnClassifier
train_data = [({"a": 4, "b": 1, "c": 0}, "ham"),
              ({"a": 5, "b": 2, "c": 1}, "ham"),
              ({"a": 0, "b": 3, "c": 4}, "spam"),
              ({"a": 5, "b": 1, "c": 1}, "ham"),
              ({"a": 1, "b": 4, "c": 3}, "spam")]
test_data = [{"a": 3, "b": 2, "c": 1},
             {"a": 0, "b": 3, "c": 7}]
In [59]:
from sklearn.naive_bayes import BernoulliNB
classif = SklearnClassifier(BernoulliNB()).train(train_data)
classif.classify_many(test_data)
Out[59]:
['ham', 'spam']
In [60]:
from sklearn.svm import SVC
classif = SklearnClassifier(SVC(gamma='scale'), sparse=False).train(train_data)
classif.classify_many(test_data)
Out[60]:
['ham', 'spam']
In [61]:
# Using the sklearn classifier:
X = [[0], [1], [2], [3]]
Y = [0, 1, 2, 3]
clf = SVC(kernel='linear', C=1.0)
clf.fit(X, Y)
Out[61]:
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)
In [63]:
classifr = SklearnClassifier(SVC(kernel='rbf', C=1.0, gamma='scale'), sparse=False).train(train_data)
In [64]:
classifr.classify_many(test_data)
Out[64]:
['ham', 'spam']
In [65]:
from sklearn.svm import LinearSVC
classif_ova = SklearnClassifier(LinearSVC(C=1.0), sparse=False).train(train_data)
In [47]:
classif_ova.classify_many(test_data)
Out[47]:
['ham', 'spam']

The key parameter to optimize for a given SVM kernel is the C parameter. Here is example code from sklearn that shows how to optimize C on a development set.

In [66]:
%matplotlib inline

import numpy as np
from sklearn import model_selection, datasets, svm

digits = datasets.load_digits()
X = digits.data
y = digits.target

svc = svm.SVC(kernel='linear')
C_s = np.logspace(-10, 0, 10)

scores = list()
scores_std = list()
for C in C_s:
    svc.C = C
    this_scores = model_selection.cross_val_score(svc, X, y, n_jobs=1, cv=5)
    scores.append(np.mean(this_scores))
    scores_std.append(np.std(this_scores))

# Do the plotting
import matplotlib.pyplot as plt
plt.figure(1, figsize=(4, 3))
plt.clf()
plt.semilogx(C_s, scores)
plt.semilogx(C_s, np.array(scores) + np.array(scores_std), 'b--')
plt.semilogx(C_s, np.array(scores) - np.array(scores_std), 'b--')
locs, labels = plt.yticks()
plt.yticks(locs, list(map(lambda x: "%g" % x, locs)))
plt.ylabel('CV score')
plt.xlabel('Parameter C')
plt.ylim(0, 1.1)
plt.show()
In [ ]: