Classification

In this notebook, we demonstrate how to apply classification tools to solve NLP tasks.

We start with a very simple task that looks at words in isolation and tries to classify them into 2 labels: gender identification. The task consists of guessing whether a name is masculine or feminine.

The classification method consists of taking as input an observation, turning this observation into a feature vector, then predicting the label of this feature vector by applying a trained classifier model.

To prepare for this procedure, we must train a classifier. In supervised learning, a classifier is learned by generalizing a set of observed pairs (observationi, labeli) where [i = 1..N].

In [1]:
%matplotlib inline

def gender_features(word):
    return {'last_letter': word[-1]}
gender_features('Shrek')
Out[1]:
{'last_letter': 'k'}
In [2]:
from nltk.corpus import names
labeled_names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')])
import random
random.shuffle(labeled_names)
In [5]:
print("There are %s samples in the dataset." % (len(labeled_names)))
There are 7944 samples in the dataset.
In [4]:
import nltk
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
In [6]:
print("Neo is classified as %s" % (classifier.classify(gender_features('Neo'))))

print("Trinity is classified as %s" % (classifier.classify(gender_features('Trinity'))))
Neo is classified as male
Trinity is classified as female
In [7]:
print(nltk.classify.accuracy(classifier, test_set))
0.77
In [8]:
classifier.show_most_informative_features(5)
Most Informative Features
             last_letter = 'a'            female : male   =     34.2 : 1.0
             last_letter = 'k'              male : female =     32.7 : 1.0
             last_letter = 'f'              male : female =     15.9 : 1.0
             last_letter = 'p'              male : female =     12.6 : 1.0
             last_letter = 'v'              male : female =     11.2 : 1.0
In [9]:
def gender_features2(name):
    features = {}
    features["first_letter"] = name[0].lower()
    features["last_letter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
        features["has(%s)" % letter] = (letter in name.lower())
    return features
In [10]:
gender_features2('John') 
Out[10]:
{'count(a)': 0,
 'count(b)': 0,
 'count(c)': 0,
 'count(d)': 0,
 'count(e)': 0,
 'count(f)': 0,
 'count(g)': 0,
 'count(h)': 1,
 'count(i)': 0,
 'count(j)': 1,
 'count(k)': 0,
 'count(l)': 0,
 'count(m)': 0,
 'count(n)': 1,
 'count(o)': 1,
 'count(p)': 0,
 'count(q)': 0,
 'count(r)': 0,
 'count(s)': 0,
 'count(t)': 0,
 'count(u)': 0,
 'count(v)': 0,
 'count(w)': 0,
 'count(x)': 0,
 'count(y)': 0,
 'count(z)': 0,
 'first_letter': 'j',
 'has(a)': False,
 'has(b)': False,
 'has(c)': False,
 'has(d)': False,
 'has(e)': False,
 'has(f)': False,
 'has(g)': False,
 'has(h)': True,
 'has(i)': False,
 'has(j)': True,
 'has(k)': False,
 'has(l)': False,
 'has(m)': False,
 'has(n)': True,
 'has(o)': True,
 'has(p)': False,
 'has(q)': False,
 'has(r)': False,
 'has(s)': False,
 'has(t)': False,
 'has(u)': False,
 'has(v)': False,
 'has(w)': False,
 'has(x)': False,
 'has(y)': False,
 'has(z)': False,
 'last_letter': 'n'}
In [11]:
featuresets = [(gender_features2(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))
0.768
In [12]:
train_names = labeled_names[1500:]
devtest_names = labeled_names[500:1500]
test_names = labeled_names[:500]
In [13]:
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
test_set = [(gender_features(n), gender) for (n, gender) in test_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set))
0.761
In [14]:
errors = []
for (name, tag) in devtest_names:
    guess = classifier.classify(gender_features(name))
    if guess != tag:
        errors.append( (tag, guess, name) )
In [15]:
for (tag, guess, name) in sorted(errors):
    print('correct=%-8s guess=%-8s name=%-30s' % (tag, guess, name))
correct=female   guess=male     name=Allsun                        
correct=female   guess=male     name=Allys                         
correct=female   guess=male     name=Alyson                        
correct=female   guess=male     name=Arleen                        
correct=female   guess=male     name=Aryn                          
correct=female   guess=male     name=Bette-Ann                     
correct=female   guess=male     name=Bird                          
correct=female   guess=male     name=Birgit                        
correct=female   guess=male     name=Brigid                        
correct=female   guess=male     name=Brynn                         
correct=female   guess=male     name=Carolin                       
correct=female   guess=male     name=Carrol                        
correct=female   guess=male     name=Charis                        
correct=female   guess=male     name=Charyl                        
correct=female   guess=male     name=Cherilyn                      
correct=female   guess=male     name=Christel                      
correct=female   guess=male     name=Chrysler                      
correct=female   guess=male     name=Coreen                        
correct=female   guess=male     name=Cristabel                     
correct=female   guess=male     name=Cristen                       
correct=female   guess=male     name=Crystal                       
correct=female   guess=male     name=Cybill                        
correct=female   guess=male     name=Daloris                       
correct=female   guess=male     name=Damaris                       
correct=female   guess=male     name=Daniel                        
correct=female   guess=male     name=Darb                          
correct=female   guess=male     name=Demeter                       
correct=female   guess=male     name=Devon                         
correct=female   guess=male     name=Dido                          
correct=female   guess=male     name=Doralin                       
correct=female   guess=male     name=Easter                        
correct=female   guess=male     name=Eden                          
correct=female   guess=male     name=Eleanor                       
correct=female   guess=male     name=Elinor                        
correct=female   guess=male     name=Em                            
correct=female   guess=male     name=Estel                         
correct=female   guess=male     name=Evangelin                     
correct=female   guess=male     name=Fawn                          
correct=female   guess=male     name=Flo                           
correct=female   guess=male     name=Flor                          
correct=female   guess=male     name=Floris                        
correct=female   guess=male     name=Frances                       
correct=female   guess=male     name=Gayleen                       
correct=female   guess=male     name=Gen                           
correct=female   guess=male     name=Gillian                       
correct=female   guess=male     name=Gilligan                      
correct=female   guess=male     name=Glynnis                       
correct=female   guess=male     name=Gwen                          
correct=female   guess=male     name=Gwyn                          
correct=female   guess=male     name=Idell                         
correct=female   guess=male     name=Ingeborg                      
correct=female   guess=male     name=Ivett                         
correct=female   guess=male     name=Jasmin                        
correct=female   guess=male     name=Jerrilyn                      
correct=female   guess=male     name=Jewell                        
correct=female   guess=male     name=Jilleen                       
correct=female   guess=male     name=Joellyn                       
correct=female   guess=male     name=Jonell                        
correct=female   guess=male     name=Jordan                        
correct=female   guess=male     name=Josselyn                      
correct=female   guess=male     name=Joyan                         
correct=female   guess=male     name=Kaster                        
correct=female   guess=male     name=Katheleen                     
correct=female   guess=male     name=Kaylyn                        
correct=female   guess=male     name=Kellen                        
correct=female   guess=male     name=Keren                         
correct=female   guess=male     name=Kip                           
correct=female   guess=male     name=Kirstin                       
correct=female   guess=male     name=Kristal                       
correct=female   guess=male     name=Kym                           
correct=female   guess=male     name=Leann                         
correct=female   guess=male     name=Lillian                       
correct=female   guess=male     name=Linet                         
correct=female   guess=male     name=Linn                          
correct=female   guess=male     name=Loreen                        
correct=female   guess=male     name=Lurleen                       
correct=female   guess=male     name=Lyn                           
correct=female   guess=male     name=Magdalen                      
correct=female   guess=male     name=Margalit                      
correct=female   guess=male     name=Marget                        
correct=female   guess=male     name=Marion                        
correct=female   guess=male     name=Marit                         
correct=female   guess=male     name=Marylin                       
correct=female   guess=male     name=Mavis                         
correct=female   guess=male     name=Melisent                      
correct=female   guess=male     name=Meridel                       
correct=female   guess=male     name=Mureil                        
correct=female   guess=male     name=Nadeen                        
correct=female   guess=male     name=Nariko                        
correct=female   guess=male     name=Nell                          
correct=female   guess=male     name=Nichol                        
correct=female   guess=male     name=Noel                          
correct=female   guess=male     name=Noell                         
correct=female   guess=male     name=Noelyn                        
correct=female   guess=male     name=Norean                        
correct=female   guess=male     name=Noreen                        
correct=female   guess=male     name=Persis                        
correct=female   guess=male     name=Phil                          
correct=female   guess=male     name=Phyllis                       
correct=female   guess=male     name=Phyllys                       
correct=female   guess=male     name=Pris                          
correct=female   guess=male     name=Rahal                         
correct=female   guess=male     name=Rakel                         
correct=female   guess=male     name=Rayshell                      
correct=female   guess=male     name=Rhianon                       
correct=female   guess=male     name=Rosabel                       
correct=female   guess=male     name=Roz                           
correct=female   guess=male     name=Rozalin                       
correct=female   guess=male     name=Sam                           
correct=female   guess=male     name=Sara-Ann                      
correct=female   guess=male     name=Sharon                        
correct=female   guess=male     name=Shirleen                      
correct=female   guess=male     name=Stoddard                      
correct=female   guess=male     name=Storm                         
correct=female   guess=male     name=Sydel                         
correct=female   guess=male     name=Teriann                       
correct=female   guess=male     name=Viviyan                       
correct=female   guess=male     name=Wren                          
correct=male     guess=female   name=Abby                          
correct=male     guess=female   name=Alix                          
correct=male     guess=female   name=Amory                         
correct=male     guess=female   name=Andie                         
correct=male     guess=female   name=Baillie                       
correct=male     guess=female   name=Baily                         
correct=male     guess=female   name=Bela                          
correct=male     guess=female   name=Boniface                      
correct=male     guess=female   name=Brady                         
correct=male     guess=female   name=Brice                         
correct=male     guess=female   name=Cary                          
correct=male     guess=female   name=Case                          
correct=male     guess=female   name=Chane                         
correct=male     guess=female   name=Chase                         
correct=male     guess=female   name=Chauncey                      
correct=male     guess=female   name=Chrisy                        
correct=male     guess=female   name=Cobbie                        
correct=male     guess=female   name=Connie                        
correct=male     guess=female   name=Corey                         
correct=male     guess=female   name=Danie                         
correct=male     guess=female   name=Dannie                        
correct=male     guess=female   name=Darcy                         
correct=male     guess=female   name=Darth                         
correct=male     guess=female   name=Davie                         
correct=male     guess=female   name=Dietrich                      
correct=male     guess=female   name=Dominique                     
correct=male     guess=female   name=Dory                          
correct=male     guess=female   name=Durante                       
correct=male     guess=female   name=Eugene                        
correct=male     guess=female   name=Fonzie                        
correct=male     guess=female   name=Gay                           
correct=male     guess=female   name=Geoffrey                      
correct=male     guess=female   name=Gerri                         
correct=male     guess=female   name=Giovanni                      
correct=male     guess=female   name=Giuseppe                      
correct=male     guess=female   name=Godfree                       
correct=male     guess=female   name=Guy                           
correct=male     guess=female   name=Hadleigh                      
correct=male     guess=female   name=Haley                         
correct=male     guess=female   name=Herby                         
correct=male     guess=female   name=Herculie                      
correct=male     guess=female   name=Hezekiah                      
correct=male     guess=female   name=Hillery                       
correct=male     guess=female   name=Hirsch                        
correct=male     guess=female   name=Hurley                        
correct=male     guess=female   name=Hy                            
correct=male     guess=female   name=Hymie                         
correct=male     guess=female   name=Isa                           
correct=male     guess=female   name=Isidore                       
correct=male     guess=female   name=Jean-Christophe               
correct=male     guess=female   name=Jefferey                      
correct=male     guess=female   name=Jeffie                        
correct=male     guess=female   name=Jermaine                      
correct=male     guess=female   name=Jessey                        
correct=male     guess=female   name=Jimmy                         
correct=male     guess=female   name=Johny                         
correct=male     guess=female   name=Jonah                         
correct=male     guess=female   name=Joseph                        
correct=male     guess=female   name=Kennedy                       
correct=male     guess=female   name=Lanny                         
correct=male     guess=female   name=Larry                         
correct=male     guess=female   name=Lorne                         
correct=male     guess=female   name=Lorrie                        
correct=male     guess=female   name=Maurice                       
correct=male     guess=female   name=Maurise                       
correct=male     guess=female   name=Mischa                        
correct=male     guess=female   name=Monty                         
correct=male     guess=female   name=Moore                         
correct=male     guess=female   name=Mordecai                      
correct=male     guess=female   name=Morlee                        
correct=male     guess=female   name=Morley                        
correct=male     guess=female   name=Nichole                       
correct=male     guess=female   name=Nicky                         
correct=male     guess=female   name=Niki                          
correct=male     guess=female   name=Nikita                        
correct=male     guess=female   name=Noble                         
correct=male     guess=female   name=Pace                          
correct=male     guess=female   name=Pearce                        
correct=male     guess=female   name=Pepe                          
correct=male     guess=female   name=Petey                         
correct=male     guess=female   name=Pierce                        
correct=male     guess=female   name=Prentice                      
correct=male     guess=female   name=Price                         
correct=male     guess=female   name=Pryce                         
correct=male     guess=female   name=Reece                         
correct=male     guess=female   name=Reza                          
correct=male     guess=female   name=Rickey                        
correct=male     guess=female   name=Riley                         
correct=male     guess=female   name=Ritch                         
correct=male     guess=female   name=Rory                          
correct=male     guess=female   name=Ruddy                         
correct=male     guess=female   name=Rudolph                       
correct=male     guess=female   name=Sayre                         
correct=male     guess=female   name=Scotti                        
correct=male     guess=female   name=Sheffie                       
correct=male     guess=female   name=Shelby                        
correct=male     guess=female   name=Sloane                        
correct=male     guess=female   name=Solly                         
correct=male     guess=female   name=Spense                        
correct=male     guess=female   name=Stacy                         
correct=male     guess=female   name=Tabbie                        
correct=male     guess=female   name=Tally                         
correct=male     guess=female   name=Tedie                         
correct=male     guess=female   name=Terrance                      
correct=male     guess=female   name=Terri                         
correct=male     guess=female   name=Tony                          
correct=male     guess=female   name=Torrance                      
correct=male     guess=female   name=Uri                           
correct=male     guess=female   name=Virge                         
correct=male     guess=female   name=Vite                          
correct=male     guess=female   name=Walsh                         
correct=male     guess=female   name=Welbie                        
correct=male     guess=female   name=Westley                       
correct=male     guess=female   name=Winny                         
correct=male     guess=female   name=Worth                         
correct=male     guess=female   name=Yancy                         
correct=male     guess=female   name=Zacherie                      
correct=male     guess=female   name=Zebadiah                      
correct=male     guess=female   name=Zechariah                     
correct=male     guess=female   name=Zedekiah                      
correct=male     guess=female   name=Zollie                        
In [16]:
def gender_features(word):
    return {'suffix1': word[-1:],
            'suffix2': word[-2:]}
In [17]:
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set))
0.773

Document Classification

We now turn our attention to classifying full documents as opposed to single words in isolation.

The task seems more challenging, but simple methods can achieve surprisingly good results when the task is well defined. Consider the task of predicting whether a movie review is positive or negative. This is a task called sentiment analysis and is a hot practical task in the era of user-generated content (UGC) on the Web.

A good dataset is available in NLTK to experiment with this task.

In [18]:
from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
In [19]:
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = [w for (w, c) in all_words.most_common(2000)]

def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features
In [20]:
print("There are %d documents in the movie reviews dataset." % (len(documents)))
print("There are %d distinct words in the dataset." % (all_words.B()))
print("There are %d tokens in the dataset." % (all_words.N()))
all_words.tabulate(20)
all_words.plot(100)
There are 2000 documents in the movie reviews dataset.
There are 39768 distinct words in the dataset.
There are 1583820 tokens in the dataset.
   ,  the    .    a  and   of   to    '   is   in    s    "   it that    -    )    (   as with  for 
77717 76529 65876 38106 35576 34123 31937 30585 25195 21822 18513 17612 16107 15924 15595 11781 11664 11378 10792 9961 
In [21]:
print(document_features(movie_reviews.words('pos/cv957_8737.txt'))) 
{'contains(hill)': False, 'contains(reason)': False, 'contains(music)': False, 'contains(impact)': False, 'contains(calls)': False, 'contains(roberts)': False, 'contains(lucas)': False, 'contains(taken)': False, 'contains(mad)': False, 'contains(jackson)': False, 'contains(m)': False, 'contains(band)': False, 'contains(what)': True, 'contains(addition)': False, 'contains(various)': False, 'contains(vincent)': False, 'contains(press)': False, 'contains(perhaps)': False, 'contains(return)': False, 'contains(appear)': False, 'contains(among)': True, 'contains(why)': False, 'contains(recommend)': False, 'contains(sequences)': False, 'contains(william)': False, 'contains(creating)': False, 'contains(died)': False, 'contains(state)': False, 'contains(literally)': False, 'contains(common)': False, 'contains(suddenly)': False, 'contains(predictable)': False, 'contains(continue)': False, 'contains(called)': False, 'contains(fellow)': False, 'contains(jokes)': False, 'contains(open)': False, 'contains(hits)': False, 'contains(killer)': False, 'contains(pretty)': False, 'contains(lord)': False, 'contains(serious)': False, 'contains(accept)': False, 'contains(follow)': False, 'contains(police)': False, 'contains(giant)': False, 'contains(husband)': False, 'contains(wondering)': True, 'contains(act)': False, 'contains(spawn)': False, 'contains(other)': True, 'contains(nobody)': False, 'contains(city)': False, 'contains(gary)': False, 'contains(provide)': False, 'contains(talk)': True, 'contains(blade)': False, 'contains(hero)': False, 'contains(mouth)': False, 'contains(affleck)': False, 'contains(whom)': False, 'contains(nature)': False, 'contains(so)': False, 'contains(plays)': False, 'contains(turning)': False, 'contains(culture)': False, 'contains(classic)': False, 'contains(fire)': False, 'contains(reviews)': False, 'contains(long)': False, 'contains(mix)': False, 'contains(fairly)': True, 'contains(neither)': False, 'contains(earlier)': False, 'contains(be)': True, 'contains(apes)': False, 'contains(title)': False, 'contains(annoying)': False, 'contains(dull)': False, 'contains(search)': False, 'contains(oh)': False, 'contains(person)': False, 'contains(truth)': False, 'contains(taking)': False, 'contains(breaking)': False, 'contains(attractive)': True, 'contains(babe)': False, 'contains(trek)': False, 'contains(important)': False, 'contains(writing)': False, 'contains(comedy)': True, 'contains(responsible)': False, 'contains(expect)': False, 'contains(family)': False, 'contains(routine)': False, 'contains(immediately)': False, 'contains(pleasure)': False, 'contains(starship)': False, 'contains(entertainment)': False, 'contains(sexual)': False, 'contains(relationship)': False, 'contains(super)': False, 'contains(meeting)': False, 'contains(whose)': False, 'contains(desire)': False, 'contains(sub)': False, 'contains(guard)': True, 'contains(sharp)': False, 'contains(appealing)': False, 'contains(visually)': False, 'contains(credits)': False, 'contains(front)': False, 'contains(plus)': False, 'contains(stupid)': False, 'contains(hoping)': False, 'contains(clear)': False, 'contains(sitting)': False, 'contains(smith)': False, 'contains(growing)': False, 'contains(looking)': False, 'contains(scary)': False, 'contains(weird)': False, 'contains(cartoon)': False, 'contains(contrived)': False, 'contains(married)': False, 'contains(gore)': False, 'contains(without)': False, 'contains(college)': False, 'contains(terrific)': False, 'contains(documentary)': False, 'contains(run)': False, 'contains(emotional)': False, 'contains(after)': False, 'contains(david)': False, 'contains(tony)': False, 'contains(flaws)': False, 'contains(sandler)': False, 'contains(track)': False, 'contains(country)': False, 'contains(pain)': False, 'contains(suspects)': False, 'contains(engaging)': False, 'contains(mind)': False, 'contains(door)': False, 'contains(set)': False, 'contains(flicks)': False, 'contains(before)': False, 'contains(older)': False, 'contains(age)': False, 'contains(who)': True, 'contains(exciting)': False, 'contains(cross)': False, 'contains(blame)': False, 'contains(cinematographer)': False, 'contains(f)': False, 'contains(small)': False, 'contains(product)': False, 'contains(released)': False, 'contains(unfortunately)': False, 'contains(review)': False, 'contains(stories)': False, 'contains(100)': False, 'contains(partner)': False, 'contains(drug)': True, 'contains(call)': False, 'contains(ray)': False, 'contains(buddy)': False, 'contains(effects)': False, 'contains(book)': False, 'contains(academy)': False, 'contains(affair)': False, 'contains(air)': False, 'contains(stephen)': False, 'contains(loves)': False, 'contains(law)': False, 'contains(course)': True, 'contains(check)': False, 'contains(screen)': True, 'contains(breaks)': False, 'contains(1996)': False, 'contains(gives)': False, 'contains(computer)': False, 'contains(disappointing)': False, 'contains(angry)': False, 'contains(chris)': False, 'contains(rose)': False, 'contains(alex)': False, 'contains(humour)': False, 'contains(james)': False, 'contains(type)': False, 'contains(way)': True, 'contains(shows)': False, 'contains(full)': False, 'contains(exception)': False, 'contains(visual)': False, 'contains(private)': False, 'contains(certain)': False, 'contains(hit)': False, 'contains(thought)': False, 'contains(creative)': False, 'contains(manner)': False, 'contains(script)': False, 'contains(hopes)': False, 'contains(worth)': True, 'contains(main)': False, 'contains(magic)': False, 'contains(headed)': False, 'contains(half)': False, 'contains(hilarious)': True, 'contains(forget)': False, 'contains(brief)': True, 'contains(plane)': False, 'contains(feeling)': False, 'contains(dvd)': False, 'contains(minutes)': False, 'contains(creepy)': False, 'contains(sets)': False, 'contains(picture)': False, 'contains(did)': False, 'contains(science)': False, 'contains())': True, 'contains(lives)': False, 'contains(animals)': False, 'contains($)': False, 'contains(speed)': False, 'contains(just)': True, 'contains(tarantino)': False, 'contains(extreme)': False, 'contains(trust)': False, 'contains(involves)': True, 'contains(cheesy)': False, 'contains(there)': True, 'contains(especially)': True, 'contains(smart)': False, 'contains(spends)': False, 'contains(spice)': False, 'contains(dennis)': False, 'contains(rather)': False, 'contains(actions)': False, 'contains(sometimes)': False, 'contains(funniest)': False, 'contains(play)': False, 'contains(experience)': False, 'contains(constantly)': False, 'contains(dad)': False, 'contains(genre)': False, 'contains(usual)': False, 'contains(ultimate)': False, 'contains(weak)': False, 'contains(sympathetic)': False, 'contains(goofy)': False, 'contains(miller)': False, 'contains(woody)': False, 'contains(;)': False, 'contains(him)': True, 'contains(talking)': False, 'contains(public)': False, 'contains(twice)': False, 'contains(disney)': False, 'contains(onto)': False, 'contains(middle)': False, 'contains(been)': False, 'contains(dying)': False, 'contains(brown)': False, 'contains(strike)': False, 'contains(ghost)': False, 'contains(rescue)': False, 'contains(de)': False, 'contains(flynt)': False, 'contains(court)': False, 'contains(critic)': False, 'contains(side)': False, 'contains(shots)': False, 'contains(notice)': False, 'contains(virtually)': False, 'contains(lucky)': False, 'contains(element)': False, 'contains(images)': False, 'contains(jail)': True, 'contains(superb)': False, 'contains(could)': False, 'contains(end)': False, 'contains(performance)': False, 'contains(cameo)': False, 'contains(us)': True, 'contains(made)': False, 'contains(virus)': False, 'contains(featuring)': False, 'contains(group)': False, 'contains(understand)': False, 'contains(fail)': False, 'contains(wayne)': False, 'contains(content)': False, 'contains(comes)': False, 'contains(were)': False, 'contains(t)': False, 'contains(process)': False, 'contains(deserves)': False, 'contains(price)': False, 'contains(her)': False, 'contains(english)': False, 'contains(jane)': False, 'contains(change)': False, 'contains(critics)': False, 'contains(j)': False, 'contains(intelligence)': False, 'contains(people)': False, 'contains(news)': False, 'contains(camera)': False, 'contains(get)': True, 'contains(actually)': True, 'contains(acted)': False, 'contains(emotionally)': False, 'contains(jeff)': False, 'contains(changed)': False, 'contains(horrible)': False, 'contains(scream)': True, 'contains(survive)': False, 'contains(remember)': False, 'contains(head)': False, 'contains(teacher)': False, 'contains(urban)': False, 'contains(certainly)': False, 'contains(mention)': False, 'contains(=)': False, 'contains(ted)': False, 'contains(bus)': False, 'contains(they)': True, 'contains(dumb)': False, 'contains(marriage)': False, 'contains(terms)': False, 'contains(re)': True, 'contains(hair)': False, 'contains(shooting)': False, 'contains(keeping)': False, 'contains(lacks)': False, 'contains(self)': False, 'contains(physical)': True, 'contains(direct)': False, 'contains(okay)': False, 'contains(gets)': True, 'contains(if)': True, 'contains(driving)': False, 'contains(wrote)': False, 'contains(odd)': False, 'contains(tarzan)': False, 'contains(led)': False, 'contains(everything)': True, 'contains(stage)': False, 'contains(world)': True, 'contains(three)': False, 'contains(sea)': False, 'contains(position)': False, 'contains(crash)': False, 'contains(test)': False, 'contains(attention)': False, 'contains(development)': False, 'contains(determined)': False, 'contains(considering)': False, 'contains(have)': True, 'contains(laugh)': True, 'contains(likes)': False, 'contains(substance)': False, 'contains(an)': True, 'contains(born)': False, 'contains(sister)': False, 'contains(material)': False, 'contains(die)': False, 'contains(1)': False, 'contains(points)': False, 'contains(needs)': False, 'contains(date)': False, 'contains(how)': True, 'contains(couple)': False, 'contains(high)': False, 'contains(jedi)': False, 'contains(scorsese)': False, 'contains(actor)': False, 'contains(owner)': False, 'contains(devil)': False, 'contains(ultimately)': False, 'contains(christopher)': False, 'contains(20)': False, 'contains(feature)': False, 'contains(tension)': False, 'contains(boyfriend)': False, 'contains(lose)': False, 'contains(billy)': False, 'contains(successful)': False, 'contains(x)': False, 'contains(falling)': False, 'contains(mediocre)': False, 'contains(moment)': False, 'contains(information)': False, 'contains(right)': False, "contains(')": True, 'contains(sight)': False, 'contains(supporting)': False, 'contains(aren)': False, 'contains(frank)': False, 'contains(none)': False, 'contains(five)': False, 'contains(wit)': False, 'contains(early)': True, 'contains(d)': False, 'contains(followed)': False, 'contains(become)': False, 'contains(alien)': False, 'contains(telling)': False, 'contains(want)': False, 'contains(issues)': False, 'contains(1998)': False, 'contains(girls)': False, 'contains(king)': False, 'contains(treat)': False, 'contains(said)': False, 'contains(focus)': False, 'contains(famous)': False, 'contains(worked)': False, 'contains(secret)': False, 'contains(presented)': False, 'contains(soul)': False, 'contains(fighting)': False, 'contains(disaster)': False, 'contains(further)': False, 'contains(model)': False, 'contains(tries)': True, 'contains(ed)': False, 'contains(fat)': False, 'contains(touch)': False, 'contains(earth)': False, 'contains(follows)': False, 'contains(post)': False, 'contains(hate)': False, 'contains(remarkable)': False, 'contains(within)': False, 'contains(she)': True, 'contains(dies)': False, 'contains(towards)': False, 'contains(pointless)': False, 'contains(emotion)': False, 'contains(deal)': False, 'contains(seeing)': True, 'contains(keep)': False, 'contains(keeps)': True, 'contains(theme)': False, 'contains(woods)': False, 'contains(sound)': False, 'contains(little)': True, 'contains(sorry)': False, 'contains(came)': False, 'contains(detail)': False, 'contains(buy)': False, 'contains(bit)': False, 'contains(studio)': False, 'contains(doesn)': False, 'contains(often)': True, 'contains(indeed)': False, 'contains(beginning)': False, 'contains(struggle)': False, 'contains(000)': False, 'contains(pieces)': False, 'contains(bunch)': False, 'contains(saying)': False, 'contains(concept)': False, 'contains(soldiers)': False, 'contains(humorous)': False, 'contains(comedic)': False, 'contains(peter)': False, 'contains(includes)': False, 'contains(personality)': False, 'contains(question)': False, 'contains(joe)': False, 'contains(l)': False, 'contains(home)': False, 'contains(ll)': False, 'contains(constant)': False, 'contains(year)': False, 'contains(except)': False, 'contains(trouble)': True, 'contains(guess)': False, 'contains(award)': False, 'contains(poor)': False, 'contains(impressive)': False, 'contains(cast)': False, 'contains(teen)': False, 'contains(filmmaking)': False, 'contains(catch)': False, 'contains(basically)': False, 'contains(whole)': False, 'contains(u)': False, 'contains(sadly)': False, 'contains(decade)': False, 'contains(hidden)': False, 'contains(folks)': False, 'contains(however)': True, 'contains(animal)': False, 'contains(style)': False, 'contains(surprised)': False, 'contains(admit)': False, 'contains(project)': False, 'contains(setting)': False, 'contains(dreams)': False, 'contains(recently)': False, 'contains(eccentric)': False, 'contains(cheap)': False, 'contains(graphic)': False, 'contains(had)': False, 'contains(actress)': False, 'contains(steve)': False, 'contains(shallow)': False, 'contains(jackie)': True, 'contains(girlfriend)': True, 'contains(ended)': False, 'contains(drawn)': False, 'contains(perfect)': False, 'contains(water)': False, 'contains(princess)': False, 'contains(started)': False, 'contains(accident)': False, 'contains(ups)': False, 'contains(comic)': False, 'contains(lies)': False, 'contains(adds)': False, 'contains(sense)': False, 'contains(about)': True, 'contains(count)': False, 'contains(killing)': False, 'contains(contains)': False, 'contains(wasn)': False, 'contains(island)': False, 'contains(kills)': False, 'contains(younger)': False, 'contains(hong)': True, 'contains(then)': True, 'contains(second)': False, 'contains(directly)': False, 'contains(store)': False, 'contains(using)': False, 'contains(times)': False, 'contains(drugs)': False, 'contains(interesting)': False, 'contains(silent)': False, 'contains(contact)': False, 'contains(your)': False, 'contains(black)': False, 'contains(watched)': False, 'contains(robert)': False, 'contains(living)': False, 'contains(promise)': False, 'contains(instead)': False, 'contains(dude)': False, 'contains(adaptation)': False, 'contains(third)': True, 'contains(likable)': False, 'contains(confusing)': False, 'contains(limited)': False, 'contains(are)': True, 'contains(example)': False, 'contains(was)': False, 'contains(doctor)': False, 'contains(song)': False, 'contains(hot)': False, 'contains(device)': False, 'contains(social)': False, 'contains(originally)': False, 'contains(thin)': False, 'contains(rocky)': False, 'contains(death)': False, 'contains(identity)': False, 'contains(totally)': False, 'contains(mulan)': False, 'contains(total)': False, 'contains(exist)': False, 'contains(hard)': False, 'contains(see)': False, 'contains(perfectly)': False, 'contains(lines)': False, 'contains(carter)': False, 'contains(subject)': False, 'contains(audiences)': False, 'contains(player)': False, 'contains(rarely)': False, 'contains(incredible)': False, 'contains(plain)': False, 'contains(edward)': False, 'contains(popular)': False, 'contains(vampires)': False, 'contains(community)': False, 'contains(")': True, 'contains(natural)': False, 'contains(cute)': False, 'contains(art)': False, 'contains(-)': True, 'contains(walk)': False, 'contains(spent)': False, 'contains(street)': False, 'contains(meaning)': False, 'contains(wide)': False, 'contains(rising)': False, 'contains(etc)': False, 'contains(merely)': False, 'contains(opens)': False, 'contains(soon)': False, 'contains(always)': False, 'contains(seven)': False, 'contains(g)': False, 'contains(officer)': False, 'contains(standard)': False, 'contains(actors)': False, 'contains(accent)': False, 'contains(present)': False, 'contains(themselves)': False, 'contains(happen)': False, 'contains(two)': True, 'contains(martial)': False, 'contains(male)': False, 'contains(introduced)': False, 'contains(appreciate)': False, 'contains(williamson)': False, 'contains(starring)': False, 'contains(when)': True, 'contains(driver)': False, 'contains(rating)': False, 'contains(baby)': False, 'contains(during)': False, 'contains(sean)': False, 'contains(of)': True, 'contains(general)': False, 'contains(leaving)': False, 'contains(scene)': True, 'contains(deliver)': False, 'contains(r)': False, 'contains(makes)': False, 'contains(tough)': False, 'contains(hold)': False, 'contains(individual)': False, 'contains(liked)': False, 'contains(remake)': False, 'contains(bill)': False, 'contains(good)': False, 'contains(order)': False, 'contains(dealing)': False, 'contains(ones)': False, 'contains(include)': False, 'contains(effect)': False, 'contains(meant)': False, 'contains(writer)': False, 'contains(succeeds)': False, 'contains(helps)': False, 'contains(matter)': False, 'contains(typical)': False, 'contains(radio)': False, 'contains(villains)': False, 'contains(sent)': False, 'contains(sexy)': False, 'contains(happens)': False, 'contains(ago)': False, 'contains(roles)': False, 'contains(bloody)': False, 'contains(body)': False, 'contains(later)': False, 'contains(war)': False, 'contains(?)': False, 'contains(clich)': False, 'contains(game)': False, 'contains(and)': True, 'contains(soundtrack)': False, 'contains(moving)': True, 'contains(human)': False, 'contains(becomes)': False, 'contains(spend)': False, 'contains(speak)': False, 'contains(town)': False, 'contains(carrey)': False, 'contains(allows)': False, 'contains(pair)': False, 'contains(attitude)': False, 'contains(crew)': False, 'contains(created)': False, 'contains(definitely)': False, 'contains(wonder)': False, 'contains(ends)': False, 'contains(likely)': False, 'contains(difficult)': True, 'contains(thriller)': False, 'contains(father)': False, 'contains(we)': False, 'contains(rules)': False, 'contains(office)': False, 'contains(easy)': False, 'contains(women)': False, 'contains(impression)': False, 'contains(elizabeth)': False, 'contains(haven)': False, 'contains(bizarre)': False, 'contains(room)': False, 'contains(superior)': False, 'contains(simple)': False, 'contains(pg)': False, 'contains(maybe)': False, 'contains(america)': False, 'contains(able)': False, 'contains(please)': False, 'contains(that)': True, 'contains(couldn)': False, 'contains(though)': False, 'contains(parents)': False, 'contains(lover)': False, 'contains(allen)': False, 'contains(bond)': False, 'contains(mystery)': False, 'contains(last)': False, 'contains(anti)': False, 'contains(believes)': False, 'contains(4)': False, 'contains(no)': False, 'contains(fails)': False, 'contains(here)': True, 'contains(dangerous)': True, 'contains(boat)': False, 'contains(wasted)': False, 'contains(start)': True, 'contains(dance)': False, 'contains(imagination)': False, 'contains(heart)': False, 'contains(hanks)': False, 'contains(convincing)': False, 'contains(i)': False, 'contains(huge)': False, 'contains(fate)': False, 'contains(generation)': False, 'contains(dialogue)': False, 'contains(young)': False, 'contains(effective)': False, 'contains(gangster)': True, 'contains(biggest)': False, 'contains(waste)': False, 'contains(jerry)': False, 'contains(cash)': False, 'contains(van)': False, 'contains(need)': False, 'contains(sam)': False, 'contains(animation)': False, 'contains(gags)': False, 'contains(involved)': False, 'contains(begins)': False, 'contains(effort)': False, 'contains(wouldn)': False, 'contains(theater)': False, 'contains(enjoyed)': False, 'contains(release)': False, 'contains(austin)': False, 'contains(1997)': False, 'contains(nick)': False, 'contains(mars)': False, 'contains(stands)': False, 'contains(step)': False, 'contains(bar)': False, 'contains(still)': False, 'contains(success)': False, 'contains(unlike)': False, 'contains(job)': False, 'contains(plan)': False, 'contains(minor)': False, 'contains(seemed)': False, 'contains(reasons)': False, 'contains(season)': False, 'contains(guns)': False, 'contains(almost)': False, 'contains(cage)': False, 'contains(series)': True, 'contains(many)': True, 'contains(member)': False, 'contains(depth)': False, 'contains(straight)': False, 'contains(strength)': False, 'contains(or)': False, 'contains(offer)': False, 'contains(stone)': False, 'contains(nor)': False, 'contains(atmosphere)': False, 'contains(anyway)': False, 'contains(pure)': False, 'contains(intelligent)': False, 'contains(footage)': False, 'contains(myers)': False, 'contains(dead)': False, 'contains(men)': False, 'contains(gang)': False, 'contains(surprisingly)': False, 'contains(surprise)': False, 'contains(going)': False, 'contains(share)': False, 'contains(unique)': True, 'contains(view)': False, 'contains(fly)': False, 'contains(edge)': False, 'contains(government)': False, 'contains(anything)': False, 'contains(humans)': False, 'contains(female)': False, 'contains(known)': True, 'contains(for)': True, 'contains(oscar)': False, 'contains(viewing)': False, 'contains(things)': True, 'contains(generated)': False, 'contains(wild)': False, 'contains(trip)': False, 'contains(adventure)': False, 'contains(horror)': False, 'contains(felt)': False, 'contains(added)': False, 'contains(bring)': False, 'contains(night)': False, 'contains(names)': False, 'contains(woo)': False, 'contains(purpose)': False, 'contains(personal)': False, 'contains(like)': True, 'contains(together)': False, 'contains(])': False, 'contains(hear)': False, 'contains(ill)': False, 'contains(beautiful)': False, 'contains(adults)': False, 'contains(movie)': True, 'contains(ground)': False, 'contains(!)': True, 'contains(30)': False, 'contains(bored)': False, 'contains(great)': True, 'contains(poorly)': False, 'contains(themes)': False, 'contains(thanks)': False, 'contains(menace)': False, 'contains(difference)': False, 'contains(catherine)': False, 'contains(decision)': False, 'contains(club)': False, 'contains(pull)': False, 'contains(saving)': False, 'contains(fully)': False, 'contains(meets)': False, 'contains(available)': False, 'contains(questions)': False, 'contains(far)': False, 'contains(thomas)': False, 'contains(truman)': False, 'contains(rent)': False, 'contains(director)': False, 'contains(surprising)': False, 'contains(heard)': False, 'contains(fascinating)': False, 'contains(naked)': False, 'contains(career)': False, 'contains(somehow)': False, 'contains(skills)': False, 'contains(90)': False, 'contains(mrs)': False, 'contains(may)': False, 'contains(british)': False, 'contains(until)': False, 'contains(ideas)': False, 'contains(man)': False, 'contains(mel)': False, 'contains(desperate)': False, 'contains(bland)': False, 'contains(house)': False, 'contains(used)': True, 'contains(unless)': False, 'contains(goal)': False, 'contains(itself)': False, 'contains(period)': False, 'contains(protagonist)': False, 'contains(incredibly)': False, 'contains(trailer)': False, 'contains(machine)': False, 'contains(use)': False, 'contains(favorite)': False, 'contains(seat)': False, 'contains(lead)': True, 'contains(imagine)': False, 'contains(9)': False, 'contains(those)': False, 'contains(acting)': False, 'contains(united)': False, 'contains(cliches)': False, 'contains(look)': True, 'contains(clever)': False, 'contains(fine)': False, 'contains(parts)': False, 'contains(their)': False, 'contains(hardly)': False, 'contains(rare)': False, 'contains(big)': False, 'contains(proves)': False, 'contains(tom)': False, 'contains(amusing)': False, 'contains(realizes)': False, 'contains(let)': False, 'contains(cruise)': False, 'contains(boy)': False, 'contains(join)': False, 'contains(subtle)': False, 'contains(wise)': False, 'contains(coming)': False, 'contains(blair)': False, 'contains(well)': True, 'contains(brothers)': False, 'contains(players)': False, 'contains(twenty)': False, 'contains(similar)': False, 'contains(brother)': False, 'contains(quality)': False, 'contains(cops)': False, 'contains(crystal)': False, 'contains(directed)': False, 'contains(frame)': False, 'contains(mission)': False, 'contains(language)': False, 'contains(short)': False, 'contains(jennifer)': False, 'contains(box)': False, 'contains(missed)': False, 'contains(myself)': False, 'contains(inside)': False, 'contains(kids)': False, 'contains(batman)': False, 'contains(doubt)': False, 'contains(thinking)': False, 'contains(done)': False, 'contains(eventually)': False, 'contains(songs)': False, 'contains(to)': True, 'contains(talented)': False, 'contains(seriously)': False, 'contains(broken)': False, 'contains(happened)': False, 'contains(built)': False, 'contains(might)': False, 'contains(go)': False, 'contains(frightening)': False, 'contains(appearance)': False, 'contains(damon)': False, 'contains(french)': False, 'contains(around)': False, 'contains(all)': True, 'contains(place)': True, 'contains(leaves)': False, 'contains(nowhere)': False, 'contains(choice)': False, 'contains(directors)': False, 'contains(monster)': False, 'contains(student)': False, 'contains(bug)': False, 'contains(mysterious)': False, 'contains(realistic)': False, 'contains(special)': False, 'contains(fan)': False, 'contains(attempt)': False, 'contains(elements)': False, 'contains(production)': False, 'contains(doing)': False, 'contains(creature)': False, 'contains(seemingly)': False, 'contains(although)': False, 'contains(gave)': False, 'contains(fame)': False, 'contains(opinion)': False, 'contains(rate)': False, 'contains(bad)': False, 'contains(troopers)': False, 'contains(score)': False, 'contains(eyes)': False, 'contains(beast)': False, 'contains(charles)': False, 'contains(wanted)': False, 'contains(helen)': False, 'contains(terrible)': False, 'contains(harry)': False, 'contains(turns)': False, 'contains(uses)': False, 'contains(brilliant)': False, 'contains(others)': True, 'contains(damme)': False, 'contains(douglas)': False, 'contains(amazing)': False, 'contains(reach)': False, 'contains(history)': False, 'contains(top)': True, 'contains(wonderful)': False, 'contains(witch)': False, 'contains(son)': False, 'contains(quite)': False, 'contains(thrown)': False, 'contains(direction)': False, 'contains(free)': False, 'contains(true)': False, 'contains(forces)': False, 'contains(jr)': False, 'contains(jim)': False, 'contains(upon)': False, 'contains(screenwriter)': False, 'contains(train)': True, 'contains(cover)': False, 'contains(crap)': False, 'contains(along)': True, 'contains(files)': False, 'contains(guy)': True, 'contains(guilty)': False, 'contains(enough)': False, 'contains(detective)': False, 'contains(background)': False, 'contains(race)': False, 'contains(matthew)': False, 'contains(patch)': False, 'contains(puts)': False, 'contains(deep)': True, 'contains(double)': False, 'contains(sex)': False, 'contains(loving)': False, 'contains(numerous)': False, 'contains(scale)': False, 'contains(would)': False, 'contains(fair)': False, 'contains(willis)': False, 'contains(pictures)': False, 'contains(+)': False, 'contains(apparent)': False, 'contains(utterly)': False, 'contains(wall)': False, 'contains(shock)': False, 'contains(discovers)': False, 'contains(watching)': False, 'contains(sequence)': False, 'contains(appropriate)': False, 'contains(day)': True, 'contains(wrong)': True, 'contains(power)': False, 'contains(american)': False, 'contains(behind)': False, 'contains(realized)': False, 'contains(states)': False, 'contains(than)': False, 'contains(center)': False, 'contains(aspect)': False, 'contains(ice)': False, 'contains(john)': False, 'contains(laughable)': False, 'contains(victim)': False, 'contains(low)': False, 'contains(mostly)': False, 'contains(never)': True, 'contains(provided)': False, 'contains(`)': False, 'contains(major)': False, 'contains(seconds)': False, 'contains(joke)': False, 'contains(takes)': False, 'contains(problems)': True, 'contains(episode)': False, 'contains(shoot)': False, 'contains(scenes)': False, 'contains(our)': False, 'contains(baldwin)': False, 'contains(legend)': False, 'contains(large)': False, 'contains(drive)': False, 'contains(liners)': False, 'contains(ahead)': False, 'contains(rob)': False, 'contains(voice)': False, 'contains(eight)': True, 'contains(tells)': False, 'contains(singer)': False, 'contains(actual)': False, 'contains(titanic)': False, 'contains(subplot)': False, 'contains(narrative)': False, 'contains(beyond)': False, 'contains(president)': False, 'contains(e)': False, 'contains(the)': True, 'contains(getting)': True, 'contains(performances)': False, 'contains(murder)': False, 'contains(produced)': False, 'contains(falls)': False, 'contains(campbell)': False, 'contains(something)': False, 'contains(alive)': False, 'contains(s)': True, 'contains(matt)': False, 'contains(jack)': False, 'contains(target)': False, 'contains(know)': False, 'contains(fbi)': False, 'contains(nights)': False, 'contains(faces)': False, 'contains(across)': False, 'contains(every)': False, 'contains(forever)': False, 'contains(sci)': False, 'contains(into)': True, 'contains(horse)': False, 'contains(more)': False, 'contains(off)': False, 'contains(jimmy)': False, 'contains(ever)': True, 'contains(supposed)': False, 'contains(goes)': False, 'contains(brings)': False, 'contains(forced)': False, 'contains(by)': True, 'contains(sounds)': False, 'contains(pop)': False, 'contains(satire)': False, 'contains(mood)': False, 'contains(interest)': False, 'contains(equally)': False, 'contains(decided)': False, 'contains(boss)': True, 'contains(winning)': False, 'contains(grant)': False, 'contains(audience)': False, 'contains(vegas)': False, 'contains(winner)': False, 'contains(knows)': False, 'contains(works)': False, 'contains(television)': False, 'contains(thing)': False, 'contains(travolta)': False, 'contains(tale)': False, 'contains(took)': False, 'contains(also)': True, 'contains(note)': False, 'contains(different)': False, 'contains(caught)': False, 'contains(asks)': False, 'contains(ben)': False, 'contains(any)': False, 'contains(much)': False, 'contains(involving)': False, 'contains(acts)': False, 'contains(stock)': False, 'contains(compared)': False, 'contains(control)': False, 'contains(needed)': False, 'contains(obvious)': False, 'contains(()': True, 'contains(talent)': False, 'contains(least)': True, 'contains(reeves)': False, 'contains(forward)': False, 'contains(completely)': False, 'contains(because)': False, 'contains(land)': False, 'contains(features)': True, 'contains(yes)': False, 'contains(violence)': False, 'contains(tv)': False, 'contains(mary)': False, 'contains(fox)': False, 'contains(developed)': False, 'contains(lynch)': False, 'contains(respect)': False, 'contains(hunting)': False, 'contains(lack)': False, 'contains(result)': False, 'contains(woman)': True, 'contains(red)': False, 'contains(guys)': False, 'contains(13)': False, 'contains(hell)': False, 'contains(field)': True, 'contains(max)': False, 'contains(ex)': False, 'contains(ugly)': False, 'contains(fight)': True, 'contains(role)': False, 'contains(o)': False, 'contains(explained)': False, 'contains(waiting)': False, 'contains(inspired)': False, 'contains(ridiculous)': False, 'contains(co)': False, 'contains(create)': False, 'contains(revenge)': False, 'contains(reality)': False, 'contains(nicely)': False, 'contains(entire)': False, 'contains(details)': False, 'contains(stars)': False, 'contains(lost)': False, 'contains(found)': False, 'contains(&)': False, 'contains(grows)': False, 'contains(twist)': False, 'contains(line)': False, 'contains(dr)': False, 'contains(rush)': False, 'contains(york)': False, 'contains(fast)': True, 'contains(brain)': False, 'contains(fear)': False, 'contains(surprises)': False, 'contains(heavy)': False, 'contains(moral)': False, 'contains(starts)': False, 'contains(outstanding)': False, 'contains(larry)': False, 'contains(seagal)': False, 'contains(hope)': False, 'contains(tone)': False, 'contains(nearly)': False, 'contains(time)': False, 'contains(films)': False, 'contains(lee)': False, 'contains(kind)': True, 'contains(humanity)': False, 'contains(toward)': False, 'contains(list)': False, 'contains(kelly)': False, 'contains(including)': False, 'contains(prison)': False, 'contains(money)': False, 'contains(chance)': False, 'contains(boring)': False, 'contains(potential)': False, 'contains(agent)': False, 'contains(must)': False, 'contains(shot)': False, 'contains(already)': False, 'contains(longer)': False, 'contains(godzilla)': False, 'contains(hours)': False, 'contains(single)': False, 'contains(herself)': False, 'contains(but)': True, 'contains(consider)': False, 'contains(case)': False, 'contains(try)': False, 'contains(while)': True, 'contains(since)': False, 'contains(fi)': False, 'contains(laughing)': False, 'contains(support)': False, 'contains(7)': False, 'contains(either)': False, 'contains(quiet)': False, 'contains(throw)': False, 'contains(suppose)': False, 'contains(russell)': False, 'contains(ability)': False, 'contains(:)': True, 'contains(loved)': False, 'contains(lacking)': False, 'contains(mentioned)': False, 'contains(near)': False, 'contains(phone)': False, 'contains(event)': False, 'contains(suspense)': False, 'contains(worthy)': False, 'contains(finally)': False, 'contains(/)': False, 'contains(future)': False, 'contains(sad)': False, 'contains(heads)': False, 'contains(walking)': False, 'contains(several)': False, 'contains(masterpiece)': False, 'contains(message)': False, 'contains(pick)': False, 'contains(members)': False, 'contains(final)': False, 'contains(attack)': False, 'contains(entertaining)': False, 'contains(probably)': False, 'contains(filmmakers)': False, 'contains(killed)': False, 'contains(nasty)': False, 'contains(where)': True, 'contains(tell)': False, 'contains(speaking)': False, 'contains(opposite)': False, 'contains(now)': False, 'contains(novel)': False, 'contains(roger)': False, 'contains(danny)': False, 'contains(hopkins)': False, 'contains(comparison)': False, 'contains(really)': False, 'contains(ending)': False, 'contains(only)': True, 'contains(anthony)': False, 'contains(cares)': False, 'contains(again)': False, 'contains(stuck)': False, 'contains(above)': False, 'contains(work)': False, 'contains(alan)': False, 'contains(c)': False, 'contains(haunting)': False, 'contains(steven)': False, 'contains(cannot)': False, 'contains(extremely)': False, 'contains(unfunny)': False, 'contains(fake)': False, 'contains(write)': False, 'contains(results)': False, 'contains(wars)': False, 'contains(has)': True, 'contains(musical)': False, 'contains(thoroughly)': False, 'contains(working)': False, 'contains(truly)': False, 'contains(character)': False, 'contains(story)': False, 'contains(bright)': False, 'contains(friend)': False, 'contains(plot)': True, 'contains(answer)': False, 'contains(latest)': False, 'contains(enjoy)': False, 'contains(say)': False, 'contains(as)': True, 'contains(loud)': False, 'contains(meet)': False, 'contains(learns)': False, 'contains(toy)': False, 'contains(la)': False, 'contains(rock)': False, 'contains(intriguing)': False, 'contains(cinematic)': False, 'contains(believe)': False, 'contains(add)': False, 'contains(2)': False, 'contains(watch)': True, 'contains(due)': False, 'contains(arnold)': True, 'contains(jay)': False, 'contains(image)': False, 'contains(planet)': False, 'contains(cause)': False, 'contains(appeal)': False, 'contains(out)': True, 'contains(party)': False, 'contains(lame)': False, 'contains(considered)': False, 'contains(bruce)': False, 'contains(complex)': False, 'contains(producer)': False, 'contains(feel)': False, 'contains(think)': False, 'contains(board)': False, 'contains(chan)': True, 'contains(excellent)': False, 'contains(live)': False, 'contains(humor)': False, 'contains(shakespeare)': False, 'contains(barely)': False, 'contains(very)': True, 'contains(fantastic)': False, 'contains(road)': False, 'contains(hospital)': False, 'contains(amount)': False, 'contains(military)': False, 'contains(children)': False, 'contains(moves)': False, 'contains(ve)': False, 'contains(ryan)': False, 'contains(form)': False, 'contains(missing)': False, 'contains(asked)': False, 'contains(name)': False, 'contains(park)': False, 'contains(ms)': False, 'contains(driven)': False, 'contains(child)': False, 'contains(confused)': False, 'contains(suspect)': False, 'contains(wedding)': False, 'contains(romantic)': False, 'contains(moments)': False, 'contains(hands)': False, 'contains(cool)': False, 'contains(latter)': False, 'contains(average)': False, 'contains(costumes)': False, 'contains(help)': True, 'contains(following)': False, 'contains(am)': False, 'contains(wonderfully)': False, 'contains(white)': False, 'contains(memorable)': False, 'contains(richard)': False, 'contains(era)': False, 'contains(another)': False, 'contains(seems)': False, 'contains(annie)': False, 'contains(managed)': False, 'contains(knew)': False, 'contains(should)': False, 'contains(worst)': False, 'contains(break)': False, 'contains(genuine)': False, 'contains(wish)': False, 'contains(once)': True, 'contains(figures)': False, 'contains(cinematography)': False, 'contains(being)': False, 'contains(joan)': False, 'contains(relationships)': False, 'contains(producers)': False, 'contains(says)': False, 'contains(michael)': False, 'contains(floor)': False, 'contains(provides)': False, 'contains(--)': True, 'contains(beauty)': False, 'contains(mr)': False, 'contains(today)': True, 'contains(remains)': False, 'contains(these)': False, 'contains(wait)': False, 'contains(carpenter)': False, 'contains(school)': False, 'contains(thus)': False, 'contains(talents)': False, 'contains(adams)': False, 'contains(paced)': False, 'contains(kevin)': True, 'contains(face)': False, 'contains(.)': True, 'contains(expected)': False, 'contains(evil)': False, 'contains(technical)': False, 'contains(speech)': False, 'contains(gold)': False, 'contains(began)': False, 'contains(in)': True, 'contains(hasn)': False, 'contains(energy)': False, 'contains(8)': False, 'contains(mob)': False, 'contains(finds)': False, 'contains(on)': True, 'contains(real)': False, 'contains(1999)': False, 'contains(trying)': False, 'contains(action)': True, 'contains(at)': False, 'contains(sees)': True, 'contains(usually)': False, 'contains(come)': False, 'contains(building)': False, 'contains(months)': False, 'contains(faith)': False, 'contains(finding)': False, 'contains(snake)': False, 'contains(century)': False, 'contains(aliens)': False, 'contains(climax)': True, 'contains(realize)': False, 'contains(ass)': True, 'contains(past)': False, 'contains(word)': False, 'contains(feels)': False, 'contains(impossible)': False, 'contains(movies)': True, 'contains(eddie)': False, 'contains(fantasy)': False, 'contains(sheer)': False, 'contains(car)': False, 'contains(thinks)': False, 'contains(julia)': False, 'contains(miss)': True, 'contains(team)': False, 'contains(obviously)': False, 'contains(technology)': False, 'contains(decent)': False, 'contains(yeah)': False, 'contains(both)': False, 'contains(chosen)': False, 'contains(told)': False, 'contains(save)': False, 'contains(washington)': False, 'contains(professor)': False, 'contains(god)': False, 'contains(scientist)': False, 'contains(motion)': False, 'contains(cut)': False, 'contains(shown)': False, 'contains(species)': False, 'contains(quickly)': False, 'contains(this)': True, 'contains(you)': True, 'contains(nuclear)': False, 'contains(won)': False, 'contains(brought)': False, 'contains(witty)': False, 'contains(match)': False, 'contains(fans)': False, 'contains(offers)': False, 'contains(despite)': False, 'contains(put)': False, 'contains(make)': True, 'contains(presence)': False, 'contains(played)': True, 'contains(gibson)': False, 'contains(gone)': False, 'contains(local)': False, 'contains(leader)': False, 'contains(saw)': False, 'contains(kept)': False, 'contains(making)': True, 'contains(sweet)': False, 'contains(presents)': False, 'contains(with)': True, 'contains(basic)': False, 'contains(eye)': False, 'contains(george)': False, 'contains(system)': False, 'contains(crime)': False, 'contains(premise)': False, 'contains(my)': False, 'contains(hour)': False, 'contains(portrayed)': False, 'contains(he)': True, 'contains(manage)': False, 'contains(smile)': False, 'contains(safe)': False, 'contains(henry)': False, 'contains(silly)': False, 'contains(back)': False, 'contains(security)': True, 'contains(ii)': False, 'contains(villain)': False, 'contains(*)': True, 'contains(amy)': False, 'contains(figure)': False, 'contains(entirely)': False, 'contains(west)': False, 'contains(places)': False, 'contains(outside)': False, 'contains(station)': False, 'contains(them)': True, 'contains(plans)': False, 'contains(mean)': False, 'contains(aside)': False, 'contains(running)': False, 'contains(slasher)': False, 'contains(give)': False, 'contains(changes)': False, 'contains(generally)': False, 'contains(blue)': False, 'contains(sure)': False, 'contains(pulp)': False, 'contains(kate)': False, 'contains(some)': False, 'contains(previous)': False, 'contains(powerful)': False, 'contains(window)': False, 'contains(win)': False, 'contains(move)': False, 'contains(will)': True, 'contains(parody)': False, 'contains(brian)': False, 'contains(intense)': False, 'contains(jason)': False, 'contains(nudity)': False, 'contains(girl)': False, 'contains(runs)': False, 'contains(overly)': False, 'contains(next)': False, 'contains(stuff)': False, 'contains(tried)': False, 'contains(sort)': False, 'contains(badly)': False, 'contains(throughout)': False, 'contains(animated)': False, 'contains(summer)': False, 'contains(professional)': False, 'contains(julie)': False, 'contains(strong)': False, 'contains(rival)': False, 'contains(care)': False, 'contains(therefore)': False, 'contains(stand)': False, 'contains(genius)': False, 'contains(meanwhile)': False, 'contains(excuse)': False, 'contains(bottom)': False, 'contains(clearly)': False, 'contains(compelling)': False, 'contains(situation)': False, 'contains(spielberg)': False, 'contains(length)': False, 'contains(himself)': False, 'contains(do)': True, 'contains(stop)': True, 'contains(easily)': False, 'contains(better)': False, 'contains(ask)': False, 'contains(read)': False, 'contains(cinema)': False, 'contains(supposedly)': False, 'contains(singing)': False, 'contains(cop)': False, 'contains(believable)': False, 'contains(lady)': False, 'contains(flat)': False, 'contains(interested)': False, 'contains(greatest)': False, 'contains(attempts)': False, 'contains(task)': False, 'contains(chase)': True, 'contains(grand)': False, 'contains(scott)': False, 'contains(charming)': False, 'contains(destroy)': False, 'contains(matrix)': False, 'contains(pace)': False, 'contains(hunt)': False, 'contains(epic)': False, 'contains(tommy)': False, 'contains(hurt)': False, 'contains(placed)': False, 'contains(gay)': False, 'contains(events)': False, 'contains(rated)': False, 'contains(business)': False, 'contains(manages)': False, 'contains(one)': True, 'contains(screenplay)': False, 'contains(bob)': False, 'contains(appears)': False, 'contains(japanese)': False, 'contains(idea)': True, 'contains(avoid)': False, 'contains(crazy)': False, 'contains(color)': False, 'contains(words)': False, 'contains(surface)': False, 'contains(failed)': False, 'contains(loses)': False, 'contains(formula)': False, 'contains(version)': False, 'contains(yet)': False, 'contains(possibly)': False, 'contains(minute)': False, 'contains(finale)': False, 'contains(gun)': True, 'contains(blood)': False, 'contains(allow)': False, 'contains(teenagers)': False, 'contains(williams)': False, 'contains(shouldn)': False, 'contains(somewhat)': False, 'contains(lot)': False, 'contains(few)': False, 'contains(problem)': False, 'contains(stuart)': False, 'contains(modern)': False, 'contains(too)': False, 'contains(apparently)': False, 'contains(slowly)': False, 'contains(sick)': False, 'contains(film)': False, 'contains(filled)': False, 'contains(dream)': False, 'contains(each)': True, 'contains(innocent)': False, 'contains(current)': False, 'contains(wants)': False, 'contains(5)': False, 'contains(million)': True, 'contains(heroes)': False, 'contains(slapstick)': False, 'contains(worse)': False, 'contains(chemistry)': True, 'contains(mother)': False, 'contains(force)': False, 'contains(opportunity)': False, 'contains(reveal)': False, 'contains(viewer)': False, 'contains(from)': True, 'contains(serve)': False, 'contains(heaven)': False, 'contains(bringing)': False, 'contains(its)': False, 'contains(delivers)': False, 'contains(failure)': False, 'contains(class)': False, 'contains(rest)': False, 'contains(less)': False, 'contains(returns)': False, 'contains(beat)': False, 'contains(grace)': False, 'contains(ford)': False, 'contains(over)': False, 'contains(brooks)': False, 'contains(expectations)': False, 'contains(porn)': False, 'contains(tim)': False, 'contains(slightly)': False, 'contains(overall)': False, 'contains(up)': False, 'contains(south)': False, 'contains(take)': False, 'contains(hand)': False, 'contains(written)': False, 'contains(tired)': False, 'contains(ship)': False, 'contains(taste)': False, 'contains(late)': False, 'contains(send)': False, 'contains(wife)': True, 'contains(video)': False, 'contains(having)': False, 'contains(based)': False, 'contains(hollywood)': False, 'contains(solid)': False, 'contains(show)': False, 'contains(ride)': False, 'contains(somewhere)': False, 'contains(point)': False, 'contains(daughter)': False, 'contains(turned)': False, 'contains(hey)': False, 'contains(fun)': True, 'contains(exactly)': False, 'contains(shame)': False, 'contains(build)': False, 'contains(nothing)': False, 'contains(begin)': False, 'contains(6)': False, 'contains(level)': False, 'contains(approach)': False, 'contains(necessary)': False, 'contains(paul)': False, 'contains(sarah)': False, 'contains(pay)': False, 'contains(down)': False, 'contains(beach)': False, 'contains(portrayal)': False, 'contains(can)': False, 'contains(jones)': False, 'contains(didn)': False, 'contains(is)': True, 'contains(apart)': False, 'contains(normal)': False, 'contains(fiction)': False, 'contains(space)': False, 'contains(books)': False, 'contains(becoming)': False, 'contains(feet)': False, 'contains(phantom)': False, 'contains(star)': False, 'contains(rich)': False, 'contains(fare)': False, 'contains(it)': True, 'contains(seem)': False, 'contains(murphy)': False, 'contains(spectacular)': False, 'contains(part)': False, 'contains(mess)': False, 'contains(holds)': False, 'contains(kid)': False, 'contains(creates)': False, 'contains(,)': True, 'contains(army)': False, 'contains(don)': False, 'contains(cameron)': False, 'contains(occasionally)': False, 'contains(best)': True, 'contains(robin)': False, 'contains(laughs)': False, 'contains(yourself)': False, 'contains(sign)': False, 'contains(fault)': False, 'contains(stunning)': False, 'contains(plenty)': False, 'contains(close)': False, 'contains(dimensional)': False, 'contains(pathetic)': False, 'contains(giving)': False, 'contains(conflict)': False, 'contains(journey)': False, 'contains(first)': False, 'contains(simon)': False, 'contains(slow)': True, 'contains(powers)': False, 'contains(most)': True, 'contains(teenage)': False, 'contains(complete)': False, 'contains(characterization)': False, 'contains(sequel)': False, 'contains(theaters)': False, 'contains(sit)': False, 'contains(away)': False, 'contains(leave)': False, 'contains(me)': True, 'contains(escape)': False, 'contains(former)': False, 'contains(willing)': False, 'contains(key)': True, 'contains(alone)': False, 'contains(original)': False, 'contains(crowd)': False, 'contains(editing)': False, 'contains(fresh)': False, 'contains(situations)': False, 'contains(relief)': False, 'contains(viewers)': False, 'contains(b)': False, 'contains(explain)': False, 'contains(looks)': True, 'contains(aspects)': False, 'contains(seen)': False, 'contains(means)': True, 'contains(four)': False, 'contains(anderson)': False, 'contains(romance)': False, 'contains(fit)': False, 'contains(10)': False, 'contains(opening)': False, 'contains(playing)': True, 'contains(design)': False, 'contains(non)': True, 'contains(nice)': False, 'contains(love)': False, 'contains(leading)': False, 'contains(stay)': False, 'contains(number)': False, 'contains(patrick)': False, 'contains(master)': False, 'contains(six)': False, 'contains(years)': False, 'contains(fashion)': False, 'contains(offensive)': False, 'contains(jean)': False, 'contains(week)': False, 'contains(boys)': False, 'contains(zero)': False, 'contains(friends)': True, 'contains(political)': False, 'contains(serial)': False, 'contains(martin)': False, 'contains(got)': True, 'contains(decides)': False, 'contains(budget)': False, 'contains(casting)': False, 'contains(between)': True, 'contains(quick)': True, 'contains(isn)': False, 'contains(strange)': False, 'contains(media)': False, 'contains(which)': True, 'contains(violent)': False, 'contains(whatever)': False, 'contains(not)': True, 'contains(christmas)': False, 'contains(decide)': False, 'contains(conclusion)': False, 'contains(absolutely)': False, 'contains(company)': False, 'contains(al)': False, 'contains(criminal)': False, 'contains(deals)': False, 'contains(days)': False, 'contains(leads)': False, 'contains(mark)': False, 'contains(kiss)': False, 'contains(dark)': False, 'contains(directing)': False, 'contains(central)': False, 'contains(travel)': False, 'contains(visuals)': False, 'contains(someone)': False, 'contains(fact)': False, 'contains(light)': False, 'contains(battle)': False, 'contains(dog)': False, 'contains(arts)': False, 'contains(became)': False, 'contains(storyline)': False, 'contains(burton)': False, 'contains(given)': False, 'contains(enjoyable)': True, 'contains(such)': False, 'contains(find)': False, 'contains(otherwise)': False, 'contains(kill)': False, 'contains(intended)': False, 'contains(credit)': False, 'contains(cold)': False, 'contains(named)': False, 'contains(flick)': False, 'contains(whether)': False, 'contains(camp)': False, 'contains(particularly)': False, 'contains(matters)': False, 'contains(charm)': False, 'contains(angels)': False, 'contains(anyone)': False, 'contains(honest)': False, 'contains(familiar)': False, 'contains(simply)': False, 'contains(piece)': False, 'contains(instance)': False, 'contains(showing)': False, 'contains(comedies)': False, 'contains(turn)': False, 'contains(flying)': False, 'contains(happening)': False, 'contains(food)': True, 'contains(kong)': True, 'contains(industry)': False, 'contains(taylor)': False, 'contains(dramatic)': False, 'contains(ok)': False, 'contains(life)': False, 'contains(mike)': False, 'contains(chinese)': True, 'contains(carry)': False, 'contains(touching)': False, 'contains(green)': False, 'contains(expecting)': False, 'contains(fall)': False, 'contains(filmed)': False, 'contains(ready)': False, 'contains(debut)': False, 'contains(looked)': False, 'contains(pass)': False, 'contains(drama)': False, 'contains(disturbing)': False, 'contains(old)': False, 'contains(characters)': False, 'contains(lawyer)': False, 'contains(captain)': False, 'contains(a)': True, 'contains([)': False, 'contains(his)': True, 'contains(streets)': False, 'contains(funny)': True, 'contains(same)': True, 'contains(thankfully)': False, 'contains(charlie)': False, 'contains(hall)': False, 'contains(dollars)': False, 'contains(vampire)': False, 'contains(particular)': False, 'contains(spirit)': False, 'contains(against)': False, 'contains(writers)': False, 'contains(ways)': False, 'contains(hotel)': False, 'contains(football)': False, 'contains(prove)': False, 'contains(3)': False, 'contains(emotions)': False, 'contains(recent)': False, 'contains(society)': False, 'contains(reading)': False, 'contains(through)': False, 'contains(left)': False, 'contains(moore)': False, 'contains(twists)': False, 'contains(learn)': False, 'contains(feelings)': False, 'contains(possible)': False, 'contains(highly)': False, 'contains(even)': False, 'contains(awful)': False, 'contains(welcome)': False, 'contains(johnny)': False, 'contains(continues)': False, 'contains(everyone)': False, 'contains(under)': False, 'contains(forgotten)': False, 'contains(does)': False, 'contains(went)': False, 'contains(apartment)': False, 'contains(lots)': False, 'contains(adult)': False, 'contains(else)': False, 'contains(artist)': False, 'contains(damn)': False, 'contains(standing)': True, 'contains(new)': False, 'contains(ten)': False, 'contains(discover)': False, 'contains(queen)': False, 'contains(cult)': False, 'contains(happy)': False, 'contains(own)': True, 'contains(filmmaker)': False}
In [22]:
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)
In [24]:
print("Naive Bayes accuracy with 2000 bag of words features is %s" % (nltk.classify.accuracy(classifier, test_set)))
classifier.show_most_informative_features(5) 
Naive Bayes accuracy with 2000 bag of words features is 0.83
Most Informative Features
   contains(outstanding) = True              pos : neg    =     11.3 : 1.0
         contains(mulan) = True              pos : neg    =      9.1 : 1.0
        contains(seagal) = True              neg : pos    =      8.1 : 1.0
   contains(wonderfully) = True              pos : neg    =      6.4 : 1.0
         contains(damon) = True              pos : neg    =      6.1 : 1.0
In [25]:
classifier.show_most_informative_features(20) 
Most Informative Features
   contains(outstanding) = True              pos : neg    =     11.3 : 1.0
         contains(mulan) = True              pos : neg    =      9.1 : 1.0
        contains(seagal) = True              neg : pos    =      8.1 : 1.0
   contains(wonderfully) = True              pos : neg    =      6.4 : 1.0
         contains(damon) = True              pos : neg    =      6.1 : 1.0
         contains(flynt) = True              pos : neg    =      5.7 : 1.0
        contains(wasted) = True              neg : pos    =      5.6 : 1.0
         contains(awful) = True              neg : pos    =      5.3 : 1.0
        contains(poorly) = True              neg : pos    =      5.3 : 1.0
          contains(lame) = True              neg : pos    =      5.2 : 1.0
    contains(ridiculous) = True              neg : pos    =      4.9 : 1.0
         contains(waste) = True              neg : pos    =      4.8 : 1.0
           contains(era) = True              pos : neg    =      4.6 : 1.0
        contains(allows) = True              pos : neg    =      4.4 : 1.0
         contains(worst) = True              neg : pos    =      4.4 : 1.0
         contains(bland) = True              neg : pos    =      4.3 : 1.0
     contains(laughable) = True              neg : pos    =      4.1 : 1.0
          contains(mess) = True              neg : pos    =      4.0 : 1.0
     contains(fantastic) = True              pos : neg    =      4.0 : 1.0
          contains(jedi) = True              pos : neg    =      3.9 : 1.0

Back to Part of Speech Tagging

We can consider the task of POS tagging as a classification task and use the classifier methodology described here. Let us revisit the POS tagging task discussed in the first lecture using the new tools we have developed.

In [26]:
from nltk.corpus import brown
suffix_fdist = nltk.FreqDist()
for word in brown.words():
     word = word.lower()
     suffix_fdist[word[-1:]] += 1
     suffix_fdist[word[-2:]] += 1
     suffix_fdist[word[-3:]] += 1
In [27]:
common_suffixes = [suffix for (suffix, count) in suffix_fdist.most_common(100)]
print(common_suffixes)
['e', ',', '.', 's', 'd', 't', 'he', 'n', 'a', 'of', 'the', 'y', 'r', 'to', 'in', 'f', 'o', 'ed', 'nd', 'is', 'on', 'l', 'g', 'and', 'ng', 'er', 'as', 'ing', 'h', 'at', 'es', 'or', 're', 'it', '``', 'an', "''", 'm', ';', 'i', 'ly', 'ion', 'en', 'al', '?', 'nt', 'be', 'hat', 'st', 'his', 'th', 'll', 'le', 'ce', 'by', 'ts', 'me', 've', "'", 'se', 'ut', 'was', 'for', 'ent', 'ch', 'k', 'w', 'ld', '`', 'rs', 'ted', 'ere', 'her', 'ne', 'ns', 'ith', 'ad', 'ry', ')', '(', 'te', '--', 'ay', 'ty', 'ot', 'p', 'nce', "'s", 'ter', 'om', 'ss', ':', 'we', 'are', 'c', 'ers', 'uld', 'had', 'so', 'ey']
In [28]:
def pos_features(word):
    features = {}
    for suffix in common_suffixes:
        features['endswith(%s)' % suffix] = word.lower().endswith(suffix)
    return features
In [29]:
tagged_words = brown.tagged_words(categories='news', tagset='universal')
featuresets = [(pos_features(n), g) for (n,g) in tagged_words]
In [30]:
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)
Out[30]:
0.7011437095972153
In [31]:
classifier.classify(pos_features('cats'))
Out[31]:
'NOUN'
In [32]:
classifier.classify(pos_features('books'))
Out[32]:
'NOUN'
In [81]:
classifier.show_most_informative_features(20) 
Most Informative Features
           endswith(the) = True              DET : NOUN   =   3416.9 : 1.0
             endswith(.) = True                . : ADP    =   2481.6 : 1.0
            endswith(to) = True              PRT : ADJ    =   2138.0 : 1.0
             endswith(f) = True              ADP : VERB   =   2050.5 : 1.0
            endswith(he) = True              DET : NOUN   =   1808.9 : 1.0
           endswith(and) = True             CONJ : ADV    =   1642.0 : 1.0
             endswith(a) = True              DET : VERB   =   1597.1 : 1.0
            endswith(of) = True              ADP : NOUN   =   1406.9 : 1.0
           endswith(his) = True              DET : NOUN   =    728.0 : 1.0
            endswith(ut) = True             CONJ : DET    =    694.7 : 1.0
            endswith(nd) = True             CONJ : NUM    =    636.1 : 1.0
           endswith(hat) = True             PRON : NOUN   =    570.6 : 1.0
            endswith(ey) = True             PRON : VERB   =    549.0 : 1.0
             endswith(i) = True             PRON : ADP    =    547.2 : 1.0
             endswith(') = True                . : VERB   =    503.7 : 1.0
             endswith(o) = True              PRT : ADJ    =    493.4 : 1.0
            endswith(es) = True             NOUN : ADP    =    427.0 : 1.0
           endswith(uld) = True             VERB : NOUN   =    422.5 : 1.0
            endswith(we) = True             PRON : NOUN   =    353.5 : 1.0
           endswith(ted) = True             VERB : NOUN   =    337.9 : 1.0

Testing Different Classifiers

NLTK provides a common interface to different classifier algorithms. This is illustrated in the following examples.

In [33]:
import nltk
train = [
    (dict(a=1,b=1,c=1), 'y'),
    (dict(a=1,b=1,c=1), 'x'),
    (dict(a=1,b=1,c=0), 'y'),
    (dict(a=0,b=1,c=1), 'x'),
    (dict(a=0,b=1,c=1), 'y'),
    (dict(a=0,b=0,c=1), 'y'),
    (dict(a=0,b=1,c=0), 'x'),
    (dict(a=0,b=0,c=0), 'x'),
    (dict(a=0,b=1,c=1), 'y'),
]
test = [
    (dict(a=1,b=0,c=1)), # unseen
    (dict(a=1,b=0,c=0)), # unseen
    (dict(a=0,b=1,c=1)), # seen 3 times, labels=y,y,x
    (dict(a=0,b=1,c=0)), # seen 1 time, label=x
]

Naive Bayes Classifier

In [34]:
classifier = nltk.classify.NaiveBayesClassifier.train(train)
sorted(classifier.labels())
Out[34]:
['x', 'y']
In [35]:
classifier.classify_many(test)
Out[35]:
['y', 'x', 'y', 'x']
In [36]:
for pdist in classifier.prob_classify_many(test):
    print('%.4f %.4f' % (pdist.prob('x'), pdist.prob('y')))
0.3203 0.6797
0.5857 0.4143
0.3792 0.6208
0.6470 0.3530
In [37]:
classifier.show_most_informative_features()
Most Informative Features
                       c = 0                   x : y      =      2.0 : 1.0
                       c = 1                   y : x      =      1.5 : 1.0
                       a = 1                   y : x      =      1.4 : 1.0
                       b = 0                   x : y      =      1.2 : 1.0
                       a = 0                   x : y      =      1.2 : 1.0
                       b = 1                   y : x      =      1.1 : 1.0

Decision Tree Classifier

In [38]:
classifier = nltk.classify.DecisionTreeClassifier.train(
    train, entropy_cutoff=0, support_cutoff=0)
sorted(classifier.labels())
Out[38]:
['x', 'y']
In [39]:
print(classifier)
c=0? .................................................. x
  a=0? ................................................ x
  a=1? ................................................ y
c=1? .................................................. y

In [40]:
classifier.classify_many(test)
Out[40]:
['y', 'y', 'y', 'x']

There is no prob() method for decision tree classifiers, as they do not provide a probability interpretation.

Scikit-Learn Classifiers

NLTK provides an interface to the Scikit-learn (sklearn) classifiers - including maximum entropy and SVM.

In [41]:
from nltk.classify import SklearnClassifier
train_data = [({"a": 4, "b": 1, "c": 0}, "ham"),
              ({"a": 5, "b": 2, "c": 1}, "ham"),
              ({"a": 0, "b": 3, "c": 4}, "spam"),
              ({"a": 5, "b": 1, "c": 1}, "ham"),
              ({"a": 1, "b": 4, "c": 3}, "spam")]
test_data = [{"a": 3, "b": 2, "c": 1},
             {"a": 0, "b": 3, "c": 7}]
In [42]:
from sklearn.naive_bayes import BernoulliNB
classif = SklearnClassifier(BernoulliNB()).train(train_data)
classif.classify_many(test_data)
Out[42]:
['ham', 'spam']
In [43]:
from sklearn.svm import SVC
classif = SklearnClassifier(SVC(), sparse=False).train(train_data)
classif.classify_many(test_data)
Out[43]:
['ham', 'spam']
In [44]:
# Using the sklearn classifier:
X = [[0], [1], [2], [3]]
Y = [0, 1, 2, 3]
clf = SVC(kernel='linear', C=1.0)
clf.fit(X, Y)
Out[44]:
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
In [45]:
classifr = SklearnClassifier(SVC(kernel='rbf', C=1.0), sparse=False).train(train_data)
In [46]:
classifr.classify_many(test_data)
Out[46]:
['ham', 'spam']
In [47]:
from sklearn.svm import LinearSVC
classif_ova = SklearnClassifier(LinearSVC(C=1.0), sparse=False).train(train_data)
In [48]:
classif_ova.classify_many(test_data)
Out[48]:
['ham', 'spam']

The key parameter to optimize for a given SVM kernel is the C parameter. Here is example code from sklearn that shows how to optimize C on a development set.

In [49]:
%matplotlib inline

import numpy as np
from sklearn import cross_validation, datasets, svm

digits = datasets.load_digits()
X = digits.data
y = digits.target

svc = svm.SVC(kernel='linear')
C_s = np.logspace(-10, 0, 10)

scores = list()
scores_std = list()
for C in C_s:
    svc.C = C
    this_scores = cross_validation.cross_val_score(svc, X, y, n_jobs=1)
    scores.append(np.mean(this_scores))
    scores_std.append(np.std(this_scores))

# Do the plotting
import matplotlib.pyplot as plt
plt.figure(1, figsize=(4, 3))
plt.clf()
plt.semilogx(C_s, scores)
plt.semilogx(C_s, np.array(scores) + np.array(scores_std), 'b--')
plt.semilogx(C_s, np.array(scores) - np.array(scores_std), 'b--')
locs, labels = plt.yticks()
plt.yticks(locs, list(map(lambda x: "%g" % x, locs)))
plt.ylabel('CV score')
plt.xlabel('Parameter C')
plt.ylim(0, 1.1)
plt.show()
In [ ]: