We apply classification tools to solve NLP tasks.
We start with a very simple task that looks at words in isolation and tries to classify them into 2 labels: gender identification. The task consists of guessing whether a name is masculine or feminine.
The classification method consists of taking as input an observation, turning this observation into a feature vector, then predicting the label of this feature vector by applying a trained classifier model.
To prepare for this procedure, we must train a classifier. In supervised learning, a classifier is learned by generalizing a set of observed pairs (observationi, labeli) where [i = 1..N].
%matplotlib inline
def gender_features(word):
return {'last_letter': word[-1]}
gender_features('Shrek')
{'last_letter': 'k'}
from nltk.corpus import names
labeled_names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')])
import random
random.shuffle(labeled_names)
print("There are %s samples in the dataset." % (len(labeled_names)))
There are 7944 samples in the dataset.
import nltk
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print("Neo is classified as %s" % (classifier.classify(gender_features('Neo'))))
print("Trinity is classified as %s" % (classifier.classify(gender_features('Trinity'))))
Neo is classified as male Trinity is classified as female
print(nltk.classify.accuracy(classifier, test_set))
0.788
classifier.show_most_informative_features(5)
Most Informative Features last_letter = 'a' female : male = 34.5 : 1.0 last_letter = 'k' male : female = 31.7 : 1.0 last_letter = 'f' male : female = 15.9 : 1.0 last_letter = 'p' male : female = 11.9 : 1.0 last_letter = 'v' male : female = 9.8 : 1.0
def gender_features2(name):
features = {}
features["first_letter"] = name[0].lower()
features["last_letter"] = name[-1].lower()
for letter in 'abcdefghijklmnopqrstuvwxyz':
features["count(%s)" % letter] = name.lower().count(letter)
features["has(%s)" % letter] = (letter in name.lower())
return features
gender_features2('John')
{'first_letter': 'j', 'last_letter': 'n', 'count(a)': 0, 'has(a)': False, 'count(b)': 0, 'has(b)': False, 'count(c)': 0, 'has(c)': False, 'count(d)': 0, 'has(d)': False, 'count(e)': 0, 'has(e)': False, 'count(f)': 0, 'has(f)': False, 'count(g)': 0, 'has(g)': False, 'count(h)': 1, 'has(h)': True, 'count(i)': 0, 'has(i)': False, 'count(j)': 1, 'has(j)': True, 'count(k)': 0, 'has(k)': False, 'count(l)': 0, 'has(l)': False, 'count(m)': 0, 'has(m)': False, 'count(n)': 1, 'has(n)': True, 'count(o)': 1, 'has(o)': True, 'count(p)': 0, 'has(p)': False, 'count(q)': 0, 'has(q)': False, 'count(r)': 0, 'has(r)': False, 'count(s)': 0, 'has(s)': False, 'count(t)': 0, 'has(t)': False, 'count(u)': 0, 'has(u)': False, 'count(v)': 0, 'has(v)': False, 'count(w)': 0, 'has(w)': False, 'count(x)': 0, 'has(x)': False, 'count(y)': 0, 'has(y)': False, 'count(z)': 0, 'has(z)': False}
featuresets = [(gender_features2(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))
0.784
train_names = labeled_names[1500:]
devtest_names = labeled_names[500:1500]
test_names = labeled_names[:500]
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
test_set = [(gender_features(n), gender) for (n, gender) in test_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set))
0.747
errors = []
for (name, tag) in devtest_names:
guess = classifier.classify(gender_features(name))
if guess != tag:
errors.append( (tag, guess, name) )
for (tag, guess, name) in sorted(errors):
print('correct=%-8s guess=%-8s name=%-30s' % (tag, guess, name))
correct=female guess=male name=Abagail correct=female guess=male name=Abigael correct=female guess=male name=Adriaens correct=female guess=male name=Annabal correct=female guess=male name=Annabel correct=female guess=male name=April correct=female guess=male name=Arden correct=female guess=male name=Arleen correct=female guess=male name=Arlyn correct=female guess=male name=Astrid correct=female guess=male name=Avril correct=female guess=male name=Beatriz correct=female guess=male name=Bell correct=female guess=male name=Brit correct=female guess=male name=Caitrin correct=female guess=male name=Cam correct=female guess=male name=Candis correct=female guess=male name=Carlyn correct=female guess=male name=Carmel correct=female guess=male name=Carol-Jean correct=female guess=male name=Carolynn correct=female guess=male name=Caryn correct=female guess=male name=Cat correct=female guess=male name=Chantal correct=female guess=male name=Charis correct=female guess=male name=Charleen correct=female guess=male name=Charmian correct=female guess=male name=Cher correct=female guess=male name=Chriss correct=female guess=male name=Christan correct=female guess=male name=Cloris correct=female guess=male name=Coleen correct=female guess=male name=Crystal correct=female guess=male name=Cybal correct=female guess=male name=Cybil correct=female guess=male name=Dael correct=female guess=male name=Darb correct=female guess=male name=Daryn correct=female guess=male name=Demeter correct=female guess=male name=Dot correct=female guess=male name=Drew correct=female guess=male name=Emmalyn correct=female guess=male name=Erinn correct=female guess=male name=Esther correct=female guess=male name=Farand correct=female guess=male name=Farrand correct=female guess=male name=Francis correct=female guess=male name=Glen correct=female guess=male name=Glynis correct=female guess=male name=Guendolen correct=female guess=male name=Gwendolin correct=female guess=male name=Harriot correct=female guess=male name=Helyn correct=female guess=male name=Hildegaard correct=female guess=male name=Inger correct=female guess=male name=Ingrid correct=female guess=male name=Iris correct=female guess=male name=Iseabal correct=female guess=male name=Isobel correct=female guess=male name=Jasmin correct=female guess=male name=Jen correct=female guess=male name=Jenn correct=female guess=male name=Jessalyn correct=female guess=male name=Jessamyn correct=female guess=male name=Joell correct=female guess=male name=Jolynn correct=female guess=male name=Jonis correct=female guess=male name=Juliet correct=female guess=male name=Karalynn correct=female guess=male name=Kristal correct=female guess=male name=Kristen correct=female guess=male name=Laural correct=female guess=male name=Leanor correct=female guess=male name=Lian correct=female guess=male name=Lilias correct=female guess=male name=Lillian correct=female guess=male name=Lind correct=female guess=male name=Lorrin correct=female guess=male name=Lurleen correct=female guess=male name=Lust correct=female guess=male name=Mab correct=female guess=male name=Mabel correct=female guess=male name=Madalyn correct=female guess=male name=Magdalen correct=female guess=male name=Manon correct=female guess=male name=Marie-Ann correct=female guess=male name=Mariel correct=female guess=male name=Marigold correct=female guess=male name=Marion correct=female guess=male name=Maryl correct=female guess=male name=Marylin correct=female guess=male name=Marylou correct=female guess=male name=Mercedes correct=female guess=male name=Merilyn correct=female guess=male name=Mildred correct=female guess=male name=Millicent correct=female guess=male name=Morgan correct=female guess=male name=Morgen correct=female guess=male name=Phil correct=female guess=male name=Philis correct=female guess=male name=Phylis correct=female guess=male name=Rahel correct=female guess=male name=Raven correct=female guess=male name=Rhiamon correct=female guess=male name=Robbyn correct=female guess=male name=Rosaleen correct=female guess=male name=Saraann correct=female guess=male name=Shaun correct=female guess=male name=Shell correct=female guess=male name=Sherilyn correct=female guess=male name=Shirleen correct=female guess=male name=Sibel correct=female guess=male name=Sigrid correct=female guess=male name=Sydel correct=female guess=male name=Teryl correct=female guess=male name=Theo correct=female guess=male name=Viv correct=female guess=male name=Wileen correct=female guess=male name=Yoshiko correct=male guess=female name=Aditya correct=male guess=female name=Adolphe correct=male guess=female name=Alex correct=male guess=female name=Alexei correct=male guess=female name=Alfie correct=male guess=female name=Artie correct=male guess=female name=Ashley correct=male guess=female name=Ave correct=male guess=female name=Bailey correct=male guess=female name=Barri correct=male guess=female name=Bary correct=male guess=female name=Berkley correct=male guess=female name=Billie correct=male guess=female name=Billy correct=male guess=female name=Blaine correct=male guess=female name=Brady correct=male guess=female name=Cammy correct=male guess=female name=Cary correct=male guess=female name=Chaddie correct=male guess=female name=Chancey correct=male guess=female name=Chase correct=male guess=female name=Clarke correct=male guess=female name=Clay correct=male guess=female name=Cobbie correct=male guess=female name=Cody correct=male guess=female name=Connolly correct=male guess=female name=Corby correct=male guess=female name=Corky correct=male guess=female name=Davidde correct=male guess=female name=Deane correct=male guess=female name=Dennie correct=male guess=female name=Duane correct=male guess=female name=Durante correct=male guess=female name=Dwaine correct=male guess=female name=Dwayne correct=male guess=female name=Elisha correct=male guess=female name=Elmore correct=male guess=female name=Emmy correct=male guess=female name=Fonsie correct=male guess=female name=Fonzie correct=male guess=female name=Frederich correct=male guess=female name=Gabriele correct=male guess=female name=Garcia correct=male guess=female name=Garey correct=male guess=female name=Garry correct=male guess=female name=Gary correct=male guess=female name=George correct=male guess=female name=Germaine correct=male guess=female name=Gerry correct=male guess=female name=Giovanni correct=male guess=female name=Goose correct=male guess=female name=Hadleigh correct=male guess=female name=Hartley correct=male guess=female name=Heath correct=male guess=female name=Hermy correct=male guess=female name=Hersch correct=male guess=female name=Hewie correct=male guess=female name=Hilary correct=male guess=female name=Hodge correct=male guess=female name=Horace correct=male guess=female name=Howie correct=male guess=female name=Hugh correct=male guess=female name=Jeffie correct=male guess=female name=Jeremie correct=male guess=female name=Jerrome correct=male guess=female name=Jose correct=male guess=female name=Judah correct=male guess=female name=Judith correct=male guess=female name=Kelly correct=male guess=female name=Kyle correct=male guess=female name=Lane correct=male guess=female name=Laurie correct=male guess=female name=Lee correct=male guess=female name=Leroy correct=male guess=female name=Locke correct=male guess=female name=Lonnie correct=male guess=female name=Luce correct=male guess=female name=Martie correct=male guess=female name=Maurie correct=male guess=female name=Maurise correct=male guess=female name=Maxie correct=male guess=female name=Meade correct=male guess=female name=Mike correct=male guess=female name=Mischa correct=male guess=female name=Mitch correct=male guess=female name=Moe correct=male guess=female name=Montague correct=male guess=female name=Monte correct=male guess=female name=Morly correct=male guess=female name=Moshe correct=male guess=female name=Munroe correct=male guess=female name=Mustafa correct=male guess=female name=Noe correct=male guess=female name=Orville correct=male guess=female name=Paige correct=male guess=female name=Parke correct=male guess=female name=Rabbi correct=male guess=female name=Ramsay correct=male guess=female name=Ramsey correct=male guess=female name=Richy correct=male guess=female name=Rickie correct=male guess=female name=Rocky correct=male guess=female name=Rory correct=male guess=female name=Rube correct=male guess=female name=Rutledge correct=male guess=female name=Samuele correct=male guess=female name=Sandy correct=male guess=female name=Sasha correct=male guess=female name=Scarface correct=male guess=female name=Serge correct=male guess=female name=Sheffie correct=male guess=female name=Siddhartha correct=male guess=female name=Skye correct=male guess=female name=Sonny correct=male guess=female name=Spike correct=male guess=female name=Taite correct=male guess=female name=Teddie correct=male guess=female name=Terence correct=male guess=female name=Terry correct=male guess=female name=Thaine correct=male guess=female name=Thorndike correct=male guess=female name=Thornie correct=male guess=female name=Tonnie correct=male guess=female name=Tyrone correct=male guess=female name=Verne correct=male guess=female name=Virgie correct=male guess=female name=Walsh correct=male guess=female name=Ware correct=male guess=female name=Way correct=male guess=female name=Wesley correct=male guess=female name=Yancy correct=male guess=female name=Yehudi correct=male guess=female name=Zane correct=male guess=female name=Zolly
def gender_features(word):
return {'suffix1': word[-1:],
'suffix2': word[-2:]}
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set))
0.774
We now turn our attention to classifying full documents as opposed to single words in isolation.
The task seems more challenging, but simple methods can achieve surprisingly good results when the task is well defined. Consider the task of predicting whether a movie review is positive or negative. This is a task called sentiment analysis and is a hot practical task in the era of user-generated content (UGC) on the Web.
A good dataset is available in NLTK to experiment with this task.
from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = [w for (w, c) in all_words.most_common(2000)]
def document_features(document):
document_words = set(document)
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in document_words)
return features
print("There are %d documents in the movie reviews dataset." % (len(documents)))
print("There are %d distinct words in the dataset." % (all_words.B()))
print("There are %d tokens in the dataset." % (all_words.N()))
all_words.tabulate(20)
all_words.plot(100)
There are 2000 documents in the movie reviews dataset. There are 39768 distinct words in the dataset. There are 1583820 tokens in the dataset. , the . a and of to ' is in s " it that - ) ( as with for 77717 76529 65876 38106 35576 34123 31937 30585 25195 21822 18513 17612 16107 15924 15595 11781 11664 11378 10792 9961
<matplotlib.axes._subplots.AxesSubplot at 0x1ee6fd3ea20>
print(document_features(movie_reviews.words('pos/cv957_8737.txt')))
{'contains(,)': True, 'contains(the)': True, 'contains(.)': True, 'contains(a)': True, 'contains(and)': True, 'contains(of)': True, 'contains(to)': True, "contains(')": True, 'contains(is)': True, 'contains(in)': True, 'contains(s)': True, 'contains(")': True, 'contains(it)': True, 'contains(that)': True, 'contains(-)': True, 'contains())': True, 'contains(()': True, 'contains(as)': True, 'contains(with)': True, 'contains(for)': True, 'contains(his)': True, 'contains(this)': True, 'contains(film)': False, 'contains(i)': False, 'contains(he)': True, 'contains(but)': True, 'contains(on)': True, 'contains(are)': True, 'contains(t)': False, 'contains(by)': True, 'contains(be)': True, 'contains(one)': True, 'contains(movie)': True, 'contains(an)': True, 'contains(who)': True, 'contains(not)': True, 'contains(you)': True, 'contains(from)': True, 'contains(at)': False, 'contains(was)': False, 'contains(have)': True, 'contains(they)': True, 'contains(has)': True, 'contains(her)': False, 'contains(all)': True, 'contains(?)': False, 'contains(there)': True, 'contains(like)': True, 'contains(so)': False, 'contains(out)': True, 'contains(about)': True, 'contains(up)': False, 'contains(more)': False, 'contains(what)': True, 'contains(when)': True, 'contains(which)': True, 'contains(or)': False, 'contains(she)': True, 'contains(their)': False, 'contains(:)': True, 'contains(some)': False, 'contains(just)': True, 'contains(can)': False, 'contains(if)': True, 'contains(we)': False, 'contains(him)': True, 'contains(into)': True, 'contains(even)': False, 'contains(only)': True, 'contains(than)': False, 'contains(no)': False, 'contains(good)': False, 'contains(time)': False, 'contains(most)': True, 'contains(its)': False, 'contains(will)': True, 'contains(story)': False, 'contains(would)': False, 'contains(been)': False, 'contains(much)': False, 'contains(character)': False, 'contains(also)': True, 'contains(get)': True, 'contains(other)': True, 'contains(do)': True, 'contains(two)': True, 'contains(well)': True, 'contains(them)': True, 'contains(very)': True, 'contains(characters)': False, 'contains(;)': False, 'contains(first)': False, 'contains(--)': True, 'contains(after)': False, 'contains(see)': False, 'contains(!)': True, 'contains(way)': True, 'contains(because)': False, 'contains(make)': True, 'contains(life)': False, 'contains(off)': False, 'contains(too)': False, 'contains(any)': False, 'contains(does)': False, 'contains(really)': False, 'contains(had)': False, 'contains(while)': True, 'contains(films)': False, 'contains(how)': True, 'contains(plot)': True, 'contains(little)': True, 'contains(where)': True, 'contains(people)': False, 'contains(over)': False, 'contains(could)': False, 'contains(then)': True, 'contains(me)': True, 'contains(scene)': True, 'contains(man)': False, 'contains(bad)': False, 'contains(my)': False, 'contains(never)': True, 'contains(being)': False, 'contains(best)': True, 'contains(these)': False, 'contains(don)': False, 'contains(new)': False, 'contains(doesn)': False, 'contains(scenes)': False, 'contains(many)': True, 'contains(director)': False, 'contains(such)': False, 'contains(know)': False, 'contains(were)': False, 'contains(movies)': True, 'contains(through)': False, 'contains(here)': True, 'contains(action)': True, 'contains(great)': True, 'contains(re)': True, 'contains(another)': False, 'contains(love)': False, 'contains(go)': False, 'contains(made)': False, 'contains(us)': True, 'contains(big)': False, 'contains(end)': False, 'contains(something)': False, 'contains(back)': False, 'contains(*)': True, 'contains(still)': False, 'contains(world)': True, 'contains(seems)': False, 'contains(work)': False, 'contains(those)': False, 'contains(makes)': False, 'contains(now)': False, 'contains(before)': False, 'contains(however)': True, 'contains(between)': True, 'contains(few)': False, 'contains(/)': False, 'contains(down)': False, 'contains(every)': False, 'contains(though)': False, 'contains(better)': False, 'contains(real)': False, 'contains(audience)': False, 'contains(enough)': False, 'contains(seen)': False, 'contains(take)': False, 'contains(around)': False, 'contains(both)': False, 'contains(going)': False, 'contains(year)': False, 'contains(performance)': False, 'contains(why)': False, 'contains(should)': False, 'contains(role)': False, 'contains(isn)': False, 'contains(same)': True, 'contains(old)': False, 'contains(gets)': True, 'contains(your)': False, 'contains(may)': False, 'contains(things)': True, 'contains(think)': False, 'contains(years)': False, 'contains(last)': False, 'contains(comedy)': True, 'contains(funny)': True, 'contains(actually)': True, 'contains(ve)': False, 'contains(long)': False, 'contains(look)': True, 'contains(almost)': False, 'contains(own)': True, 'contains(thing)': False, 'contains(fact)': False, 'contains(nothing)': False, 'contains(say)': False, 'contains(right)': False, 'contains(john)': False, 'contains(although)': False, 'contains(played)': True, 'contains(find)': False, 'contains(script)': False, 'contains(come)': False, 'contains(ever)': True, 'contains(cast)': False, 'contains(since)': False, 'contains(did)': False, 'contains(star)': False, 'contains(plays)': False, 'contains(young)': False, 'contains(show)': False, 'contains(comes)': False, 'contains(m)': False, 'contains(part)': False, 'contains(original)': False, 'contains(actors)': False, 'contains(screen)': True, 'contains(without)': False, 'contains(again)': False, 'contains(acting)': False, 'contains(three)': False, 'contains(day)': True, 'contains(each)': True, 'contains(point)': False, 'contains(lot)': False, 'contains(least)': True, 'contains(takes)': False, 'contains(guy)': True, 'contains(quite)': False, 'contains(himself)': False, 'contains(away)': False, 'contains(during)': False, 'contains(family)': False, 'contains(effects)': False, 'contains(course)': True, 'contains(goes)': False, 'contains(minutes)': False, 'contains(interesting)': False, 'contains(might)': False, 'contains(far)': False, 'contains(high)': False, 'contains(rather)': False, 'contains(once)': True, 'contains(must)': False, 'contains(anything)': False, 'contains(place)': True, 'contains(set)': False, 'contains(yet)': False, 'contains(watch)': True, 'contains(d)': False, 'contains(making)': True, 'contains(our)': False, 'contains(wife)': True, 'contains(hard)': False, 'contains(always)': False, 'contains(fun)': True, 'contains(didn)': False, 'contains(ll)': False, 'contains(seem)': False, 'contains(special)': False, 'contains(bit)': False, 'contains(times)': False, 'contains(trying)': False, 'contains(hollywood)': False, 'contains(instead)': False, 'contains(give)': False, 'contains(want)': False, 'contains(picture)': False, 'contains(kind)': True, 'contains(american)': False, 'contains(job)': False, 'contains(sense)': False, 'contains(woman)': True, 'contains(home)': False, 'contains(having)': False, 'contains(series)': True, 'contains(actor)': False, 'contains(probably)': False, 'contains(help)': True, 'contains(half)': False, 'contains(along)': True, 'contains(men)': False, 'contains(everything)': True, 'contains(pretty)': False, 'contains(becomes)': False, 'contains(sure)': False, 'contains(black)': False, 'contains(together)': False, 'contains(dialogue)': False, 'contains(money)': False, 'contains(become)': False, 'contains(gives)': False, 'contains(given)': False, 'contains(looking)': False, 'contains(whole)': False, 'contains(watching)': False, 'contains(father)': False, 'contains(`)': False, 'contains(feel)': False, 'contains(everyone)': False, 'contains(music)': False, 'contains(wants)': False, 'contains(sex)': False, 'contains(less)': False, 'contains(done)': False, 'contains(horror)': False, 'contains(got)': True, 'contains(death)': False, 'contains(perhaps)': False, 'contains(city)': False, 'contains(next)': False, 'contains(especially)': True, 'contains(play)': False, 'contains(girl)': False, 'contains(mind)': False, 'contains(10)': False, 'contains(moments)': False, 'contains(looks)': True, 'contains(completely)': False, 'contains(2)': False, 'contains(reason)': False, 'contains(mother)': False, 'contains(whose)': False, 'contains(line)': False, 'contains(night)': False, 'contains(human)': False, 'contains(until)': False, 'contains(rest)': False, 'contains(performances)': False, 'contains(different)': False, 'contains(evil)': False, 'contains(small)': False, 'contains(james)': False, 'contains(simply)': False, 'contains(couple)': False, 'contains(put)': False, 'contains(let)': False, 'contains(anyone)': False, 'contains(ending)': False, 'contains(case)': False, 'contains(several)': False, 'contains(dead)': False, 'contains(michael)': False, 'contains(left)': False, 'contains(thought)': False, 'contains(school)': False, 'contains(shows)': False, 'contains(humor)': False, 'contains(true)': False, 'contains(lost)': False, 'contains(written)': False, 'contains(itself)': False, 'contains(friend)': False, 'contains(entire)': False, 'contains(getting)': True, 'contains(town)': False, 'contains(turns)': False, 'contains(soon)': False, 'contains(someone)': False, 'contains(second)': False, 'contains(main)': False, 'contains(stars)': False, 'contains(found)': False, 'contains(use)': False, 'contains(problem)': False, 'contains(friends)': True, 'contains(tv)': False, 'contains(top)': True, 'contains(name)': False, 'contains(begins)': False, 'contains(called)': False, 'contains(based)': False, 'contains(comic)': False, 'contains(david)': False, 'contains(head)': False, 'contains(else)': False, 'contains(idea)': True, 'contains(either)': False, 'contains(wrong)': True, 'contains(unfortunately)': False, 'contains(later)': False, 'contains(final)': False, 'contains(hand)': False, 'contains(alien)': False, 'contains(house)': False, 'contains(group)': False, 'contains(full)': False, 'contains(used)': True, 'contains(tries)': True, 'contains(often)': True, 'contains(against)': False, 'contains(war)': False, 'contains(sequence)': False, 'contains(keep)': False, 'contains(turn)': False, 'contains(playing)': True, 'contains(boy)': False, 'contains(behind)': False, 'contains(named)': False, 'contains(certainly)': False, 'contains(live)': False, 'contains(believe)': False, 'contains(under)': False, 'contains(works)': False, 'contains(relationship)': False, 'contains(face)': False, 'contains(hour)': False, 'contains(run)': False, 'contains(style)': False, 'contains(said)': False, 'contains(despite)': False, 'contains(person)': False, 'contains(finally)': False, 'contains(shot)': False, 'contains(book)': False, 'contains(doing)': False, 'contains(tell)': False, 'contains(maybe)': False, 'contains(nice)': False, 'contains(son)': False, 'contains(perfect)': False, 'contains(side)': False, 'contains(seeing)': True, 'contains(able)': False, 'contains(finds)': False, 'contains(children)': False, 'contains(days)': False, 'contains(past)': False, 'contains(summer)': False, 'contains(camera)': False, 'contains(won)': False, 'contains(including)': False, 'contains(mr)': False, 'contains(kids)': False, 'contains(lives)': False, 'contains(directed)': False, 'contains(moment)': False, 'contains(game)': False, 'contains(running)': False, 'contains(fight)': True, 'contains(supposed)': False, 'contains(video)': False, 'contains(car)': False, 'contains(matter)': False, 'contains(kevin)': True, 'contains(joe)': False, 'contains(lines)': False, 'contains(worth)': True, 'contains(=)': False, 'contains(daughter)': False, 'contains(earth)': False, 'contains(starts)': False, 'contains(need)': False, 'contains(entertaining)': False, 'contains(white)': False, 'contains(start)': True, 'contains(writer)': False, 'contains(dark)': False, 'contains(short)': False, 'contains(self)': False, 'contains(worst)': False, 'contains(nearly)': False, 'contains(opening)': False, 'contains(try)': False, 'contains(upon)': False, 'contains(care)': False, 'contains(early)': True, 'contains(violence)': False, 'contains(throughout)': False, 'contains(team)': False, 'contains(production)': False, 'contains(example)': False, 'contains(beautiful)': False, 'contains(title)': False, 'contains(exactly)': False, 'contains(jack)': False, 'contains(review)': False, 'contains(major)': False, 'contains(drama)': False, 'contains(&)': False, 'contains(problems)': True, 'contains(sequences)': False, 'contains(obvious)': False, 'contains(version)': False, 'contains(screenplay)': False, 'contains(known)': True, 'contains(killer)': False, 'contains(wasn)': False, 'contains(robert)': False, 'contains(disney)': False, 'contains(already)': False, 'contains(close)': False, 'contains(classic)': False, 'contains(others)': True, 'contains(hit)': False, 'contains(kill)': False, 'contains(deep)': True, 'contains(five)': False, 'contains(order)': False, 'contains(act)': False, 'contains(simple)': False, 'contains(fine)': False, 'contains(themselves)': False, 'contains(heart)': False, 'contains(roles)': False, 'contains(jackie)': True, 'contains(direction)': False, 'contains(eyes)': False, 'contains(four)': False, 'contains(question)': False, 'contains(sort)': False, 'contains(sometimes)': False, 'contains(knows)': False, 'contains(supporting)': False, 'contains(coming)': False, 'contains(voice)': False, 'contains(women)': False, 'contains(truly)': False, 'contains(save)': False, 'contains(jokes)': False, 'contains(computer)': False, 'contains(child)': False, 'contains(o)': False, 'contains(boring)': False, 'contains(tom)': False, 'contains(level)': False, 'contains(1)': False, 'contains(body)': False, 'contains(guys)': False, 'contains(genre)': False, 'contains(brother)': False, 'contains(strong)': False, 'contains(stop)': True, 'contains(room)': False, 'contains(space)': False, 'contains(lee)': False, 'contains(ends)': False, 'contains(beginning)': False, 'contains(ship)': False, 'contains(york)': False, 'contains(attempt)': False, 'contains(thriller)': False, 'contains(scream)': True, 'contains(peter)': False, 'contains(aren)': False, 'contains(husband)': False, 'contains(fiction)': False, 'contains(happens)': False, 'contains(hero)': False, 'contains(novel)': False, 'contains(note)': False, 'contains(hope)': False, 'contains(king)': False, 'contains(yes)': False, 'contains(says)': False, 'contains(tells)': False, 'contains(quickly)': False, 'contains(romantic)': False, 'contains(dog)': False, 'contains(oscar)': False, 'contains(stupid)': False, 'contains(possible)': False, 'contains(saw)': False, 'contains(lead)': True, 'contains(career)': False, 'contains(murder)': False, 'contains(extremely)': False, 'contains(manages)': False, 'contains(god)': False, 'contains(mostly)': False, 'contains(wonder)': False, 'contains(particularly)': False, 'contains(future)': False, 'contains(fans)': False, 'contains(sound)': False, 'contains(worse)': False, 'contains(piece)': False, 'contains(involving)': False, 'contains(de)': False, 'contains(appears)': False, 'contains(planet)': False, 'contains(paul)': False, 'contains(involved)': False, 'contains(mean)': False, 'contains(none)': False, 'contains(taking)': False, 'contains(hours)': False, 'contains(laugh)': True, 'contains(police)': False, 'contains(sets)': False, 'contains(attention)': False, 'contains(co)': False, 'contains(hell)': False, 'contains(eventually)': False, 'contains(single)': False, 'contains(fall)': False, 'contains(falls)': False, 'contains(material)': False, 'contains(emotional)': False, 'contains(power)': False, 'contains(late)': False, 'contains(lack)': False, 'contains(dr)': False, 'contains(van)': False, 'contains(result)': False, 'contains(elements)': False, 'contains(meet)': False, 'contains(smith)': False, 'contains(science)': False, 'contains(experience)': False, 'contains(bring)': False, 'contains(wild)': False, 'contains(living)': False, 'contains(theater)': False, 'contains(interest)': False, 'contains(leads)': False, 'contains(word)': False, 'contains(feature)': False, 'contains(battle)': False, 'contains(girls)': False, 'contains(alone)': False, 'contains(obviously)': False, 'contains(george)': False, 'contains(within)': False, 'contains(usually)': False, 'contains(enjoy)': False, 'contains(guess)': False, 'contains(among)': True, 'contains(taken)': False, 'contains(feeling)': False, 'contains(laughs)': False, 'contains(aliens)': False, 'contains(talk)': True, 'contains(chance)': False, 'contains(talent)': False, 'contains(3)': False, 'contains(middle)': False, 'contains(number)': False, 'contains(easy)': False, 'contains(across)': False, 'contains(needs)': False, 'contains(attempts)': False, 'contains(happen)': False, 'contains(television)': False, 'contains(chris)': False, 'contains(deal)': False, 'contains(poor)': False, 'contains(form)': False, 'contains(girlfriend)': True, 'contains(viewer)': False, 'contains(release)': False, 'contains(killed)': False, 'contains(forced)': False, 'contains(whether)': False, 'contains(wonderful)': False, 'contains(feels)': False, 'contains(oh)': False, 'contains(tale)': False, 'contains(serious)': False, 'contains(expect)': False, 'contains(except)': False, 'contains(light)': False, 'contains(success)': False, 'contains(features)': True, 'contains(premise)': False, 'contains(happy)': False, 'contains(words)': False, 'contains(leave)': False, 'contains(important)': False, 'contains(meets)': False, 'contains(history)': False, 'contains(giving)': False, 'contains(crew)': False, 'contains(type)': False, 'contains(call)': False, 'contains(turned)': False, 'contains(released)': False, 'contains(parents)': False, 'contains(art)': False, 'contains(impressive)': False, 'contains(mission)': False, 'contains(working)': False, 'contains(seemed)': False, 'contains(score)': False, 'contains(told)': False, 'contains(recent)': False, 'contains(robin)': False, 'contains(basically)': False, 'contains(entertainment)': False, 'contains(america)': False, 'contains($)': False, 'contains(surprise)': False, 'contains(apparently)': False, 'contains(easily)': False, 'contains(ryan)': False, 'contains(cool)': False, 'contains(stuff)': False, 'contains(cop)': False, 'contains(change)': False, 'contains(williams)': False, 'contains(crime)': False, 'contains(office)': False, 'contains(parts)': False, 'contains(somehow)': False, 'contains(sequel)': False, 'contains(william)': False, 'contains(cut)': False, 'contains(die)': False, 'contains(jones)': False, 'contains(credits)': False, 'contains(batman)': False, 'contains(suspense)': False, 'contains(brings)': False, 'contains(events)': False, 'contains(reality)': False, 'contains(whom)': False, 'contains(local)': False, 'contains(talking)': False, 'contains(difficult)': True, 'contains(using)': False, 'contains(went)': False, 'contains(writing)': False, 'contains(remember)': False, 'contains(near)': False, 'contains(straight)': False, 'contains(hilarious)': True, 'contains(ago)': False, 'contains(certain)': False, 'contains(ben)': False, 'contains(kid)': False, 'contains(wouldn)': False, 'contains(slow)': True, 'contains(blood)': False, 'contains(mystery)': False, 'contains(complete)': False, 'contains(red)': False, 'contains(popular)': False, 'contains(effective)': False, 'contains(am)': False, 'contains(fast)': True, 'contains(flick)': False, 'contains(due)': False, 'contains(runs)': False, 'contains(gone)': False, 'contains(return)': False, 'contains(presence)': False, 'contains(quality)': False, 'contains(dramatic)': False, 'contains(filmmakers)': False, 'contains(age)': False, 'contains(brothers)': False, 'contains(business)': False, 'contains(general)': False, 'contains(rock)': False, 'contains(sexual)': False, 'contains(present)': False, 'contains(surprisingly)': False, 'contains(anyway)': False, 'contains(uses)': False, 'contains(4)': False, 'contains(personal)': False, 'contains(figure)': False, 'contains(smart)': False, 'contains(ways)': False, 'contains(decides)': False, 'contains(annoying)': False, 'contains(begin)': False, 'contains(couldn)': False, 'contains(somewhat)': False, 'contains(shots)': False, 'contains(rich)': False, 'contains(minute)': False, 'contains(law)': False, 'contains(previous)': False, 'contains(jim)': False, 'contains(successful)': False, 'contains(harry)': False, 'contains(water)': False, 'contains(similar)': False, 'contains(absolutely)': False, 'contains(motion)': False, 'contains(former)': False, 'contains(strange)': False, 'contains(came)': False, 'contains(follow)': False, 'contains(read)': False, 'contains(project)': False, 'contains(million)': True, 'contains(secret)': False, 'contains(starring)': False, 'contains(clear)': False, 'contains(familiar)': False, 'contains(romance)': False, 'contains(intelligent)': False, 'contains(third)': True, 'contains(excellent)': False, 'contains(amazing)': False, 'contains(party)': False, 'contains(budget)': False, 'contains(eye)': False, 'contains(actress)': False, 'contains(prison)': False, 'contains(latest)': False, 'contains(means)': True, 'contains(company)': False, 'contains(towards)': False, 'contains(predictable)': False, 'contains(powerful)': False, 'contains(nor)': False, 'contains(bob)': False, 'contains(beyond)': False, 'contains(visual)': False, 'contains(leaves)': False, 'contains(r)': False, 'contains(nature)': False, 'contains(following)': False, 'contains(villain)': False, 'contains(leaving)': False, 'contains(animated)': False, 'contains(low)': False, 'contains(myself)': False, 'contains(b)': False, 'contains(bill)': False, 'contains(sam)': False, 'contains(filled)': False, 'contains(wars)': False, 'contains(questions)': False, 'contains(cinema)': False, 'contains(message)': False, 'contains(box)': False, 'contains(moving)': True, 'contains(herself)': False, 'contains(country)': False, 'contains(usual)': False, 'contains(martin)': False, 'contains(definitely)': False, 'contains(add)': False, 'contains(large)': False, 'contains(clever)': False, 'contains(create)': False, 'contains(felt)': False, 'contains(stories)': False, 'contains(brilliant)': False, 'contains(ones)': False, 'contains(giant)': False, 'contains(situation)': False, 'contains(murphy)': False, 'contains(break)': False, 'contains(opens)': False, 'contains(scary)': False, 'contains(doubt)': False, 'contains(drug)': True, 'contains(bunch)': False, 'contains(thinking)': False, 'contains(solid)': False, 'contains(effect)': False, 'contains(learn)': False, 'contains(move)': False, 'contains(force)': False, 'contains(potential)': False, 'contains(seriously)': False, 'contains(follows)': False, 'contains(above)': False, 'contains(saying)': False, 'contains(huge)': False, 'contains(class)': False, 'contains(plan)': False, 'contains(agent)': False, 'contains(created)': False, 'contains(unlike)': False, 'contains(pay)': False, 'contains(non)': True, 'contains(married)': False, 'contains(mark)': False, 'contains(sweet)': False, 'contains(perfectly)': False, 'contains(ex)': False, 'contains(realize)': False, 'contains(audiences)': False, 'contains(took)': False, 'contains(decent)': False, 'contains(likely)': False, 'contains(dream)': False, 'contains(view)': False, 'contains(scott)': False, 'contains(subject)': False, 'contains(understand)': False, 'contains(happened)': False, 'contains(enjoyable)': True, 'contains(studio)': False, 'contains(immediately)': False, 'contains(open)': False, 'contains(e)': False, 'contains(points)': False, 'contains(heard)': False, 'contains(viewers)': False, 'contains(cameron)': False, 'contains(truman)': False, 'contains(bruce)': False, 'contains(frank)': False, 'contains(private)': False, 'contains(stay)': False, 'contains(fails)': False, 'contains(impossible)': False, 'contains(cold)': False, 'contains(richard)': False, 'contains(overall)': False, 'contains(merely)': False, 'contains(exciting)': False, 'contains(mess)': False, 'contains(chase)': True, 'contains(free)': False, 'contains(ten)': False, 'contains(neither)': False, 'contains(wanted)': False, 'contains(gun)': True, 'contains(appear)': False, 'contains(carter)': False, 'contains(escape)': False, 'contains(ultimately)': False, 'contains(+)': False, 'contains(fan)': False, 'contains(inside)': False, 'contains(favorite)': False, 'contains(haven)': False, 'contains(modern)': False, 'contains(l)': False, 'contains(wedding)': False, 'contains(stone)': False, 'contains(trek)': False, 'contains(brought)': False, 'contains(trouble)': True, 'contains(otherwise)': False, 'contains(tim)': False, 'contains(5)': False, 'contains(allen)': False, 'contains(bond)': False, 'contains(society)': False, 'contains(liked)': False, 'contains(dumb)': False, 'contains(musical)': False, 'contains(stand)': False, 'contains(political)': False, 'contains(various)': False, 'contains(talented)': False, 'contains(particular)': False, 'contains(west)': False, 'contains(state)': False, 'contains(keeps)': True, 'contains(english)': False, 'contains(silly)': False, 'contains(u)': False, 'contains(situations)': False, 'contains(park)': False, 'contains(teen)': False, 'contains(rating)': False, 'contains(slightly)': False, 'contains(steve)': False, 'contains(truth)': False, 'contains(air)': False, 'contains(element)': False, 'contains(joke)': False, 'contains(spend)': False, 'contains(key)': True, 'contains(biggest)': False, 'contains(members)': False, 'contains(effort)': False, 'contains(government)': False, 'contains(focus)': False, 'contains(eddie)': False, 'contains(soundtrack)': False, 'contains(hands)': False, 'contains(earlier)': False, 'contains(chan)': True, 'contains(purpose)': False, 'contains(today)': True, 'contains(showing)': False, 'contains(memorable)': False, 'contains(six)': False, 'contains(cannot)': False, 'contains(max)': False, 'contains(offers)': False, 'contains(rated)': False, 'contains(mars)': False, 'contains(heavy)': False, 'contains(totally)': False, 'contains(control)': False, 'contains(credit)': False, 'contains(fi)': False, 'contains(woody)': False, 'contains(ideas)': False, 'contains(sci)': False, 'contains(wait)': False, 'contains(sit)': False, 'contains(female)': False, 'contains(ask)': False, 'contains(waste)': False, 'contains(terrible)': False, 'contains(depth)': False, 'contains(simon)': False, 'contains(aspect)': False, 'contains(list)': False, 'contains(mary)': False, 'contains(sister)': False, 'contains(animation)': False, 'contains(entirely)': False, 'contains(fear)': False, 'contains(steven)': False, 'contains(moves)': False, 'contains(actual)': False, 'contains(army)': False, 'contains(british)': False, 'contains(constantly)': False, 'contains(fire)': False, 'contains(convincing)': False, 'contains(setting)': False, 'contains(gave)': False, 'contains(tension)': False, 'contains(street)': False, 'contains(8)': False, 'contains(brief)': True, 'contains(ridiculous)': False, 'contains(cinematography)': False, 'contains(typical)': False, 'contains(nick)': False, 'contains(screenwriter)': False, 'contains(ability)': False, 'contains(spent)': False, 'contains(quick)': True, 'contains(violent)': False, 'contains(atmosphere)': False, 'contains(subtle)': False, 'contains(expected)': False, 'contains(fairly)': True, 'contains(seven)': False, 'contains(killing)': False, 'contains(tone)': False, 'contains(master)': False, 'contains(disaster)': False, 'contains(lots)': False, 'contains(thinks)': False, 'contains(song)': False, 'contains(cheap)': False, 'contains(suddenly)': False, 'contains(background)': False, 'contains(club)': False, 'contains(willis)': False, 'contains(whatever)': False, 'contains(highly)': False, 'contains(sees)': True, 'contains(complex)': False, 'contains(greatest)': False, 'contains(impact)': False, 'contains(beauty)': False, 'contains(front)': False, 'contains(humans)': False, 'contains(indeed)': False, 'contains(flat)': False, 'contains(grace)': False, 'contains(wrote)': False, 'contains(amusing)': False, 'contains(ii)': False, 'contains(mike)': False, 'contains(further)': False, 'contains(cute)': False, 'contains(dull)': False, 'contains(minor)': False, 'contains(recently)': False, 'contains(hate)': False, 'contains(outside)': False, 'contains(plenty)': False, 'contains(wish)': False, 'contains(godzilla)': False, 'contains(college)': False, 'contains(titanic)': False, 'contains(sounds)': False, 'contains(telling)': False, 'contains(sight)': False, 'contains(double)': False, 'contains(cinematic)': False, 'contains(queen)': False, 'contains(hold)': False, 'contains(meanwhile)': False, 'contains(awful)': False, 'contains(clearly)': False, 'contains(theme)': False, 'contains(hear)': False, 'contains(x)': False, 'contains(amount)': False, 'contains(baby)': False, 'contains(approach)': False, 'contains(dreams)': False, 'contains(shown)': False, 'contains(island)': False, 'contains(reasons)': False, 'contains(charm)': False, 'contains(miss)': True, 'contains(longer)': False, 'contains(common)': False, 'contains(sean)': False, 'contains(carry)': False, 'contains(believable)': False, 'contains(realistic)': False, 'contains(chemistry)': True, 'contains(possibly)': False, 'contains(casting)': False, 'contains(carrey)': False, 'contains(french)': False, 'contains(trailer)': False, 'contains(tough)': False, 'contains(produced)': False, 'contains(imagine)': False, 'contains(choice)': False, 'contains(ride)': False, 'contains(somewhere)': False, 'contains(hot)': False, 'contains(race)': False, 'contains(road)': False, 'contains(leader)': False, 'contains(thin)': False, 'contains(jerry)': False, 'contains(slowly)': False, 'contains(delivers)': False, 'contains(detective)': False, 'contains(brown)': False, 'contains(jackson)': False, 'contains(member)': False, 'contains(provide)': False, 'contains(president)': False, 'contains(puts)': False, 'contains(asks)': False, 'contains(critics)': False, 'contains(appearance)': False, 'contains(famous)': False, 'contains(okay)': False, 'contains(intelligence)': False, 'contains(energy)': False, 'contains(sent)': False, 'contains(spielberg)': False, 'contains(development)': False, 'contains(etc)': False, 'contains(language)': False, 'contains(blue)': False, 'contains(proves)': False, 'contains(vampire)': False, 'contains(seemingly)': False, 'contains(basic)': False, 'contains(caught)': False, 'contains(decide)': False, 'contains(opportunity)': False, 'contains(incredibly)': False, 'contains(images)': False, 'contains(band)': False, 'contains(j)': False, 'contains(writers)': False, 'contains(knew)': False, 'contains(interested)': False, 'contains(considering)': False, 'contains(boys)': False, 'contains(thanks)': False, 'contains(remains)': False, 'contains(climax)': True, 'contains(event)': False, 'contains(directing)': False, 'contains(conclusion)': False, 'contains(leading)': False, 'contains(ground)': False, 'contains(lies)': False, 'contains(forget)': False, 'contains(alive)': False, 'contains(tarzan)': False, 'contains(century)': False, 'contains(provides)': False, 'contains(trip)': False, 'contains(partner)': False, 'contains(central)': False, 'contains(tarantino)': False, 'contains(period)': False, 'contains(pace)': False, 'contains(yourself)': False, 'contains(worked)': False, 'contains(ready)': False, 'contains(date)': False, 'contains(thus)': False, 'contains(1998)': False, 'contains(terrific)': False, 'contains(write)': False, 'contains(average)': False, 'contains(onto)': False, 'contains(songs)': False, 'contains(occasionally)': False, 'contains(doctor)': False, 'contains(stands)': False, 'contains(hardly)': False, 'contains(monster)': False, 'contains(led)': False, 'contains(mysterious)': False, 'contains(details)': False, 'contains(wasted)': False, 'contains(apart)': False, 'contains(aside)': False, 'contains(store)': False, 'contains(billy)': False, 'contains(boss)': True, 'contains(travolta)': False, 'contains(producer)': False, 'contains(pull)': False, 'contains(consider)': False, 'contains(pictures)': False, 'contains(becoming)': False, 'contains(cage)': False, 'contains(loud)': False, 'contains(looked)': False, 'contains(officer)': False, 'contains(twenty)': False, 'contains(system)': False, 'contains(contains)': False, 'contains(julia)': False, 'contains(subplot)': False, 'contains(missing)': False, 'contains(personality)': False, 'contains(building)': False, 'contains(learns)': False, 'contains(hong)': True, 'contains(la)': False, 'contains(apartment)': False, 'contains(7)': False, 'contains(bizarre)': False, 'contains(powers)': False, 'contains(flaws)': False, 'contains(catch)': False, 'contains(lawyer)': False, 'contains(shoot)': False, 'contains(student)': False, 'contains(unique)': True, 'contains(000)': False, 'contains(admit)': False, 'contains(concept)': False, 'contains(needed)': False, 'contains(thrown)': False, 'contains(christopher)': False, 'contains(laughing)': False, 'contains(green)': False, 'contains(twists)': False, 'contains(matthew)': False, 'contains(touch)': False, 'contains(waiting)': False, 'contains(victim)': False, 'contains(cover)': False, 'contains(machine)': False, 'contains(danny)': False, 'contains(mention)': False, 'contains(search)': False, 'contains(1997)': False, 'contains(win)': False, 'contains(door)': False, 'contains(manner)': False, 'contains(train)': True, 'contains(saving)': False, 'contains(share)': False, 'contains(image)': False, 'contains(discovers)': False, 'contains(normal)': False, 'contains(cross)': False, 'contains(fox)': False, 'contains(returns)': False, 'contains(adult)': False, 'contains(adds)': False, 'contains(answer)': False, 'contains(adventure)': False, 'contains(lame)': False, 'contains(male)': False, 'contains(odd)': False, 'contains(singer)': False, 'contains(deserves)': False, 'contains(gore)': False, 'contains(states)': False, 'contains(include)': False, 'contains(equally)': False, 'contains(months)': False, 'contains(barely)': False, 'contains(directors)': False, 'contains(introduced)': False, 'contains(fashion)': False, 'contains(social)': False, 'contains(1999)': False, 'contains(news)': False, 'contains(hair)': False, 'contains(dance)': False, 'contains(innocent)': False, 'contains(camp)': False, 'contains(teacher)': False, 'contains(became)': False, 'contains(sad)': False, 'contains(witch)': False, 'contains(includes)': False, 'contains(nights)': False, 'contains(jason)': False, 'contains(julie)': False, 'contains(latter)': False, 'contains(food)': True, 'contains(jennifer)': False, 'contains(land)': False, 'contains(menace)': False, 'contains(rate)': False, 'contains(storyline)': False, 'contains(contact)': False, 'contains(jean)': False, 'contains(elizabeth)': False, 'contains(fellow)': False, 'contains(changes)': False, 'contains(henry)': False, 'contains(hill)': False, 'contains(pulp)': False, 'contains(gay)': False, 'contains(tried)': False, 'contains(surprised)': False, 'contains(literally)': False, 'contains(walk)': False, 'contains(standard)': False, 'contains(90)': False, 'contains(forward)': False, 'contains(wise)': False, 'contains(enjoyed)': False, 'contains(discover)': False, 'contains(pop)': False, 'contains(anderson)': False, 'contains(offer)': False, 'contains(recommend)': False, 'contains(public)': False, 'contains(drive)': False, 'contains(c)': False, 'contains(toy)': False, 'contains(charming)': False, 'contains(fair)': False, 'contains(chinese)': True, 'contains(rescue)': False, 'contains(terms)': False, 'contains(mouth)': False, 'contains(lucas)': False, 'contains(accident)': False, 'contains(dies)': False, 'contains(decided)': False, 'contains(edge)': False, 'contains(footage)': False, 'contains(culture)': False, 'contains(weak)': False, 'contains(presented)': False, 'contains(blade)': False, 'contains(younger)': False, 'contains(douglas)': False, 'contains(natural)': False, 'contains(born)': False, 'contains(generally)': False, 'contains(teenage)': False, 'contains(older)': False, 'contains(horrible)': False, 'contains(addition)': False, 'contains(sadly)': False, 'contains(creates)': False, 'contains(disturbing)': False, 'contains(roger)': False, 'contains(detail)': False, 'contains(devil)': False, 'contains(debut)': False, 'contains(track)': False, 'contains(developed)': False, 'contains(week)': False, 'contains(russell)': False, 'contains(attack)': False, 'contains(explain)': False, 'contains(rarely)': False, 'contains(fully)': False, 'contains(prove)': False, 'contains(exception)': False, 'contains(jeff)': False, 'contains(twist)': False, 'contains(gang)': False, 'contains(winning)': False, 'contains(jr)': False, 'contains(species)': False, 'contains(issues)': False, 'contains(fresh)': False, 'contains(rules)': False, 'contains(meaning)': False, 'contains(inspired)': False, 'contains(heroes)': False, 'contains(desperate)': False, 'contains(fighting)': False, 'contains(filmed)': False, 'contains(faces)': False, 'contains(alan)': False, 'contains(bright)': False, 'contains(ass)': True, 'contains(flying)': False, 'contains(kong)': True, 'contains(rush)': False, 'contains(forces)': False, 'contains(charles)': False, 'contains(numerous)': False, 'contains(emotions)': False, 'contains(involves)': True, 'contains(patrick)': False, 'contains(weird)': False, 'contains(apparent)': False, 'contains(information)': False, 'contains(revenge)': False, 'contains(jay)': False, 'contains(toward)': False, 'contains(surprising)': False, 'contains(twice)': False, 'contains(editing)': False, 'contains(calls)': False, 'contains(lose)': False, 'contains(vegas)': False, 'contains(stage)': False, 'contains(intended)': False, 'contains(gags)': False, 'contains(opinion)': False, 'contains(likes)': False, 'contains(crazy)': False, 'contains(owner)': False, 'contains(places)': False, 'contains(pair)': False, 'contains(genuine)': False, 'contains(epic)': False, 'contains(speak)': False, 'contains(throw)': False, 'contains(appeal)': False, 'contains(gibson)': False, 'contains(captain)': False, 'contains(military)': False, 'contains(20)': False, 'contains(blair)': False, 'contains(nowhere)': False, 'contains(length)': False, 'contains(nicely)': False, 'contains(cause)': False, 'contains(pass)': False, 'contains(episode)': False, 'contains(kiss)': False, 'contains(arnold)': True, 'contains(please)': False, 'contains(hasn)': False, 'contains(phone)': False, 'contains(filmmaking)': False, 'contains(formula)': False, 'contains(boyfriend)': False, 'contains(talents)': False, 'contains(creating)': False, 'contains(kelly)': False, 'contains(buy)': False, 'contains(wide)': False, 'contains(fantasy)': False, 'contains(mood)': False, 'contains(heads)': False, 'contains(pathetic)': False, 'contains(lacks)': False, 'contains(loved)': False, 'contains(asked)': False, 'contains(mrs)': False, 'contains(witty)': False, 'contains(shakespeare)': False, 'contains(mulan)': False, 'contains(generation)': False, 'contains(affair)': False, 'contains(pieces)': False, 'contains(task)': False, 'contains(rare)': False, 'contains(kept)': False, 'contains(cameo)': False, 'contains(fascinating)': False, 'contains(ed)': False, 'contains(fbi)': False, 'contains(burton)': False, 'contains(incredible)': False, 'contains(accent)': False, 'contains(artist)': False, 'contains(superior)': False, 'contains(academy)': False, 'contains(thomas)': False, 'contains(spirit)': False, 'contains(technical)': False, 'contains(confusing)': False, 'contains(poorly)': False, 'contains(target)': False, 'contains(lover)': False, 'contains(woo)': False, 'contains(mentioned)': False, 'contains(theaters)': False, 'contains(plane)': False, 'contains(confused)': False, 'contains(dennis)': False, 'contains(rob)': False, 'contains(appropriate)': False, 'contains(christmas)': False, 'contains(considered)': False, 'contains(legend)': False, 'contains(shame)': False, 'contains(soul)': False, 'contains(matt)': False, 'contains(campbell)': False, 'contains(process)': False, 'contains(bottom)': False, 'contains(sitting)': False, 'contains(brain)': False, 'contains(creepy)': False, 'contains(13)': False, 'contains(forever)': False, 'contains(dude)': False, 'contains(crap)': False, 'contains(superb)': False, 'contains(speech)': False, 'contains(ice)': False, 'contains(journey)': False, 'contains(masterpiece)': False, 'contains(intriguing)': False, 'contains(names)': False, 'contains(pick)': False, 'contains(speaking)': False, 'contains(virtually)': False, 'contains(award)': False, 'contains(worthy)': False, 'contains(marriage)': False, 'contains(deliver)': False, 'contains(cash)': False, 'contains(magic)': False, 'contains(respect)': False, 'contains(product)': False, 'contains(necessary)': False, 'contains(suppose)': False, 'contains(silent)': False, 'contains(pointless)': False, 'contains(station)': False, 'contains(affleck)': False, 'contains(dimensional)': False, 'contains(charlie)': False, 'contains(allows)': False, 'contains(avoid)': False, 'contains(meant)': False, 'contains(cops)': False, 'contains(attitude)': False, 'contains(relationships)': False, 'contains(hits)': False, 'contains(stephen)': False, 'contains(spends)': False, 'contains(relief)': False, 'contains(physical)': True, 'contains(count)': False, 'contains(reviews)': False, 'contains(appreciate)': False, 'contains(cliches)': False, 'contains(holds)': False, 'contains(pure)': False, 'contains(plans)': False, 'contains(limited)': False, 'contains(failed)': False, 'contains(pain)': False, 'contains(impression)': False, 'contains(unless)': False, 'contains(sub)': False, 'contains([)': False, 'contains(total)': False, 'contains(creature)': False, 'contains(viewing)': False, 'contains(loves)': False, 'contains(princess)': False, 'contains(kate)': False, 'contains(rising)': False, 'contains(woods)': False, 'contains(baldwin)': False, 'contains(angry)': False, 'contains(drawn)': False, 'contains(step)': False, 'contains(matrix)': False, 'contains(themes)': False, 'contains(satire)': False, 'contains(arts)': False, 'contains(])': False, 'contains(remake)': False, 'contains(wall)': False, 'contains(moral)': False, 'contains(color)': False, 'contains(ray)': False, 'contains(stuck)': False, 'contains(touching)': False, 'contains(wit)': False, 'contains(tony)': False, 'contains(hanks)': False, 'contains(continues)': False, 'contains(damn)': False, 'contains(nobody)': False, 'contains(cartoon)': False, 'contains(keeping)': False, 'contains(realized)': False, 'contains(criminal)': False, 'contains(unfunny)': False, 'contains(comedic)': False, 'contains(martial)': False, 'contains(disappointing)': False, 'contains(anti)': False, 'contains(graphic)': False, 'contains(stunning)': False, 'contains(actions)': False, 'contains(floor)': False, 'contains(emotion)': False, 'contains(soldiers)': False, 'contains(edward)': False, 'contains(comedies)': False, 'contains(driver)': False, 'contains(expectations)': False, 'contains(added)': False, 'contains(mad)': False, 'contains(angels)': False, 'contains(shallow)': False, 'contains(suspect)': False, 'contains(humorous)': False, 'contains(phantom)': False, 'contains(appealing)': False, 'contains(device)': False, 'contains(design)': False, 'contains(industry)': False, 'contains(reach)': False, 'contains(fat)': False, 'contains(blame)': False, 'contains(united)': False, 'contains(sign)': False, 'contains(portrayal)': False, 'contains(rocky)': False, 'contains(finale)': False, 'contains(grand)': False, 'contains(opposite)': False, 'contains(hotel)': False, 'contains(match)': False, 'contains(damme)': False, 'contains(speed)': False, 'contains(ok)': False, 'contains(loving)': False, 'contains(field)': True, 'contains(larry)': False, 'contains(urban)': False, 'contains(troopers)': False, 'contains(compared)': False, 'contains(apes)': False, 'contains(rose)': False, 'contains(falling)': False, 'contains(era)': False, 'contains(loses)': False, 'contains(adults)': False, 'contains(managed)': False, 'contains(dad)': False, 'contains(therefore)': False, 'contains(pg)': False, 'contains(results)': False, 'contains(guns)': False, 'contains(radio)': False, 'contains(lady)': False, 'contains(manage)': False, 'contains(spice)': False, 'contains(naked)': False, 'contains(started)': False, 'contains(intense)': False, 'contains(humanity)': False, 'contains(wonderfully)': False, 'contains(slasher)': False, 'contains(bland)': False, 'contains(imagination)': False, 'contains(walking)': False, 'contains(willing)': False, 'contains(horse)': False, 'contains(rent)': False, 'contains(mix)': False, 'contains(generated)': False, 'contains(g)': False, 'contains(utterly)': False, 'contains(scientist)': False, 'contains(washington)': False, 'contains(notice)': False, 'contains(players)': False, 'contains(teenagers)': False, 'contains(moore)': False, 'contains(board)': False, 'contains(price)': False, 'contains(frightening)': False, 'contains(tommy)': False, 'contains(spectacular)': False, 'contains(bored)': False, 'contains(jane)': False, 'contains(join)': False, 'contains(producers)': False, 'contains(johnny)': False, 'contains(zero)': False, 'contains(vampires)': False, 'contains(adaptation)': False, 'contains(dollars)': False, 'contains(parody)': False, 'contains(documentary)': False, 'contains(dvd)': False, 'contains(wayne)': False, 'contains(post)': False, 'contains(exist)': False, 'contains(matters)': False, 'contains(chosen)': False, 'contains(mel)': False, 'contains(attractive)': True, 'contains(plain)': False, 'contains(trust)': False, 'contains(safe)': False, 'contains(reading)': False, 'contains(hoping)': False, 'contains(protagonist)': False, 'contains(feelings)': False, 'contains(fate)': False, 'contains(finding)': False, 'contains(feet)': False, 'contains(visuals)': False, 'contains(spawn)': False, 'contains(compelling)': False, 'contains(hall)': False, 'contains(sympathetic)': False, 'contains(featuring)': False, 'contains(difference)': False, 'contains(professional)': False, 'contains(drugs)': False, 'contains(ford)': False, 'contains(shooting)': False, 'contains(gold)': False, 'contains(patch)': False, 'contains(build)': False, 'contains(boat)': False, 'contains(cruise)': False, 'contains(honest)': False, 'contains(media)': False, 'contains(flicks)': False, 'contains(bug)': False, 'contains(bringing)': False, 'contains(dangerous)': True, 'contains(watched)': False, 'contains(grant)': False, 'contains(smile)': False, 'contains(plus)': False, 'contains(shouldn)': False, 'contains(decision)': False, 'contains(visually)': False, 'contains(allow)': False, 'contains(starship)': False, 'contains(roberts)': False, 'contains(dying)': False, 'contains(portrayed)': False, 'contains(turning)': False, 'contains(believes)': False, 'contains(changed)': False, 'contains(shock)': False, 'contains(destroy)': False, 'contains(30)': False, 'contains(crowd)': False, 'contains(broken)': False, 'contains(tired)': False, 'contains(fail)': False, 'contains(south)': False, 'contains(died)': False, 'contains(cult)': False, 'contains(fake)': False, 'contains(vincent)': False, 'contains(identity)': False, 'contains(sexy)': False, 'contains(hunt)': False, 'contains(jedi)': False, 'contains(flynt)': False, 'contains(alex)': False, 'contains(engaging)': False, 'contains(serve)': False, 'contains(snake)': False, 'contains(yeah)': False, 'contains(expecting)': False, 'contains(100)': False, 'contains(decade)': False, 'contains(ups)': False, 'contains(constant)': False, 'contains(current)': False, 'contains(survive)': False, 'contains(jimmy)': False, 'contains(buddy)': False, 'contains(send)': False, 'contains(brooks)': False, 'contains(goofy)': False, 'contains(likable)': False, 'contains(humour)': False, 'contains(technology)': False, 'contains(files)': False, 'contains(babe)': False, 'contains(aspects)': False, 'contains(presents)': False, 'contains(kills)': False, 'contains(supposedly)': False, 'contains(eight)': True, 'contains(sandler)': False, 'contains(hospital)': False, 'contains(test)': False, 'contains(hidden)': False, 'contains(brian)': False, 'contains(books)': False, 'contains(promise)': False, 'contains(determined)': False, 'contains(professor)': False, 'contains(welcome)': False, 'contains(pleasure)': False, 'contains(succeeds)': False, 'contains(individual)': False, 'contains(annie)': False, 'contains(mob)': False, 'contains(ted)': False, 'contains(virus)': False, 'contains(content)': False, 'contains(gary)': False, 'contains(direct)': False, 'contains(contrived)': False, 'contains(carpenter)': False, 'contains(scale)': False, 'contains(sick)': False, 'contains(nasty)': False, 'contains(conflict)': False, 'contains(haunting)': False, 'contains(ghost)': False, 'contains(filmmaker)': False, 'contains(japanese)': False, 'contains(helps)': False, 'contains(fare)': False, 'contains(lucky)': False, 'contains(ultimate)': False, 'contains(window)': False, 'contains(support)': False, 'contains(goal)': False, 'contains(provided)': False, 'contains(genius)': False, 'contains(winner)': False, 'contains(taylor)': False, 'contains(fantastic)': False, 'contains(faith)': False, 'contains(lynch)': False, 'contains(fit)': False, 'contains(catherine)': False, 'contains(ms)': False, 'contains(paced)': False, 'contains(breaks)': False, 'contains(al)': False, 'contains(frame)': False, 'contains(travel)': False, 'contains(badly)': False, 'contains(available)': False, 'contains(cares)': False, 'contains(reeves)': False, 'contains(crash)': False, 'contains(driving)': False, 'contains(press)': False, 'contains(seagal)': False, 'contains(amy)': False, 'contains(9)': False, 'contains(headed)': False, 'contains(instance)': False, 'contains(excuse)': False, 'contains(offensive)': False, 'contains(narrative)': False, 'contains(fault)': False, 'contains(bus)': False, 'contains(f)': False, 'contains(extreme)': False, 'contains(miller)': False, 'contains(guilty)': False, 'contains(grows)': False, 'contains(overly)': False, 'contains(liners)': False, 'contains(forgotten)': False, 'contains(ahead)': False, 'contains(accept)': False, 'contains(porn)': False, 'contains(directly)': False, 'contains(helen)': False, 'contains(began)': False, 'contains(lord)': False, 'contains(folks)': False, 'contains(mediocre)': False, 'contains(bar)': False, 'contains(surface)': False, 'contains(super)': False, 'contains(failure)': False, 'contains(6)': False, 'contains(acted)': False, 'contains(quiet)': False, 'contains(laughable)': False, 'contains(sheer)': False, 'contains(security)': True, 'contains(emotionally)': False, 'contains(season)': False, 'contains(stuart)': False, 'contains(jail)': True, 'contains(deals)': False, 'contains(cheesy)': False, 'contains(court)': False, 'contains(beach)': False, 'contains(austin)': False, 'contains(model)': False, 'contains(outstanding)': False, 'contains(substance)': False, 'contains(nudity)': False, 'contains(slapstick)': False, 'contains(joan)': False, 'contains(reveal)': False, 'contains(placed)': False, 'contains(check)': False, 'contains(beast)': False, 'contains(hurt)': False, 'contains(bloody)': False, 'contains(acts)': False, 'contains(fame)': False, 'contains(meeting)': False, 'contains(nuclear)': False, 'contains(1996)': False, 'contains(strength)': False, 'contains(center)': False, 'contains(funniest)': False, 'contains(standing)': True, 'contains(damon)': False, 'contains(clich)': False, 'contains(position)': False, 'contains(desire)': False, 'contains(driven)': False, 'contains(seat)': False, 'contains(stock)': False, 'contains(wondering)': True, 'contains(realizes)': False, 'contains(dealing)': False, 'contains(taste)': False, 'contains(routine)': False, 'contains(comparison)': False, 'contains(cinematographer)': False, 'contains(seconds)': False, 'contains(singing)': False, 'contains(gangster)': True, 'contains(responsible)': False, 'contains(football)': False, 'contains(remarkable)': False, 'contains(hunting)': False, 'contains(adams)': False, 'contains(fly)': False, 'contains(suspects)': False, 'contains(treat)': False, 'contains(hopes)': False, 'contains(heaven)': False, 'contains(myers)': False, 'contains(struggle)': False, 'contains(costumes)': False, 'contains(beat)': False, 'contains(happening)': False, 'contains(skills)': False, 'contains(ugly)': False, 'contains(figures)': False, 'contains(thoroughly)': False, 'contains(ill)': False, 'contains(surprises)': False, 'contains(player)': False, 'contains(rival)': False, 'contains(guard)': True, 'contains(anthony)': False, 'contains(strike)': False, 'contains(community)': False, 'contains(streets)': False, 'contains(hopkins)': False, 'contains(ended)': False, 'contains(originally)': False, 'contains(sarah)': False, 'contains(creative)': False, 'contains(characterization)': False, 'contains(thankfully)': False, 'contains(growing)': False, 'contains(sharp)': False, 'contains(williamson)': False, 'contains(eccentric)': False, 'contains(explained)': False, 'contains(hey)': False, 'contains(claire)': False, 'contains(steal)': False, 'contains(inevitable)': False, 'contains(joel)': False, 'contains(core)': False, 'contains(weren)': False, 'contains(sorry)': False, 'contains(built)': False, 'contains(anne)': False, 'contains(breaking)': False, 'contains(villains)': False, 'contains(critic)': False, 'contains(lets)': False, 'contains(visit)': False, 'contains(followed)': False}
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print("Naive Bayes accuracy with 2000 bag of words features is %s" % (nltk.classify.accuracy(classifier, test_set)))
classifier.show_most_informative_features(5)
Naive Bayes accuracy with 2000 bag of words features is 0.81 Most Informative Features contains(outstanding) = True pos : neg = 10.8 : 1.0 contains(mulan) = True pos : neg = 9.0 : 1.0 contains(seagal) = True neg : pos = 8.2 : 1.0 contains(wonderfully) = True pos : neg = 7.1 : 1.0 contains(lame) = True neg : pos = 6.1 : 1.0
classifier.show_most_informative_features(20)
Most Informative Features contains(outstanding) = True pos : neg = 10.8 : 1.0 contains(mulan) = True pos : neg = 9.0 : 1.0 contains(seagal) = True neg : pos = 8.2 : 1.0 contains(wonderfully) = True pos : neg = 7.1 : 1.0 contains(lame) = True neg : pos = 6.1 : 1.0 contains(damon) = True pos : neg = 6.1 : 1.0 contains(flynt) = True pos : neg = 5.6 : 1.0 contains(awful) = True neg : pos = 5.3 : 1.0 contains(poorly) = True neg : pos = 5.3 : 1.0 contains(wasted) = True neg : pos = 5.3 : 1.0 contains(ridiculous) = True neg : pos = 5.1 : 1.0 contains(waste) = True neg : pos = 4.8 : 1.0 contains(era) = True pos : neg = 4.8 : 1.0 contains(damme) = True neg : pos = 4.7 : 1.0 contains(worst) = True neg : pos = 4.3 : 1.0 contains(unfunny) = True neg : pos = 4.3 : 1.0 contains(bland) = True neg : pos = 4.2 : 1.0 contains(allows) = True pos : neg = 4.1 : 1.0 contains(stupid) = True neg : pos = 3.9 : 1.0 contains(portrayal) = True pos : neg = 3.9 : 1.0
We can consider the task of POS tagging as a classification task and use the classifier methodology described here. Let us revisit the POS tagging task discussed in the first lecture using the new tools we have developed.
from nltk.corpus import brown
suffix_fdist = nltk.FreqDist()
for word in brown.words():
word = word.lower()
suffix_fdist[word[-1:]] += 1
suffix_fdist[word[-2:]] += 1
suffix_fdist[word[-3:]] += 1
common_suffixes = [suffix for (suffix, count) in suffix_fdist.most_common(100)]
print(common_suffixes)
['e', ',', '.', 's', 'd', 't', 'he', 'n', 'a', 'of', 'the', 'y', 'r', 'to', 'in', 'f', 'o', 'ed', 'nd', 'is', 'on', 'l', 'g', 'and', 'ng', 'er', 'as', 'ing', 'h', 'at', 'es', 'or', 're', 'it', '``', 'an', "''", 'm', ';', 'i', 'ly', 'ion', 'en', 'al', '?', 'nt', 'be', 'hat', 'st', 'his', 'th', 'll', 'le', 'ce', 'by', 'ts', 'me', 've', "'", 'se', 'ut', 'was', 'for', 'ent', 'ch', 'k', 'w', 'ld', '`', 'rs', 'ted', 'ere', 'her', 'ne', 'ns', 'ith', 'ad', 'ry', ')', '(', 'te', '--', 'ay', 'ty', 'ot', 'p', 'nce', "'s", 'ter', 'om', 'ss', ':', 'we', 'are', 'c', 'ers', 'uld', 'had', 'so', 'ey']
def pos_features(word):
features = {}
for suffix in common_suffixes:
features['endswith(%s)' % suffix] = word.lower().endswith(suffix)
return features
tagged_words = brown.tagged_words(categories='news', tagset='universal')
featuresets = [(pos_features(n), g) for (n,g) in tagged_words]
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)
0.7011437095972153
classifier.classify(pos_features('cats'))
'NOUN'
classifier.classify(pos_features('books'))
'NOUN'
classifier.show_most_informative_features(20)
Most Informative Features endswith(the) = True DET : NOUN = 3416.9 : 1.0 endswith(.) = True . : ADP = 2481.6 : 1.0 endswith(to) = True PRT : ADJ = 2138.0 : 1.0 endswith(f) = True ADP : VERB = 2050.5 : 1.0 endswith(he) = True DET : NOUN = 1808.9 : 1.0 endswith(and) = True CONJ : ADV = 1642.0 : 1.0 endswith(a) = True DET : VERB = 1597.1 : 1.0 endswith(of) = True ADP : NOUN = 1406.9 : 1.0 endswith(his) = True DET : NOUN = 728.0 : 1.0 endswith(ut) = True CONJ : DET = 694.7 : 1.0 endswith(nd) = True CONJ : NUM = 636.1 : 1.0 endswith(hat) = True PRON : NOUN = 570.6 : 1.0 endswith(ey) = True PRON : VERB = 549.0 : 1.0 endswith(i) = True PRON : ADP = 547.2 : 1.0 endswith(') = True . : VERB = 503.7 : 1.0 endswith(o) = True PRT : ADJ = 493.4 : 1.0 endswith(es) = True NOUN : ADP = 427.0 : 1.0 endswith(uld) = True VERB : NOUN = 422.5 : 1.0 endswith(we) = True PRON : NOUN = 353.5 : 1.0 endswith(ted) = True VERB : NOUN = 337.9 : 1.0
NLTK provides a common interface to different classifier algorithms. This is illustrated in the following examples.
import nltk
train = [
(dict(a=1,b=1,c=1), 'y'),
(dict(a=1,b=1,c=1), 'x'),
(dict(a=1,b=1,c=0), 'y'),
(dict(a=0,b=1,c=1), 'x'),
(dict(a=0,b=1,c=1), 'y'),
(dict(a=0,b=0,c=1), 'y'),
(dict(a=0,b=1,c=0), 'x'),
(dict(a=0,b=0,c=0), 'x'),
(dict(a=0,b=1,c=1), 'y'),
]
test = [
(dict(a=1,b=0,c=1)), # unseen
(dict(a=1,b=0,c=0)), # unseen
(dict(a=0,b=1,c=1)), # seen 3 times, labels=y,y,x
(dict(a=0,b=1,c=0)), # seen 1 time, label=x
]
classifier = nltk.classify.NaiveBayesClassifier.train(train)
sorted(classifier.labels())
['x', 'y']
classifier.classify_many(test)
['y', 'x', 'y', 'x']
for pdist in classifier.prob_classify_many(test):
print('%.4f %.4f' % (pdist.prob('x'), pdist.prob('y')))
0.3203 0.6797 0.5857 0.4143 0.3792 0.6208 0.6470 0.3530
classifier.show_most_informative_features()
Most Informative Features c = 0 x : y = 2.0 : 1.0 c = 1 y : x = 1.5 : 1.0 a = 1 y : x = 1.4 : 1.0 b = 0 x : y = 1.2 : 1.0 a = 0 x : y = 1.2 : 1.0 b = 1 y : x = 1.1 : 1.0
classifier = nltk.classify.DecisionTreeClassifier.train(
train, entropy_cutoff=0, support_cutoff=0)
sorted(classifier.labels())
['x', 'y']
print(classifier)
c=0? .................................................. x a=0? ................................................ x a=1? ................................................ y c=1? .................................................. y
classifier.classify_many(test)
['y', 'y', 'y', 'x']
There is no prob() method for decision tree classifiers, as they do not provide a probability interpretation.
NLTK provides an interface to the Scikit-learn (sklearn) classifiers - including maximum entropy and SVM.
from nltk.classify import SklearnClassifier
train_data = [({"a": 4, "b": 1, "c": 0}, "ham"),
({"a": 5, "b": 2, "c": 1}, "ham"),
({"a": 0, "b": 3, "c": 4}, "spam"),
({"a": 5, "b": 1, "c": 1}, "ham"),
({"a": 1, "b": 4, "c": 3}, "spam")]
test_data = [{"a": 3, "b": 2, "c": 1},
{"a": 0, "b": 3, "c": 7}]
from sklearn.naive_bayes import BernoulliNB
classif = SklearnClassifier(BernoulliNB()).train(train_data)
classif.classify_many(test_data)
['ham', 'spam']
from sklearn.svm import SVC
classif = SklearnClassifier(SVC(gamma='scale'), sparse=False).train(train_data)
classif.classify_many(test_data)
['ham', 'spam']
# Using the sklearn classifier:
X = [[0], [1], [2], [3]]
Y = [0, 1, 2, 3]
clf = SVC(kernel='linear', C=1.0)
clf.fit(X, Y)
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto_deprecated', kernel='linear', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)
classifr = SklearnClassifier(SVC(kernel='rbf', C=1.0, gamma='scale'), sparse=False).train(train_data)
classifr.classify_many(test_data)
['ham', 'spam']
from sklearn.svm import LinearSVC
classif_ova = SklearnClassifier(LinearSVC(C=1.0), sparse=False).train(train_data)
classif_ova.classify_many(test_data)
['ham', 'spam']
The key parameter to optimize for a given SVM kernel is the C parameter. Here is example code from sklearn that shows how to optimize C on a development set.
%matplotlib inline
import numpy as np
from sklearn import model_selection, datasets, svm
digits = datasets.load_digits()
X = digits.data
y = digits.target
svc = svm.SVC(kernel='linear')
C_s = np.logspace(-10, 0, 10)
scores = list()
scores_std = list()
for C in C_s:
svc.C = C
this_scores = model_selection.cross_val_score(svc, X, y, n_jobs=1, cv=5)
scores.append(np.mean(this_scores))
scores_std.append(np.std(this_scores))
# Do the plotting
import matplotlib.pyplot as plt
plt.figure(1, figsize=(4, 3))
plt.clf()
plt.semilogx(C_s, scores)
plt.semilogx(C_s, np.array(scores) + np.array(scores_std), 'b--')
plt.semilogx(C_s, np.array(scores) - np.array(scores_std), 'b--')
locs, labels = plt.yticks()
plt.yticks(locs, list(map(lambda x: "%g" % x, locs)))
plt.ylabel('CV score')
plt.xlabel('Parameter C')
plt.ylim(0, 1.1)
plt.show()