In this notebook, we demonstrate how to apply classification tools to solve NLP tasks.
We start with a very simple task that looks at words in isolation and tries to classify them into 2 labels: gender identification. The task consists of guessing whether a name is masculine or feminine.
The classification method consists of taking as input an observation, turning this observation into a feature vector, then predicting the label of this feature vector by applying a trained classifier model.
To prepare for this procedure, we must train a classifier. In supervised learning, a classifier is learned by generalizing a set of observed pairs (observationi, labeli) where [i = 1..N].
%matplotlib inline
def gender_features(word):
return {'last_letter': word[-1]}
gender_features('Shrek')
{'last_letter': 'k'}
from nltk.corpus import names
labeled_names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')])
import random
random.shuffle(labeled_names)
print("There are %s samples in the dataset." % (len(labeled_names)))
There are 7944 samples in the dataset.
import nltk
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print("Neo is classified as %s" % (classifier.classify(gender_features('Neo'))))
print("Trinity is classified as %s" % (classifier.classify(gender_features('Trinity'))))
Neo is classified as male Trinity is classified as female
print(nltk.classify.accuracy(classifier, test_set))
0.77
classifier.show_most_informative_features(5)
Most Informative Features last_letter = 'a' female : male = 34.2 : 1.0 last_letter = 'k' male : female = 32.7 : 1.0 last_letter = 'f' male : female = 15.9 : 1.0 last_letter = 'p' male : female = 12.6 : 1.0 last_letter = 'v' male : female = 11.2 : 1.0
def gender_features2(name):
features = {}
features["first_letter"] = name[0].lower()
features["last_letter"] = name[-1].lower()
for letter in 'abcdefghijklmnopqrstuvwxyz':
features["count(%s)" % letter] = name.lower().count(letter)
features["has(%s)" % letter] = (letter in name.lower())
return features
gender_features2('John')
{'count(a)': 0, 'count(b)': 0, 'count(c)': 0, 'count(d)': 0, 'count(e)': 0, 'count(f)': 0, 'count(g)': 0, 'count(h)': 1, 'count(i)': 0, 'count(j)': 1, 'count(k)': 0, 'count(l)': 0, 'count(m)': 0, 'count(n)': 1, 'count(o)': 1, 'count(p)': 0, 'count(q)': 0, 'count(r)': 0, 'count(s)': 0, 'count(t)': 0, 'count(u)': 0, 'count(v)': 0, 'count(w)': 0, 'count(x)': 0, 'count(y)': 0, 'count(z)': 0, 'first_letter': 'j', 'has(a)': False, 'has(b)': False, 'has(c)': False, 'has(d)': False, 'has(e)': False, 'has(f)': False, 'has(g)': False, 'has(h)': True, 'has(i)': False, 'has(j)': True, 'has(k)': False, 'has(l)': False, 'has(m)': False, 'has(n)': True, 'has(o)': True, 'has(p)': False, 'has(q)': False, 'has(r)': False, 'has(s)': False, 'has(t)': False, 'has(u)': False, 'has(v)': False, 'has(w)': False, 'has(x)': False, 'has(y)': False, 'has(z)': False, 'last_letter': 'n'}
featuresets = [(gender_features2(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))
0.768
train_names = labeled_names[1500:]
devtest_names = labeled_names[500:1500]
test_names = labeled_names[:500]
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
test_set = [(gender_features(n), gender) for (n, gender) in test_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set))
0.761
errors = []
for (name, tag) in devtest_names:
guess = classifier.classify(gender_features(name))
if guess != tag:
errors.append( (tag, guess, name) )
for (tag, guess, name) in sorted(errors):
print('correct=%-8s guess=%-8s name=%-30s' % (tag, guess, name))
correct=female guess=male name=Allsun correct=female guess=male name=Allys correct=female guess=male name=Alyson correct=female guess=male name=Arleen correct=female guess=male name=Aryn correct=female guess=male name=Bette-Ann correct=female guess=male name=Bird correct=female guess=male name=Birgit correct=female guess=male name=Brigid correct=female guess=male name=Brynn correct=female guess=male name=Carolin correct=female guess=male name=Carrol correct=female guess=male name=Charis correct=female guess=male name=Charyl correct=female guess=male name=Cherilyn correct=female guess=male name=Christel correct=female guess=male name=Chrysler correct=female guess=male name=Coreen correct=female guess=male name=Cristabel correct=female guess=male name=Cristen correct=female guess=male name=Crystal correct=female guess=male name=Cybill correct=female guess=male name=Daloris correct=female guess=male name=Damaris correct=female guess=male name=Daniel correct=female guess=male name=Darb correct=female guess=male name=Demeter correct=female guess=male name=Devon correct=female guess=male name=Dido correct=female guess=male name=Doralin correct=female guess=male name=Easter correct=female guess=male name=Eden correct=female guess=male name=Eleanor correct=female guess=male name=Elinor correct=female guess=male name=Em correct=female guess=male name=Estel correct=female guess=male name=Evangelin correct=female guess=male name=Fawn correct=female guess=male name=Flo correct=female guess=male name=Flor correct=female guess=male name=Floris correct=female guess=male name=Frances correct=female guess=male name=Gayleen correct=female guess=male name=Gen correct=female guess=male name=Gillian correct=female guess=male name=Gilligan correct=female guess=male name=Glynnis correct=female guess=male name=Gwen correct=female guess=male name=Gwyn correct=female guess=male name=Idell correct=female guess=male name=Ingeborg correct=female guess=male name=Ivett correct=female guess=male name=Jasmin correct=female guess=male name=Jerrilyn correct=female guess=male name=Jewell correct=female guess=male name=Jilleen correct=female guess=male name=Joellyn correct=female guess=male name=Jonell correct=female guess=male name=Jordan correct=female guess=male name=Josselyn correct=female guess=male name=Joyan correct=female guess=male name=Kaster correct=female guess=male name=Katheleen correct=female guess=male name=Kaylyn correct=female guess=male name=Kellen correct=female guess=male name=Keren correct=female guess=male name=Kip correct=female guess=male name=Kirstin correct=female guess=male name=Kristal correct=female guess=male name=Kym correct=female guess=male name=Leann correct=female guess=male name=Lillian correct=female guess=male name=Linet correct=female guess=male name=Linn correct=female guess=male name=Loreen correct=female guess=male name=Lurleen correct=female guess=male name=Lyn correct=female guess=male name=Magdalen correct=female guess=male name=Margalit correct=female guess=male name=Marget correct=female guess=male name=Marion correct=female guess=male name=Marit correct=female guess=male name=Marylin correct=female guess=male name=Mavis correct=female guess=male name=Melisent correct=female guess=male name=Meridel correct=female guess=male name=Mureil correct=female guess=male name=Nadeen correct=female guess=male name=Nariko correct=female guess=male name=Nell correct=female guess=male name=Nichol correct=female guess=male name=Noel correct=female guess=male name=Noell correct=female guess=male name=Noelyn correct=female guess=male name=Norean correct=female guess=male name=Noreen correct=female guess=male name=Persis correct=female guess=male name=Phil correct=female guess=male name=Phyllis correct=female guess=male name=Phyllys correct=female guess=male name=Pris correct=female guess=male name=Rahal correct=female guess=male name=Rakel correct=female guess=male name=Rayshell correct=female guess=male name=Rhianon correct=female guess=male name=Rosabel correct=female guess=male name=Roz correct=female guess=male name=Rozalin correct=female guess=male name=Sam correct=female guess=male name=Sara-Ann correct=female guess=male name=Sharon correct=female guess=male name=Shirleen correct=female guess=male name=Stoddard correct=female guess=male name=Storm correct=female guess=male name=Sydel correct=female guess=male name=Teriann correct=female guess=male name=Viviyan correct=female guess=male name=Wren correct=male guess=female name=Abby correct=male guess=female name=Alix correct=male guess=female name=Amory correct=male guess=female name=Andie correct=male guess=female name=Baillie correct=male guess=female name=Baily correct=male guess=female name=Bela correct=male guess=female name=Boniface correct=male guess=female name=Brady correct=male guess=female name=Brice correct=male guess=female name=Cary correct=male guess=female name=Case correct=male guess=female name=Chane correct=male guess=female name=Chase correct=male guess=female name=Chauncey correct=male guess=female name=Chrisy correct=male guess=female name=Cobbie correct=male guess=female name=Connie correct=male guess=female name=Corey correct=male guess=female name=Danie correct=male guess=female name=Dannie correct=male guess=female name=Darcy correct=male guess=female name=Darth correct=male guess=female name=Davie correct=male guess=female name=Dietrich correct=male guess=female name=Dominique correct=male guess=female name=Dory correct=male guess=female name=Durante correct=male guess=female name=Eugene correct=male guess=female name=Fonzie correct=male guess=female name=Gay correct=male guess=female name=Geoffrey correct=male guess=female name=Gerri correct=male guess=female name=Giovanni correct=male guess=female name=Giuseppe correct=male guess=female name=Godfree correct=male guess=female name=Guy correct=male guess=female name=Hadleigh correct=male guess=female name=Haley correct=male guess=female name=Herby correct=male guess=female name=Herculie correct=male guess=female name=Hezekiah correct=male guess=female name=Hillery correct=male guess=female name=Hirsch correct=male guess=female name=Hurley correct=male guess=female name=Hy correct=male guess=female name=Hymie correct=male guess=female name=Isa correct=male guess=female name=Isidore correct=male guess=female name=Jean-Christophe correct=male guess=female name=Jefferey correct=male guess=female name=Jeffie correct=male guess=female name=Jermaine correct=male guess=female name=Jessey correct=male guess=female name=Jimmy correct=male guess=female name=Johny correct=male guess=female name=Jonah correct=male guess=female name=Joseph correct=male guess=female name=Kennedy correct=male guess=female name=Lanny correct=male guess=female name=Larry correct=male guess=female name=Lorne correct=male guess=female name=Lorrie correct=male guess=female name=Maurice correct=male guess=female name=Maurise correct=male guess=female name=Mischa correct=male guess=female name=Monty correct=male guess=female name=Moore correct=male guess=female name=Mordecai correct=male guess=female name=Morlee correct=male guess=female name=Morley correct=male guess=female name=Nichole correct=male guess=female name=Nicky correct=male guess=female name=Niki correct=male guess=female name=Nikita correct=male guess=female name=Noble correct=male guess=female name=Pace correct=male guess=female name=Pearce correct=male guess=female name=Pepe correct=male guess=female name=Petey correct=male guess=female name=Pierce correct=male guess=female name=Prentice correct=male guess=female name=Price correct=male guess=female name=Pryce correct=male guess=female name=Reece correct=male guess=female name=Reza correct=male guess=female name=Rickey correct=male guess=female name=Riley correct=male guess=female name=Ritch correct=male guess=female name=Rory correct=male guess=female name=Ruddy correct=male guess=female name=Rudolph correct=male guess=female name=Sayre correct=male guess=female name=Scotti correct=male guess=female name=Sheffie correct=male guess=female name=Shelby correct=male guess=female name=Sloane correct=male guess=female name=Solly correct=male guess=female name=Spense correct=male guess=female name=Stacy correct=male guess=female name=Tabbie correct=male guess=female name=Tally correct=male guess=female name=Tedie correct=male guess=female name=Terrance correct=male guess=female name=Terri correct=male guess=female name=Tony correct=male guess=female name=Torrance correct=male guess=female name=Uri correct=male guess=female name=Virge correct=male guess=female name=Vite correct=male guess=female name=Walsh correct=male guess=female name=Welbie correct=male guess=female name=Westley correct=male guess=female name=Winny correct=male guess=female name=Worth correct=male guess=female name=Yancy correct=male guess=female name=Zacherie correct=male guess=female name=Zebadiah correct=male guess=female name=Zechariah correct=male guess=female name=Zedekiah correct=male guess=female name=Zollie
def gender_features(word):
return {'suffix1': word[-1:],
'suffix2': word[-2:]}
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set))
0.773
We now turn our attention to classifying full documents as opposed to single words in isolation.
The task seems more challenging, but simple methods can achieve surprisingly good results when the task is well defined. Consider the task of predicting whether a movie review is positive or negative. This is a task called sentiment analysis and is a hot practical task in the era of user-generated content (UGC) on the Web.
A good dataset is available in NLTK to experiment with this task.
from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = [w for (w, c) in all_words.most_common(2000)]
def document_features(document):
document_words = set(document)
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in document_words)
return features
print("There are %d documents in the movie reviews dataset." % (len(documents)))
print("There are %d distinct words in the dataset." % (all_words.B()))
print("There are %d tokens in the dataset." % (all_words.N()))
all_words.tabulate(20)
all_words.plot(100)
There are 2000 documents in the movie reviews dataset. There are 39768 distinct words in the dataset. There are 1583820 tokens in the dataset. , the . a and of to ' is in s " it that - ) ( as with for 77717 76529 65876 38106 35576 34123 31937 30585 25195 21822 18513 17612 16107 15924 15595 11781 11664 11378 10792 9961
print(document_features(movie_reviews.words('pos/cv957_8737.txt')))
{'contains(hill)': False, 'contains(reason)': False, 'contains(music)': False, 'contains(impact)': False, 'contains(calls)': False, 'contains(roberts)': False, 'contains(lucas)': False, 'contains(taken)': False, 'contains(mad)': False, 'contains(jackson)': False, 'contains(m)': False, 'contains(band)': False, 'contains(what)': True, 'contains(addition)': False, 'contains(various)': False, 'contains(vincent)': False, 'contains(press)': False, 'contains(perhaps)': False, 'contains(return)': False, 'contains(appear)': False, 'contains(among)': True, 'contains(why)': False, 'contains(recommend)': False, 'contains(sequences)': False, 'contains(william)': False, 'contains(creating)': False, 'contains(died)': False, 'contains(state)': False, 'contains(literally)': False, 'contains(common)': False, 'contains(suddenly)': False, 'contains(predictable)': False, 'contains(continue)': False, 'contains(called)': False, 'contains(fellow)': False, 'contains(jokes)': False, 'contains(open)': False, 'contains(hits)': False, 'contains(killer)': False, 'contains(pretty)': False, 'contains(lord)': False, 'contains(serious)': False, 'contains(accept)': False, 'contains(follow)': False, 'contains(police)': False, 'contains(giant)': False, 'contains(husband)': False, 'contains(wondering)': True, 'contains(act)': False, 'contains(spawn)': False, 'contains(other)': True, 'contains(nobody)': False, 'contains(city)': False, 'contains(gary)': False, 'contains(provide)': False, 'contains(talk)': True, 'contains(blade)': False, 'contains(hero)': False, 'contains(mouth)': False, 'contains(affleck)': False, 'contains(whom)': False, 'contains(nature)': False, 'contains(so)': False, 'contains(plays)': False, 'contains(turning)': False, 'contains(culture)': False, 'contains(classic)': False, 'contains(fire)': False, 'contains(reviews)': False, 'contains(long)': False, 'contains(mix)': False, 'contains(fairly)': True, 'contains(neither)': False, 'contains(earlier)': False, 'contains(be)': True, 'contains(apes)': False, 'contains(title)': False, 'contains(annoying)': False, 'contains(dull)': False, 'contains(search)': False, 'contains(oh)': False, 'contains(person)': False, 'contains(truth)': False, 'contains(taking)': False, 'contains(breaking)': False, 'contains(attractive)': True, 'contains(babe)': False, 'contains(trek)': False, 'contains(important)': False, 'contains(writing)': False, 'contains(comedy)': True, 'contains(responsible)': False, 'contains(expect)': False, 'contains(family)': False, 'contains(routine)': False, 'contains(immediately)': False, 'contains(pleasure)': False, 'contains(starship)': False, 'contains(entertainment)': False, 'contains(sexual)': False, 'contains(relationship)': False, 'contains(super)': False, 'contains(meeting)': False, 'contains(whose)': False, 'contains(desire)': False, 'contains(sub)': False, 'contains(guard)': True, 'contains(sharp)': False, 'contains(appealing)': False, 'contains(visually)': False, 'contains(credits)': False, 'contains(front)': False, 'contains(plus)': False, 'contains(stupid)': False, 'contains(hoping)': False, 'contains(clear)': False, 'contains(sitting)': False, 'contains(smith)': False, 'contains(growing)': False, 'contains(looking)': False, 'contains(scary)': False, 'contains(weird)': False, 'contains(cartoon)': False, 'contains(contrived)': False, 'contains(married)': False, 'contains(gore)': False, 'contains(without)': False, 'contains(college)': False, 'contains(terrific)': False, 'contains(documentary)': False, 'contains(run)': False, 'contains(emotional)': False, 'contains(after)': False, 'contains(david)': False, 'contains(tony)': False, 'contains(flaws)': False, 'contains(sandler)': False, 'contains(track)': False, 'contains(country)': False, 'contains(pain)': False, 'contains(suspects)': False, 'contains(engaging)': False, 'contains(mind)': False, 'contains(door)': False, 'contains(set)': False, 'contains(flicks)': False, 'contains(before)': False, 'contains(older)': False, 'contains(age)': False, 'contains(who)': True, 'contains(exciting)': False, 'contains(cross)': False, 'contains(blame)': False, 'contains(cinematographer)': False, 'contains(f)': False, 'contains(small)': False, 'contains(product)': False, 'contains(released)': False, 'contains(unfortunately)': False, 'contains(review)': False, 'contains(stories)': False, 'contains(100)': False, 'contains(partner)': False, 'contains(drug)': True, 'contains(call)': False, 'contains(ray)': False, 'contains(buddy)': False, 'contains(effects)': False, 'contains(book)': False, 'contains(academy)': False, 'contains(affair)': False, 'contains(air)': False, 'contains(stephen)': False, 'contains(loves)': False, 'contains(law)': False, 'contains(course)': True, 'contains(check)': False, 'contains(screen)': True, 'contains(breaks)': False, 'contains(1996)': False, 'contains(gives)': False, 'contains(computer)': False, 'contains(disappointing)': False, 'contains(angry)': False, 'contains(chris)': False, 'contains(rose)': False, 'contains(alex)': False, 'contains(humour)': False, 'contains(james)': False, 'contains(type)': False, 'contains(way)': True, 'contains(shows)': False, 'contains(full)': False, 'contains(exception)': False, 'contains(visual)': False, 'contains(private)': False, 'contains(certain)': False, 'contains(hit)': False, 'contains(thought)': False, 'contains(creative)': False, 'contains(manner)': False, 'contains(script)': False, 'contains(hopes)': False, 'contains(worth)': True, 'contains(main)': False, 'contains(magic)': False, 'contains(headed)': False, 'contains(half)': False, 'contains(hilarious)': True, 'contains(forget)': False, 'contains(brief)': True, 'contains(plane)': False, 'contains(feeling)': False, 'contains(dvd)': False, 'contains(minutes)': False, 'contains(creepy)': False, 'contains(sets)': False, 'contains(picture)': False, 'contains(did)': False, 'contains(science)': False, 'contains())': True, 'contains(lives)': False, 'contains(animals)': False, 'contains($)': False, 'contains(speed)': False, 'contains(just)': True, 'contains(tarantino)': False, 'contains(extreme)': False, 'contains(trust)': False, 'contains(involves)': True, 'contains(cheesy)': False, 'contains(there)': True, 'contains(especially)': True, 'contains(smart)': False, 'contains(spends)': False, 'contains(spice)': False, 'contains(dennis)': False, 'contains(rather)': False, 'contains(actions)': False, 'contains(sometimes)': False, 'contains(funniest)': False, 'contains(play)': False, 'contains(experience)': False, 'contains(constantly)': False, 'contains(dad)': False, 'contains(genre)': False, 'contains(usual)': False, 'contains(ultimate)': False, 'contains(weak)': False, 'contains(sympathetic)': False, 'contains(goofy)': False, 'contains(miller)': False, 'contains(woody)': False, 'contains(;)': False, 'contains(him)': True, 'contains(talking)': False, 'contains(public)': False, 'contains(twice)': False, 'contains(disney)': False, 'contains(onto)': False, 'contains(middle)': False, 'contains(been)': False, 'contains(dying)': False, 'contains(brown)': False, 'contains(strike)': False, 'contains(ghost)': False, 'contains(rescue)': False, 'contains(de)': False, 'contains(flynt)': False, 'contains(court)': False, 'contains(critic)': False, 'contains(side)': False, 'contains(shots)': False, 'contains(notice)': False, 'contains(virtually)': False, 'contains(lucky)': False, 'contains(element)': False, 'contains(images)': False, 'contains(jail)': True, 'contains(superb)': False, 'contains(could)': False, 'contains(end)': False, 'contains(performance)': False, 'contains(cameo)': False, 'contains(us)': True, 'contains(made)': False, 'contains(virus)': False, 'contains(featuring)': False, 'contains(group)': False, 'contains(understand)': False, 'contains(fail)': False, 'contains(wayne)': False, 'contains(content)': False, 'contains(comes)': False, 'contains(were)': False, 'contains(t)': False, 'contains(process)': False, 'contains(deserves)': False, 'contains(price)': False, 'contains(her)': False, 'contains(english)': False, 'contains(jane)': False, 'contains(change)': False, 'contains(critics)': False, 'contains(j)': False, 'contains(intelligence)': False, 'contains(people)': False, 'contains(news)': False, 'contains(camera)': False, 'contains(get)': True, 'contains(actually)': True, 'contains(acted)': False, 'contains(emotionally)': False, 'contains(jeff)': False, 'contains(changed)': False, 'contains(horrible)': False, 'contains(scream)': True, 'contains(survive)': False, 'contains(remember)': False, 'contains(head)': False, 'contains(teacher)': False, 'contains(urban)': False, 'contains(certainly)': False, 'contains(mention)': False, 'contains(=)': False, 'contains(ted)': False, 'contains(bus)': False, 'contains(they)': True, 'contains(dumb)': False, 'contains(marriage)': False, 'contains(terms)': False, 'contains(re)': True, 'contains(hair)': False, 'contains(shooting)': False, 'contains(keeping)': False, 'contains(lacks)': False, 'contains(self)': False, 'contains(physical)': True, 'contains(direct)': False, 'contains(okay)': False, 'contains(gets)': True, 'contains(if)': True, 'contains(driving)': False, 'contains(wrote)': False, 'contains(odd)': False, 'contains(tarzan)': False, 'contains(led)': False, 'contains(everything)': True, 'contains(stage)': False, 'contains(world)': True, 'contains(three)': False, 'contains(sea)': False, 'contains(position)': False, 'contains(crash)': False, 'contains(test)': False, 'contains(attention)': False, 'contains(development)': False, 'contains(determined)': False, 'contains(considering)': False, 'contains(have)': True, 'contains(laugh)': True, 'contains(likes)': False, 'contains(substance)': False, 'contains(an)': True, 'contains(born)': False, 'contains(sister)': False, 'contains(material)': False, 'contains(die)': False, 'contains(1)': False, 'contains(points)': False, 'contains(needs)': False, 'contains(date)': False, 'contains(how)': True, 'contains(couple)': False, 'contains(high)': False, 'contains(jedi)': False, 'contains(scorsese)': False, 'contains(actor)': False, 'contains(owner)': False, 'contains(devil)': False, 'contains(ultimately)': False, 'contains(christopher)': False, 'contains(20)': False, 'contains(feature)': False, 'contains(tension)': False, 'contains(boyfriend)': False, 'contains(lose)': False, 'contains(billy)': False, 'contains(successful)': False, 'contains(x)': False, 'contains(falling)': False, 'contains(mediocre)': False, 'contains(moment)': False, 'contains(information)': False, 'contains(right)': False, "contains(')": True, 'contains(sight)': False, 'contains(supporting)': False, 'contains(aren)': False, 'contains(frank)': False, 'contains(none)': False, 'contains(five)': False, 'contains(wit)': False, 'contains(early)': True, 'contains(d)': False, 'contains(followed)': False, 'contains(become)': False, 'contains(alien)': False, 'contains(telling)': False, 'contains(want)': False, 'contains(issues)': False, 'contains(1998)': False, 'contains(girls)': False, 'contains(king)': False, 'contains(treat)': False, 'contains(said)': False, 'contains(focus)': False, 'contains(famous)': False, 'contains(worked)': False, 'contains(secret)': False, 'contains(presented)': False, 'contains(soul)': False, 'contains(fighting)': False, 'contains(disaster)': False, 'contains(further)': False, 'contains(model)': False, 'contains(tries)': True, 'contains(ed)': False, 'contains(fat)': False, 'contains(touch)': False, 'contains(earth)': False, 'contains(follows)': False, 'contains(post)': False, 'contains(hate)': False, 'contains(remarkable)': False, 'contains(within)': False, 'contains(she)': True, 'contains(dies)': False, 'contains(towards)': False, 'contains(pointless)': False, 'contains(emotion)': False, 'contains(deal)': False, 'contains(seeing)': True, 'contains(keep)': False, 'contains(keeps)': True, 'contains(theme)': False, 'contains(woods)': False, 'contains(sound)': False, 'contains(little)': True, 'contains(sorry)': False, 'contains(came)': False, 'contains(detail)': False, 'contains(buy)': False, 'contains(bit)': False, 'contains(studio)': False, 'contains(doesn)': False, 'contains(often)': True, 'contains(indeed)': False, 'contains(beginning)': False, 'contains(struggle)': False, 'contains(000)': False, 'contains(pieces)': False, 'contains(bunch)': False, 'contains(saying)': False, 'contains(concept)': False, 'contains(soldiers)': False, 'contains(humorous)': False, 'contains(comedic)': False, 'contains(peter)': False, 'contains(includes)': False, 'contains(personality)': False, 'contains(question)': False, 'contains(joe)': False, 'contains(l)': False, 'contains(home)': False, 'contains(ll)': False, 'contains(constant)': False, 'contains(year)': False, 'contains(except)': False, 'contains(trouble)': True, 'contains(guess)': False, 'contains(award)': False, 'contains(poor)': False, 'contains(impressive)': False, 'contains(cast)': False, 'contains(teen)': False, 'contains(filmmaking)': False, 'contains(catch)': False, 'contains(basically)': False, 'contains(whole)': False, 'contains(u)': False, 'contains(sadly)': False, 'contains(decade)': False, 'contains(hidden)': False, 'contains(folks)': False, 'contains(however)': True, 'contains(animal)': False, 'contains(style)': False, 'contains(surprised)': False, 'contains(admit)': False, 'contains(project)': False, 'contains(setting)': False, 'contains(dreams)': False, 'contains(recently)': False, 'contains(eccentric)': False, 'contains(cheap)': False, 'contains(graphic)': False, 'contains(had)': False, 'contains(actress)': False, 'contains(steve)': False, 'contains(shallow)': False, 'contains(jackie)': True, 'contains(girlfriend)': True, 'contains(ended)': False, 'contains(drawn)': False, 'contains(perfect)': False, 'contains(water)': False, 'contains(princess)': False, 'contains(started)': False, 'contains(accident)': False, 'contains(ups)': False, 'contains(comic)': False, 'contains(lies)': False, 'contains(adds)': False, 'contains(sense)': False, 'contains(about)': True, 'contains(count)': False, 'contains(killing)': False, 'contains(contains)': False, 'contains(wasn)': False, 'contains(island)': False, 'contains(kills)': False, 'contains(younger)': False, 'contains(hong)': True, 'contains(then)': True, 'contains(second)': False, 'contains(directly)': False, 'contains(store)': False, 'contains(using)': False, 'contains(times)': False, 'contains(drugs)': False, 'contains(interesting)': False, 'contains(silent)': False, 'contains(contact)': False, 'contains(your)': False, 'contains(black)': False, 'contains(watched)': False, 'contains(robert)': False, 'contains(living)': False, 'contains(promise)': False, 'contains(instead)': False, 'contains(dude)': False, 'contains(adaptation)': False, 'contains(third)': True, 'contains(likable)': False, 'contains(confusing)': False, 'contains(limited)': False, 'contains(are)': True, 'contains(example)': False, 'contains(was)': False, 'contains(doctor)': False, 'contains(song)': False, 'contains(hot)': False, 'contains(device)': False, 'contains(social)': False, 'contains(originally)': False, 'contains(thin)': False, 'contains(rocky)': False, 'contains(death)': False, 'contains(identity)': False, 'contains(totally)': False, 'contains(mulan)': False, 'contains(total)': False, 'contains(exist)': False, 'contains(hard)': False, 'contains(see)': False, 'contains(perfectly)': False, 'contains(lines)': False, 'contains(carter)': False, 'contains(subject)': False, 'contains(audiences)': False, 'contains(player)': False, 'contains(rarely)': False, 'contains(incredible)': False, 'contains(plain)': False, 'contains(edward)': False, 'contains(popular)': False, 'contains(vampires)': False, 'contains(community)': False, 'contains(")': True, 'contains(natural)': False, 'contains(cute)': False, 'contains(art)': False, 'contains(-)': True, 'contains(walk)': False, 'contains(spent)': False, 'contains(street)': False, 'contains(meaning)': False, 'contains(wide)': False, 'contains(rising)': False, 'contains(etc)': False, 'contains(merely)': False, 'contains(opens)': False, 'contains(soon)': False, 'contains(always)': False, 'contains(seven)': False, 'contains(g)': False, 'contains(officer)': False, 'contains(standard)': False, 'contains(actors)': False, 'contains(accent)': False, 'contains(present)': False, 'contains(themselves)': False, 'contains(happen)': False, 'contains(two)': True, 'contains(martial)': False, 'contains(male)': False, 'contains(introduced)': False, 'contains(appreciate)': False, 'contains(williamson)': False, 'contains(starring)': False, 'contains(when)': True, 'contains(driver)': False, 'contains(rating)': False, 'contains(baby)': False, 'contains(during)': False, 'contains(sean)': False, 'contains(of)': True, 'contains(general)': False, 'contains(leaving)': False, 'contains(scene)': True, 'contains(deliver)': False, 'contains(r)': False, 'contains(makes)': False, 'contains(tough)': False, 'contains(hold)': False, 'contains(individual)': False, 'contains(liked)': False, 'contains(remake)': False, 'contains(bill)': False, 'contains(good)': False, 'contains(order)': False, 'contains(dealing)': False, 'contains(ones)': False, 'contains(include)': False, 'contains(effect)': False, 'contains(meant)': False, 'contains(writer)': False, 'contains(succeeds)': False, 'contains(helps)': False, 'contains(matter)': False, 'contains(typical)': False, 'contains(radio)': False, 'contains(villains)': False, 'contains(sent)': False, 'contains(sexy)': False, 'contains(happens)': False, 'contains(ago)': False, 'contains(roles)': False, 'contains(bloody)': False, 'contains(body)': False, 'contains(later)': False, 'contains(war)': False, 'contains(?)': False, 'contains(clich)': False, 'contains(game)': False, 'contains(and)': True, 'contains(soundtrack)': False, 'contains(moving)': True, 'contains(human)': False, 'contains(becomes)': False, 'contains(spend)': False, 'contains(speak)': False, 'contains(town)': False, 'contains(carrey)': False, 'contains(allows)': False, 'contains(pair)': False, 'contains(attitude)': False, 'contains(crew)': False, 'contains(created)': False, 'contains(definitely)': False, 'contains(wonder)': False, 'contains(ends)': False, 'contains(likely)': False, 'contains(difficult)': True, 'contains(thriller)': False, 'contains(father)': False, 'contains(we)': False, 'contains(rules)': False, 'contains(office)': False, 'contains(easy)': False, 'contains(women)': False, 'contains(impression)': False, 'contains(elizabeth)': False, 'contains(haven)': False, 'contains(bizarre)': False, 'contains(room)': False, 'contains(superior)': False, 'contains(simple)': False, 'contains(pg)': False, 'contains(maybe)': False, 'contains(america)': False, 'contains(able)': False, 'contains(please)': False, 'contains(that)': True, 'contains(couldn)': False, 'contains(though)': False, 'contains(parents)': False, 'contains(lover)': False, 'contains(allen)': False, 'contains(bond)': False, 'contains(mystery)': False, 'contains(last)': False, 'contains(anti)': False, 'contains(believes)': False, 'contains(4)': False, 'contains(no)': False, 'contains(fails)': False, 'contains(here)': True, 'contains(dangerous)': True, 'contains(boat)': False, 'contains(wasted)': False, 'contains(start)': True, 'contains(dance)': False, 'contains(imagination)': False, 'contains(heart)': False, 'contains(hanks)': False, 'contains(convincing)': False, 'contains(i)': False, 'contains(huge)': False, 'contains(fate)': False, 'contains(generation)': False, 'contains(dialogue)': False, 'contains(young)': False, 'contains(effective)': False, 'contains(gangster)': True, 'contains(biggest)': False, 'contains(waste)': False, 'contains(jerry)': False, 'contains(cash)': False, 'contains(van)': False, 'contains(need)': False, 'contains(sam)': False, 'contains(animation)': False, 'contains(gags)': False, 'contains(involved)': False, 'contains(begins)': False, 'contains(effort)': False, 'contains(wouldn)': False, 'contains(theater)': False, 'contains(enjoyed)': False, 'contains(release)': False, 'contains(austin)': False, 'contains(1997)': False, 'contains(nick)': False, 'contains(mars)': False, 'contains(stands)': False, 'contains(step)': False, 'contains(bar)': False, 'contains(still)': False, 'contains(success)': False, 'contains(unlike)': False, 'contains(job)': False, 'contains(plan)': False, 'contains(minor)': False, 'contains(seemed)': False, 'contains(reasons)': False, 'contains(season)': False, 'contains(guns)': False, 'contains(almost)': False, 'contains(cage)': False, 'contains(series)': True, 'contains(many)': True, 'contains(member)': False, 'contains(depth)': False, 'contains(straight)': False, 'contains(strength)': False, 'contains(or)': False, 'contains(offer)': False, 'contains(stone)': False, 'contains(nor)': False, 'contains(atmosphere)': False, 'contains(anyway)': False, 'contains(pure)': False, 'contains(intelligent)': False, 'contains(footage)': False, 'contains(myers)': False, 'contains(dead)': False, 'contains(men)': False, 'contains(gang)': False, 'contains(surprisingly)': False, 'contains(surprise)': False, 'contains(going)': False, 'contains(share)': False, 'contains(unique)': True, 'contains(view)': False, 'contains(fly)': False, 'contains(edge)': False, 'contains(government)': False, 'contains(anything)': False, 'contains(humans)': False, 'contains(female)': False, 'contains(known)': True, 'contains(for)': True, 'contains(oscar)': False, 'contains(viewing)': False, 'contains(things)': True, 'contains(generated)': False, 'contains(wild)': False, 'contains(trip)': False, 'contains(adventure)': False, 'contains(horror)': False, 'contains(felt)': False, 'contains(added)': False, 'contains(bring)': False, 'contains(night)': False, 'contains(names)': False, 'contains(woo)': False, 'contains(purpose)': False, 'contains(personal)': False, 'contains(like)': True, 'contains(together)': False, 'contains(])': False, 'contains(hear)': False, 'contains(ill)': False, 'contains(beautiful)': False, 'contains(adults)': False, 'contains(movie)': True, 'contains(ground)': False, 'contains(!)': True, 'contains(30)': False, 'contains(bored)': False, 'contains(great)': True, 'contains(poorly)': False, 'contains(themes)': False, 'contains(thanks)': False, 'contains(menace)': False, 'contains(difference)': False, 'contains(catherine)': False, 'contains(decision)': False, 'contains(club)': False, 'contains(pull)': False, 'contains(saving)': False, 'contains(fully)': False, 'contains(meets)': False, 'contains(available)': False, 'contains(questions)': False, 'contains(far)': False, 'contains(thomas)': False, 'contains(truman)': False, 'contains(rent)': False, 'contains(director)': False, 'contains(surprising)': False, 'contains(heard)': False, 'contains(fascinating)': False, 'contains(naked)': False, 'contains(career)': False, 'contains(somehow)': False, 'contains(skills)': False, 'contains(90)': False, 'contains(mrs)': False, 'contains(may)': False, 'contains(british)': False, 'contains(until)': False, 'contains(ideas)': False, 'contains(man)': False, 'contains(mel)': False, 'contains(desperate)': False, 'contains(bland)': False, 'contains(house)': False, 'contains(used)': True, 'contains(unless)': False, 'contains(goal)': False, 'contains(itself)': False, 'contains(period)': False, 'contains(protagonist)': False, 'contains(incredibly)': False, 'contains(trailer)': False, 'contains(machine)': False, 'contains(use)': False, 'contains(favorite)': False, 'contains(seat)': False, 'contains(lead)': True, 'contains(imagine)': False, 'contains(9)': False, 'contains(those)': False, 'contains(acting)': False, 'contains(united)': False, 'contains(cliches)': False, 'contains(look)': True, 'contains(clever)': False, 'contains(fine)': False, 'contains(parts)': False, 'contains(their)': False, 'contains(hardly)': False, 'contains(rare)': False, 'contains(big)': False, 'contains(proves)': False, 'contains(tom)': False, 'contains(amusing)': False, 'contains(realizes)': False, 'contains(let)': False, 'contains(cruise)': False, 'contains(boy)': False, 'contains(join)': False, 'contains(subtle)': False, 'contains(wise)': False, 'contains(coming)': False, 'contains(blair)': False, 'contains(well)': True, 'contains(brothers)': False, 'contains(players)': False, 'contains(twenty)': False, 'contains(similar)': False, 'contains(brother)': False, 'contains(quality)': False, 'contains(cops)': False, 'contains(crystal)': False, 'contains(directed)': False, 'contains(frame)': False, 'contains(mission)': False, 'contains(language)': False, 'contains(short)': False, 'contains(jennifer)': False, 'contains(box)': False, 'contains(missed)': False, 'contains(myself)': False, 'contains(inside)': False, 'contains(kids)': False, 'contains(batman)': False, 'contains(doubt)': False, 'contains(thinking)': False, 'contains(done)': False, 'contains(eventually)': False, 'contains(songs)': False, 'contains(to)': True, 'contains(talented)': False, 'contains(seriously)': False, 'contains(broken)': False, 'contains(happened)': False, 'contains(built)': False, 'contains(might)': False, 'contains(go)': False, 'contains(frightening)': False, 'contains(appearance)': False, 'contains(damon)': False, 'contains(french)': False, 'contains(around)': False, 'contains(all)': True, 'contains(place)': True, 'contains(leaves)': False, 'contains(nowhere)': False, 'contains(choice)': False, 'contains(directors)': False, 'contains(monster)': False, 'contains(student)': False, 'contains(bug)': False, 'contains(mysterious)': False, 'contains(realistic)': False, 'contains(special)': False, 'contains(fan)': False, 'contains(attempt)': False, 'contains(elements)': False, 'contains(production)': False, 'contains(doing)': False, 'contains(creature)': False, 'contains(seemingly)': False, 'contains(although)': False, 'contains(gave)': False, 'contains(fame)': False, 'contains(opinion)': False, 'contains(rate)': False, 'contains(bad)': False, 'contains(troopers)': False, 'contains(score)': False, 'contains(eyes)': False, 'contains(beast)': False, 'contains(charles)': False, 'contains(wanted)': False, 'contains(helen)': False, 'contains(terrible)': False, 'contains(harry)': False, 'contains(turns)': False, 'contains(uses)': False, 'contains(brilliant)': False, 'contains(others)': True, 'contains(damme)': False, 'contains(douglas)': False, 'contains(amazing)': False, 'contains(reach)': False, 'contains(history)': False, 'contains(top)': True, 'contains(wonderful)': False, 'contains(witch)': False, 'contains(son)': False, 'contains(quite)': False, 'contains(thrown)': False, 'contains(direction)': False, 'contains(free)': False, 'contains(true)': False, 'contains(forces)': False, 'contains(jr)': False, 'contains(jim)': False, 'contains(upon)': False, 'contains(screenwriter)': False, 'contains(train)': True, 'contains(cover)': False, 'contains(crap)': False, 'contains(along)': True, 'contains(files)': False, 'contains(guy)': True, 'contains(guilty)': False, 'contains(enough)': False, 'contains(detective)': False, 'contains(background)': False, 'contains(race)': False, 'contains(matthew)': False, 'contains(patch)': False, 'contains(puts)': False, 'contains(deep)': True, 'contains(double)': False, 'contains(sex)': False, 'contains(loving)': False, 'contains(numerous)': False, 'contains(scale)': False, 'contains(would)': False, 'contains(fair)': False, 'contains(willis)': False, 'contains(pictures)': False, 'contains(+)': False, 'contains(apparent)': False, 'contains(utterly)': False, 'contains(wall)': False, 'contains(shock)': False, 'contains(discovers)': False, 'contains(watching)': False, 'contains(sequence)': False, 'contains(appropriate)': False, 'contains(day)': True, 'contains(wrong)': True, 'contains(power)': False, 'contains(american)': False, 'contains(behind)': False, 'contains(realized)': False, 'contains(states)': False, 'contains(than)': False, 'contains(center)': False, 'contains(aspect)': False, 'contains(ice)': False, 'contains(john)': False, 'contains(laughable)': False, 'contains(victim)': False, 'contains(low)': False, 'contains(mostly)': False, 'contains(never)': True, 'contains(provided)': False, 'contains(`)': False, 'contains(major)': False, 'contains(seconds)': False, 'contains(joke)': False, 'contains(takes)': False, 'contains(problems)': True, 'contains(episode)': False, 'contains(shoot)': False, 'contains(scenes)': False, 'contains(our)': False, 'contains(baldwin)': False, 'contains(legend)': False, 'contains(large)': False, 'contains(drive)': False, 'contains(liners)': False, 'contains(ahead)': False, 'contains(rob)': False, 'contains(voice)': False, 'contains(eight)': True, 'contains(tells)': False, 'contains(singer)': False, 'contains(actual)': False, 'contains(titanic)': False, 'contains(subplot)': False, 'contains(narrative)': False, 'contains(beyond)': False, 'contains(president)': False, 'contains(e)': False, 'contains(the)': True, 'contains(getting)': True, 'contains(performances)': False, 'contains(murder)': False, 'contains(produced)': False, 'contains(falls)': False, 'contains(campbell)': False, 'contains(something)': False, 'contains(alive)': False, 'contains(s)': True, 'contains(matt)': False, 'contains(jack)': False, 'contains(target)': False, 'contains(know)': False, 'contains(fbi)': False, 'contains(nights)': False, 'contains(faces)': False, 'contains(across)': False, 'contains(every)': False, 'contains(forever)': False, 'contains(sci)': False, 'contains(into)': True, 'contains(horse)': False, 'contains(more)': False, 'contains(off)': False, 'contains(jimmy)': False, 'contains(ever)': True, 'contains(supposed)': False, 'contains(goes)': False, 'contains(brings)': False, 'contains(forced)': False, 'contains(by)': True, 'contains(sounds)': False, 'contains(pop)': False, 'contains(satire)': False, 'contains(mood)': False, 'contains(interest)': False, 'contains(equally)': False, 'contains(decided)': False, 'contains(boss)': True, 'contains(winning)': False, 'contains(grant)': False, 'contains(audience)': False, 'contains(vegas)': False, 'contains(winner)': False, 'contains(knows)': False, 'contains(works)': False, 'contains(television)': False, 'contains(thing)': False, 'contains(travolta)': False, 'contains(tale)': False, 'contains(took)': False, 'contains(also)': True, 'contains(note)': False, 'contains(different)': False, 'contains(caught)': False, 'contains(asks)': False, 'contains(ben)': False, 'contains(any)': False, 'contains(much)': False, 'contains(involving)': False, 'contains(acts)': False, 'contains(stock)': False, 'contains(compared)': False, 'contains(control)': False, 'contains(needed)': False, 'contains(obvious)': False, 'contains(()': True, 'contains(talent)': False, 'contains(least)': True, 'contains(reeves)': False, 'contains(forward)': False, 'contains(completely)': False, 'contains(because)': False, 'contains(land)': False, 'contains(features)': True, 'contains(yes)': False, 'contains(violence)': False, 'contains(tv)': False, 'contains(mary)': False, 'contains(fox)': False, 'contains(developed)': False, 'contains(lynch)': False, 'contains(respect)': False, 'contains(hunting)': False, 'contains(lack)': False, 'contains(result)': False, 'contains(woman)': True, 'contains(red)': False, 'contains(guys)': False, 'contains(13)': False, 'contains(hell)': False, 'contains(field)': True, 'contains(max)': False, 'contains(ex)': False, 'contains(ugly)': False, 'contains(fight)': True, 'contains(role)': False, 'contains(o)': False, 'contains(explained)': False, 'contains(waiting)': False, 'contains(inspired)': False, 'contains(ridiculous)': False, 'contains(co)': False, 'contains(create)': False, 'contains(revenge)': False, 'contains(reality)': False, 'contains(nicely)': False, 'contains(entire)': False, 'contains(details)': False, 'contains(stars)': False, 'contains(lost)': False, 'contains(found)': False, 'contains(&)': False, 'contains(grows)': False, 'contains(twist)': False, 'contains(line)': False, 'contains(dr)': False, 'contains(rush)': False, 'contains(york)': False, 'contains(fast)': True, 'contains(brain)': False, 'contains(fear)': False, 'contains(surprises)': False, 'contains(heavy)': False, 'contains(moral)': False, 'contains(starts)': False, 'contains(outstanding)': False, 'contains(larry)': False, 'contains(seagal)': False, 'contains(hope)': False, 'contains(tone)': False, 'contains(nearly)': False, 'contains(time)': False, 'contains(films)': False, 'contains(lee)': False, 'contains(kind)': True, 'contains(humanity)': False, 'contains(toward)': False, 'contains(list)': False, 'contains(kelly)': False, 'contains(including)': False, 'contains(prison)': False, 'contains(money)': False, 'contains(chance)': False, 'contains(boring)': False, 'contains(potential)': False, 'contains(agent)': False, 'contains(must)': False, 'contains(shot)': False, 'contains(already)': False, 'contains(longer)': False, 'contains(godzilla)': False, 'contains(hours)': False, 'contains(single)': False, 'contains(herself)': False, 'contains(but)': True, 'contains(consider)': False, 'contains(case)': False, 'contains(try)': False, 'contains(while)': True, 'contains(since)': False, 'contains(fi)': False, 'contains(laughing)': False, 'contains(support)': False, 'contains(7)': False, 'contains(either)': False, 'contains(quiet)': False, 'contains(throw)': False, 'contains(suppose)': False, 'contains(russell)': False, 'contains(ability)': False, 'contains(:)': True, 'contains(loved)': False, 'contains(lacking)': False, 'contains(mentioned)': False, 'contains(near)': False, 'contains(phone)': False, 'contains(event)': False, 'contains(suspense)': False, 'contains(worthy)': False, 'contains(finally)': False, 'contains(/)': False, 'contains(future)': False, 'contains(sad)': False, 'contains(heads)': False, 'contains(walking)': False, 'contains(several)': False, 'contains(masterpiece)': False, 'contains(message)': False, 'contains(pick)': False, 'contains(members)': False, 'contains(final)': False, 'contains(attack)': False, 'contains(entertaining)': False, 'contains(probably)': False, 'contains(filmmakers)': False, 'contains(killed)': False, 'contains(nasty)': False, 'contains(where)': True, 'contains(tell)': False, 'contains(speaking)': False, 'contains(opposite)': False, 'contains(now)': False, 'contains(novel)': False, 'contains(roger)': False, 'contains(danny)': False, 'contains(hopkins)': False, 'contains(comparison)': False, 'contains(really)': False, 'contains(ending)': False, 'contains(only)': True, 'contains(anthony)': False, 'contains(cares)': False, 'contains(again)': False, 'contains(stuck)': False, 'contains(above)': False, 'contains(work)': False, 'contains(alan)': False, 'contains(c)': False, 'contains(haunting)': False, 'contains(steven)': False, 'contains(cannot)': False, 'contains(extremely)': False, 'contains(unfunny)': False, 'contains(fake)': False, 'contains(write)': False, 'contains(results)': False, 'contains(wars)': False, 'contains(has)': True, 'contains(musical)': False, 'contains(thoroughly)': False, 'contains(working)': False, 'contains(truly)': False, 'contains(character)': False, 'contains(story)': False, 'contains(bright)': False, 'contains(friend)': False, 'contains(plot)': True, 'contains(answer)': False, 'contains(latest)': False, 'contains(enjoy)': False, 'contains(say)': False, 'contains(as)': True, 'contains(loud)': False, 'contains(meet)': False, 'contains(learns)': False, 'contains(toy)': False, 'contains(la)': False, 'contains(rock)': False, 'contains(intriguing)': False, 'contains(cinematic)': False, 'contains(believe)': False, 'contains(add)': False, 'contains(2)': False, 'contains(watch)': True, 'contains(due)': False, 'contains(arnold)': True, 'contains(jay)': False, 'contains(image)': False, 'contains(planet)': False, 'contains(cause)': False, 'contains(appeal)': False, 'contains(out)': True, 'contains(party)': False, 'contains(lame)': False, 'contains(considered)': False, 'contains(bruce)': False, 'contains(complex)': False, 'contains(producer)': False, 'contains(feel)': False, 'contains(think)': False, 'contains(board)': False, 'contains(chan)': True, 'contains(excellent)': False, 'contains(live)': False, 'contains(humor)': False, 'contains(shakespeare)': False, 'contains(barely)': False, 'contains(very)': True, 'contains(fantastic)': False, 'contains(road)': False, 'contains(hospital)': False, 'contains(amount)': False, 'contains(military)': False, 'contains(children)': False, 'contains(moves)': False, 'contains(ve)': False, 'contains(ryan)': False, 'contains(form)': False, 'contains(missing)': False, 'contains(asked)': False, 'contains(name)': False, 'contains(park)': False, 'contains(ms)': False, 'contains(driven)': False, 'contains(child)': False, 'contains(confused)': False, 'contains(suspect)': False, 'contains(wedding)': False, 'contains(romantic)': False, 'contains(moments)': False, 'contains(hands)': False, 'contains(cool)': False, 'contains(latter)': False, 'contains(average)': False, 'contains(costumes)': False, 'contains(help)': True, 'contains(following)': False, 'contains(am)': False, 'contains(wonderfully)': False, 'contains(white)': False, 'contains(memorable)': False, 'contains(richard)': False, 'contains(era)': False, 'contains(another)': False, 'contains(seems)': False, 'contains(annie)': False, 'contains(managed)': False, 'contains(knew)': False, 'contains(should)': False, 'contains(worst)': False, 'contains(break)': False, 'contains(genuine)': False, 'contains(wish)': False, 'contains(once)': True, 'contains(figures)': False, 'contains(cinematography)': False, 'contains(being)': False, 'contains(joan)': False, 'contains(relationships)': False, 'contains(producers)': False, 'contains(says)': False, 'contains(michael)': False, 'contains(floor)': False, 'contains(provides)': False, 'contains(--)': True, 'contains(beauty)': False, 'contains(mr)': False, 'contains(today)': True, 'contains(remains)': False, 'contains(these)': False, 'contains(wait)': False, 'contains(carpenter)': False, 'contains(school)': False, 'contains(thus)': False, 'contains(talents)': False, 'contains(adams)': False, 'contains(paced)': False, 'contains(kevin)': True, 'contains(face)': False, 'contains(.)': True, 'contains(expected)': False, 'contains(evil)': False, 'contains(technical)': False, 'contains(speech)': False, 'contains(gold)': False, 'contains(began)': False, 'contains(in)': True, 'contains(hasn)': False, 'contains(energy)': False, 'contains(8)': False, 'contains(mob)': False, 'contains(finds)': False, 'contains(on)': True, 'contains(real)': False, 'contains(1999)': False, 'contains(trying)': False, 'contains(action)': True, 'contains(at)': False, 'contains(sees)': True, 'contains(usually)': False, 'contains(come)': False, 'contains(building)': False, 'contains(months)': False, 'contains(faith)': False, 'contains(finding)': False, 'contains(snake)': False, 'contains(century)': False, 'contains(aliens)': False, 'contains(climax)': True, 'contains(realize)': False, 'contains(ass)': True, 'contains(past)': False, 'contains(word)': False, 'contains(feels)': False, 'contains(impossible)': False, 'contains(movies)': True, 'contains(eddie)': False, 'contains(fantasy)': False, 'contains(sheer)': False, 'contains(car)': False, 'contains(thinks)': False, 'contains(julia)': False, 'contains(miss)': True, 'contains(team)': False, 'contains(obviously)': False, 'contains(technology)': False, 'contains(decent)': False, 'contains(yeah)': False, 'contains(both)': False, 'contains(chosen)': False, 'contains(told)': False, 'contains(save)': False, 'contains(washington)': False, 'contains(professor)': False, 'contains(god)': False, 'contains(scientist)': False, 'contains(motion)': False, 'contains(cut)': False, 'contains(shown)': False, 'contains(species)': False, 'contains(quickly)': False, 'contains(this)': True, 'contains(you)': True, 'contains(nuclear)': False, 'contains(won)': False, 'contains(brought)': False, 'contains(witty)': False, 'contains(match)': False, 'contains(fans)': False, 'contains(offers)': False, 'contains(despite)': False, 'contains(put)': False, 'contains(make)': True, 'contains(presence)': False, 'contains(played)': True, 'contains(gibson)': False, 'contains(gone)': False, 'contains(local)': False, 'contains(leader)': False, 'contains(saw)': False, 'contains(kept)': False, 'contains(making)': True, 'contains(sweet)': False, 'contains(presents)': False, 'contains(with)': True, 'contains(basic)': False, 'contains(eye)': False, 'contains(george)': False, 'contains(system)': False, 'contains(crime)': False, 'contains(premise)': False, 'contains(my)': False, 'contains(hour)': False, 'contains(portrayed)': False, 'contains(he)': True, 'contains(manage)': False, 'contains(smile)': False, 'contains(safe)': False, 'contains(henry)': False, 'contains(silly)': False, 'contains(back)': False, 'contains(security)': True, 'contains(ii)': False, 'contains(villain)': False, 'contains(*)': True, 'contains(amy)': False, 'contains(figure)': False, 'contains(entirely)': False, 'contains(west)': False, 'contains(places)': False, 'contains(outside)': False, 'contains(station)': False, 'contains(them)': True, 'contains(plans)': False, 'contains(mean)': False, 'contains(aside)': False, 'contains(running)': False, 'contains(slasher)': False, 'contains(give)': False, 'contains(changes)': False, 'contains(generally)': False, 'contains(blue)': False, 'contains(sure)': False, 'contains(pulp)': False, 'contains(kate)': False, 'contains(some)': False, 'contains(previous)': False, 'contains(powerful)': False, 'contains(window)': False, 'contains(win)': False, 'contains(move)': False, 'contains(will)': True, 'contains(parody)': False, 'contains(brian)': False, 'contains(intense)': False, 'contains(jason)': False, 'contains(nudity)': False, 'contains(girl)': False, 'contains(runs)': False, 'contains(overly)': False, 'contains(next)': False, 'contains(stuff)': False, 'contains(tried)': False, 'contains(sort)': False, 'contains(badly)': False, 'contains(throughout)': False, 'contains(animated)': False, 'contains(summer)': False, 'contains(professional)': False, 'contains(julie)': False, 'contains(strong)': False, 'contains(rival)': False, 'contains(care)': False, 'contains(therefore)': False, 'contains(stand)': False, 'contains(genius)': False, 'contains(meanwhile)': False, 'contains(excuse)': False, 'contains(bottom)': False, 'contains(clearly)': False, 'contains(compelling)': False, 'contains(situation)': False, 'contains(spielberg)': False, 'contains(length)': False, 'contains(himself)': False, 'contains(do)': True, 'contains(stop)': True, 'contains(easily)': False, 'contains(better)': False, 'contains(ask)': False, 'contains(read)': False, 'contains(cinema)': False, 'contains(supposedly)': False, 'contains(singing)': False, 'contains(cop)': False, 'contains(believable)': False, 'contains(lady)': False, 'contains(flat)': False, 'contains(interested)': False, 'contains(greatest)': False, 'contains(attempts)': False, 'contains(task)': False, 'contains(chase)': True, 'contains(grand)': False, 'contains(scott)': False, 'contains(charming)': False, 'contains(destroy)': False, 'contains(matrix)': False, 'contains(pace)': False, 'contains(hunt)': False, 'contains(epic)': False, 'contains(tommy)': False, 'contains(hurt)': False, 'contains(placed)': False, 'contains(gay)': False, 'contains(events)': False, 'contains(rated)': False, 'contains(business)': False, 'contains(manages)': False, 'contains(one)': True, 'contains(screenplay)': False, 'contains(bob)': False, 'contains(appears)': False, 'contains(japanese)': False, 'contains(idea)': True, 'contains(avoid)': False, 'contains(crazy)': False, 'contains(color)': False, 'contains(words)': False, 'contains(surface)': False, 'contains(failed)': False, 'contains(loses)': False, 'contains(formula)': False, 'contains(version)': False, 'contains(yet)': False, 'contains(possibly)': False, 'contains(minute)': False, 'contains(finale)': False, 'contains(gun)': True, 'contains(blood)': False, 'contains(allow)': False, 'contains(teenagers)': False, 'contains(williams)': False, 'contains(shouldn)': False, 'contains(somewhat)': False, 'contains(lot)': False, 'contains(few)': False, 'contains(problem)': False, 'contains(stuart)': False, 'contains(modern)': False, 'contains(too)': False, 'contains(apparently)': False, 'contains(slowly)': False, 'contains(sick)': False, 'contains(film)': False, 'contains(filled)': False, 'contains(dream)': False, 'contains(each)': True, 'contains(innocent)': False, 'contains(current)': False, 'contains(wants)': False, 'contains(5)': False, 'contains(million)': True, 'contains(heroes)': False, 'contains(slapstick)': False, 'contains(worse)': False, 'contains(chemistry)': True, 'contains(mother)': False, 'contains(force)': False, 'contains(opportunity)': False, 'contains(reveal)': False, 'contains(viewer)': False, 'contains(from)': True, 'contains(serve)': False, 'contains(heaven)': False, 'contains(bringing)': False, 'contains(its)': False, 'contains(delivers)': False, 'contains(failure)': False, 'contains(class)': False, 'contains(rest)': False, 'contains(less)': False, 'contains(returns)': False, 'contains(beat)': False, 'contains(grace)': False, 'contains(ford)': False, 'contains(over)': False, 'contains(brooks)': False, 'contains(expectations)': False, 'contains(porn)': False, 'contains(tim)': False, 'contains(slightly)': False, 'contains(overall)': False, 'contains(up)': False, 'contains(south)': False, 'contains(take)': False, 'contains(hand)': False, 'contains(written)': False, 'contains(tired)': False, 'contains(ship)': False, 'contains(taste)': False, 'contains(late)': False, 'contains(send)': False, 'contains(wife)': True, 'contains(video)': False, 'contains(having)': False, 'contains(based)': False, 'contains(hollywood)': False, 'contains(solid)': False, 'contains(show)': False, 'contains(ride)': False, 'contains(somewhere)': False, 'contains(point)': False, 'contains(daughter)': False, 'contains(turned)': False, 'contains(hey)': False, 'contains(fun)': True, 'contains(exactly)': False, 'contains(shame)': False, 'contains(build)': False, 'contains(nothing)': False, 'contains(begin)': False, 'contains(6)': False, 'contains(level)': False, 'contains(approach)': False, 'contains(necessary)': False, 'contains(paul)': False, 'contains(sarah)': False, 'contains(pay)': False, 'contains(down)': False, 'contains(beach)': False, 'contains(portrayal)': False, 'contains(can)': False, 'contains(jones)': False, 'contains(didn)': False, 'contains(is)': True, 'contains(apart)': False, 'contains(normal)': False, 'contains(fiction)': False, 'contains(space)': False, 'contains(books)': False, 'contains(becoming)': False, 'contains(feet)': False, 'contains(phantom)': False, 'contains(star)': False, 'contains(rich)': False, 'contains(fare)': False, 'contains(it)': True, 'contains(seem)': False, 'contains(murphy)': False, 'contains(spectacular)': False, 'contains(part)': False, 'contains(mess)': False, 'contains(holds)': False, 'contains(kid)': False, 'contains(creates)': False, 'contains(,)': True, 'contains(army)': False, 'contains(don)': False, 'contains(cameron)': False, 'contains(occasionally)': False, 'contains(best)': True, 'contains(robin)': False, 'contains(laughs)': False, 'contains(yourself)': False, 'contains(sign)': False, 'contains(fault)': False, 'contains(stunning)': False, 'contains(plenty)': False, 'contains(close)': False, 'contains(dimensional)': False, 'contains(pathetic)': False, 'contains(giving)': False, 'contains(conflict)': False, 'contains(journey)': False, 'contains(first)': False, 'contains(simon)': False, 'contains(slow)': True, 'contains(powers)': False, 'contains(most)': True, 'contains(teenage)': False, 'contains(complete)': False, 'contains(characterization)': False, 'contains(sequel)': False, 'contains(theaters)': False, 'contains(sit)': False, 'contains(away)': False, 'contains(leave)': False, 'contains(me)': True, 'contains(escape)': False, 'contains(former)': False, 'contains(willing)': False, 'contains(key)': True, 'contains(alone)': False, 'contains(original)': False, 'contains(crowd)': False, 'contains(editing)': False, 'contains(fresh)': False, 'contains(situations)': False, 'contains(relief)': False, 'contains(viewers)': False, 'contains(b)': False, 'contains(explain)': False, 'contains(looks)': True, 'contains(aspects)': False, 'contains(seen)': False, 'contains(means)': True, 'contains(four)': False, 'contains(anderson)': False, 'contains(romance)': False, 'contains(fit)': False, 'contains(10)': False, 'contains(opening)': False, 'contains(playing)': True, 'contains(design)': False, 'contains(non)': True, 'contains(nice)': False, 'contains(love)': False, 'contains(leading)': False, 'contains(stay)': False, 'contains(number)': False, 'contains(patrick)': False, 'contains(master)': False, 'contains(six)': False, 'contains(years)': False, 'contains(fashion)': False, 'contains(offensive)': False, 'contains(jean)': False, 'contains(week)': False, 'contains(boys)': False, 'contains(zero)': False, 'contains(friends)': True, 'contains(political)': False, 'contains(serial)': False, 'contains(martin)': False, 'contains(got)': True, 'contains(decides)': False, 'contains(budget)': False, 'contains(casting)': False, 'contains(between)': True, 'contains(quick)': True, 'contains(isn)': False, 'contains(strange)': False, 'contains(media)': False, 'contains(which)': True, 'contains(violent)': False, 'contains(whatever)': False, 'contains(not)': True, 'contains(christmas)': False, 'contains(decide)': False, 'contains(conclusion)': False, 'contains(absolutely)': False, 'contains(company)': False, 'contains(al)': False, 'contains(criminal)': False, 'contains(deals)': False, 'contains(days)': False, 'contains(leads)': False, 'contains(mark)': False, 'contains(kiss)': False, 'contains(dark)': False, 'contains(directing)': False, 'contains(central)': False, 'contains(travel)': False, 'contains(visuals)': False, 'contains(someone)': False, 'contains(fact)': False, 'contains(light)': False, 'contains(battle)': False, 'contains(dog)': False, 'contains(arts)': False, 'contains(became)': False, 'contains(storyline)': False, 'contains(burton)': False, 'contains(given)': False, 'contains(enjoyable)': True, 'contains(such)': False, 'contains(find)': False, 'contains(otherwise)': False, 'contains(kill)': False, 'contains(intended)': False, 'contains(credit)': False, 'contains(cold)': False, 'contains(named)': False, 'contains(flick)': False, 'contains(whether)': False, 'contains(camp)': False, 'contains(particularly)': False, 'contains(matters)': False, 'contains(charm)': False, 'contains(angels)': False, 'contains(anyone)': False, 'contains(honest)': False, 'contains(familiar)': False, 'contains(simply)': False, 'contains(piece)': False, 'contains(instance)': False, 'contains(showing)': False, 'contains(comedies)': False, 'contains(turn)': False, 'contains(flying)': False, 'contains(happening)': False, 'contains(food)': True, 'contains(kong)': True, 'contains(industry)': False, 'contains(taylor)': False, 'contains(dramatic)': False, 'contains(ok)': False, 'contains(life)': False, 'contains(mike)': False, 'contains(chinese)': True, 'contains(carry)': False, 'contains(touching)': False, 'contains(green)': False, 'contains(expecting)': False, 'contains(fall)': False, 'contains(filmed)': False, 'contains(ready)': False, 'contains(debut)': False, 'contains(looked)': False, 'contains(pass)': False, 'contains(drama)': False, 'contains(disturbing)': False, 'contains(old)': False, 'contains(characters)': False, 'contains(lawyer)': False, 'contains(captain)': False, 'contains(a)': True, 'contains([)': False, 'contains(his)': True, 'contains(streets)': False, 'contains(funny)': True, 'contains(same)': True, 'contains(thankfully)': False, 'contains(charlie)': False, 'contains(hall)': False, 'contains(dollars)': False, 'contains(vampire)': False, 'contains(particular)': False, 'contains(spirit)': False, 'contains(against)': False, 'contains(writers)': False, 'contains(ways)': False, 'contains(hotel)': False, 'contains(football)': False, 'contains(prove)': False, 'contains(3)': False, 'contains(emotions)': False, 'contains(recent)': False, 'contains(society)': False, 'contains(reading)': False, 'contains(through)': False, 'contains(left)': False, 'contains(moore)': False, 'contains(twists)': False, 'contains(learn)': False, 'contains(feelings)': False, 'contains(possible)': False, 'contains(highly)': False, 'contains(even)': False, 'contains(awful)': False, 'contains(welcome)': False, 'contains(johnny)': False, 'contains(continues)': False, 'contains(everyone)': False, 'contains(under)': False, 'contains(forgotten)': False, 'contains(does)': False, 'contains(went)': False, 'contains(apartment)': False, 'contains(lots)': False, 'contains(adult)': False, 'contains(else)': False, 'contains(artist)': False, 'contains(damn)': False, 'contains(standing)': True, 'contains(new)': False, 'contains(ten)': False, 'contains(discover)': False, 'contains(queen)': False, 'contains(cult)': False, 'contains(happy)': False, 'contains(own)': True, 'contains(filmmaker)': False}
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print("Naive Bayes accuracy with 2000 bag of words features is %s" % (nltk.classify.accuracy(classifier, test_set)))
classifier.show_most_informative_features(5)
Naive Bayes accuracy with 2000 bag of words features is 0.83 Most Informative Features contains(outstanding) = True pos : neg = 11.3 : 1.0 contains(mulan) = True pos : neg = 9.1 : 1.0 contains(seagal) = True neg : pos = 8.1 : 1.0 contains(wonderfully) = True pos : neg = 6.4 : 1.0 contains(damon) = True pos : neg = 6.1 : 1.0
classifier.show_most_informative_features(20)
Most Informative Features contains(outstanding) = True pos : neg = 11.3 : 1.0 contains(mulan) = True pos : neg = 9.1 : 1.0 contains(seagal) = True neg : pos = 8.1 : 1.0 contains(wonderfully) = True pos : neg = 6.4 : 1.0 contains(damon) = True pos : neg = 6.1 : 1.0 contains(flynt) = True pos : neg = 5.7 : 1.0 contains(wasted) = True neg : pos = 5.6 : 1.0 contains(awful) = True neg : pos = 5.3 : 1.0 contains(poorly) = True neg : pos = 5.3 : 1.0 contains(lame) = True neg : pos = 5.2 : 1.0 contains(ridiculous) = True neg : pos = 4.9 : 1.0 contains(waste) = True neg : pos = 4.8 : 1.0 contains(era) = True pos : neg = 4.6 : 1.0 contains(allows) = True pos : neg = 4.4 : 1.0 contains(worst) = True neg : pos = 4.4 : 1.0 contains(bland) = True neg : pos = 4.3 : 1.0 contains(laughable) = True neg : pos = 4.1 : 1.0 contains(mess) = True neg : pos = 4.0 : 1.0 contains(fantastic) = True pos : neg = 4.0 : 1.0 contains(jedi) = True pos : neg = 3.9 : 1.0
We can consider the task of POS tagging as a classification task and use the classifier methodology described here. Let us revisit the POS tagging task discussed in the first lecture using the new tools we have developed.
from nltk.corpus import brown
suffix_fdist = nltk.FreqDist()
for word in brown.words():
word = word.lower()
suffix_fdist[word[-1:]] += 1
suffix_fdist[word[-2:]] += 1
suffix_fdist[word[-3:]] += 1
common_suffixes = [suffix for (suffix, count) in suffix_fdist.most_common(100)]
print(common_suffixes)
['e', ',', '.', 's', 'd', 't', 'he', 'n', 'a', 'of', 'the', 'y', 'r', 'to', 'in', 'f', 'o', 'ed', 'nd', 'is', 'on', 'l', 'g', 'and', 'ng', 'er', 'as', 'ing', 'h', 'at', 'es', 'or', 're', 'it', '``', 'an', "''", 'm', ';', 'i', 'ly', 'ion', 'en', 'al', '?', 'nt', 'be', 'hat', 'st', 'his', 'th', 'll', 'le', 'ce', 'by', 'ts', 'me', 've', "'", 'se', 'ut', 'was', 'for', 'ent', 'ch', 'k', 'w', 'ld', '`', 'rs', 'ted', 'ere', 'her', 'ne', 'ns', 'ith', 'ad', 'ry', ')', '(', 'te', '--', 'ay', 'ty', 'ot', 'p', 'nce', "'s", 'ter', 'om', 'ss', ':', 'we', 'are', 'c', 'ers', 'uld', 'had', 'so', 'ey']
def pos_features(word):
features = {}
for suffix in common_suffixes:
features['endswith(%s)' % suffix] = word.lower().endswith(suffix)
return features
tagged_words = brown.tagged_words(categories='news', tagset='universal')
featuresets = [(pos_features(n), g) for (n,g) in tagged_words]
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)
0.7011437095972153
classifier.classify(pos_features('cats'))
'NOUN'
classifier.classify(pos_features('books'))
'NOUN'
classifier.show_most_informative_features(20)
Most Informative Features endswith(the) = True DET : NOUN = 3416.9 : 1.0 endswith(.) = True . : ADP = 2481.6 : 1.0 endswith(to) = True PRT : ADJ = 2138.0 : 1.0 endswith(f) = True ADP : VERB = 2050.5 : 1.0 endswith(he) = True DET : NOUN = 1808.9 : 1.0 endswith(and) = True CONJ : ADV = 1642.0 : 1.0 endswith(a) = True DET : VERB = 1597.1 : 1.0 endswith(of) = True ADP : NOUN = 1406.9 : 1.0 endswith(his) = True DET : NOUN = 728.0 : 1.0 endswith(ut) = True CONJ : DET = 694.7 : 1.0 endswith(nd) = True CONJ : NUM = 636.1 : 1.0 endswith(hat) = True PRON : NOUN = 570.6 : 1.0 endswith(ey) = True PRON : VERB = 549.0 : 1.0 endswith(i) = True PRON : ADP = 547.2 : 1.0 endswith(') = True . : VERB = 503.7 : 1.0 endswith(o) = True PRT : ADJ = 493.4 : 1.0 endswith(es) = True NOUN : ADP = 427.0 : 1.0 endswith(uld) = True VERB : NOUN = 422.5 : 1.0 endswith(we) = True PRON : NOUN = 353.5 : 1.0 endswith(ted) = True VERB : NOUN = 337.9 : 1.0
NLTK provides a common interface to different classifier algorithms. This is illustrated in the following examples.
import nltk
train = [
(dict(a=1,b=1,c=1), 'y'),
(dict(a=1,b=1,c=1), 'x'),
(dict(a=1,b=1,c=0), 'y'),
(dict(a=0,b=1,c=1), 'x'),
(dict(a=0,b=1,c=1), 'y'),
(dict(a=0,b=0,c=1), 'y'),
(dict(a=0,b=1,c=0), 'x'),
(dict(a=0,b=0,c=0), 'x'),
(dict(a=0,b=1,c=1), 'y'),
]
test = [
(dict(a=1,b=0,c=1)), # unseen
(dict(a=1,b=0,c=0)), # unseen
(dict(a=0,b=1,c=1)), # seen 3 times, labels=y,y,x
(dict(a=0,b=1,c=0)), # seen 1 time, label=x
]
classifier = nltk.classify.NaiveBayesClassifier.train(train)
sorted(classifier.labels())
['x', 'y']
classifier.classify_many(test)
['y', 'x', 'y', 'x']
for pdist in classifier.prob_classify_many(test):
print('%.4f %.4f' % (pdist.prob('x'), pdist.prob('y')))
0.3203 0.6797 0.5857 0.4143 0.3792 0.6208 0.6470 0.3530
classifier.show_most_informative_features()
Most Informative Features c = 0 x : y = 2.0 : 1.0 c = 1 y : x = 1.5 : 1.0 a = 1 y : x = 1.4 : 1.0 b = 0 x : y = 1.2 : 1.0 a = 0 x : y = 1.2 : 1.0 b = 1 y : x = 1.1 : 1.0
classifier = nltk.classify.DecisionTreeClassifier.train(
train, entropy_cutoff=0, support_cutoff=0)
sorted(classifier.labels())
['x', 'y']
print(classifier)
c=0? .................................................. x a=0? ................................................ x a=1? ................................................ y c=1? .................................................. y
classifier.classify_many(test)
['y', 'y', 'y', 'x']
There is no prob() method for decision tree classifiers, as they do not provide a probability interpretation.
NLTK provides an interface to the Scikit-learn (sklearn) classifiers - including maximum entropy and SVM.
from nltk.classify import SklearnClassifier
train_data = [({"a": 4, "b": 1, "c": 0}, "ham"),
({"a": 5, "b": 2, "c": 1}, "ham"),
({"a": 0, "b": 3, "c": 4}, "spam"),
({"a": 5, "b": 1, "c": 1}, "ham"),
({"a": 1, "b": 4, "c": 3}, "spam")]
test_data = [{"a": 3, "b": 2, "c": 1},
{"a": 0, "b": 3, "c": 7}]
from sklearn.naive_bayes import BernoulliNB
classif = SklearnClassifier(BernoulliNB()).train(train_data)
classif.classify_many(test_data)
['ham', 'spam']
from sklearn.svm import SVC
classif = SklearnClassifier(SVC(), sparse=False).train(train_data)
classif.classify_many(test_data)
['ham', 'spam']
# Using the sklearn classifier:
X = [[0], [1], [2], [3]]
Y = [0, 1, 2, 3]
clf = SVC(kernel='linear', C=1.0)
clf.fit(X, Y)
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape=None, degree=3, gamma='auto', kernel='linear', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)
classifr = SklearnClassifier(SVC(kernel='rbf', C=1.0), sparse=False).train(train_data)
classifr.classify_many(test_data)
['ham', 'spam']
from sklearn.svm import LinearSVC
classif_ova = SklearnClassifier(LinearSVC(C=1.0), sparse=False).train(train_data)
classif_ova.classify_many(test_data)
['ham', 'spam']
The key parameter to optimize for a given SVM kernel is the C parameter. Here is example code from sklearn that shows how to optimize C on a development set.
%matplotlib inline
import numpy as np
from sklearn import cross_validation, datasets, svm
digits = datasets.load_digits()
X = digits.data
y = digits.target
svc = svm.SVC(kernel='linear')
C_s = np.logspace(-10, 0, 10)
scores = list()
scores_std = list()
for C in C_s:
svc.C = C
this_scores = cross_validation.cross_val_score(svc, X, y, n_jobs=1)
scores.append(np.mean(this_scores))
scores_std.append(np.std(this_scores))
# Do the plotting
import matplotlib.pyplot as plt
plt.figure(1, figsize=(4, 3))
plt.clf()
plt.semilogx(C_s, scores)
plt.semilogx(C_s, np.array(scores) + np.array(scores_std), 'b--')
plt.semilogx(C_s, np.array(scores) - np.array(scores_std), 'b--')
locs, labels = plt.yticks()
plt.yticks(locs, list(map(lambda x: "%g" % x, locs)))
plt.ylabel('CV score')
plt.xlabel('Parameter C')
plt.ylim(0, 1.1)
plt.show()