We apply classification tools to solve NLP tasks.
We start with a very simple task that looks at words in isolation and tries to classify them into 2 labels: gender identification. The task consists of guessing whether a name is masculine or feminine.
The classification method consists of taking as input an observation, turning this observation into a feature vector, then predicting the label of this feature vector by applying a trained classifier model.
To prepare for this procedure, we must train a classifier. In supervised learning, a classifier is learned by generalizing a set of observed pairs (observationi, labeli) where [i = 1..N].
%matplotlib inline
def gender_features(word):
return {'last_letter': word[-1]}
gender_features('Shrek')
{'last_letter': 'k'}
from nltk.corpus import names
labeled_names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')])
import random
random.shuffle(labeled_names)
print("There are %s samples in the dataset." % (len(labeled_names)))
There are 7944 samples in the dataset.
import nltk
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print("Neo is classified as %s" % (classifier.classify(gender_features('Neo'))))
print("Trinity is classified as %s" % (classifier.classify(gender_features('Trinity'))))
Neo is classified as male Trinity is classified as female
print(nltk.classify.accuracy(classifier, test_set))
0.744
classifier.show_most_informative_features(5)
Most Informative Features last_letter = 'a' female : male = 34.5 : 1.0 last_letter = 'k' male : female = 32.7 : 1.0 last_letter = 'f' male : female = 27.7 : 1.0 last_letter = 'p' male : female = 12.6 : 1.0 last_letter = 'v' male : female = 10.5 : 1.0
def gender_features2(name):
features = {}
features["first_letter"] = name[0].lower()
features["last_letter"] = name[-1].lower()
for letter in 'abcdefghijklmnopqrstuvwxyz':
features["count(%s)" % letter] = name.lower().count(letter)
features["has(%s)" % letter] = (letter in name.lower())
return features
gender_features2('John')
{'first_letter': 'j', 'last_letter': 'n', 'count(a)': 0, 'has(a)': False, 'count(b)': 0, 'has(b)': False, 'count(c)': 0, 'has(c)': False, 'count(d)': 0, 'has(d)': False, 'count(e)': 0, 'has(e)': False, 'count(f)': 0, 'has(f)': False, 'count(g)': 0, 'has(g)': False, 'count(h)': 1, 'has(h)': True, 'count(i)': 0, 'has(i)': False, 'count(j)': 1, 'has(j)': True, 'count(k)': 0, 'has(k)': False, 'count(l)': 0, 'has(l)': False, 'count(m)': 0, 'has(m)': False, 'count(n)': 1, 'has(n)': True, 'count(o)': 1, 'has(o)': True, 'count(p)': 0, 'has(p)': False, 'count(q)': 0, 'has(q)': False, 'count(r)': 0, 'has(r)': False, 'count(s)': 0, 'has(s)': False, 'count(t)': 0, 'has(t)': False, 'count(u)': 0, 'has(u)': False, 'count(v)': 0, 'has(v)': False, 'count(w)': 0, 'has(w)': False, 'count(x)': 0, 'has(x)': False, 'count(y)': 0, 'has(y)': False, 'count(z)': 0, 'has(z)': False}
featuresets = [(gender_features2(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))
0.772
train_names = labeled_names[1500:]
devtest_names = labeled_names[500:1500]
test_names = labeled_names[:500]
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
test_set = [(gender_features(n), gender) for (n, gender) in test_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set))
0.774
errors = []
for (name, tag) in devtest_names:
guess = classifier.classify(gender_features(name))
if guess != tag:
errors.append( (tag, guess, name) )
for (tag, guess, name) in sorted(errors):
print('correct=%-8s guess=%-8s name=%-30s' % (tag, guess, name))
correct=female guess=male name=Addis correct=female guess=male name=Aleen correct=female guess=male name=Allyn correct=female guess=male name=Ardeen correct=female guess=male name=Ariel correct=female guess=male name=Arleen correct=female guess=male name=Aurel correct=female guess=male name=Avis correct=female guess=male name=Avrit correct=female guess=male name=Berget correct=female guess=male name=Bev correct=female guess=male name=Bidget correct=female guess=male name=Bill correct=female guess=male name=Bliss correct=female guess=male name=Brenn correct=female guess=male name=Brooks correct=female guess=male name=Caitlin correct=female guess=male name=Candis correct=female guess=male name=Carmel correct=female guess=male name=Cathleen correct=female guess=male name=Charis correct=female guess=male name=Charlot correct=female guess=male name=Chris correct=female guess=male name=Clair correct=female guess=male name=Dael correct=female guess=male name=Dallas correct=female guess=male name=Daniel correct=female guess=male name=Dawn correct=female guess=male name=Doralin correct=female guess=male name=Ealasaid correct=female guess=male name=Elizabet correct=female guess=male name=Ellyn correct=female guess=male name=Eveleen correct=female guess=male name=Flower correct=female guess=male name=Germain correct=female guess=male name=Gladys correct=female guess=male name=Glen correct=female guess=male name=Glynis correct=female guess=male name=Gretel correct=female guess=male name=Honor correct=female guess=male name=Imojean correct=female guess=male name=Ingaberg correct=female guess=male name=Janel correct=female guess=male name=Joan correct=female guess=male name=Joann correct=female guess=male name=Jordan correct=female guess=male name=Joselyn correct=female guess=male name=Kaitlynn correct=female guess=male name=Kass correct=female guess=male name=Koren correct=female guess=male name=Kristel correct=female guess=male name=Lark correct=female guess=male name=Lian correct=female guess=male name=Lillis correct=female guess=male name=Linnet correct=female guess=male name=Lorilyn correct=female guess=male name=Lorrin correct=female guess=male name=Lynnet correct=female guess=male name=Maegan correct=female guess=male name=Mairead correct=female guess=male name=Marget correct=female guess=male name=Mariam correct=female guess=male name=Marijo correct=female guess=male name=Marilin correct=female guess=male name=Marlo correct=female guess=male name=Mel correct=female guess=male name=Mellisent correct=female guess=male name=Michal correct=female guess=male name=Mikako correct=female guess=male name=Mirabel correct=female guess=male name=Miran correct=female guess=male name=Morgan correct=female guess=male name=Morgen correct=female guess=male name=Nan correct=female guess=male name=Nanon correct=female guess=male name=Noreen correct=female guess=male name=Philis correct=female guess=male name=Phillis correct=female guess=male name=Phylys correct=female guess=male name=Quentin correct=female guess=male name=Raynell correct=female guess=male name=Rhiamon correct=female guess=male name=Rosamond correct=female guess=male name=Rozalin correct=female guess=male name=Sharl correct=female guess=male name=Shaylyn correct=female guess=male name=Sheril correct=female guess=male name=Shirleen correct=female guess=male name=Siobhan correct=female guess=male name=Starlin correct=female guess=male name=Stoddard correct=female guess=male name=Theo correct=female guess=male name=Wandis correct=male guess=female name=Abbey correct=male guess=female name=Adolph correct=male guess=female name=Adolphe correct=male guess=female name=Ajai correct=male guess=female name=Alfonse correct=male guess=female name=Alix correct=male guess=female name=Alley correct=male guess=female name=Alphonse correct=male guess=female name=Amory correct=male guess=female name=Andrea correct=male guess=female name=Andrey correct=male guess=female name=Antony correct=male guess=female name=Ari correct=male guess=female name=Arne correct=male guess=female name=Artie correct=male guess=female name=Bela correct=male guess=female name=Bertie correct=male guess=female name=Bjorne correct=male guess=female name=Bradly correct=male guess=female name=Buddy correct=male guess=female name=Burke correct=male guess=female name=Carlyle correct=male guess=female name=Claire correct=male guess=female name=Cobby correct=male guess=female name=Dana correct=male guess=female name=Dani correct=male guess=female name=Dante correct=male guess=female name=Dennie correct=male guess=female name=Dickey correct=male guess=female name=Dietrich correct=male guess=female name=Dominique correct=male guess=female name=Dudley correct=male guess=female name=Durante correct=male guess=female name=Elijah correct=male guess=female name=Elmore correct=male guess=female name=Enoch correct=male guess=female name=Felice correct=male guess=female name=Ferdy correct=male guess=female name=Gabe correct=male guess=female name=Gabriele correct=male guess=female name=Gale correct=male guess=female name=Gayle correct=male guess=female name=Geoffrey correct=male guess=female name=Godfree correct=male guess=female name=Grove correct=male guess=female name=Gustave correct=male guess=female name=Guthry correct=male guess=female name=Harry correct=male guess=female name=Harvey correct=male guess=female name=Hermy correct=male guess=female name=Hersch correct=male guess=female name=Hugh correct=male guess=female name=Ignace correct=male guess=female name=Izzy correct=male guess=female name=Jamie correct=male guess=female name=Jeffrey correct=male guess=female name=Jermaine correct=male guess=female name=Jermayne correct=male guess=female name=Jerome correct=male guess=female name=Jessie correct=male guess=female name=Jimmy correct=male guess=female name=Joe correct=male guess=female name=Jorge correct=male guess=female name=Judith correct=male guess=female name=Kane correct=male guess=female name=Keith correct=male guess=female name=Kerry correct=male guess=female name=Kory correct=male guess=female name=Lane correct=male guess=female name=Lawerence correct=male guess=female name=Lemmie correct=male guess=female name=Lyle correct=male guess=female name=Maurise correct=male guess=female name=Meredeth correct=male guess=female name=Morrie correct=male guess=female name=Mortie correct=male guess=female name=Mugsy correct=male guess=female name=Murphy correct=male guess=female name=Nicky correct=male guess=female name=Nikki correct=male guess=female name=Noah correct=male guess=female name=Normie correct=male guess=female name=Orville correct=male guess=female name=Ozzie correct=male guess=female name=Paige correct=male guess=female name=Pearce correct=male guess=female name=Randie correct=male guess=female name=Randolph correct=male guess=female name=Reilly correct=male guess=female name=Rex correct=male guess=female name=Rey correct=male guess=female name=Rikki correct=male guess=female name=Roddie correct=male guess=female name=Rodolph correct=male guess=female name=Rodrique correct=male guess=female name=Sammie correct=male guess=female name=Samuele correct=male guess=female name=Saundra correct=male guess=female name=Sax correct=male guess=female name=Say correct=male guess=female name=Scotti correct=male guess=female name=Selby correct=male guess=female name=Shane correct=male guess=female name=Shayne correct=male guess=female name=Shea correct=male guess=female name=Shelby correct=male guess=female name=Skye correct=male guess=female name=Slade correct=male guess=female name=Spike correct=male guess=female name=Stanley correct=male guess=female name=Stearne correct=male guess=female name=Tallie correct=male guess=female name=Tanny correct=male guess=female name=Temple correct=male guess=female name=Terrance correct=male guess=female name=Terrence correct=male guess=female name=Toddy correct=male guess=female name=Tracie correct=male guess=female name=Tremayne correct=male guess=female name=Troy correct=male guess=female name=Tuckie correct=male guess=female name=Tye correct=male guess=female name=Waite correct=male guess=female name=Wallache correct=male guess=female name=Winnie correct=male guess=female name=Witty correct=male guess=female name=Wolfy correct=male guess=female name=Woody correct=male guess=female name=Yancy correct=male guess=female name=Yuri correct=male guess=female name=Zachary correct=male guess=female name=Zebadiah correct=male guess=female name=Zechariah
def gender_features(word):
return {'suffix1': word[-1:],
'suffix2': word[-2:]}
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set))
0.78
We now turn our attention to classifying full documents as opposed to single words in isolation.
The task seems more challenging, but simple methods can achieve surprisingly good results when the task is well defined. Consider the task of predicting whether a movie review is positive or negative. This is a task called sentiment analysis and is a hot practical task in the era of user-generated content (UGC) on the Web.
A good dataset is available in NLTK to experiment with this task.
from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = [w for (w, c) in all_words.most_common(2000)]
def document_features(document):
document_words = set(document)
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in document_words)
return features
print("There are %d documents in the movie reviews dataset." % (len(documents)))
print("There are %d distinct words in the dataset." % (all_words.B()))
print("There are %d tokens in the dataset." % (all_words.N()))
all_words.tabulate(20)
all_words.plot(100)
There are 2000 documents in the movie reviews dataset. There are 39768 distinct words in the dataset. There are 1583820 tokens in the dataset. , the . a and of to ' is in s " it that - ) ( as with for 77717 76529 65876 38106 35576 34123 31937 30585 25195 21822 18513 17612 16107 15924 15595 11781 11664 11378 10792 9961
<AxesSubplot:xlabel='Samples', ylabel='Counts'>
print(document_features(movie_reviews.words('pos/cv957_8737.txt')))
{'contains(,)': True, 'contains(the)': True, 'contains(.)': True, 'contains(a)': True, 'contains(and)': True, 'contains(of)': True, 'contains(to)': True, "contains(')": True, 'contains(is)': True, 'contains(in)': True, 'contains(s)': True, 'contains(")': True, 'contains(it)': True, 'contains(that)': True, 'contains(-)': True, 'contains())': True, 'contains(()': True, 'contains(as)': True, 'contains(with)': True, 'contains(for)': True, 'contains(his)': True, 'contains(this)': True, 'contains(film)': False, 'contains(i)': False, 'contains(he)': True, 'contains(but)': True, 'contains(on)': True, 'contains(are)': True, 'contains(t)': False, 'contains(by)': True, 'contains(be)': True, 'contains(one)': True, 'contains(movie)': True, 'contains(an)': True, 'contains(who)': True, 'contains(not)': True, 'contains(you)': True, 'contains(from)': True, 'contains(at)': False, 'contains(was)': False, 'contains(have)': True, 'contains(they)': True, 'contains(has)': True, 'contains(her)': False, 'contains(all)': True, 'contains(?)': False, 'contains(there)': True, 'contains(like)': True, 'contains(so)': False, 'contains(out)': True, 'contains(about)': True, 'contains(up)': False, 'contains(more)': False, 'contains(what)': True, 'contains(when)': True, 'contains(which)': True, 'contains(or)': False, 'contains(she)': True, 'contains(their)': False, 'contains(:)': True, 'contains(some)': False, 'contains(just)': True, 'contains(can)': False, 'contains(if)': True, 'contains(we)': False, 'contains(him)': True, 'contains(into)': True, 'contains(even)': False, 'contains(only)': True, 'contains(than)': False, 'contains(no)': False, 'contains(good)': False, 'contains(time)': False, 'contains(most)': True, 'contains(its)': False, 'contains(will)': True, 'contains(story)': False, 'contains(would)': False, 'contains(been)': False, 'contains(much)': False, 'contains(character)': False, 'contains(also)': True, 'contains(get)': True, 'contains(other)': True, 'contains(do)': True, 'contains(two)': True, 'contains(well)': True, 'contains(them)': True, 'contains(very)': True, 'contains(characters)': False, 'contains(;)': False, 'contains(first)': False, 'contains(--)': True, 'contains(after)': False, 'contains(see)': False, 'contains(!)': True, 'contains(way)': True, 'contains(because)': False, 'contains(make)': True, 'contains(life)': False, 'contains(off)': False, 'contains(too)': False, 'contains(any)': False, 'contains(does)': False, 'contains(really)': False, 'contains(had)': False, 'contains(while)': True, 'contains(films)': False, 'contains(how)': True, 'contains(plot)': True, 'contains(little)': True, 'contains(where)': True, 'contains(people)': False, 'contains(over)': False, 'contains(could)': False, 'contains(then)': True, 'contains(me)': True, 'contains(scene)': True, 'contains(man)': False, 'contains(bad)': False, 'contains(my)': False, 'contains(never)': True, 'contains(being)': False, 'contains(best)': True, 'contains(these)': False, 'contains(don)': False, 'contains(new)': False, 'contains(doesn)': False, 'contains(scenes)': False, 'contains(many)': True, 'contains(director)': False, 'contains(such)': False, 'contains(know)': False, 'contains(were)': False, 'contains(movies)': True, 'contains(through)': False, 'contains(here)': True, 'contains(action)': True, 'contains(great)': True, 'contains(re)': True, 'contains(another)': False, 'contains(love)': False, 'contains(go)': False, 'contains(made)': False, 'contains(us)': True, 'contains(big)': False, 'contains(end)': False, 'contains(something)': False, 'contains(back)': False, 'contains(*)': True, 'contains(still)': False, 'contains(world)': True, 'contains(seems)': False, 'contains(work)': False, 'contains(those)': False, 'contains(makes)': False, 'contains(now)': False, 'contains(before)': False, 'contains(however)': True, 'contains(between)': True, 'contains(few)': False, 'contains(/)': False, 'contains(down)': False, 'contains(every)': False, 'contains(though)': False, 'contains(better)': False, 'contains(real)': False, 'contains(audience)': False, 'contains(enough)': False, 'contains(seen)': False, 'contains(take)': False, 'contains(around)': False, 'contains(both)': False, 'contains(going)': False, 'contains(year)': False, 'contains(performance)': False, 'contains(why)': False, 'contains(should)': False, 'contains(role)': False, 'contains(isn)': False, 'contains(same)': True, 'contains(old)': False, 'contains(gets)': True, 'contains(your)': False, 'contains(may)': False, 'contains(things)': True, 'contains(think)': False, 'contains(years)': False, 'contains(last)': False, 'contains(comedy)': True, 'contains(funny)': True, 'contains(actually)': True, 'contains(ve)': False, 'contains(long)': False, 'contains(look)': True, 'contains(almost)': False, 'contains(own)': True, 'contains(thing)': False, 'contains(fact)': False, 'contains(nothing)': False, 'contains(say)': False, 'contains(right)': False, 'contains(john)': False, 'contains(although)': False, 'contains(played)': True, 'contains(find)': False, 'contains(script)': False, 'contains(come)': False, 'contains(ever)': True, 'contains(cast)': False, 'contains(since)': False, 'contains(did)': False, 'contains(star)': False, 'contains(plays)': False, 'contains(young)': False, 'contains(show)': False, 'contains(comes)': False, 'contains(m)': False, 'contains(part)': False, 'contains(original)': False, 'contains(actors)': False, 'contains(screen)': True, 'contains(without)': False, 'contains(again)': False, 'contains(acting)': False, 'contains(three)': False, 'contains(day)': True, 'contains(each)': True, 'contains(point)': False, 'contains(lot)': False, 'contains(least)': True, 'contains(takes)': False, 'contains(guy)': True, 'contains(quite)': False, 'contains(himself)': False, 'contains(away)': False, 'contains(during)': False, 'contains(family)': False, 'contains(effects)': False, 'contains(course)': True, 'contains(goes)': False, 'contains(minutes)': False, 'contains(interesting)': False, 'contains(might)': False, 'contains(far)': False, 'contains(high)': False, 'contains(rather)': False, 'contains(once)': True, 'contains(must)': False, 'contains(anything)': False, 'contains(place)': True, 'contains(set)': False, 'contains(yet)': False, 'contains(watch)': True, 'contains(d)': False, 'contains(making)': True, 'contains(our)': False, 'contains(wife)': True, 'contains(hard)': False, 'contains(always)': False, 'contains(fun)': True, 'contains(didn)': False, 'contains(ll)': False, 'contains(seem)': False, 'contains(special)': False, 'contains(bit)': False, 'contains(times)': False, 'contains(trying)': False, 'contains(hollywood)': False, 'contains(instead)': False, 'contains(give)': False, 'contains(want)': False, 'contains(picture)': False, 'contains(kind)': True, 'contains(american)': False, 'contains(job)': False, 'contains(sense)': False, 'contains(woman)': True, 'contains(home)': False, 'contains(having)': False, 'contains(series)': True, 'contains(actor)': False, 'contains(probably)': False, 'contains(help)': True, 'contains(half)': False, 'contains(along)': True, 'contains(men)': False, 'contains(everything)': True, 'contains(pretty)': False, 'contains(becomes)': False, 'contains(sure)': False, 'contains(black)': False, 'contains(together)': False, 'contains(dialogue)': False, 'contains(money)': False, 'contains(become)': False, 'contains(gives)': False, 'contains(given)': False, 'contains(looking)': False, 'contains(whole)': False, 'contains(watching)': False, 'contains(father)': False, 'contains(`)': False, 'contains(feel)': False, 'contains(everyone)': False, 'contains(music)': False, 'contains(wants)': False, 'contains(sex)': False, 'contains(less)': False, 'contains(done)': False, 'contains(horror)': False, 'contains(got)': True, 'contains(death)': False, 'contains(perhaps)': False, 'contains(city)': False, 'contains(next)': False, 'contains(especially)': True, 'contains(play)': False, 'contains(girl)': False, 'contains(mind)': False, 'contains(10)': False, 'contains(moments)': False, 'contains(looks)': True, 'contains(completely)': False, 'contains(2)': False, 'contains(reason)': False, 'contains(mother)': False, 'contains(whose)': False, 'contains(line)': False, 'contains(night)': False, 'contains(human)': False, 'contains(until)': False, 'contains(rest)': False, 'contains(performances)': False, 'contains(different)': False, 'contains(evil)': False, 'contains(small)': False, 'contains(james)': False, 'contains(simply)': False, 'contains(couple)': False, 'contains(put)': False, 'contains(let)': False, 'contains(anyone)': False, 'contains(ending)': False, 'contains(case)': False, 'contains(several)': False, 'contains(dead)': False, 'contains(michael)': False, 'contains(left)': False, 'contains(thought)': False, 'contains(school)': False, 'contains(shows)': False, 'contains(humor)': False, 'contains(true)': False, 'contains(lost)': False, 'contains(written)': False, 'contains(itself)': False, 'contains(friend)': False, 'contains(entire)': False, 'contains(getting)': True, 'contains(town)': False, 'contains(turns)': False, 'contains(soon)': False, 'contains(someone)': False, 'contains(second)': False, 'contains(main)': False, 'contains(stars)': False, 'contains(found)': False, 'contains(use)': False, 'contains(problem)': False, 'contains(friends)': True, 'contains(tv)': False, 'contains(top)': True, 'contains(name)': False, 'contains(begins)': False, 'contains(called)': False, 'contains(based)': False, 'contains(comic)': False, 'contains(david)': False, 'contains(head)': False, 'contains(else)': False, 'contains(idea)': True, 'contains(either)': False, 'contains(wrong)': True, 'contains(unfortunately)': False, 'contains(later)': False, 'contains(final)': False, 'contains(hand)': False, 'contains(alien)': False, 'contains(house)': False, 'contains(group)': False, 'contains(full)': False, 'contains(used)': True, 'contains(tries)': True, 'contains(often)': True, 'contains(against)': False, 'contains(war)': False, 'contains(sequence)': False, 'contains(keep)': False, 'contains(turn)': False, 'contains(playing)': True, 'contains(boy)': False, 'contains(behind)': False, 'contains(named)': False, 'contains(certainly)': False, 'contains(live)': False, 'contains(believe)': False, 'contains(under)': False, 'contains(works)': False, 'contains(relationship)': False, 'contains(face)': False, 'contains(hour)': False, 'contains(run)': False, 'contains(style)': False, 'contains(said)': False, 'contains(despite)': False, 'contains(person)': False, 'contains(finally)': False, 'contains(shot)': False, 'contains(book)': False, 'contains(doing)': False, 'contains(tell)': False, 'contains(maybe)': False, 'contains(nice)': False, 'contains(son)': False, 'contains(perfect)': False, 'contains(side)': False, 'contains(seeing)': True, 'contains(able)': False, 'contains(finds)': False, 'contains(children)': False, 'contains(days)': False, 'contains(past)': False, 'contains(summer)': False, 'contains(camera)': False, 'contains(won)': False, 'contains(including)': False, 'contains(mr)': False, 'contains(kids)': False, 'contains(lives)': False, 'contains(directed)': False, 'contains(moment)': False, 'contains(game)': False, 'contains(running)': False, 'contains(fight)': True, 'contains(supposed)': False, 'contains(video)': False, 'contains(car)': False, 'contains(matter)': False, 'contains(kevin)': True, 'contains(joe)': False, 'contains(lines)': False, 'contains(worth)': True, 'contains(=)': False, 'contains(daughter)': False, 'contains(earth)': False, 'contains(starts)': False, 'contains(need)': False, 'contains(entertaining)': False, 'contains(white)': False, 'contains(start)': True, 'contains(writer)': False, 'contains(dark)': False, 'contains(short)': False, 'contains(self)': False, 'contains(worst)': False, 'contains(nearly)': False, 'contains(opening)': False, 'contains(try)': False, 'contains(upon)': False, 'contains(care)': False, 'contains(early)': True, 'contains(violence)': False, 'contains(throughout)': False, 'contains(team)': False, 'contains(production)': False, 'contains(example)': False, 'contains(beautiful)': False, 'contains(title)': False, 'contains(exactly)': False, 'contains(jack)': False, 'contains(review)': False, 'contains(major)': False, 'contains(drama)': False, 'contains(&)': False, 'contains(problems)': True, 'contains(sequences)': False, 'contains(obvious)': False, 'contains(version)': False, 'contains(screenplay)': False, 'contains(known)': True, 'contains(killer)': False, 'contains(wasn)': False, 'contains(robert)': False, 'contains(disney)': False, 'contains(already)': False, 'contains(close)': False, 'contains(classic)': False, 'contains(others)': True, 'contains(hit)': False, 'contains(kill)': False, 'contains(deep)': True, 'contains(five)': False, 'contains(order)': False, 'contains(act)': False, 'contains(simple)': False, 'contains(fine)': False, 'contains(themselves)': False, 'contains(heart)': False, 'contains(roles)': False, 'contains(jackie)': True, 'contains(direction)': False, 'contains(eyes)': False, 'contains(four)': False, 'contains(question)': False, 'contains(sort)': False, 'contains(sometimes)': False, 'contains(knows)': False, 'contains(supporting)': False, 'contains(coming)': False, 'contains(voice)': False, 'contains(women)': False, 'contains(truly)': False, 'contains(save)': False, 'contains(jokes)': False, 'contains(computer)': False, 'contains(child)': False, 'contains(o)': False, 'contains(boring)': False, 'contains(tom)': False, 'contains(level)': False, 'contains(1)': False, 'contains(body)': False, 'contains(guys)': False, 'contains(genre)': False, 'contains(brother)': False, 'contains(strong)': False, 'contains(stop)': True, 'contains(room)': False, 'contains(space)': False, 'contains(lee)': False, 'contains(ends)': False, 'contains(beginning)': False, 'contains(ship)': False, 'contains(york)': False, 'contains(attempt)': False, 'contains(thriller)': False, 'contains(scream)': True, 'contains(peter)': False, 'contains(aren)': False, 'contains(husband)': False, 'contains(fiction)': False, 'contains(happens)': False, 'contains(hero)': False, 'contains(novel)': False, 'contains(note)': False, 'contains(hope)': False, 'contains(king)': False, 'contains(yes)': False, 'contains(says)': False, 'contains(tells)': False, 'contains(quickly)': False, 'contains(romantic)': False, 'contains(dog)': False, 'contains(oscar)': False, 'contains(stupid)': False, 'contains(possible)': False, 'contains(saw)': False, 'contains(lead)': True, 'contains(career)': False, 'contains(murder)': False, 'contains(extremely)': False, 'contains(manages)': False, 'contains(god)': False, 'contains(mostly)': False, 'contains(wonder)': False, 'contains(particularly)': False, 'contains(future)': False, 'contains(fans)': False, 'contains(sound)': False, 'contains(worse)': False, 'contains(piece)': False, 'contains(involving)': False, 'contains(de)': False, 'contains(appears)': False, 'contains(planet)': False, 'contains(paul)': False, 'contains(involved)': False, 'contains(mean)': False, 'contains(none)': False, 'contains(taking)': False, 'contains(hours)': False, 'contains(laugh)': True, 'contains(police)': False, 'contains(sets)': False, 'contains(attention)': False, 'contains(co)': False, 'contains(hell)': False, 'contains(eventually)': False, 'contains(single)': False, 'contains(fall)': False, 'contains(falls)': False, 'contains(material)': False, 'contains(emotional)': False, 'contains(power)': False, 'contains(late)': False, 'contains(lack)': False, 'contains(dr)': False, 'contains(van)': False, 'contains(result)': False, 'contains(elements)': False, 'contains(meet)': False, 'contains(smith)': False, 'contains(science)': False, 'contains(experience)': False, 'contains(bring)': False, 'contains(wild)': False, 'contains(living)': False, 'contains(theater)': False, 'contains(interest)': False, 'contains(leads)': False, 'contains(word)': False, 'contains(feature)': False, 'contains(battle)': False, 'contains(girls)': False, 'contains(alone)': False, 'contains(obviously)': False, 'contains(george)': False, 'contains(within)': False, 'contains(usually)': False, 'contains(enjoy)': False, 'contains(guess)': False, 'contains(among)': True, 'contains(taken)': False, 'contains(feeling)': False, 'contains(laughs)': False, 'contains(aliens)': False, 'contains(talk)': True, 'contains(chance)': False, 'contains(talent)': False, 'contains(3)': False, 'contains(middle)': False, 'contains(number)': False, 'contains(easy)': False, 'contains(across)': False, 'contains(needs)': False, 'contains(attempts)': False, 'contains(happen)': False, 'contains(television)': False, 'contains(chris)': False, 'contains(deal)': False, 'contains(poor)': False, 'contains(form)': False, 'contains(girlfriend)': True, 'contains(viewer)': False, 'contains(release)': False, 'contains(killed)': False, 'contains(forced)': False, 'contains(whether)': False, 'contains(wonderful)': False, 'contains(feels)': False, 'contains(oh)': False, 'contains(tale)': False, 'contains(serious)': False, 'contains(expect)': False, 'contains(except)': False, 'contains(light)': False, 'contains(success)': False, 'contains(features)': True, 'contains(premise)': False, 'contains(happy)': False, 'contains(words)': False, 'contains(leave)': False, 'contains(important)': False, 'contains(meets)': False, 'contains(history)': False, 'contains(giving)': False, 'contains(crew)': False, 'contains(type)': False, 'contains(call)': False, 'contains(turned)': False, 'contains(released)': False, 'contains(parents)': False, 'contains(art)': False, 'contains(impressive)': False, 'contains(mission)': False, 'contains(working)': False, 'contains(seemed)': False, 'contains(score)': False, 'contains(told)': False, 'contains(recent)': False, 'contains(robin)': False, 'contains(basically)': False, 'contains(entertainment)': False, 'contains(america)': False, 'contains($)': False, 'contains(surprise)': False, 'contains(apparently)': False, 'contains(easily)': False, 'contains(ryan)': False, 'contains(cool)': False, 'contains(stuff)': False, 'contains(cop)': False, 'contains(change)': False, 'contains(williams)': False, 'contains(crime)': False, 'contains(office)': False, 'contains(parts)': False, 'contains(somehow)': False, 'contains(sequel)': False, 'contains(william)': False, 'contains(cut)': False, 'contains(die)': False, 'contains(jones)': False, 'contains(credits)': False, 'contains(batman)': False, 'contains(suspense)': False, 'contains(brings)': False, 'contains(events)': False, 'contains(reality)': False, 'contains(whom)': False, 'contains(local)': False, 'contains(talking)': False, 'contains(difficult)': True, 'contains(using)': False, 'contains(went)': False, 'contains(writing)': False, 'contains(remember)': False, 'contains(near)': False, 'contains(straight)': False, 'contains(hilarious)': True, 'contains(ago)': False, 'contains(certain)': False, 'contains(ben)': False, 'contains(kid)': False, 'contains(wouldn)': False, 'contains(slow)': True, 'contains(blood)': False, 'contains(mystery)': False, 'contains(complete)': False, 'contains(red)': False, 'contains(popular)': False, 'contains(effective)': False, 'contains(am)': False, 'contains(fast)': True, 'contains(flick)': False, 'contains(due)': False, 'contains(runs)': False, 'contains(gone)': False, 'contains(return)': False, 'contains(presence)': False, 'contains(quality)': False, 'contains(dramatic)': False, 'contains(filmmakers)': False, 'contains(age)': False, 'contains(brothers)': False, 'contains(business)': False, 'contains(general)': False, 'contains(rock)': False, 'contains(sexual)': False, 'contains(present)': False, 'contains(surprisingly)': False, 'contains(anyway)': False, 'contains(uses)': False, 'contains(4)': False, 'contains(personal)': False, 'contains(figure)': False, 'contains(smart)': False, 'contains(ways)': False, 'contains(decides)': False, 'contains(annoying)': False, 'contains(begin)': False, 'contains(couldn)': False, 'contains(somewhat)': False, 'contains(shots)': False, 'contains(rich)': False, 'contains(minute)': False, 'contains(law)': False, 'contains(previous)': False, 'contains(jim)': False, 'contains(successful)': False, 'contains(harry)': False, 'contains(water)': False, 'contains(similar)': False, 'contains(absolutely)': False, 'contains(motion)': False, 'contains(former)': False, 'contains(strange)': False, 'contains(came)': False, 'contains(follow)': False, 'contains(read)': False, 'contains(project)': False, 'contains(million)': True, 'contains(secret)': False, 'contains(starring)': False, 'contains(clear)': False, 'contains(familiar)': False, 'contains(romance)': False, 'contains(intelligent)': False, 'contains(third)': True, 'contains(excellent)': False, 'contains(amazing)': False, 'contains(party)': False, 'contains(budget)': False, 'contains(eye)': False, 'contains(actress)': False, 'contains(prison)': False, 'contains(latest)': False, 'contains(means)': True, 'contains(company)': False, 'contains(towards)': False, 'contains(predictable)': False, 'contains(powerful)': False, 'contains(nor)': False, 'contains(bob)': False, 'contains(beyond)': False, 'contains(visual)': False, 'contains(leaves)': False, 'contains(r)': False, 'contains(nature)': False, 'contains(following)': False, 'contains(villain)': False, 'contains(leaving)': False, 'contains(animated)': False, 'contains(low)': False, 'contains(myself)': False, 'contains(b)': False, 'contains(bill)': False, 'contains(sam)': False, 'contains(filled)': False, 'contains(wars)': False, 'contains(questions)': False, 'contains(cinema)': False, 'contains(message)': False, 'contains(box)': False, 'contains(moving)': True, 'contains(herself)': False, 'contains(country)': False, 'contains(usual)': False, 'contains(martin)': False, 'contains(definitely)': False, 'contains(add)': False, 'contains(large)': False, 'contains(clever)': False, 'contains(create)': False, 'contains(felt)': False, 'contains(stories)': False, 'contains(brilliant)': False, 'contains(ones)': False, 'contains(giant)': False, 'contains(situation)': False, 'contains(murphy)': False, 'contains(break)': False, 'contains(opens)': False, 'contains(scary)': False, 'contains(doubt)': False, 'contains(drug)': True, 'contains(bunch)': False, 'contains(thinking)': False, 'contains(solid)': False, 'contains(effect)': False, 'contains(learn)': False, 'contains(move)': False, 'contains(force)': False, 'contains(potential)': False, 'contains(seriously)': False, 'contains(follows)': False, 'contains(above)': False, 'contains(saying)': False, 'contains(huge)': False, 'contains(class)': False, 'contains(plan)': False, 'contains(agent)': False, 'contains(created)': False, 'contains(unlike)': False, 'contains(pay)': False, 'contains(non)': True, 'contains(married)': False, 'contains(mark)': False, 'contains(sweet)': False, 'contains(perfectly)': False, 'contains(ex)': False, 'contains(realize)': False, 'contains(audiences)': False, 'contains(took)': False, 'contains(decent)': False, 'contains(likely)': False, 'contains(dream)': False, 'contains(view)': False, 'contains(scott)': False, 'contains(subject)': False, 'contains(understand)': False, 'contains(happened)': False, 'contains(enjoyable)': True, 'contains(studio)': False, 'contains(immediately)': False, 'contains(open)': False, 'contains(e)': False, 'contains(points)': False, 'contains(heard)': False, 'contains(viewers)': False, 'contains(cameron)': False, 'contains(truman)': False, 'contains(bruce)': False, 'contains(frank)': False, 'contains(private)': False, 'contains(stay)': False, 'contains(fails)': False, 'contains(impossible)': False, 'contains(cold)': False, 'contains(richard)': False, 'contains(overall)': False, 'contains(merely)': False, 'contains(exciting)': False, 'contains(mess)': False, 'contains(chase)': True, 'contains(free)': False, 'contains(ten)': False, 'contains(neither)': False, 'contains(wanted)': False, 'contains(gun)': True, 'contains(appear)': False, 'contains(carter)': False, 'contains(escape)': False, 'contains(ultimately)': False, 'contains(+)': False, 'contains(fan)': False, 'contains(inside)': False, 'contains(favorite)': False, 'contains(haven)': False, 'contains(modern)': False, 'contains(l)': False, 'contains(wedding)': False, 'contains(stone)': False, 'contains(trek)': False, 'contains(brought)': False, 'contains(trouble)': True, 'contains(otherwise)': False, 'contains(tim)': False, 'contains(5)': False, 'contains(allen)': False, 'contains(bond)': False, 'contains(society)': False, 'contains(liked)': False, 'contains(dumb)': False, 'contains(musical)': False, 'contains(stand)': False, 'contains(political)': False, 'contains(various)': False, 'contains(talented)': False, 'contains(particular)': False, 'contains(west)': False, 'contains(state)': False, 'contains(keeps)': True, 'contains(english)': False, 'contains(silly)': False, 'contains(u)': False, 'contains(situations)': False, 'contains(park)': False, 'contains(teen)': False, 'contains(rating)': False, 'contains(slightly)': False, 'contains(steve)': False, 'contains(truth)': False, 'contains(air)': False, 'contains(element)': False, 'contains(joke)': False, 'contains(spend)': False, 'contains(key)': True, 'contains(biggest)': False, 'contains(members)': False, 'contains(effort)': False, 'contains(government)': False, 'contains(focus)': False, 'contains(eddie)': False, 'contains(soundtrack)': False, 'contains(hands)': False, 'contains(earlier)': False, 'contains(chan)': True, 'contains(purpose)': False, 'contains(today)': True, 'contains(showing)': False, 'contains(memorable)': False, 'contains(six)': False, 'contains(cannot)': False, 'contains(max)': False, 'contains(offers)': False, 'contains(rated)': False, 'contains(mars)': False, 'contains(heavy)': False, 'contains(totally)': False, 'contains(control)': False, 'contains(credit)': False, 'contains(fi)': False, 'contains(woody)': False, 'contains(ideas)': False, 'contains(sci)': False, 'contains(wait)': False, 'contains(sit)': False, 'contains(female)': False, 'contains(ask)': False, 'contains(waste)': False, 'contains(terrible)': False, 'contains(depth)': False, 'contains(simon)': False, 'contains(aspect)': False, 'contains(list)': False, 'contains(mary)': False, 'contains(sister)': False, 'contains(animation)': False, 'contains(entirely)': False, 'contains(fear)': False, 'contains(steven)': False, 'contains(moves)': False, 'contains(actual)': False, 'contains(army)': False, 'contains(british)': False, 'contains(constantly)': False, 'contains(fire)': False, 'contains(convincing)': False, 'contains(setting)': False, 'contains(gave)': False, 'contains(tension)': False, 'contains(street)': False, 'contains(8)': False, 'contains(brief)': True, 'contains(ridiculous)': False, 'contains(cinematography)': False, 'contains(typical)': False, 'contains(nick)': False, 'contains(screenwriter)': False, 'contains(ability)': False, 'contains(spent)': False, 'contains(quick)': True, 'contains(violent)': False, 'contains(atmosphere)': False, 'contains(subtle)': False, 'contains(expected)': False, 'contains(fairly)': True, 'contains(seven)': False, 'contains(killing)': False, 'contains(tone)': False, 'contains(master)': False, 'contains(disaster)': False, 'contains(lots)': False, 'contains(thinks)': False, 'contains(song)': False, 'contains(cheap)': False, 'contains(suddenly)': False, 'contains(background)': False, 'contains(club)': False, 'contains(willis)': False, 'contains(whatever)': False, 'contains(highly)': False, 'contains(sees)': True, 'contains(complex)': False, 'contains(greatest)': False, 'contains(impact)': False, 'contains(beauty)': False, 'contains(front)': False, 'contains(humans)': False, 'contains(indeed)': False, 'contains(flat)': False, 'contains(grace)': False, 'contains(wrote)': False, 'contains(amusing)': False, 'contains(ii)': False, 'contains(mike)': False, 'contains(further)': False, 'contains(cute)': False, 'contains(dull)': False, 'contains(minor)': False, 'contains(recently)': False, 'contains(hate)': False, 'contains(outside)': False, 'contains(plenty)': False, 'contains(wish)': False, 'contains(godzilla)': False, 'contains(college)': False, 'contains(titanic)': False, 'contains(sounds)': False, 'contains(telling)': False, 'contains(sight)': False, 'contains(double)': False, 'contains(cinematic)': False, 'contains(queen)': False, 'contains(hold)': False, 'contains(meanwhile)': False, 'contains(awful)': False, 'contains(clearly)': False, 'contains(theme)': False, 'contains(hear)': False, 'contains(x)': False, 'contains(amount)': False, 'contains(baby)': False, 'contains(approach)': False, 'contains(dreams)': False, 'contains(shown)': False, 'contains(island)': False, 'contains(reasons)': False, 'contains(charm)': False, 'contains(miss)': True, 'contains(longer)': False, 'contains(common)': False, 'contains(sean)': False, 'contains(carry)': False, 'contains(believable)': False, 'contains(realistic)': False, 'contains(chemistry)': True, 'contains(possibly)': False, 'contains(casting)': False, 'contains(carrey)': False, 'contains(french)': False, 'contains(trailer)': False, 'contains(tough)': False, 'contains(produced)': False, 'contains(imagine)': False, 'contains(choice)': False, 'contains(ride)': False, 'contains(somewhere)': False, 'contains(hot)': False, 'contains(race)': False, 'contains(road)': False, 'contains(leader)': False, 'contains(thin)': False, 'contains(jerry)': False, 'contains(slowly)': False, 'contains(delivers)': False, 'contains(detective)': False, 'contains(brown)': False, 'contains(jackson)': False, 'contains(member)': False, 'contains(provide)': False, 'contains(president)': False, 'contains(puts)': False, 'contains(asks)': False, 'contains(critics)': False, 'contains(appearance)': False, 'contains(famous)': False, 'contains(okay)': False, 'contains(intelligence)': False, 'contains(energy)': False, 'contains(sent)': False, 'contains(spielberg)': False, 'contains(development)': False, 'contains(etc)': False, 'contains(language)': False, 'contains(blue)': False, 'contains(proves)': False, 'contains(vampire)': False, 'contains(seemingly)': False, 'contains(basic)': False, 'contains(caught)': False, 'contains(decide)': False, 'contains(opportunity)': False, 'contains(incredibly)': False, 'contains(images)': False, 'contains(band)': False, 'contains(j)': False, 'contains(writers)': False, 'contains(knew)': False, 'contains(interested)': False, 'contains(considering)': False, 'contains(boys)': False, 'contains(thanks)': False, 'contains(remains)': False, 'contains(climax)': True, 'contains(event)': False, 'contains(directing)': False, 'contains(conclusion)': False, 'contains(leading)': False, 'contains(ground)': False, 'contains(lies)': False, 'contains(forget)': False, 'contains(alive)': False, 'contains(tarzan)': False, 'contains(century)': False, 'contains(provides)': False, 'contains(trip)': False, 'contains(partner)': False, 'contains(central)': False, 'contains(tarantino)': False, 'contains(period)': False, 'contains(pace)': False, 'contains(yourself)': False, 'contains(worked)': False, 'contains(ready)': False, 'contains(date)': False, 'contains(thus)': False, 'contains(1998)': False, 'contains(terrific)': False, 'contains(write)': False, 'contains(average)': False, 'contains(onto)': False, 'contains(songs)': False, 'contains(occasionally)': False, 'contains(doctor)': False, 'contains(stands)': False, 'contains(hardly)': False, 'contains(monster)': False, 'contains(led)': False, 'contains(mysterious)': False, 'contains(details)': False, 'contains(wasted)': False, 'contains(apart)': False, 'contains(aside)': False, 'contains(store)': False, 'contains(billy)': False, 'contains(boss)': True, 'contains(travolta)': False, 'contains(producer)': False, 'contains(pull)': False, 'contains(consider)': False, 'contains(pictures)': False, 'contains(becoming)': False, 'contains(cage)': False, 'contains(loud)': False, 'contains(looked)': False, 'contains(officer)': False, 'contains(twenty)': False, 'contains(system)': False, 'contains(contains)': False, 'contains(julia)': False, 'contains(subplot)': False, 'contains(missing)': False, 'contains(personality)': False, 'contains(building)': False, 'contains(learns)': False, 'contains(hong)': True, 'contains(la)': False, 'contains(apartment)': False, 'contains(7)': False, 'contains(bizarre)': False, 'contains(powers)': False, 'contains(flaws)': False, 'contains(catch)': False, 'contains(lawyer)': False, 'contains(shoot)': False, 'contains(student)': False, 'contains(unique)': True, 'contains(000)': False, 'contains(admit)': False, 'contains(concept)': False, 'contains(needed)': False, 'contains(thrown)': False, 'contains(christopher)': False, 'contains(laughing)': False, 'contains(green)': False, 'contains(twists)': False, 'contains(matthew)': False, 'contains(touch)': False, 'contains(waiting)': False, 'contains(victim)': False, 'contains(cover)': False, 'contains(machine)': False, 'contains(danny)': False, 'contains(mention)': False, 'contains(search)': False, 'contains(1997)': False, 'contains(win)': False, 'contains(door)': False, 'contains(manner)': False, 'contains(train)': True, 'contains(saving)': False, 'contains(share)': False, 'contains(image)': False, 'contains(discovers)': False, 'contains(normal)': False, 'contains(cross)': False, 'contains(fox)': False, 'contains(returns)': False, 'contains(adult)': False, 'contains(adds)': False, 'contains(answer)': False, 'contains(adventure)': False, 'contains(lame)': False, 'contains(male)': False, 'contains(odd)': False, 'contains(singer)': False, 'contains(deserves)': False, 'contains(gore)': False, 'contains(states)': False, 'contains(include)': False, 'contains(equally)': False, 'contains(months)': False, 'contains(barely)': False, 'contains(directors)': False, 'contains(introduced)': False, 'contains(fashion)': False, 'contains(social)': False, 'contains(1999)': False, 'contains(news)': False, 'contains(hair)': False, 'contains(dance)': False, 'contains(innocent)': False, 'contains(camp)': False, 'contains(teacher)': False, 'contains(became)': False, 'contains(sad)': False, 'contains(witch)': False, 'contains(includes)': False, 'contains(nights)': False, 'contains(jason)': False, 'contains(julie)': False, 'contains(latter)': False, 'contains(food)': True, 'contains(jennifer)': False, 'contains(land)': False, 'contains(menace)': False, 'contains(rate)': False, 'contains(storyline)': False, 'contains(contact)': False, 'contains(jean)': False, 'contains(elizabeth)': False, 'contains(fellow)': False, 'contains(changes)': False, 'contains(henry)': False, 'contains(hill)': False, 'contains(pulp)': False, 'contains(gay)': False, 'contains(tried)': False, 'contains(surprised)': False, 'contains(literally)': False, 'contains(walk)': False, 'contains(standard)': False, 'contains(90)': False, 'contains(forward)': False, 'contains(wise)': False, 'contains(enjoyed)': False, 'contains(discover)': False, 'contains(pop)': False, 'contains(anderson)': False, 'contains(offer)': False, 'contains(recommend)': False, 'contains(public)': False, 'contains(drive)': False, 'contains(c)': False, 'contains(toy)': False, 'contains(charming)': False, 'contains(fair)': False, 'contains(chinese)': True, 'contains(rescue)': False, 'contains(terms)': False, 'contains(mouth)': False, 'contains(lucas)': False, 'contains(accident)': False, 'contains(dies)': False, 'contains(decided)': False, 'contains(edge)': False, 'contains(footage)': False, 'contains(culture)': False, 'contains(weak)': False, 'contains(presented)': False, 'contains(blade)': False, 'contains(younger)': False, 'contains(douglas)': False, 'contains(natural)': False, 'contains(born)': False, 'contains(generally)': False, 'contains(teenage)': False, 'contains(older)': False, 'contains(horrible)': False, 'contains(addition)': False, 'contains(sadly)': False, 'contains(creates)': False, 'contains(disturbing)': False, 'contains(roger)': False, 'contains(detail)': False, 'contains(devil)': False, 'contains(debut)': False, 'contains(track)': False, 'contains(developed)': False, 'contains(week)': False, 'contains(russell)': False, 'contains(attack)': False, 'contains(explain)': False, 'contains(rarely)': False, 'contains(fully)': False, 'contains(prove)': False, 'contains(exception)': False, 'contains(jeff)': False, 'contains(twist)': False, 'contains(gang)': False, 'contains(winning)': False, 'contains(jr)': False, 'contains(species)': False, 'contains(issues)': False, 'contains(fresh)': False, 'contains(rules)': False, 'contains(meaning)': False, 'contains(inspired)': False, 'contains(heroes)': False, 'contains(desperate)': False, 'contains(fighting)': False, 'contains(filmed)': False, 'contains(faces)': False, 'contains(alan)': False, 'contains(bright)': False, 'contains(ass)': True, 'contains(flying)': False, 'contains(kong)': True, 'contains(rush)': False, 'contains(forces)': False, 'contains(charles)': False, 'contains(numerous)': False, 'contains(emotions)': False, 'contains(involves)': True, 'contains(patrick)': False, 'contains(weird)': False, 'contains(apparent)': False, 'contains(information)': False, 'contains(revenge)': False, 'contains(jay)': False, 'contains(toward)': False, 'contains(surprising)': False, 'contains(twice)': False, 'contains(editing)': False, 'contains(calls)': False, 'contains(lose)': False, 'contains(vegas)': False, 'contains(stage)': False, 'contains(intended)': False, 'contains(gags)': False, 'contains(opinion)': False, 'contains(likes)': False, 'contains(crazy)': False, 'contains(owner)': False, 'contains(places)': False, 'contains(pair)': False, 'contains(genuine)': False, 'contains(epic)': False, 'contains(speak)': False, 'contains(throw)': False, 'contains(appeal)': False, 'contains(gibson)': False, 'contains(captain)': False, 'contains(military)': False, 'contains(20)': False, 'contains(blair)': False, 'contains(nowhere)': False, 'contains(length)': False, 'contains(nicely)': False, 'contains(cause)': False, 'contains(pass)': False, 'contains(episode)': False, 'contains(kiss)': False, 'contains(arnold)': True, 'contains(please)': False, 'contains(hasn)': False, 'contains(phone)': False, 'contains(filmmaking)': False, 'contains(formula)': False, 'contains(boyfriend)': False, 'contains(talents)': False, 'contains(creating)': False, 'contains(kelly)': False, 'contains(buy)': False, 'contains(wide)': False, 'contains(fantasy)': False, 'contains(mood)': False, 'contains(heads)': False, 'contains(pathetic)': False, 'contains(lacks)': False, 'contains(loved)': False, 'contains(asked)': False, 'contains(mrs)': False, 'contains(witty)': False, 'contains(shakespeare)': False, 'contains(mulan)': False, 'contains(generation)': False, 'contains(affair)': False, 'contains(pieces)': False, 'contains(task)': False, 'contains(rare)': False, 'contains(kept)': False, 'contains(cameo)': False, 'contains(fascinating)': False, 'contains(ed)': False, 'contains(fbi)': False, 'contains(burton)': False, 'contains(incredible)': False, 'contains(accent)': False, 'contains(artist)': False, 'contains(superior)': False, 'contains(academy)': False, 'contains(thomas)': False, 'contains(spirit)': False, 'contains(technical)': False, 'contains(confusing)': False, 'contains(poorly)': False, 'contains(target)': False, 'contains(lover)': False, 'contains(woo)': False, 'contains(mentioned)': False, 'contains(theaters)': False, 'contains(plane)': False, 'contains(confused)': False, 'contains(dennis)': False, 'contains(rob)': False, 'contains(appropriate)': False, 'contains(christmas)': False, 'contains(considered)': False, 'contains(legend)': False, 'contains(shame)': False, 'contains(soul)': False, 'contains(matt)': False, 'contains(campbell)': False, 'contains(process)': False, 'contains(bottom)': False, 'contains(sitting)': False, 'contains(brain)': False, 'contains(creepy)': False, 'contains(13)': False, 'contains(forever)': False, 'contains(dude)': False, 'contains(crap)': False, 'contains(superb)': False, 'contains(speech)': False, 'contains(ice)': False, 'contains(journey)': False, 'contains(masterpiece)': False, 'contains(intriguing)': False, 'contains(names)': False, 'contains(pick)': False, 'contains(speaking)': False, 'contains(virtually)': False, 'contains(award)': False, 'contains(worthy)': False, 'contains(marriage)': False, 'contains(deliver)': False, 'contains(cash)': False, 'contains(magic)': False, 'contains(respect)': False, 'contains(product)': False, 'contains(necessary)': False, 'contains(suppose)': False, 'contains(silent)': False, 'contains(pointless)': False, 'contains(station)': False, 'contains(affleck)': False, 'contains(dimensional)': False, 'contains(charlie)': False, 'contains(allows)': False, 'contains(avoid)': False, 'contains(meant)': False, 'contains(cops)': False, 'contains(attitude)': False, 'contains(relationships)': False, 'contains(hits)': False, 'contains(stephen)': False, 'contains(spends)': False, 'contains(relief)': False, 'contains(physical)': True, 'contains(count)': False, 'contains(reviews)': False, 'contains(appreciate)': False, 'contains(cliches)': False, 'contains(holds)': False, 'contains(pure)': False, 'contains(plans)': False, 'contains(limited)': False, 'contains(failed)': False, 'contains(pain)': False, 'contains(impression)': False, 'contains(unless)': False, 'contains(sub)': False, 'contains([)': False, 'contains(total)': False, 'contains(creature)': False, 'contains(viewing)': False, 'contains(loves)': False, 'contains(princess)': False, 'contains(kate)': False, 'contains(rising)': False, 'contains(woods)': False, 'contains(baldwin)': False, 'contains(angry)': False, 'contains(drawn)': False, 'contains(step)': False, 'contains(matrix)': False, 'contains(themes)': False, 'contains(satire)': False, 'contains(arts)': False, 'contains(])': False, 'contains(remake)': False, 'contains(wall)': False, 'contains(moral)': False, 'contains(color)': False, 'contains(ray)': False, 'contains(stuck)': False, 'contains(touching)': False, 'contains(wit)': False, 'contains(tony)': False, 'contains(hanks)': False, 'contains(continues)': False, 'contains(damn)': False, 'contains(nobody)': False, 'contains(cartoon)': False, 'contains(keeping)': False, 'contains(realized)': False, 'contains(criminal)': False, 'contains(unfunny)': False, 'contains(comedic)': False, 'contains(martial)': False, 'contains(disappointing)': False, 'contains(anti)': False, 'contains(graphic)': False, 'contains(stunning)': False, 'contains(actions)': False, 'contains(floor)': False, 'contains(emotion)': False, 'contains(soldiers)': False, 'contains(edward)': False, 'contains(comedies)': False, 'contains(driver)': False, 'contains(expectations)': False, 'contains(added)': False, 'contains(mad)': False, 'contains(angels)': False, 'contains(shallow)': False, 'contains(suspect)': False, 'contains(humorous)': False, 'contains(phantom)': False, 'contains(appealing)': False, 'contains(device)': False, 'contains(design)': False, 'contains(industry)': False, 'contains(reach)': False, 'contains(fat)': False, 'contains(blame)': False, 'contains(united)': False, 'contains(sign)': False, 'contains(portrayal)': False, 'contains(rocky)': False, 'contains(finale)': False, 'contains(grand)': False, 'contains(opposite)': False, 'contains(hotel)': False, 'contains(match)': False, 'contains(damme)': False, 'contains(speed)': False, 'contains(ok)': False, 'contains(loving)': False, 'contains(field)': True, 'contains(larry)': False, 'contains(urban)': False, 'contains(troopers)': False, 'contains(compared)': False, 'contains(apes)': False, 'contains(rose)': False, 'contains(falling)': False, 'contains(era)': False, 'contains(loses)': False, 'contains(adults)': False, 'contains(managed)': False, 'contains(dad)': False, 'contains(therefore)': False, 'contains(pg)': False, 'contains(results)': False, 'contains(guns)': False, 'contains(radio)': False, 'contains(lady)': False, 'contains(manage)': False, 'contains(spice)': False, 'contains(naked)': False, 'contains(started)': False, 'contains(intense)': False, 'contains(humanity)': False, 'contains(wonderfully)': False, 'contains(slasher)': False, 'contains(bland)': False, 'contains(imagination)': False, 'contains(walking)': False, 'contains(willing)': False, 'contains(horse)': False, 'contains(rent)': False, 'contains(mix)': False, 'contains(generated)': False, 'contains(g)': False, 'contains(utterly)': False, 'contains(scientist)': False, 'contains(washington)': False, 'contains(notice)': False, 'contains(players)': False, 'contains(teenagers)': False, 'contains(moore)': False, 'contains(board)': False, 'contains(price)': False, 'contains(frightening)': False, 'contains(tommy)': False, 'contains(spectacular)': False, 'contains(bored)': False, 'contains(jane)': False, 'contains(join)': False, 'contains(producers)': False, 'contains(johnny)': False, 'contains(zero)': False, 'contains(vampires)': False, 'contains(adaptation)': False, 'contains(dollars)': False, 'contains(parody)': False, 'contains(documentary)': False, 'contains(dvd)': False, 'contains(wayne)': False, 'contains(post)': False, 'contains(exist)': False, 'contains(matters)': False, 'contains(chosen)': False, 'contains(mel)': False, 'contains(attractive)': True, 'contains(plain)': False, 'contains(trust)': False, 'contains(safe)': False, 'contains(reading)': False, 'contains(hoping)': False, 'contains(protagonist)': False, 'contains(feelings)': False, 'contains(fate)': False, 'contains(finding)': False, 'contains(feet)': False, 'contains(visuals)': False, 'contains(spawn)': False, 'contains(compelling)': False, 'contains(hall)': False, 'contains(sympathetic)': False, 'contains(featuring)': False, 'contains(difference)': False, 'contains(professional)': False, 'contains(drugs)': False, 'contains(ford)': False, 'contains(shooting)': False, 'contains(gold)': False, 'contains(patch)': False, 'contains(build)': False, 'contains(boat)': False, 'contains(cruise)': False, 'contains(honest)': False, 'contains(media)': False, 'contains(flicks)': False, 'contains(bug)': False, 'contains(bringing)': False, 'contains(dangerous)': True, 'contains(watched)': False, 'contains(grant)': False, 'contains(smile)': False, 'contains(plus)': False, 'contains(shouldn)': False, 'contains(decision)': False, 'contains(visually)': False, 'contains(allow)': False, 'contains(starship)': False, 'contains(roberts)': False, 'contains(dying)': False, 'contains(portrayed)': False, 'contains(turning)': False, 'contains(believes)': False, 'contains(changed)': False, 'contains(shock)': False, 'contains(destroy)': False, 'contains(30)': False, 'contains(crowd)': False, 'contains(broken)': False, 'contains(tired)': False, 'contains(fail)': False, 'contains(south)': False, 'contains(died)': False, 'contains(cult)': False, 'contains(fake)': False, 'contains(vincent)': False, 'contains(identity)': False, 'contains(sexy)': False, 'contains(hunt)': False, 'contains(jedi)': False, 'contains(flynt)': False, 'contains(alex)': False, 'contains(engaging)': False, 'contains(serve)': False, 'contains(snake)': False, 'contains(yeah)': False, 'contains(expecting)': False, 'contains(100)': False, 'contains(decade)': False, 'contains(ups)': False, 'contains(constant)': False, 'contains(current)': False, 'contains(survive)': False, 'contains(jimmy)': False, 'contains(buddy)': False, 'contains(send)': False, 'contains(brooks)': False, 'contains(goofy)': False, 'contains(likable)': False, 'contains(humour)': False, 'contains(technology)': False, 'contains(files)': False, 'contains(babe)': False, 'contains(aspects)': False, 'contains(presents)': False, 'contains(kills)': False, 'contains(supposedly)': False, 'contains(eight)': True, 'contains(sandler)': False, 'contains(hospital)': False, 'contains(test)': False, 'contains(hidden)': False, 'contains(brian)': False, 'contains(books)': False, 'contains(promise)': False, 'contains(determined)': False, 'contains(professor)': False, 'contains(welcome)': False, 'contains(pleasure)': False, 'contains(succeeds)': False, 'contains(individual)': False, 'contains(annie)': False, 'contains(mob)': False, 'contains(ted)': False, 'contains(virus)': False, 'contains(content)': False, 'contains(gary)': False, 'contains(direct)': False, 'contains(contrived)': False, 'contains(carpenter)': False, 'contains(scale)': False, 'contains(sick)': False, 'contains(nasty)': False, 'contains(conflict)': False, 'contains(haunting)': False, 'contains(ghost)': False, 'contains(filmmaker)': False, 'contains(japanese)': False, 'contains(helps)': False, 'contains(fare)': False, 'contains(lucky)': False, 'contains(ultimate)': False, 'contains(window)': False, 'contains(support)': False, 'contains(goal)': False, 'contains(provided)': False, 'contains(genius)': False, 'contains(winner)': False, 'contains(taylor)': False, 'contains(fantastic)': False, 'contains(faith)': False, 'contains(lynch)': False, 'contains(fit)': False, 'contains(catherine)': False, 'contains(ms)': False, 'contains(paced)': False, 'contains(breaks)': False, 'contains(al)': False, 'contains(frame)': False, 'contains(travel)': False, 'contains(badly)': False, 'contains(available)': False, 'contains(cares)': False, 'contains(reeves)': False, 'contains(crash)': False, 'contains(driving)': False, 'contains(press)': False, 'contains(seagal)': False, 'contains(amy)': False, 'contains(9)': False, 'contains(headed)': False, 'contains(instance)': False, 'contains(excuse)': False, 'contains(offensive)': False, 'contains(narrative)': False, 'contains(fault)': False, 'contains(bus)': False, 'contains(f)': False, 'contains(extreme)': False, 'contains(miller)': False, 'contains(guilty)': False, 'contains(grows)': False, 'contains(overly)': False, 'contains(liners)': False, 'contains(forgotten)': False, 'contains(ahead)': False, 'contains(accept)': False, 'contains(porn)': False, 'contains(directly)': False, 'contains(helen)': False, 'contains(began)': False, 'contains(lord)': False, 'contains(folks)': False, 'contains(mediocre)': False, 'contains(bar)': False, 'contains(surface)': False, 'contains(super)': False, 'contains(failure)': False, 'contains(6)': False, 'contains(acted)': False, 'contains(quiet)': False, 'contains(laughable)': False, 'contains(sheer)': False, 'contains(security)': True, 'contains(emotionally)': False, 'contains(season)': False, 'contains(stuart)': False, 'contains(jail)': True, 'contains(deals)': False, 'contains(cheesy)': False, 'contains(court)': False, 'contains(beach)': False, 'contains(austin)': False, 'contains(model)': False, 'contains(outstanding)': False, 'contains(substance)': False, 'contains(nudity)': False, 'contains(slapstick)': False, 'contains(joan)': False, 'contains(reveal)': False, 'contains(placed)': False, 'contains(check)': False, 'contains(beast)': False, 'contains(hurt)': False, 'contains(bloody)': False, 'contains(acts)': False, 'contains(fame)': False, 'contains(meeting)': False, 'contains(nuclear)': False, 'contains(1996)': False, 'contains(strength)': False, 'contains(center)': False, 'contains(funniest)': False, 'contains(standing)': True, 'contains(damon)': False, 'contains(clich)': False, 'contains(position)': False, 'contains(desire)': False, 'contains(driven)': False, 'contains(seat)': False, 'contains(stock)': False, 'contains(wondering)': True, 'contains(realizes)': False, 'contains(dealing)': False, 'contains(taste)': False, 'contains(routine)': False, 'contains(comparison)': False, 'contains(cinematographer)': False, 'contains(seconds)': False, 'contains(singing)': False, 'contains(gangster)': True, 'contains(responsible)': False, 'contains(football)': False, 'contains(remarkable)': False, 'contains(hunting)': False, 'contains(adams)': False, 'contains(fly)': False, 'contains(suspects)': False, 'contains(treat)': False, 'contains(hopes)': False, 'contains(heaven)': False, 'contains(myers)': False, 'contains(struggle)': False, 'contains(costumes)': False, 'contains(beat)': False, 'contains(happening)': False, 'contains(skills)': False, 'contains(ugly)': False, 'contains(figures)': False, 'contains(thoroughly)': False, 'contains(ill)': False, 'contains(surprises)': False, 'contains(player)': False, 'contains(rival)': False, 'contains(guard)': True, 'contains(anthony)': False, 'contains(strike)': False, 'contains(community)': False, 'contains(streets)': False, 'contains(hopkins)': False, 'contains(ended)': False, 'contains(originally)': False, 'contains(sarah)': False, 'contains(creative)': False, 'contains(characterization)': False, 'contains(thankfully)': False, 'contains(growing)': False, 'contains(sharp)': False, 'contains(williamson)': False, 'contains(eccentric)': False, 'contains(explained)': False, 'contains(hey)': False, 'contains(claire)': False, 'contains(steal)': False, 'contains(inevitable)': False, 'contains(joel)': False, 'contains(core)': False, 'contains(weren)': False, 'contains(sorry)': False, 'contains(built)': False, 'contains(anne)': False, 'contains(breaking)': False, 'contains(villains)': False, 'contains(critic)': False, 'contains(lets)': False, 'contains(visit)': False, 'contains(followed)': False}
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print("Naive Bayes accuracy with 2000 bag of words features is %s" % (nltk.classify.accuracy(classifier, test_set)))
classifier.show_most_informative_features(5)
Naive Bayes accuracy with 2000 bag of words features is 0.82 Most Informative Features contains(outstanding) = True pos : neg = 13.8 : 1.0 contains(seagal) = True neg : pos = 12.4 : 1.0 contains(mulan) = True pos : neg = 8.9 : 1.0 contains(wonderfully) = True pos : neg = 6.7 : 1.0 contains(damon) = True pos : neg = 5.8 : 1.0
classifier.show_most_informative_features(20)
Most Informative Features contains(outstanding) = True pos : neg = 13.8 : 1.0 contains(seagal) = True neg : pos = 12.4 : 1.0 contains(mulan) = True pos : neg = 8.9 : 1.0 contains(wonderfully) = True pos : neg = 6.7 : 1.0 contains(damon) = True pos : neg = 5.8 : 1.0 contains(wasted) = True neg : pos = 5.7 : 1.0 contains(flynt) = True pos : neg = 5.6 : 1.0 contains(awful) = True neg : pos = 5.6 : 1.0 contains(lame) = True neg : pos = 5.4 : 1.0 contains(poorly) = True neg : pos = 5.4 : 1.0 contains(waste) = True neg : pos = 4.9 : 1.0 contains(ridiculous) = True neg : pos = 4.7 : 1.0 contains(terrific) = True pos : neg = 4.3 : 1.0 contains(worst) = True neg : pos = 4.3 : 1.0 contains(era) = True pos : neg = 4.2 : 1.0 contains(mess) = True neg : pos = 4.2 : 1.0 contains(unfunny) = True neg : pos = 4.1 : 1.0 contains(bland) = True neg : pos = 4.1 : 1.0 contains(superb) = True pos : neg = 3.9 : 1.0 contains(jedi) = True pos : neg = 3.9 : 1.0
We can consider the task of POS tagging as a classification task and use the classifier methodology described here. Let us revisit the POS tagging task discussed in the first lecture using the new tools we have developed.
from nltk.corpus import brown
suffix_fdist = nltk.FreqDist()
for word in brown.words():
word = word.lower()
suffix_fdist[word[-1:]] += 1
suffix_fdist[word[-2:]] += 1
suffix_fdist[word[-3:]] += 1
common_suffixes = [suffix for (suffix, count) in suffix_fdist.most_common(100)]
print(common_suffixes)
['e', ',', '.', 's', 'd', 't', 'he', 'n', 'a', 'of', 'the', 'y', 'r', 'to', 'in', 'f', 'o', 'ed', 'nd', 'is', 'on', 'l', 'g', 'and', 'ng', 'er', 'as', 'ing', 'h', 'at', 'es', 'or', 're', 'it', '``', 'an', "''", 'm', ';', 'i', 'ly', 'ion', 'en', 'al', '?', 'nt', 'be', 'hat', 'st', 'his', 'th', 'll', 'le', 'ce', 'by', 'ts', 'me', 've', "'", 'se', 'ut', 'was', 'for', 'ent', 'ch', 'k', 'w', 'ld', '`', 'rs', 'ted', 'ere', 'her', 'ne', 'ns', 'ith', 'ad', 'ry', ')', '(', 'te', '--', 'ay', 'ty', 'ot', 'p', 'nce', "'s", 'ter', 'om', 'ss', ':', 'we', 'are', 'c', 'ers', 'uld', 'had', 'so', 'ey']
def pos_features(word):
features = {}
for suffix in common_suffixes:
features['endswith(%s)' % suffix] = word.lower().endswith(suffix)
return features
tagged_words = brown.tagged_words(categories='news', tagset='universal')
featuresets = [(pos_features(n), g) for (n,g) in tagged_words]
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)
0.7011437095972153
classifier.classify(pos_features('cats'))
'NOUN'
classifier.classify(pos_features('books'))
'NOUN'
classifier.show_most_informative_features(20)
Most Informative Features endswith(the) = True DET : NOUN = 3416.9 : 1.0 endswith(.) = True . : ADP = 2481.6 : 1.0 endswith(to) = True PRT : ADJ = 2138.0 : 1.0 endswith(f) = True ADP : VERB = 2050.5 : 1.0 endswith(he) = True DET : NOUN = 1808.9 : 1.0 endswith(and) = True CONJ : ADV = 1642.0 : 1.0 endswith(a) = True DET : VERB = 1597.1 : 1.0 endswith(of) = True ADP : NOUN = 1406.9 : 1.0 endswith(his) = True DET : NOUN = 728.0 : 1.0 endswith(ut) = True CONJ : DET = 694.7 : 1.0 endswith(nd) = True CONJ : NUM = 636.1 : 1.0 endswith(hat) = True PRON : NOUN = 570.6 : 1.0 endswith(ey) = True PRON : VERB = 549.0 : 1.0 endswith(i) = True PRON : ADP = 547.2 : 1.0 endswith(') = True . : VERB = 503.7 : 1.0 endswith(o) = True PRT : ADJ = 493.4 : 1.0 endswith(es) = True NOUN : ADP = 427.0 : 1.0 endswith(uld) = True VERB : NOUN = 422.5 : 1.0 endswith(we) = True PRON : NOUN = 353.5 : 1.0 endswith(ted) = True VERB : NOUN = 337.9 : 1.0
NLTK provides a common interface to different classifier algorithms. This is illustrated in the following examples.
import nltk
train = [
(dict(a=1,b=1,c=1), 'y'),
(dict(a=1,b=1,c=1), 'x'),
(dict(a=1,b=1,c=0), 'y'),
(dict(a=0,b=1,c=1), 'x'),
(dict(a=0,b=1,c=1), 'y'),
(dict(a=0,b=0,c=1), 'y'),
(dict(a=0,b=1,c=0), 'x'),
(dict(a=0,b=0,c=0), 'x'),
(dict(a=0,b=1,c=1), 'y'),
]
test = [
(dict(a=1,b=0,c=1)), # unseen
(dict(a=1,b=0,c=0)), # unseen
(dict(a=0,b=1,c=1)), # seen 3 times, labels=y,y,x
(dict(a=0,b=1,c=0)), # seen 1 time, label=x
]
classifier = nltk.classify.NaiveBayesClassifier.train(train)
sorted(classifier.labels())
['x', 'y']
classifier.classify_many(test)
['y', 'x', 'y', 'x']
for pdist in classifier.prob_classify_many(test):
print('%.4f %.4f' % (pdist.prob('x'), pdist.prob('y')))
0.3203 0.6797 0.5857 0.4143 0.3792 0.6208 0.6470 0.3530
classifier.show_most_informative_features()
Most Informative Features c = 0 x : y = 2.0 : 1.0 c = 1 y : x = 1.5 : 1.0 a = 1 y : x = 1.4 : 1.0 b = 0 x : y = 1.2 : 1.0 a = 0 x : y = 1.2 : 1.0 b = 1 y : x = 1.1 : 1.0
classifier = nltk.classify.DecisionTreeClassifier.train(
train, entropy_cutoff=0, support_cutoff=0)
sorted(classifier.labels())
['x', 'y']
print(classifier)
c=0? .................................................. x a=0? ................................................ x a=1? ................................................ y c=1? .................................................. y
classifier.classify_many(test)
['y', 'y', 'y', 'x']
There is no prob() method for decision tree classifiers, as they do not provide a probability interpretation.
NLTK provides an interface to the Scikit-learn (sklearn) classifiers - including maximum entropy and SVM.
from nltk.classify import SklearnClassifier
train_data = [({"a": 4, "b": 1, "c": 0}, "ham"),
({"a": 5, "b": 2, "c": 1}, "ham"),
({"a": 0, "b": 3, "c": 4}, "spam"),
({"a": 5, "b": 1, "c": 1}, "ham"),
({"a": 1, "b": 4, "c": 3}, "spam")]
test_data = [{"a": 3, "b": 2, "c": 1},
{"a": 0, "b": 3, "c": 7}]
from sklearn.naive_bayes import BernoulliNB
classif = SklearnClassifier(BernoulliNB()).train(train_data)
classif.classify_many(test_data)
['ham', 'spam']
from sklearn.svm import SVC
classif = SklearnClassifier(SVC(gamma='scale'), sparse=False).train(train_data)
classif.classify_many(test_data)
['ham', 'spam']
# Using the sklearn classifier:
X = [[0], [1], [2], [3]]
Y = [0, 1, 2, 3]
clf = SVC(kernel='linear', C=1.0)
clf.fit(X, Y)
SVC(kernel='linear')
classifr = SklearnClassifier(SVC(kernel='rbf', C=1.0, gamma='scale'), sparse=False).train(train_data)
classifr.classify_many(test_data)
['ham', 'spam']
from sklearn.svm import LinearSVC
classif_ova = SklearnClassifier(LinearSVC(C=1.0), sparse=False).train(train_data)
classif_ova.classify_many(test_data)
['ham', 'spam']
The key parameter to optimize for a given SVM kernel is the C parameter. Here is example code from sklearn that shows how to optimize C on a development set.
%matplotlib inline
import numpy as np
from sklearn import model_selection, datasets, svm
digits = datasets.load_digits()
X = digits.data
y = digits.target
svc = svm.SVC(kernel='linear')
C_s = np.logspace(-10, 0, 10)
scores = list()
scores_std = list()
for C in C_s:
svc.C = C
this_scores = model_selection.cross_val_score(svc, X, y, n_jobs=1, cv=5)
scores.append(np.mean(this_scores))
scores_std.append(np.std(this_scores))
# Do the plotting
import matplotlib.pyplot as plt
plt.figure(1, figsize=(4, 3))
plt.clf()
plt.semilogx(C_s, scores)
plt.semilogx(C_s, np.array(scores) + np.array(scores_std), 'b--')
plt.semilogx(C_s, np.array(scores) - np.array(scores_std), 'b--')
locs, labels = plt.yticks()
plt.yticks(locs, list(map(lambda x: "%g" % x, locs)))
plt.ylabel('CV score')
plt.xlabel('Parameter C')
plt.ylim(0, 1.1)
plt.show()