from nltk.corpus import reuters
cats = reuters.categories()
print("Reuters has %d categories:\n%s" % (len(cats), cats))
Reuters has 90 categories: ['acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa', 'coconut', 'coconut-oil', 'coffee', 'copper', 'copra-cake', 'corn', 'cotton', 'cotton-oil', 'cpi', 'cpu', 'crude', 'dfl', 'dlr', 'dmk', 'earn', 'fuel', 'gas', 'gnp', 'gold', 'grain', 'groundnut', 'groundnut-oil', 'heat', 'hog', 'housing', 'income', 'instal-debt', 'interest', 'ipi', 'iron-steel', 'jet', 'jobs', 'l-cattle', 'lead', 'lei', 'lin-oil', 'livestock', 'lumber', 'meal-feed', 'money-fx', 'money-supply', 'naphtha', 'nat-gas', 'nickel', 'nkr', 'nzdlr', 'oat', 'oilseed', 'orange', 'palladium', 'palm-oil', 'palmkernel', 'pet-chem', 'platinum', 'potato', 'propane', 'rand', 'rape-oil', 'rapeseed', 'reserves', 'retail', 'rice', 'rubber', 'rye', 'ship', 'silver', 'sorghum', 'soy-meal', 'soy-oil', 'soybean', 'strategic-metal', 'sugar', 'sun-meal', 'sun-oil', 'sunseed', 'tea', 'tin', 'trade', 'veg-oil', 'wheat', 'wpi', 'yen', 'zinc']
print(reuters.readme())
The Reuters-21578 benchmark corpus, ApteMod version This is a publically available version of the well-known Reuters-21578 "ApteMod" corpus for text categorization. It has been used in publications like these: * Yiming Yang and X. Liu. "A re-examination of text categorization methods". 1999. Proceedings of 22nd Annual International SIGIR. http://citeseer.nj.nec.com/yang99reexamination.html * Thorsten Joachims. "Text categorization with support vector machines: learning with many relevant features". 1998. Proceedings of ECML-98, 10th European Conference on Machine Learning. http://citeseer.nj.nec.com/joachims98text.html ApteMod is a collection of 10,788 documents from the Reuters financial newswire service, partitioned into a training set with 7769 documents and a test set with 3019 documents. The total size of the corpus is about 43 MB. It is also available for download from http://kdd.ics.uci.edu/databases/reuters21578/reuters21578.html , which includes a more extensive history of the data revisions. The distribution of categories in the ApteMod corpus is highly skewed, with 36.7% of the documents in the most common category, and only 0.0185% (2 documents) in each of the five least common categories. In fact, the original data source is even more skewed---in creating the corpus, any categories that did not contain at least one document in the training set and one document in the test set were removed from the corpus by its original creator. In the ApteMod corpus, each document belongs to one or more categories. There are 90 categories in the corpus. The average number of categories per document is 1.235, and the average number of documents per category is about 148, or 1.37% of the corpus. -Ken Williams ken@mathforum.org Copyright & Notification (extracted from the README at the UCI address above) The copyright for the text of newswire articles and Reuters annotations in the Reuters-21578 collection resides with Reuters Ltd. Reuters Ltd. and Carnegie Group, Inc. have agreed to allow the free distribution of this data *for research purposes only*. If you publish results based on this data set, please acknowledge its use, refer to the data set by the name "Reuters-21578, Distribution 1.0", and inform your readers of the current location of the data set (see "Availability & Questions").
total = len(reuters.paras())
total_multi = 0
for c in cats:
lc = len(reuters.paras(categories=[c]))
total_multi += lc
print("%s ---- %d documents out of %d" % (c, lc, total))
print("Articles belong to %.4f categories on average" % ((total_multi * 1.0) / total))
print("There are %.4f articles per category on average" % ((total * 1.0) / len(cats)))
acq ---- 2661 documents out of 11887 alum ---- 62 documents out of 11887 barley ---- 58 documents out of 11887 bop ---- 120 documents out of 11887 carcass ---- 69 documents out of 11887 castor-oil ---- 2 documents out of 11887 cocoa ---- 81 documents out of 11887 coconut ---- 6 documents out of 11887 coconut-oil ---- 7 documents out of 11887 coffee ---- 141 documents out of 11887 copper ---- 73 documents out of 11887 copra-cake ---- 3 documents out of 11887 corn ---- 270 documents out of 11887 cotton ---- 65 documents out of 11887 cotton-oil ---- 3 documents out of 11887 cpi ---- 108 documents out of 11887 cpu ---- 6 documents out of 11887 crude ---- 658 documents out of 11887 dfl ---- 3 documents out of 11887 dlr ---- 223 documents out of 11887 dmk ---- 15 documents out of 11887 earn ---- 4211 documents out of 11887 fuel ---- 23 documents out of 11887 gas ---- 65 documents out of 11887 gnp ---- 146 documents out of 11887 gold ---- 131 documents out of 11887 grain ---- 640 documents out of 11887 groundnut ---- 9 documents out of 11887 groundnut-oil ---- 2 documents out of 11887 heat ---- 20 documents out of 11887 hog ---- 23 documents out of 11887 housing ---- 23 documents out of 11887 income ---- 22 documents out of 11887 instal-debt ---- 8 documents out of 11887 interest ---- 570 documents out of 11887 ipi ---- 60 documents out of 11887 iron-steel ---- 56 documents out of 11887 jet ---- 5 documents out of 11887 jobs ---- 73 documents out of 11887 l-cattle ---- 8 documents out of 11887 lead ---- 30 documents out of 11887 lei ---- 19 documents out of 11887 lin-oil ---- 2 documents out of 11887 livestock ---- 102 documents out of 11887 lumber ---- 16 documents out of 11887 meal-feed ---- 51 documents out of 11887 money-fx ---- 835 documents out of 11887 money-supply ---- 230 documents out of 11887 naphtha ---- 6 documents out of 11887 nat-gas ---- 113 documents out of 11887 nickel ---- 10 documents out of 11887 nkr ---- 4 documents out of 11887 nzdlr ---- 4 documents out of 11887 oat ---- 16 documents out of 11887 oilseed ---- 185 documents out of 11887 orange ---- 32 documents out of 11887 palladium ---- 4 documents out of 11887 palm-oil ---- 41 documents out of 11887 palmkernel ---- 3 documents out of 11887 pet-chem ---- 33 documents out of 11887 platinum ---- 15 documents out of 11887 potato ---- 6 documents out of 11887 propane ---- 6 documents out of 11887 rand ---- 3 documents out of 11887 rape-oil ---- 8 documents out of 11887 rapeseed ---- 27 documents out of 11887 reserves ---- 83 documents out of 11887 retail ---- 29 documents out of 11887 rice ---- 64 documents out of 11887 rubber ---- 51 documents out of 11887 rye ---- 3 documents out of 11887 ship ---- 295 documents out of 11887 silver ---- 30 documents out of 11887 sorghum ---- 37 documents out of 11887 soy-meal ---- 28 documents out of 11887 soy-oil ---- 26 documents out of 11887 soybean ---- 124 documents out of 11887 strategic-metal ---- 28 documents out of 11887 sugar ---- 170 documents out of 11887 sun-meal ---- 2 documents out of 11887 sun-oil ---- 7 documents out of 11887 sunseed ---- 16 documents out of 11887 tea ---- 13 documents out of 11887 tin ---- 30 documents out of 11887 trade ---- 524 documents out of 11887 veg-oil ---- 128 documents out of 11887 wheat ---- 304 documents out of 11887 wpi ---- 32 documents out of 11887 yen ---- 70 documents out of 11887 zinc ---- 36 documents out of 11887 Articles belong to 1.2333 categories on average There are 132.0778 articles per category on average
from nltk.probability import FreqDist
fd = FreqDist(reuters.words())
len(fd)
41600
import inspect
print(inspect.signature(reuters.paras))
print(inspect.signature(reuters.fileids))
(fileids=None, categories=None) (categories=None)
reuters.fileids(categories=['yen'])
['test/14913', 'test/15400', 'test/15432', 'test/15454', 'test/15455', 'test/15483', 'test/15503', 'test/15549', 'test/18363', 'test/18370', 'test/19061', 'test/20862', 'test/21542', 'test/21573', 'training/10364', 'training/10679', 'training/10681', 'training/10684', 'training/10689', 'training/10696', 'training/10718', 'training/10762', 'training/10766', 'training/10769', 'training/10770', 'training/10804', 'training/11203', 'training/11254', 'training/11764', 'training/11772', 'training/12145', 'training/12470', 'training/13544', 'training/14767', 'training/1926', 'training/2178', 'training/2190', 'training/2286', 'training/2354', 'training/3419', 'training/3421', 'training/3532', 'training/4633', 'training/4675', 'training/4680', 'training/4703', 'training/4709', 'training/5204', 'training/5206', 'training/5271', 'training/6338', 'training/6357', 'training/872', 'training/9149', 'training/9213', 'training/9222', 'training/9698', 'training/9701', 'training/9946']
reuters.paras(fileids=['test/14913'])
[[['BANK', 'OF', 'JAPAN', 'INTERVENES', 'SOON', 'AFTER', 'TOKYO', 'OPENING', 'The', 'Bank', 'of', 'Japan', 'bought', 'a', 'small', 'amount', 'of', 'dollars', 'shortly', 'after', 'the', 'opening', 'at', 'around', '145', '.', '30', 'yen', ',', 'dealers', 'said', '.'], ['The', 'central', 'bank', 'intervened', 'as', 'a', 'medium', '-', 'sized', 'trading', 'house', 'sold', 'dollars', ',', 'putting', 'pressure', 'on', 'the', 'U', '.', 'S', '.', 'Currency', ',', 'they', 'said', '.'], ['The', 'dollar', 'was', 'also', 'supported', 'by', 'a', 'major', 'electrical', 'consumer', 'goods', 'company', ',', 'which', 'was', 'a', 'speculative', 'dollar', 'buyer', 'at', 'around', '145', '.', '25', 'yen', ',', 'they', 'added', '.'], ['The', 'dollar', 'opened', 'at', '145', '.', '33', 'yen', 'against', '145', '.', '60', '/', '70', 'in', 'New', 'York', 'and', '145', '.', '25', 'at', 'the', 'close', 'here', 'yesterday', '.']]]
def isTest(fileid):
return fileid[:4]=='test'
isTest('test/12345')
True
import nltk
from nltk.stem.porter import PorterStemmer
token_dict = {}
stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
stemmed = []
for item in tokens:
stemmed.append(stemmer.stem(item))
return stemmed
for file in reuters.fileids():
if not isTest(file):
token_dict[file] = stem_tokens(reuters.paras(fileids=[file])[0][0], stemmer)
stemmer.stem("investigation")
'investig'
len(token_dict)
7769
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english', input='content')
tfs = tfidf.fit_transform([" ".join(l) for l in token_dict.values()])
tfs
<7769x12286 sparse matrix of type '<class 'numpy.float64'>' with 161180 stored elements in Compressed Sparse Row format>
for t in tfs[0]:
print(t)
(0, 11962) 0.16109044483318327 (0, 9800) 0.039934857482404405 (0, 10330) 0.20614190626164752 (0, 3366) 0.22876680148598158 (0, 9521) 0.19783379090629347 (0, 6852) 0.13450049673491865 (0, 5946) 0.22876680148598158 (0, 8042) 0.19162837770017166 (0, 11035) 0.21165172895530673 (0, 3357) 0.16450162954192926 (0, 9028) 0.16190008557273025 (0, 6087) 0.13737488138368684 (0, 6391) 0.10447552148283092 (0, 4351) 0.1413340348834968 (0, 10242) 0.15188840994516276 (0, 4265) 0.17451330516949679 (0, 1574) 0.22876680148598158 (0, 12280) 0.18902683373097268 (0, 11960) 0.10666057755064468 (0, 3511) 0.12686029977183355 (0, 10175) 0.21875512585841406 (0, 9550) 0.3161876463025691 (0, 3306) 0.29569619997374236 (0, 2064) 0.42330345791061347
tfidf.inverse_transform(tfs[0])
[array(['weekli', 'said', 'smith', 'comissaria', 'restor', 'level', 'humid', 'normal', 'temporao', 'come', 'prospect', 'improv', 'januari', 'earli', 'sinc', 'drought', 'allevi', 'zone', 'week', 'continu', 'shower', 'review', 'cocoa', 'bahia'], dtype='<U21')]
tfidf
TfidfVectorizer(stop_words='english')