# NLTK code for building a corpus of Twitter messages (or any number of text files in a dir)
import glob, os
 path = 'E:\Corpora\Twitter\plaintext\mostfrequent'
 for infile in glob.glob (os.path.join(path, '*.txt') ):
 f = open(infile)
 raw = raw + ' ' + f.read()
tokens = nltk.word_tokenize(raw)
 text = nltk.Text(tokens)
 len(text)
 
No comments:
Post a Comment