From: http://blog.ynada.com/tag/nltk
# PREP
# load nltk
import nltk
# download stuff
nltk.download()
# load submodules. list of submodules is here. it's usually simplest to just import everything.
from nltk import *
from nltk.book import *
from nltk.corpus import gutenberg
from nltk.corpus import *
# etc
# look at files inside a corpus e.g. gutenberg collection. list of corpora is here
nltk.corpus.gutenberg.fileids()
# or just gutenberg.fileids()
# get an individual text from a corpus. if you want to put the whole corpus into one variable, use corpus.words()
alice_words = nltk.corpus.gutenberg.words('carroll-alice.txt')
alice = nltk.Text(alice_words)
# or just alice = nltk.Text(nltk.corpus.gutenberg.words('carroll-alice.txt'))
# import something from a local file
f = open('document.txt')
raw = f.read()
# import something from a webpage
from urllib import urlopen
url = "http://www.gutenberg.org/files/2554/2554.txt"
raw = urlopen(url).read()
# we might comment out line below if the source is plain text
raw = nltk.clean_html(html)
# tokenize and get ready for use
tokens = nltk.word_tokenize(raw)
text = nltk.Text(tokens)
# PROCEDURES
# tokens in a text
len(text)
# types in a text
len(set(text))
# count occurance of a word
text.count("word")
# concordance
text.concordance("someword")
# similarity
text.similar("monstrous")
# common contexts
text.common_contexts(["someword", "otherword"])
# lexical dispersion
text.dispersion_plot(["someword", "otherword", "notherword"])
# type-token ratio
len(text) / len(set(text))
fdist = FreqDist(text)
vocabulary = fdist.keys()
vocabulary[:50]
# …and generate a cumulative frequency plot for those words
fdist.plot(50, cumulative=True)
No comments:
Post a Comment