Monday, January 24, 2011

NLTK: e-philology cheat sheet

From: http://blog.ynada.com/tag/nltk

# PREP

# load nltk
import nltk

# download stuff
nltk.download()

# load submodules. list of submodules is here. it's usually simplest to just import everything.
from nltk import *
from nltk.book import *
from nltk.corpus import gutenberg
from nltk.corpus import *
# etc

# look at files inside a corpus e.g. gutenberg collection. list of corpora is here
nltk.corpus.gutenberg.fileids()
# or just gutenberg.fileids()

# get an individual text from a corpus. if you want to put the whole corpus into one variable, use corpus.words()
alice_words = nltk.corpus.gutenberg.words('carroll-alice.txt')
alice = nltk.Text(alice_words)
# or just alice = nltk.Text(nltk.corpus.gutenberg.words('carroll-alice.txt'))

# import something from a local file
f = open('document.txt')
raw = f.read()

# import something from a webpage
from urllib import urlopen
url = "http://www.gutenberg.org/files/2554/2554.txt"
raw = urlopen(url).read()
# we might comment out line below if the source is plain text
raw = nltk.clean_html(html)

# tokenize and get ready for use
tokens = nltk.word_tokenize(raw)
text = nltk.Text(tokens)

# PROCEDURES

# tokens in a text
len(text)

# types in a text
len(set(text))

# count occurance of a word
text.count("word")

# concordance
text.concordance("someword")

# similarity
text.similar("monstrous")

# common contexts
text.common_contexts(["someword", "otherword"])

# lexical dispersion
text.dispersion_plot(["someword", "otherword", "notherword"])

# type-token ratio
len(text) / len(set(text))

# compute a list of the most frequent words in the corpus
fdist = FreqDist(text)
vocabulary = fdist.keys()
vocabulary[:50]
# …and generate a cumulative frequency plot for those words
fdist.plot(50, cumulative=True)

No comments:

Post a Comment