kenkovlog

けんこふたんっオフィシャユブヨグッ
アンッ!アンッ!アンッ!アンッ!

chapter 5 of NLP with Python

Part of Speach

This code extractes top-30 words from arch wiki pages about some part of speaches.

#! /usr/bin/env python
# coding:utf-8

from __future__ import division, print_function
import nltk
import urllib
import pprint


def raw_html(url):
    html = urllib.urlopen(url).read()
    return nltk.clean_html(html)


def lemmalize(word, method="wordnet"):
    """
    lemmmalize texts

    Arguments:
        word -- String
        method -- lemmalizing method
    Return:
        String
    """
    assert method in ["wordnet", "porter"], "argument error of limmalize"
    if method == "wordnet":
        mtd = nltk.WordNetLemmatizer()
    elif method == "porter":
        mtd = nltk.PorterStemmer()
    return mtd.lemmatize(word.lower())


def text_lemmalize(text):
    return map(lemmalize, text)


def text_pos(raw):
    text = nltk.word_tokenize(raw)
    # lemmatization
    return nltk.pos_tag(text)


def arch_fd(url):
    stopwords = nltk.corpus.stopwords.words('english')
    raw = raw_html(url)
    text = nltk.word_tokenize(raw)
    # lemmatization
    wln = nltk.WordNetLemmatizer()
    stem_text = [wln.lemmatize(i) for i in map(str.lower, text)]
    fd = nltk.FreqDist([i for i in stem_text if i not in stopwords
                        and i.isalpha()])
    return fd


if __name__ == '__main__':
    urls = [
        "https://archlinux.org/index.php/Beginners'_Guide",
        "https://wiki.archlinux.org/index.php/Lvm",
        "https://wiki.archlinux.org/index.php/Python",
        "https://wiki.archlinux.org/index.php/Haskell_Package_Guidelines",
    ]
    for url in urls:
        pprint.pprint(url)
        raw = raw_html(url)
        tp = [(lemmalize(key), val) for (key, val)
              in text_pos(raw)]
        stopwords = nltk.corpus.stopwords.words('english')
        nouns = [(key, pos) for (key, pos) in tp
                 if pos.startswith("N") and key not in stopwords]
        pprint.pprint(nltk.FreqDist(nouns).items()[:30])
        verbs = [(key, pos) for (key, pos) in tp
                 if pos.startswith("V") and key not in stopwords]
        pprint.pprint(nltk.FreqDist(verbs).items()[:30])

defaultdict

Python standard library provide a useful dictionary constructor; collections.defaultdict . NLTK also provides several useful functions to construct dictionary-like structures. Although those of NLTK structures have more methods, we can achive similar structures to use defaultdict without NLTK.

They correspond as follows;

nltk.FreqDist <--> defaultdict(int)
nltk.Index <--> defaultdict(list)

In the case of nltk.FreqDist

d = defaultdict(int)
for i in words:
    d[i] += 1

In the case of nltk.Index

d = defaultdict(list)
for i in words:
    d[i].append(i)
けんこふたん