kenkovlog

けんこふたんっオフィシャユブヨグッ
アンッ!アンッ!アンッ!アンッ!

Chapter 3 of NLP with Python

I read the chapter 3 in the NLP book: * Natural Language Processing with Python , and I wrote some script to accumulate a frequent distribution.

Code

#! /usr/bin/env python
# coding:utf-8

from __future__ import division, print_function
import nltk
import urllib
import pprint


def raw_html(url):
    html = urllib.urlopen(url).read()
    return nltk.clean_html(html)


def arch_fd(url):
    stopwords = nltk.corpus.stopwords.words('english')
    raw = raw_html(url)
    text = nltk.word_tokenize(raw)
    # lemmatization
    #porter = nltk.PorterStemmer()
    wln = nltk.WordNetLemmatizer()
    #stem_text = [porter.stem(i) for i in map(str.lower, text)]
    stem_text = [wln.lemmatize(i) for i in map(str.lower, text)]
    fd = nltk.FreqDist([i for i in stem_text if i not in stopwords
                        and i.isalpha()])
    return fd


if __name__ == '__main__':
    urls = [
        "https://archlinux.org/index.php/Beginners'_Guide",
        "https://wiki.archlinux.org/index.php/Lvm",
        "https://wiki.archlinux.org/index.php/Python",
        "https://wiki.archlinux.org/index.php/Haskell_Package_Guidelines",
    ]
    for url in urls:
        pprint.pprint(url)
        fd = arch_fd(url)
        pprint.pprint(fd.items()[:30])

Result

It seemed that the contens of these pages ware well refrected.

"https://archlinux.org/index.php/Beginners'_Guide"
[('package', 13),
 ('arch', 10),
 ('bug', 9),
 ('linux', 8),
 ('systemd', 8),
 ('initscripts', 7),
 ('install', 7),
 ('medium', 7),
 ('wiki', 5),
 ('available', 4),
 ('community', 4),
 ('installation', 4),
 ('new', 4),
 ('official', 4),
 ('support', 4),
 ('system', 4),
 ('check', 3),
 ('fix', 3),
 ('guide', 3),
 ('list', 3),
 ('news', 3),
 ('profile', 3),
 ('repository', 3),
 ('start', 3),
 ('time', 3),
 ('user', 3),
 ('also', 2),
 ('boot', 2),
 ('booted', 2),
 ('change', 2)]
'https://wiki.archlinux.org/index.php/Lvm'
[('volume', 115),
 ('logical', 57),
 ('partition', 52),
 ('lvm', 45),
 ('group', 37),
 ('use', 30),
 ('physical', 28),
 ('create', 26),
 ('disk', 25),
 ('need', 23),
 ('filesystem', 21),
 ('snapshot', 15),
 ('want', 15),
 ('data', 13),
 ('space', 12),
 ('linux', 11),
 ('make', 11),
 ('system', 11),
 ('arch', 10),
 ('free', 10),
 ('command', 9),
 ('name', 9),
 ('one', 9),
 ('using', 9),
 ('doe', 8),
 ('filesystems', 8),
 ('may', 8),
 ('page', 8),
 ('used', 8),
 ('module', 7)]
'https://wiki.archlinux.org/index.php/Python'
[('python', 62),
 ('version', 15),
 ('gt', 9),
 ('binding', 8),
 ('install', 7),
 ('language', 7),
 ('package', 7),
 ('available', 6),
 ('http', 6),
 ('change', 5),
 ('latest', 5),
 ('may', 5),
 ('old', 5),
 ('page', 5),
 ('use', 5),
 ('also', 4),
 ('development', 4),
 ('eric', 4),
 ('following', 4),
 ('ha', 4),
 ('program', 4),
 ('repository', 4),
 ('run', 4),
 ('script', 4),
 ('spyder', 4),
 ('widget', 4),
 ('application', 3),
 ('archwiki', 3),
 ('aur', 3),
 ('content', 3)]
'https://wiki.archlinux.org/index.php/Haskell_Package_Guidelines'
[('haskell', 38),
 ('package', 35),
 ('library', 12),
 ('use', 11),
 ('aur', 10),
 ('guideline', 9),
 ('http', 9),
 ('repository', 9),
 ('available', 7),
 ('build', 7),
 ('gt', 7),
 ('install', 7),
 ('arch', 6),
 ('community', 6),
 ('dependency', 6),
 ('page', 6),
 ('quot', 6),
 ('runhaskell', 6),
 ('setup', 6),
 ('example', 5),
 ('extra', 5),
 ('ghc', 5),
 ('hackage', 5),
 ('pkgbuild', 5),
 ('pkgdir', 5),
 ('pkgname', 5),
 ('provided', 5),
 ('tool', 5),
 ('archhaskell', 4),
 ('cd', 4)]
けんこふたん