from urllib.request import *
import nltk
from bs4 import BeautifulSoup
url = "http://www.gutenberg.org/files/2554/2554-h/2554-h.htm"
raw = urlopen(url).read()
print(type(raw))
print(len(raw))
print(raw[:75])
raw = BeautifulSoup(raw, "lxml")
raw = BeautifulSoup.get_text(raw ) #得到html中的文本
raw = str(raw)
l = raw.find("The Project Gutenberg EBook of Crime") #根据索引提取想要的数据
r = raw.rfind("End of Project Gutenberg’s Crime")
raw = raw[l:r]
print(type(raw))
print(len(raw))
tokens = nltk.word_tokenize(raw) #分词
print(type(tokens))
print(len(tokens))
print(tokens[:10])
test = nltk.Text(tokens) #将str转换成text
print(type(test))
print(test[:100])
print(test.collocations())
print(test.concordance("gene"))