python 自然语言处理学习1

xiaoxiao2021-02-28  69

from urllib.request import * import nltk from bs4 import BeautifulSoup url = "http://www.gutenberg.org/files/2554/2554-h/2554-h.htm" raw = urlopen(url).read() print(type(raw)) print(len(raw)) print(raw[:75]) raw = BeautifulSoup(raw, "lxml") raw = BeautifulSoup.get_text(raw ) #得到html中的文本 raw = str(raw) l = raw.find("The Project Gutenberg EBook of Crime") #根据索引提取想要的数据 r = raw.rfind("End of Project Gutenberg’s Crime") raw = raw[l:r] print(type(raw)) print(len(raw)) tokens = nltk.word_tokenize(raw) #分词 print(type(tokens)) print(len(tokens)) print(tokens[:10]) test = nltk.Text(tokens) #将str转换成text print(type(test)) print(test[:100]) print(test.collocations()) print(test.concordance("gene"))
转载请注明原文地址: https://www.6miu.com/read-74851.html

最新回复(0)