HTMLParser
方法注解
from html.parser
import HTMLParser
class MyParser(HTMLParser):
"""
HTMLParser
"""
def __init__(self):
HTMLParser.__init__(self)
def handle_startendtag(self, tag, attrs):
super().handle_startendtag(tag, attrs)
def handle_starttag(self, tag, attrs):
pass
def handle_endtag(self, tag):
pass
def handle_charref(self, name):
pass
def handle_data(self, data):
pass
def handle_comment(self, data):
pass
def handle_decl(self, decl):
pass
def handle_entityref(self, name):
pass
def handle_pi(self, data):
pass
Douban电影内容爬取
import requests
from html.parser
import HTMLParser
class MovieParser(HTMLParser):
"""
电影解析器
"""
def __init__(self):
HTMLParser.__init__(self)
self.moives = []
def handle_starttag(self, tag, attrs):
def _attr(attrList, attrName):
for attr
in attrList:
if attr[
0] == attrName:
return attr[
1]
return None
if tag ==
'li' and _attr(attrs,
'data-title'):
movie = {}
movie[
'title'] = _attr(attrs,
'data-title')
movie[
'score'] = _attr(attrs,
'data-score')
if movie[
'score']
is None:
movie[
'score'] =
"None"
movie[
'director'] = _attr(attrs,
'data-director')
movie[
'actors'] = _attr(attrs,
'data-actors')
self.moives.append(movie)
def error(self, message):
pass
def my_movies(url):
"""
网络请求
:param url: 地址
:return: 解析好的内容
"""
mp =
None
try:
headers = {
'User-Agent':
'Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)'}
response = requests.get(url, headers=headers)
response.raise_for_status()
mp = MovieParser()
mp.feed(response.text)
return mp.moives
except:
return print(
'发生异常')
finally:
if mp
is not None:
mp.close()
def save_file(path, text):
"""
文本存储
:param path: 存储路径
:param text: 文本内容
:return: None
"""
with open(path,
'w', encoding=
'UTF-8')
as file:
file.write(text)
if __name__ ==
'__main__':
url =
"https://movie.douban.com/cinema/nowplaying/chongqing/"
text = my_movies(url)
save_file(
"d:/upload/movies.json", str(text).replace(
'\'',
'\"'))