Python学习笔记——20170829

xiaoxiao2021-02-28  105

HTMLParser

方法注解 from html.parser import HTMLParser # class MyParser(HTMLParser): """ HTMLParser """ def __init__(self): HTMLParser.__init__(self) def handle_startendtag(self, tag, attrs): super().handle_startendtag(tag, attrs) # 处理开始标签<a> def handle_starttag(self, tag, attrs): pass # 处理结束标签</a> def handle_endtag(self, tag): pass # 处理特殊字符串,例如$#开头的 def handle_charref(self, name): pass # 处理标签中的内容,比如<a href="http://www.baidu.com">baidu<a> def handle_data(self, data): pass # 处理注释 def handle_comment(self, data): pass # 处理以<!开头的,比如<!DOCTYPE HTML> def handle_decl(self, decl): pass # 处理特殊字符,例如  def handle_entityref(self, name): pass # 处理<?instruction> def handle_pi(self, data): pass

Douban电影内容爬取

import requests from html.parser import HTMLParser # class MovieParser(HTMLParser): """ 电影解析器 """ def __init__(self): HTMLParser.__init__(self) self.moives = [] def handle_starttag(self, tag, attrs): def _attr(attrList, attrName): for attr in attrList: if attr[0] == attrName: return attr[1] return None if tag == 'li' and _attr(attrs, 'data-title'): movie = {} movie['title'] = _attr(attrs, 'data-title') movie['score'] = _attr(attrs, 'data-score') if movie['score'] is None: movie['score'] = "None" movie['director'] = _attr(attrs, 'data-director') movie['actors'] = _attr(attrs, 'data-actors') self.moives.append(movie) # print('{movie[title]} | {movie[score]} | {movie[director]} | {movie[actors]}'.format(movie=movie)) def error(self, message): pass # # def my_movies(url): """ 网络请求 :param url: 地址 :return: 解析好的内容 """ mp = None try: headers = {'User-Agent': 'Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)'} response = requests.get(url, headers=headers) response.raise_for_status() mp = MovieParser() mp.feed(response.text) return mp.moives except: return print('发生异常') finally: if mp is not None: mp.close() # # def save_file(path, text): """ 文本存储 :param path: 存储路径 :param text: 文本内容 :return: None """ with open(path, 'w', encoding='UTF-8') as file: file.write(text) # # if __name__ == '__main__': url = "https://movie.douban.com/cinema/nowplaying/chongqing/" text = my_movies(url) save_file("d:/upload/movies.json", str(text).replace('\'', '\"'))
转载请注明原文地址: https://www.6miu.com/read-18013.html

最新回复(0)