def main(): url = 'http://maoyan.com/board/4?' html = get_one_page(url) print(html) if __name__ == '__main__': main()
打印出的结果:10部电影
<i class="board-index board-index-10">10</i> <a href="/films/1212" title="千与千寻" class="image-link" data-act="boarditem-click" data-val="{movieId:1212}"> <img src="//ms0.meituan.net/mywww/image/loading_2.e3d934bf.png" alt="" class="poster-default" /> <img data-src="http://p0.meituan.net/movie/9bf7d7b81001a9cf8adbac5a7cf7d766132425.jpg@160w_220h_1e_1c" alt="千与千寻" class="board-img" /> </a> <div class="board-item-main"> <div class="board-item-content"> <div class="movie-item-info"> <p class="name"><a href="/films/1212" title="千与千寻" data-act="boarditem-click" data-val="{movieId:1212}">千与千寻</a></p> <p class="star"> 主演:柊瑠美,入野自由,夏木真理 </p> <p class="releasetime">上映时间:2001-07-20(日本)</p> </div> <div class="movie-item-number score-num"> <p class="score"><i class="integer">9.</i><i class="fraction">3</i></p> #<i class="board-index board-index-8">8</i> # <img data-src="http://p0.meituan.net/movie/99/678407.jpg@160w_220h_1e_1c" alt="龙猫" class="board-img" /> #<p class="name"><a href="/films/123" title="龙猫" data-act="boarditem-click" data-val="{movieId:123}">龙猫</a></p> # <p class="star"> #<p class="releasetime">上映时间:1988-04-16(日本)</p> #<p class="score"><i class="integer">9.</i><i class="fraction">2</i></p> #匹配排序8,匹配图片、标题、主演、时间、评分 def parse_one_page(html): pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a' +'.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>' +'.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>',re.S) #re.S表示匹配任意字符,若不加re.S则不能匹配换行符 items = re.findall(pattern, html) print(items)def main(): url = 'http://maoyan.com/board/4?' html = get_one_page(url) parse_one_page(html) if __name__ == '__main__': main()
打印出来的结果:
def parse_one_page(html): pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a' +'.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>' +'.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>',re.S) #re.S表示匹配任意字符,若不加re.S则不能匹配换行符 items = re.findall(pattern, html) for item in items: yield { 'index':item[0], 'image':item[1], 'title':item[2], 'actor':item[3].strip()[3:],# '\n主演:张国荣,张丰毅,巩俐\n'用.strip()去掉换行符,用切片去掉主演:三个字 'time':item[4].strip()[5:], 'score':item[5]+item[6] } def main(): url = 'http://maoyan.com/board/4?' html = get_one_page(url) for item in parse_one_page(html): print(item) def write_to_file(content): with open('result.txt', 'a') as f: f.write(json.dumps(content) + '\n')#字典转换成字符串 f.close() def main(): url = 'http://maoyan.com/board/4?' html = get_one_page(url) for item in parse_one_page(html): print(item) write_to_file(item)路径在这:
/Users/****/Documents/wangyiyun/result.txt 但都是unicode编码了,
def write_to_file(content): with open('result.txt', 'a',encoding = 'utf-8') as f: f.write(json.dumps(content,ensure_ascii = False) + '\n')#字典转换成字符串 f.close()这样就好了 if __name__ == '__main__': for i in range(10): main(i * 10)#构造offset=0,10,20...90 pool = Pool()#构建进程池 pool.map(main,[i * 10 for i in range(10)])#将主流的函数拿出来作为一个个参数,拿到线程池里面运行 多线程,可快啦!