python爬虫 爬取猫眼电影数据

xiaoxiao2021-03-01  29

# 定义一个函数获取猫眼电影的数据​

import requests def main(): url = url = 'http://maoyan.com/board/4?offset=0' html = requests.get(url).text print(html) if __name__ == '__main__': main()

# 利用正则匹配,获得我们想要的信息

""" < dd > < i class ="board-index board-index-10">10</i> < a href = "/films/2760" title = "魂断蓝桥" class ="image-link" data-act="boarditem-click" data-val="{movieId:2760}" > < img src = "//ms0.meituan.net/mywww/image/loading_2.e3d934bf.png" alt = "" class ="poster-default" / > < img data - src = "http://p0.meituan.net/movie/46c29a8b8d8424bdda7715e6fd779c66235684.jpg@160w_220h_1e_1c" alt = "魂断蓝桥" class ="board-img" / >< / a > < div class ="board-item-main" > < div class ="board-item-content" > < div class ="movie-item-info" > < p class ="name" > < a href="/films/2760" title="魂断蓝桥" data-act="boarditem-click" data-val="{movieId:2760}" > 魂断蓝桥 < / a > < / p > < p class ="star" >主演:费雯·丽, 罗伯特·泰勒, 露塞尔·沃特森< / p > < p class ="releasetime" > 上映时间:1940-05-17(美国) < / p > < / div > < div class ="movie-item-number score-num" > < p class ="score" > < i class ="integer" > 9. < / i > < i class ="fraction" > 2 < / i > < / p > < / div >< / div >< / div > < / dd > """ import re reg = r'<dd>.*?>(.*?)</i>.*?data-src="(.*?)".*?title="(.*?)".*?主演:(.*?)</p>.*?' \ r'上映时间:(.*?)</p>.*?integer.*?>(.*?)</i>.*?fraction.*?>(.*?)</i>.*?' reg = re.compile(reg, re.S) items = re.findall(reg, html) print(items)

# 循环遍历列表并且把列表转换为字典

for item in items: index = item[0] image = item[1] title = item[2] actor = item[3] time = item[4] score = item[5] + item[6] dict1 = {'index': index, 'image': image, 'title': title, 'actor': actor, 'time': time, 'score': score} print(dict1)

# 把获得的数据保存在文件中

import json with open('result.txt', 'a', encoding='utf-8') as f: f.write(json.dumps(dict1, ensure_ascii=False))

# 利用循环获取猫眼电影所有数据

def main(): for i in range(10): url = 'http://maoyan.com/board/4?offset=' + str(i*10)

# 最后代码整理如下

import json import re from time import sleep import requests def main(): for i in range(10): url = 'http://maoyan.com/board/4?offset=' + str(i * 10) html = requests.get(url).text reg = r'<dd>.*?>(.*?)</i>.*?data-src="(.*?)".*?title="(.*?)"' r'.*?主演:(.*?)</p>.*?上映时间:(.*?)</p>.*?integer.*?>' r'(.*?)</i>.*?fraction.*?>(.*?)</i>.*?' reg = re.compile(reg, re.S) items = re.findall(reg, html) for item in items: # print(item) index = item[0] image = item[1] title = item[2] actor = item[3] time = item[4] score = item[5] + item[6] dict1 = {'index': index, 'image': image, 'title': title, 'actor': actor, 'time': time, 'score': score} sleep(1) with open('result.txt', 'a', encoding='utf-8') as f: f.write(json.dumps(dict1, ensure_ascii=False)) if __name__ == '__main__': main()

 

转载请注明原文地址: https://www.6miu.com/read-4050190.html

最新回复(0)