python爬取豆瓣电影Top250

xiaoxiao2021-03-01  14

# -*- utf-8 -*- import os import requests import xlsxwriter from bs4 import BeautifulSoup # 存储影片信息二维数组 MovieInfo = [] # 创建Excel文件"DouBanTop.xlsx" workxlsx = xlsxwriter.Workbook('DouBanTop.xlsx') # 创建一个表单 worksheet = workxlsx.add_worksheet() # 设置表格格式 worksheet.set_row(0, 60) worksheet.set_row(1, 60) worksheet.set_column(1, 1, 35) worksheet.set_column(2, 2, 78) head_format = workxlsx.add_format({'bold': True, 'align': 'center', 'valign': 'vcenter'}) title_format = workxlsx.add_format({'bold': True, 'align': 'center', 'valign': 'vcenter', 'font_size': 20, 'font_color': 'Green'}) text_format = workxlsx.add_format({'align': 'center', 'valign': 'vcenter', 'font_size': 14}) worksheet.merge_range(1, 1, 1, 2, '') worksheet.merge_range(0, 0, 0, 3, 'DouBan Movie Top 250', title_format) # 获取网页文本源代码 def GetHtmlText(url): try: r = requests.get(url) r.raise_for_status() r.encoding = 'utf-8' return r.text except: return '' # 提取影片信息 def FindInformation(soup): ols = soup.find_all('ol', {'class': 'grid_view'}) for ol in ols: lis = ol.find_all('li') for li in lis: OneMovieInfo = [] MovieName = '' picdiv = li.find('div', {'class': 'pic'}) MovieRank = int(picdiv.find('em').string) OneMovieInfo.append(MovieRank) img = picdiv.find('img') imgurl = img['src'] ImgName = str(MovieRank) + '.jpg' OneMovieInfo.append(ImgName) imgcontent = requests.get(imgurl).content with open(os.getcwd() + '/' + ImgName, 'wb') as wf: wf.write(imgcontent) infodiv = li.find('div', {'class': 'hd'}) spans = infodiv.find_all('span', {'class': 'title'}) for span in range(len(spans)): if span == 0: if len(spans) == 2: MovieName += spans[span].string + ' / ' else: MovieName += spans[span].string else: MovieName += spans[span].string[3:] OneMovieInfo.append(MovieName) ratingspan = li.find('span', {'class': 'rating_num'}) OneMovieInfo.append(float(ratingspan.string)) MovieInfo.append(OneMovieInfo) if __name__ == '__main__': # 表头 SheetHead = ['排名', '电影', '', '评分'] for head in range(len(SheetHead)): if head == 2: continue worksheet.write(1, head, SheetHead[head], head_format) # 共十页内容,每页25部影片 pages = 10 for page in range(pages): url = 'https://movie.douban.com/top250?start=' + str(page * 25) + '&filter=' html = GetHtmlText(url) soup = BeautifulSoup(html, 'html.parser') FindInformation(soup) # 将信息写入表格 for movie in range(len(MovieInfo)): worksheet.set_row(movie + 2, 290) for info in range(len(MovieInfo[movie])): if info == 1: worksheet.insert_image(movie + 2, info, MovieInfo[movie][info]) continue worksheet.write(movie + 2, info, MovieInfo[movie][info], text_format) workxlsx.close()

运行结果:

转载请注明原文地址: https://www.6miu.com/read-3200349.html

最新回复(0)