import os
import requests
import xlsxwriter
from bs4
import BeautifulSoup
MovieInfo
= []
workxlsx
= xlsxwriter
.Workbook
('DouBanTop.xlsx')
worksheet
= workxlsx
.add_worksheet
()
worksheet
.set_row
(0, 60)
worksheet
.set_row
(1, 60)
worksheet
.set_column
(1, 1, 35)
worksheet
.set_column
(2, 2, 78)
head_format
= workxlsx
.add_format
({'bold': True, 'align': 'center', 'valign': 'vcenter'})
title_format
= workxlsx
.add_format
({'bold': True, 'align': 'center', 'valign': 'vcenter', 'font_size': 20, 'font_color': 'Green'})
text_format
= workxlsx
.add_format
({'align': 'center', 'valign': 'vcenter', 'font_size': 14})
worksheet
.merge_range
(1, 1, 1, 2, '')
worksheet
.merge_range
(0, 0, 0, 3, 'DouBan Movie Top 250', title_format
)
def GetHtmlText(url
):
try:
r
= requests
.get
(url
)
r
.raise_for_status
()
r
.encoding
= 'utf-8'
return r
.text
except:
return ''
def FindInformation(soup
):
ols
= soup
.find_all
('ol', {'class': 'grid_view'})
for ol
in ols
:
lis
= ol
.find_all
('li')
for li
in lis
:
OneMovieInfo
= []
MovieName
= ''
picdiv
= li
.find
('div', {'class': 'pic'})
MovieRank
= int(picdiv
.find
('em').string
)
OneMovieInfo
.append
(MovieRank
)
img
= picdiv
.find
('img')
imgurl
= img
['src']
ImgName
= str(MovieRank
) + '.jpg'
OneMovieInfo
.append
(ImgName
)
imgcontent
= requests
.get
(imgurl
).content
with open(os
.getcwd
() + '/' + ImgName
, 'wb') as wf
:
wf
.write
(imgcontent
)
infodiv
= li
.find
('div', {'class': 'hd'})
spans
= infodiv
.find_all
('span', {'class': 'title'})
for span
in range(len(spans
)):
if span
== 0:
if len(spans
) == 2:
MovieName
+= spans
[span
].string
+ ' / '
else:
MovieName
+= spans
[span
].string
else:
MovieName
+= spans
[span
].string
[3:]
OneMovieInfo
.append
(MovieName
)
ratingspan
= li
.find
('span', {'class': 'rating_num'})
OneMovieInfo
.append
(float(ratingspan
.string
))
MovieInfo
.append
(OneMovieInfo
)
if __name__
== '__main__':
SheetHead
= ['排名', '电影', '', '评分']
for head
in range(len(SheetHead
)):
if head
== 2:
continue
worksheet
.write
(1, head
, SheetHead
[head
], head_format
)
pages
= 10
for page
in range(pages
):
url
= 'https://movie.douban.com/top250?start=' + str(page
* 25) + '&filter='
html
= GetHtmlText
(url
)
soup
= BeautifulSoup
(html
, 'html.parser')
FindInformation
(soup
)
for movie
in range(len(MovieInfo
)):
worksheet
.set_row
(movie
+ 2, 290)
for info
in range(len(MovieInfo
[movie
])):
if info
== 1:
worksheet
.insert_image
(movie
+ 2, info
, MovieInfo
[movie
][info
])
continue
worksheet
.write
(movie
+ 2, info
, MovieInfo
[movie
][info
], text_format
)
workxlsx
.close
()
运行结果:
转载请注明原文地址: https://www.6miu.com/read-3200349.html