import requests
from urllib
.parse
import urlencode
from requests
import codes
import os
from hashlib
import md5
from multiprocessing
.pool
import Pool
def get_page(offset
):
params
= {
'offset': offset
,
'format': 'json',
'keyword': '街拍',
'autoload': 'true',
'count': '20',
'cur_tab': '1',
'from': 'search_tab'
}
base_url
= 'https://www.toutiao.com/search_content/?'
url
= base_url
+ urlencode
(params
)
try:
resp
= requests
.get
(url
)
if codes
.ok
== resp
.status_code
:
return resp
.json
()
except requests
.ConnectionError
:
return None
def get_images(json
):
if json
.get
('data'):
data
= json
.get
('data')
for item
in data
:
if item
.get
('cell_type') is not None:
continue
title
= item
.get
('title')
images
= item
.get
('image_list')
for image
in images
:
url_temp
= image
.get
('url')
yield {
'image': 'https:' + '//p3.pstatp.com/large/pgc-image/'+ url_temp
[-23:],
'title': title
}
def save_image(item
):
img_path
= 'img' + os
.path
.sep
+ item
.get
('title')
if not os
.path
.exists
(img_path
):
os
.makedirs
(img_path
)
try:
resp
= requests
.get
(item
.get
('image'))
if codes
.ok
== resp
.status_code
:
file_path
= img_path
+ os
.path
.sep
+ '{file_name}.{file_suffix}'.format(
file_name
=md5
(resp
.content
).hexdigest
(),
file_suffix
='jpg')
if not os
.path
.exists
(file_path
):
with open(file_path
, 'wb') as f
:
f
.write
(resp
.content
)
print('Downloaded image path is %s' % file_path
)
else:
print('Already Downloaded', file_path
)
except requests
.ConnectionError
:
print('Failed to Save Image,item %s' % item
)
def main(offset
):
json
= get_page
(offset
)
for item
in get_images
(json
):
print(item
)
save_image
(item
)
GROUP_START
= 0
GROUP_END
= 3
if __name__
== '__main__':
pool
= Pool
()
groups
= ([x
* 20 for x
in range(GROUP_START
, GROUP_END
+ 1)])
pool
.map(main
, groups
)
pool
.close
()
pool
.join
()
代码年代久远,注释当时没写,现在懒得写了
转载请注明原文地址: https://www.6miu.com/read-4931428.html