# -*- encoding = utf-8 -*-
import requests
from urllib.parse
import quote
import threading
threading_lock
= threading.BoundedSemaphore(
value=1)
kw
= '校花'
kw
= quote(kw)
#https://www.duitang.com/napi/blog/list/by_search/?kw=校花&start=48&_=1501895019883
# 单个url所有返回数据
def get_page(
url)
:
page
= requests.get(
url)
page
= page.content
page
= page.decode(
'utf-8')
return page
#获取单个页面所有的图片url
def findall_in_page(
page,startpart,endpart)
:
all_strings
= []
end
= 0
while page.find(
startpart,end)
!= -1:
start
= page.find(
startpart,end)
+ len(
startpart)
end
= page.find(
endpart,start)
string
= page[start
:end]
all_strings.append(string)
return all_strings
#通过url获取所有图片url
def pic_urls_from_page(
pages)
:
pic_urls
= []
for page
in pages:
urls
= findall_in_page(page
,'path":"','"')
pic_urls.extend(urls)
return pic_urls
#获取所有url
def get_pages_from_label(
label)
:
label
= quote(
label)
all_pages
= []
for start
in range(
0,3600,100)
:
url
= 'https://www.duitang.com/napi/blog/list/by_search/?kw={}&start={}&limit=1000'
url
= url.format(
label,start)
page
= get_page(url)
all_pages.append(page)
return all_pages
def download_pics(
img_url,n)
:
r
= requests.get(
img_url)
ext
= img_url.split(
'.')[
-1]
path
= 'duitang_interface_img_dir/images/' +str(
n)
+'.' + ext
with open(path
,'wb')
as f
:
f.write(r.content)
threading_lock.release()
def main()
:
print(
'start')
all_pages
= get_pages_from_label(
'校花')
print(
'获取所有页面信息完毕')
all_img_urls
= pic_urls_from_page(all_pages)
print(
'获取所有图片url完毕')
n
= 0
print(
'开始下载')
for img_url
in all_img_urls
:
n
+= 1
print(
'正在下载第',n
,'张')
threading_lock.acquire()
t
= threading.Thread(
target=download_pics
,args=(img_url
,n))
t.start()
print(
'all done')
main()