批量爬取27270美女栏目图片

xiaoxiao2021-02-28 124

批量爬取27270美女栏目图片

运行了一个晚上小水管太慢了，才爬了几万张图片。

做了一下重复抓取，设定抓取八次

写了一下日志，但是想了一下还是注释掉了

代码里面有很多修修改改的痕迹，

如果愿意的话可以拿去把这个程序修改一下

采集的网页是:http://www.27270.com/

当前使用的python版本是python3.5.2

# -*- coding:utf-8 -*- import os import sys import time import random import logging import requests import multiprocessing from multiprocessing import Pool from bs4 import BeautifulSoup img_href = [] a_index = {} flag = 'true' html_index = '' error_num = [] error_href = [] error_path = [] index = {'start': '', 'end': ''} url_index = 'http://www.27270.com/ent/meinvtupian/' sys.setrecursionlimit(1000000) # 获取logger实例，如果参数为空则返回root logger logger = logging.getLogger("AppName") # 指定logger输出格式 formatter = logging.Formatter('%(asctime)s %(levelname)-8s: %(message)s') # 文件日志 file_handler = logging.FileHandler("test.log") file_handler.setFormatter(formatter) # 可以通过setFormatter指定输出格式 # 为logger添加的日志处理器 logger.addHandler(file_handler) # 指定日志的最低输出级别，默认为WARN级别 logger.setLevel(logging.INFO) class flag(object): def __init__(self): f = True def get_f(self): return self.f @staticmethod def set_f(self): self.f = False def is_folder(file_name=''): # 判断是否存在图片存储文件夹，如不存在则创建 cwd = os.getcwd() + file_name if not os.path.exists(cwd): os.mkdir(cwd) print('已创建图片存储文件夹%s' % file_name) else: # print("检测到已有图片存储文件夹") pass def get_url(url='', host=''): # 获取response response = '' header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept - Encoding' : 'gzip, deflate', 'Accept - Language' : 'zh-CN,zh;q=0.9', 'Cache - Control' : 'max - age = 0', 'Connection' : 'keep - alive', 'Upgrade - Insecure - Requests' : '1' } cooke = { 'Cookie': 'Hm_lvt_63a864f136a45557b3e0cbce07b7e572=1519296125,1519296217,1519306647,1519309454; Hm_lpvt_63a864f136a45557b3e0cbce07b7e572=1519310130'} # 可设置代理 proxies = { "http": "http://"+'61.155.164.106:3128', "https": "http://"+'61.155.164.106:3128', } if host != '': header['Host'] = host ''' try: print(header['Host']) except Exception: print('none host') ''' try: response = requests.get(url, headers=header, timeout=30) except Exception: response = 'error' logger.error('%s \t\t get error' % url) finally: # print(url) if host != '': del header['Host'] time.sleep(random.randint(1, 4)) return response def download_img(url = 'http://t1.27270.com/uploads/tu/201802/726/e6e5afe62c.jpg', name='', the_path='', num=8): # 下载单张图片 response = get_url(url, host='t2.hddhhn.com') if response != 'error': cwd = os.getcwd() + r'\woman' file_name = name + '.' + url.split('/')[-1].split('.')[-1] logger.warn('%s \t\t download...' % (url)) with open(cwd + '\\' + the_path + file_name, 'wb') as f: file_data = response.content f.write(file_data) else: if num > 0: return download_img(url, name=name, the_path=the_path, num=num - 1) print('download error') return def get_index(url_index): # 获取主页html文件 response = get_url(url_index) response.encoding = 'gb2312' return response.text def get_start_end(url=''): response = get_url(url) response.encoding = 'gb2312' html_index = response.text soup = BeautifulSoup(html_index, "html.parser") a_index_a_all = soup.find("div", class_="NewPages").find('ul').find_all('a', target='_self') for a_index_a in a_index_a_all: a_index[a_index_a.string] = (url_index + a_index_a['href']) html_index = get_index(url_index) soup = BeautifulSoup(html_index, "html.parser") index['start'] = soup.find("div", class_="NewPages").find('li', class_='thisclass').find('a').string response = get_url(a_index['末页']) response.encoding = 'gb2312' html_index = response.text soup = BeautifulSoup(html_index, "html.parser") index['end'] = soup.find("div", class_="NewPages").find('li', class_='thisclass').find('a').string def get_page_href(url=''): # 获取分页按钮跳转的网页 new_num = 0 response = get_url(url) if response != 'error': response.encoding = 'gb2312' html_index = response.text soup = BeautifulSoup(html_index, "html.parser") a_index_a_all = soup.find("div", class_="NewPages").find('ul').find_all('a', target='_self') for a_index_a in a_index_a_all: a_index[a_index_a.string] = (url_index + a_index_a['href']) if str(a_index_a.string).isdigit() and int(a_index_a.string) > int(new_num): new_num = a_index_a.string print('已进行:%.2f%%' % (int(new_num)*100/int(index['end']))) if (int(new_num) >= int(index['end'])): return else: new_num -= 1 print('page error') get_page_href(a_index[new_num]) def get_father_img(url_index_child): a_index_a_all = '' response = get_url(url_index_child) if response != 'error': response.encoding = 'gb2312' html_index = response.text soup = BeautifulSoup(html_index, "html.parser") a_index_a_all = soup.find('div', class_='MeinvTuPianBox').find('ul').find_all('a', class_='MMPic') return a_index_a_all def download_children_img(url, title): num = 0 global child_img_href max_index = '0' child_img_href = {'1' : url} # print(child_img_href) get_child_href(url, max_index, title) print('%d张图片，正在下载\n' % len(child_img_href)) for key, val in child_img_href.items(): try: response = get_url(val) if response != 'error': response.encoding = 'gb2312' html_index = response.text soup = BeautifulSoup(html_index, "html.parser") href = str(soup.find('div', class_='articleV4Body').find('img')['src']) # print(href) is_folder(r'\woman\\' + title) download_img(href, str(num), title+'\\') num += 1 except Exception: print('下载图片失败') def get_child_href(url_index_child, max_index, file_name=''): num = '0' response = get_url(url_index_child) if response != 'error': if file_name != '': is_folder(r'\woman\\' + file_name) response.encoding = 'gb2312' html_index = response.text # print(html_index) soup = BeautifulSoup(html_index, "html.parser") max_index = soup.find('div', class_='page-tag oh').find('ul').find('li', class_='hide')['pageinfo'] a_index_a_first = soup.find("div", class_="page-tag oh").find('ul').find('li', class_='thisclass') for sibling in a_index_a_first.next_siblings: if str(sibling.string).isdigit(): if int(sibling.string) > int(num): num = int(sibling.string) child_img_href[str(sibling.string)] = '/'.join(url_index_child.split('/')[:-1]) + '/' + sibling.find('a')['href'] # print(num) if int(num) >= int(max_index): return else: num = ''+str(int(num)+1) # print(num) get_child_href(child_img_href[str(num)], max_index) def download_url_all(): index = 1 zz = 0 # a_index = {'1': 'http://www.27270.com/ent/meinvtupian/list_11_1.html', 2: 'http://www.27270.com/ent/meinvtupian/list_11_2.html'} for key, value in a_index.items(): img_index = [] a_index_a_all = get_father_img(value) print('%d / %s' % (index, len(a_index))) # print('第'+str(index)+'轮下载即将开始') for a_index_a in a_index_a_all: # print(a_index_a) img_href.append(a_index_a) # download_children_img(a_index_a['href'], a_index_a['title']) # print(a_index_a) # logger.warn('图片合集:%d : %s %s' % (zz, a_index_a['href'], a_index_a['title'])) # download_img(a_index_a['href'], str(zz)) zz += 1 # print('请等待下一轮下载\n\n') index += 1 # 使用进程池，并发数为2 print('zhong:%d' % int(len(img_href)/2)) def func(all_href): for a_index_a in all_href: # print(a_index_a) download_children_img(a_index_a['href'], a_index_a['title']) if __name__ == '__main__': get_start_end(url_index) get_page_href(url_index) del a_index['首页'] del a_index['末页'] del a_index['上一页'] del a_index['下一页'] # for key, value in a_index.items(): # logger.warn('分页按钮:%s : %s' % (key, value)) is_folder(r'\woman') download_url_all() # print(len(img_href)) img_href_first = img_href[:int(len(img_href)/2)] img_href_second = img_href[int(len(img_href)/2+1):] p1 = multiprocessing.Process(target=func, args=(img_href_first,)) p2 = multiprocessing.Process(target=func, args=(img_href_second,)) p1.start() p2.start() p1.join() p2.join() input('end')

转载请注明原文地址: https://www.6miu.com/read-1600092.html

技术

最新回复(0)