这是一个我自己编写的爬虫工具类,功能包括:发送get/post请求获取页面,cookie操作,页面正则和xpath解析,简单的搜索爬虫。
除了lxml库都是基本python库里就有的东西。
如果没有这个库可以用pip安装,或者删除from lxml import etree和getXpath方法
pip install lxml
#! /usr/bin/python #coding=utf-8 import sys import urllib import urllib2 import re import os import cookielib import json from lxml import etree class requestPars: PROXY = 'proxy' USER_AGENT = 'userAgent' DATA = 'data' COOKIE = 'cookie' #通用方法 class crawlerTool: #类的全局变量 log='' def __init__(self): pass #基本的页面访问 输出页面 #getPage(url,data=xx) getPage(url,requestPars.=xx) @staticmethod def getPage(url,proxy=None,data=None, referer = None ,cookie = None ,userAgent = None,cookiePath=None): # print url crawlerTool.log = crawlerTool.log+url page_buf = '' i = 0 for i in range(1): # print url try: if proxy: handlers = [urllib2.ProxyHandler({'http': 'http://%s/' % proxy,'https': 'http://%s/' % proxy})] opener = urllib2.build_opener(*handlers) else: opener = urllib2.build_opener() method = urllib2.Request(url,data) if referer: method.add_header('Referer', referer) if cookiePath: method.add_header('Cookie', crawlerTool.readCookie(cookiePath)) if cookie: method.add_header('Cookie', cookie) if userAgent: method.add_header('User-Agent', userAgent) else: method.add_header('User-Agent', 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36') method.add_header('Accept-Language', 'en-US,en;q=0.5') result = opener.open(method, timeout=10) page_buf = result.read() return page_buf except urllib2.URLError, reason: crawlerTool.log = crawlerTool.log + str(reason) return str(reason) except Exception, reason: crawlerTool.log = crawlerTool.log + str(reason) raise Exception(reason) pass #getPageByPostJson data input is a dict #getPage(url,data=xx) getPage(url,requestPars.=xx) @staticmethod def getPageByJson(url,proxy=None,data={}, referer = None ,cookie = None ,userAgent = None,cookiePath=None): # print url crawlerTool.log = crawlerTool.log+url page_buf = '' i = 0 for i in range(1): # print url try: if proxy: handlers = [urllib2.ProxyHandler({'http': 'http://%s/' % proxy,'https': 'http://%s/' % proxy})] opener = urllib2.build_opener(*handlers) else: opener = urllib2.build_opener() if type(data) == type({}):data=json.dumps(data) method = urllib2.Request(url,data=data)#要注意None对应null method.add_header('Content-Type','application/json') if referer: method.add_header('Referer', referer) if cookiePath: method.add_header('Cookie', crawlerTool.readCookie(cookiePath)) if cookie: method.add_header('Cookie', cookie) if userAgent: method.add_header('User-Agent', userAgent) else: method.add_header('User-Agent', 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36') method.add_header('Accept-Language', 'en-US,en;q=0.5') result = opener.open(method, timeout=10) page_buf = result.read() return page_buf except urllib2.URLError, reason: crawlerTool.log = crawlerTool.log + str(reason) return str(reason) except Exception, reason: crawlerTool.log = crawlerTool.log + str(reason) raise Exception(reason) pass #获取正则的第一个匹配 @staticmethod def getRegex(pattern,content): group = re.search(pattern, content) if group: return group.groups()[0] else: return '' # 获取xpath 要判断一下输入类型,或者异常处理 @staticmethod def getXpath(xpath, content): tree = etree.HTML(content) out = [] results = tree.xpath(xpath) for result in results: if 'ElementStringResult' in str(type(result)) : out.append(result) else: out.append(etree.tostring(result)) return out # 获取跳转链接 @staticmethod def getDirectUrl(url): u = urllib2.urlopen(url) redirectUrl = u.geturl() return redirectUrl #输出页面的各种信息 输出字典 @staticmethod def getPageDetail(url,proxy=None,data=None, referer = None ,cookie = None ,userAgent = None,cookiePath=None): PageDetail = {} #print url crawlerTool.log = crawlerTool.log+url+'\n' page_buf = '' n = 1 for i in range(n): # print url try: getCookie = cookielib.CookieJar() cookieHandler = urllib2.HTTPCookieProcessor(getCookie) if proxy: handlers = [urllib2.ProxyHandler({'http': 'http://%s/' % proxy,'https': 'http://%s/' % proxy}),cookieHandler] opener = urllib2.build_opener(*handlers) else: opener = urllib2.build_opener(cookieHandler) method = urllib2.Request(url,data) if referer: method.add_header('Referer', referer) if cookiePath: method.add_header('Cookie', crawlerTool.readCookie(cookiePath)) if cookie: method.add_header('Cookie', cookie) if userAgent: method.add_header('User-Agent', userAgent) else: method.add_header('User-Agent', 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36') method.add_header('Accept-Language', 'en-US,en;q=0.5') result = opener.open(method, timeout=10) #print str(result.headers) page_buf = result.read() PageDetail['pageContent']=page_buf PageDetail['code'] = 200 cookie_str = '' for item in getCookie: cookie_str += item.name + "=" + item.value + "; " PageDetail['cookie'] = cookie_str #print 'getcookie:'+cookie_str break except urllib2.HTTPError, e: #print e.reason PageDetail['code'] = e.code PageDetail['cookie'] =e.headers.get('Set-Cookie','') #这里是因为百度403错误仍然需要取cookie #print e.headers.get('Set-Cookie','') except urllib2.URLError, reason: crawlerTool.Log = crawlerTool.log + str(reason) #print reason.read() PageDetail['code'] = 1003 #print 'URLError'+str(reason) break except Exception, reason: if i == n: crawlerTool.Log = crawlerTool.log + str(reason) #print 'Error'+str(reason) break return PageDetail #保存cookie 如果路径不存在就新建 如果不是需要分开写可以用cookielib.MozillaCookieJar(filename) @staticmethod def saveCookie(cookie,path): if os.path.isdir(path): crawlerTool.log = crawlerTool.log+'path cant be dir\n' sys.exit(0) try: if not os.path.exists(path): parent_path = os.path.dirname(path) if not os.path.exists(parent_path):os.makedirs(parent_path) #建立级联目录 with open(path,'w') as f: f.write(cookie) else: with open(path,'w') as f: f.write(cookie) except: sys.exit(0) # 读取cookie @staticmethod def readCookie(path): if not os.path.isfile(path): crawlerTool.log =crawlerTool.log+'cookie not find\n' return '' else: with open(path,'r') as f: return f.read() pass def keywordSearch(maxPageNum,keyword,proxy=''): try: #print proxy #print keyword,'do list search' keyword = keyword.replace(' ','+') pageNum = 0 urlListDepth0 = [] urlDepth0 = 'https://www.youtube.com/results?search_query='+keyword finalResult = [] for pageNum in range(maxPageNum): pageDepth0 = crawlerTool.getPage(urlDepth0,proxy=proxy) #print pageDepth0 urlDepth1 = re.findall('class="yt-lockup-title\s*"><a href="(/watch\?v=[\w_-]+&list=[^"]+)"',pageDepth0) urlDepth0 = 'https://www.youtube.com'+crawlerTool.getRegex('<a href="(.*?)"[^>]+"><span class="yt-uix-button-content">Next',pageDepth0) #print urlDepth0 urlListDepth1 = [] for url in urlDepth1: url = url.replace('&','&') url = 'https://www.youtube.com'+url if not url in urlListDepth1: #print url urlListDepth1.append(url) #print urlListDepth1,len(urlListDepth1) urlListDepth2 = [] for url in urlListDepth1: #print 'open listUrl:',url pageDepth1 = crawlerTool.getPage(url,proxy=proxy).replace('&','&') urlDepth2 =re.findall('(/watch\?v=[^"]*)\&index=\d+',pageDepth1) for urlDepth2 in urlDepth2: if not urlDepth2 in urlListDepth2: urlDepth2 = 'http://www.youtube.com'+urlDepth2 finalResult.append(urlDepth2) #print urlDepth2 urlListDepth2.append(urlDepth2) #print len(finalResult),finalResult return finalResult except: print 'do listSearch failed' #需要输入关键字和最大页数 输出hostingurl列表 这脚本只覆盖playlist链接 def main(): pass if __name__ == '__main__': ct=crawlerTool() data= { "keyid": "abcdefghijk2ml2n83", "website": "Kuwo", "url": "http://www.filebox.com", "author":"bb", "author_url": "http://www.filebox.com/?v=293280JUN0102", "post_date": "2015-03-20 1:12:50", "hide_flag2" : 0, "duration":225 } print json.dumps(data) print ct.getPageByJson('http://192.168.1.72:8080/VTServiceFK/service/updateVideoInfo',data=data) sys.exit() print ct.getDirectUrl('http://v.qq.com/page/c/b/4/c0361j0fab4.html') keywordSearch(1,"simpsons full episode")