#coding=gb18030
import requests
from lxml import etree
from gevent import monkey,pool;monkey.patch_all()
import gevent
import os
import time
import xlwt
import urllib2
import string
xl=xlwt.Workbook()
st=xl.add_sheet('job',cell_overwrite_ok=True)
print time.strftime('%H:%M:%S')
p=pool.Pool(30)
#返回分页地址
def get_pages(page):
url='http://search.51job.com/list/040000,000000,0000,00,9,99,python,2,'+ str(page) + '.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0,0&radius=-1&ord_field=0&confirmdate=9&fromType=1&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='
yield url
#返回详细页链接
def get_links(link):
header={'User-Agent':'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)'}
req=urllib2.Request(link,headers=header)
resp=urllib2.urlopen(req)
response=resp.read()
resp.close()
html=etree.HTML(response)
result=html.xpath('//a[@οnmοusedοwn=""]/@href')
for x in result:
yield x
#写入表格
def write_excel(url):
gevent.sleep(0)
global excel_row
try:
excel_row+=1
req=urllib2.Request(url)
res=urllib2.urlopen(req)
respon=res.read()
res.close()
sou=etree.HTML(respon)
st_title=sou.xpath('//title/text()')
ss=st_title[0].find(u'招聘_')+3
st.write(excel_row,0,st_title[0][ss:-12].strip())
st.write(excel_row,1,sou.xpath('//h1/text()')[0])
t1=sou.xpath('//span[@class="sp4"]//text()')
tt1=','.join(t1)
st.write(excel_row,2,tt1)
st.write(excel_row,3,sou.xpath('//span[@class="lname"]/text()')[0])
dz=sou.xpath('//div[@class="bmsg inbox"]//text()')
s=''.join(dz).strip()
st.write(excel_row,4,s)
st.write(excel_row,5,sou.xpath('//strong/text()')[1])
detail=sou.xpath('//div[@class="bmsg job_msg inbox"]//text()[normalize-space()]')
detail=''.join(detail)
st.write(excel_row,6,detail)
except:
pass
excel_row=0
th=[]
for i in range(1,21):
for k in get_pages(i):
for kk in get_links(k):
th.append(p.spawn(write_excel,kk))
p.join()
gevent.joinall(th)
xl.save('d:\\job_python.xls')
print time.strftime('%H:%M:%S')