# encoding: utf-8
from bs4
import BeautifulSoup
import urllib2
import re
def header (url):
user_agent =
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'
header = {
'User-Agent':
'user_agent'}
re = urllib2.Request(url
,headers=header)
respone = urllib2.urlopen(re
,'html.parser')
htmlcode = respone.read()
return htmlcode
def file_save(filename
,text):
f =
open(filename
,'w')
f.write(text)
f.close()
def load_url(url
,begin_page
,end_page):
#拼接url
for i
in range(begin_page
,end_page+
1):
zcurl = url +
str(i)
zc_html_code = header(zcurl)
st_html(zc_html_code)
def st_html(text):
soup = BeautifulSoup(text
,'html.parser')
find_html = soup.find_all(
attrs={
'target':
'_blank'})
for i
in find_html:
print i.get_text()
重点:
print i.get_text()----获取i里面的string字符串
print i.['href']---可以获取url