写csv文件抓取页面图片①抓取页面图片②为爬虫添加代理ip获取页面内嵌链接字典的相关用法
August 31, 2017 8:36 AM
写csv文件
import csv
from urllib.request
import urlopen
from bs4
import BeautifulSoup
html = urlopen(
"http://en.wikipedia.org/wiki/Comparison_of_text_editors")
bsObj = BeautifulSoup(html,
"lxml")
table = bsObj.findAll(
"table",{
"class":
"wikitable"})[
0]
rows = table.findAll(
"tr")
csvFile = open(
"editors.csv",
"wt",newline =
'',encoding =
'utf-8')
writer = csv.writer(csvFile)
try:
for row
in rows:
csvRow = []
for cell
in row.findAll([
'td',
'th']):
csvRow.append(cell.get_text())
writer.writerow(csvRow)
finally:
csvFile.close()
抓取页面图片①
import urllib.request
response = urllib.request.urlopen(
'http://imgsrc.baidu.com/forum/w=580/sign=fdcdb5b2314e251fe2f7e4f09784c9c2/16391f30e924b89915f86eb06f061d950b7bf677.jpg')
cat_img = response.
read()
with open(
'picture.jpg',
'wb')
as f:
f.
write(cat_img)
抓取页面图片②
import urllib.request
import re
def getHtml(url):
page = urllib.request.urlopen(url)
html = page.read()
return html
def getImg(html):
reg =
r'src="(.+?\.jpg)" pic_ext'
imgre = re.compile(reg)
imglist = re.findall(imgre,html)
x =
0
for imgurl
in imglist:
urllib.urlretrieve(imgurl,
'%s.jpg' % x)
x+=
1
html = getHtml(
"http://tieba.baidu.com/p/2460150866")
print(getImg(html))
为爬虫添加代理ip
import urllib.request
import random
url =
'http://whatismyip.com.tw'
iplist = [
'121.201.97.136:80',
'117.135.164.170:80',
'58.247.31.230:80']
proxy_support = urllib.request.ProxyHandler({
'http':random.choice(iplist)})
opener = urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)
response = urllib.request.urlopen(url)
html = response.read().decode(
'utf-8')
print(html)
import requests
import re
from bs4
import BeautifulSoup
from urllib.request
import urlopen
rawtext=urlopen(
"http://bbs.gfan.com/android-8397839-1-1.html").read()
soup = BeautifulSoup(rawtext,
"html.parser")
targetDiv=soup.find(
'div',{
'class':
'pg'})
catalogLinks=targetDiv.find_all(
'a')
indexlist = []
for l
in catalogLinks[
1:]:
indexlist.append(l.get(
'href'))
for index
in indexlist:
print(index)
字典的相关用法
test = {
"post": {
"content":
""
},
"replys": [
{
"content":
""
}
]
}
test[
"post"][
"content"] =
"xx"
test[
"replys"][
0][
"content"] =
"yy"
test[
"replys"][
0][
"value"] =
"zz"
test[
"replys"].append({
"content":
"",
"title":
"",
"publish_date":
""})
def store(measurements):
import json
with open(
'measurements.json',
'w')
as f:
f.write(json.dumps(test))
if __name__ ==
"__main__":
store(test)