use

xiaoxiao2021-02-28 161

写csv文件抓取页面图片①抓取页面图片②为爬虫添加代理ip获取页面内嵌链接字典的相关用法

August 31, 2017 8:36 AM

写csv文件

import csv from urllib.request import urlopen from bs4 import BeautifulSoup html = urlopen("http://en.wikipedia.org/wiki/Comparison_of_text_editors") bsObj = BeautifulSoup(html,"lxml") #主对比表格是当前页面的第一个表格 table = bsObj.findAll("table",{"class":"wikitable"})[0] rows = table.findAll("tr") csvFile = open("editors.csv","wt",newline = '',encoding = 'utf-8') writer = csv.writer(csvFile) try: for row in rows: csvRow = [] for cell in row.findAll(['td','th']): csvRow.append(cell.get_text()) writer.writerow(csvRow) finally: csvFile.close()

抓取页面图片①

import urllib.request response = urllib.request.urlopen('http://imgsrc.baidu.com/forum/w=580/sign=fdcdb5b2314e251fe2f7e4f09784c9c2/16391f30e924b89915f86eb06f061d950b7bf677.jpg') cat_img = response.read() with open('picture.jpg','wb')as f: f.write(cat_img)

抓取页面图片②

import urllib.request import re def getHtml(url): page = urllib.request.urlopen(url) html = page.read() return html def getImg(html): reg = r'src="(.+?\.jpg)" pic_ext' imgre = re.compile(reg) imglist = re.findall(imgre,html) x = 0 for imgurl in imglist: urllib.urlretrieve(imgurl,'%s.jpg' % x) x+=1 html = getHtml("http://tieba.baidu.com/p/2460150866") print(getImg(html))

为爬虫添加代理ip

import urllib.request import random url = 'http://whatismyip.com.tw' iplist = ['121.201.97.136:80','117.135.164.170:80','58.247.31.230:80'] proxy_support = urllib.request.ProxyHandler({'http':random.choice(iplist)}) opener = urllib.request.build_opener(proxy_support) urllib.request.install_opener(opener) response = urllib.request.urlopen(url) html = response.read().decode('utf-8') print(html) #获取页面内嵌链接 import requests import re from bs4 import BeautifulSoup from urllib.request import urlopen rawtext=urlopen("http://bbs.gfan.com/android-8397839-1-1.html").read() soup = BeautifulSoup(rawtext,"html.parser") targetDiv=soup.find('div',{'class':'pg'}) catalogLinks=targetDiv.find_all('a') indexlist = [] for l in catalogLinks[1:]: indexlist.append(l.get('href')) for index in indexlist: print(index)

字典的相关用法

test = { "post": { "content": "" }, "replys": [ { "content": "" } ] } test["post"]["content"] = "xx" test["replys"][0]["content"] = "yy" test["replys"][0]["value"] = "zz" test["replys"].append({"content":"","title":"","publish_date":""}) def store(measurements): import json with open('measurements.json', 'w') as f: f.write(json.dumps(test)) if __name__ == "__main__": store(test)

转载请注明原文地址: https://www.6miu.com/read-17974.html

技术

最新回复(0)