今天我们练习爬取一个网站,并且总结出相类似的网站的爬取模版(计算17的哥哥姐姐们,把BeautifulSoup的属性改一下基本就ok了哦,换成xpath也可以的,看完点赞关注加收藏哦) 我们以http://www.simm.cas.cn/xwzx/kydt/ 这样的网站为例,目标爬取此网站的news的标题,发布时间,文章链接,图片链接,以及来源
最后的保存效果 我们主要用到requests,re,BeautifulSoup,json这些模块。 直接上代码,有地方错误或者可以更改的地方 希望大家可以批评指正。
#!/usr/bin/env python #-*- coding:utf-8 -*- import requests import re from bs4 import BeautifulSoup import json baseurl = "http://www.simm.cas.cn/xwzx/kydt/" headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"} def reptile_news(url): book = {} html = requests.get(url,headers = headers) soup = BeautifulSoup(html.content,"lxml") book["title"] = soup.find("td",class_="newtitle").text #获取文字标题 html = str(soup.find_all("td",class_ = "hui12_sj2")) book["time"] = re.search("发表日期:</td>, <td[^>]+>(.*)</td>",html).group(1) #获取new的时间 content_items = soup.find("div",class_ = "TRS_Editor").find_all("font")#获取news的文字信息 content = "" for i in content_items: content = content + re.sub("<[^>]+>","",str(i)) + "<br><br>" book["content"] = "".join(content.split()) img_url = baseurl + url.split("/")[-2] #获取news图片链接 try: img = img_url + soup.find("div",class_ = "TRS_Editor").img["src"][1:] except: img = "NULL" book["img_url"] = img_url return book def reptile_list(url): lists = [] book = {} html = requests.get(url,headers = headers) soup = BeautifulSoup(html.content,"lxml") items = soup.find_all("a",class_ ="lefttitle3") for i in items: book["news_url"] = baseurl + i["href"][1:] book.update(reptile_news(book["news_url"])) lists.append(book) print(book) return lists if __name__ == "__main__": limit = 10 #爬取10页的新闻 lists = [] url = baseurl for i in range(0,limit): lists.extend(reptile_list(url)) #加入数据 html = requests.get(url,headers = headers) soup = BeautifulSoup(html.content,"lxml") html_url = str(soup.find_all("a",class_="h12")) url = baseurl + re.search("href=([^>]*) id=[^>]+>下一页</a>",html_url).group(1)[1:-1] with open("/Users/caipeng/PycharmProjects/practice/simm_base.json", "w", encoding='utf-8') as f: json.dump(lists, f, ensure_ascii=False)最后的保存效果
