selenium+python+BeautifulSoup爬取知乎文章信息

xiaoxiao2021-02-28  57

本文通过selenium+python+BeautifulSoup来爬取知乎文章信息。 #知乎推荐文章爬取 #2017/8/6 # -*- encoding = utf-8 -*- from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException from selenium.common.exceptions import NoSuchElementException from bs4 import BeautifulSoup import csv import os import time import re driver = webdriver.Chrome() #登录知乎 def putcookies(account,password): try: driver.get('https://www.zhihu.com/#signin') WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "body > div.index-main > div > div.desk-front.sign-flow.clearfix.sign-flow-simple > div.index-tab-navs > div > a.active"))) botton = driver.find_element_by_css_selector('body > div.index-main > div > div.desk-front.sign-flow.clearfix.sign-flow-simple > div.view.view-signin > div.qrcode-signin-container > div.qrcode-signin-step1 > div.qrcode-signin-cut-button > span') botton.click() form = driver.find_element_by_css_selector('body > div.index-main > div > div.desk-front.sign-flow.clearfix.sign-flow-simple > div.view.view-signin > form > div.group-inputs > div.account.input-wrapper > input[type="text"]') pas = driver.find_element_by_css_selector('body > div.index-main > div > div.desk-front.sign-flow.clearfix.sign-flow-simple > div.view.view-signin > form > div.group-inputs > div.verification.input-wrapper > input[type="password"]') sub = driver.find_element_by_css_selector('body > div.index-main > div > div.desk-front.sign-flow.clearfix.sign-flow-simple > div.view.view-signin > form > div.button-wrapper.command > button') form.send_keys(account) pas.send_keys(password) sub.click() try: print('请手动输入验证码') driver.implicitly_wait(10) driver.find_element_by_css_selector('#root > div > div:nth-child(2) > header > div > div.SearchBar > button') except NoSuchElementException: sub.click() except: putcookies(account,password) #滑动页面 def change_page(num): WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CSS_SELECTOR,'#root > div > div:nth-child(2) > header > div > div.SearchBar > button'))) for i in range(num): driver.execute_script('window.scrollTo(0, document.body.scrollHeight)') time.sleep(3) #解析页面 def findinf(html): soup = BeautifulSoup(html,'lxml') r = re.compile('(\d+)') links = soup.find_all('div',class_='Card TopstoryItem') for link in links: try: maininf = link.find(class_='Feed-meta-item').get_text()[-3:]#话题 writer = link.find(class_='AuthorInfo-head').get_text()#作者 except: continue try: intd = link.find('div',class_='RichText AuthorInfo-badgeText').string#作者个人介绍 except: intd = '' title = link.find('h2',class_='ContentItem-title').get_text()#标题 href = 'https://www.zhihu.com' + link.find('h2',class_='ContentItem-title').a['href']#文章链接 try: support = link.find(class_='Button VoteButton VoteButton--up').get_text()#点赞 except: support = link.find(class_='Button LikeButton ContentItem-action').get_text()#点赞 try: talking = r.match(link.find('button',class_='Button ContentItem-action Button--plain').get_text()[:-3]).group()#评论数 except: talking = '' content = link.find('span',class_='RichText CopyrightRichText-richText').get_text()#摘要 yield { 'maininf': maininf, 'writer':writer, 'intd':intd, 'title':title, 'support':support, 'talking':talking, 'content':content, 'href':href, } #创建一个文件夹 def make(path): if not os.path.exists(path): os.makedirs(path) #保存数据 def save_to_csv(inf,path): with open(path + '知乎文章信息概要采集.csv','a') as f: writer = csv.writer(f) writer.writerow(['标题','作者','话题','作者个人介绍','点赞数','评论数','文章链接','摘要']) try: for i in inf: writer.writerow([i['title'],i['writer'],i['maininf'],i['intd'],i['support'],i['talking'],i['href'],i['content']]) except: pass #主函数 def main(account,password,num): path = 'D:/数据/知乎文章/' putcookies(account,password) change_page(num) inf = findinf(driver.page_source) make(path) print('---'*43) print('{:^60}'.format('知乎文章概要')) print("***"*43) for i in findinf(driver.page_source): print('标题:{:<10s}'.format(i['title'])) print('作者:{:>3s}'.format(i['writer']),end = ' '*5) print("话题:{:>3s}".format(i['maininf'])) print('作者个人介绍:') print('{:<5s}'.format(i['intd'])) print('点赞数:{:<2s}'.format(i['support']),end = ' '*5) print("评论数:{:3s}".format(i['talking'])) print("文章链接:" + i['href']) print("摘要:") print('{:<5s}'.format(i['content'])) print('---' *43) save_to_csv(inf,path) #执行程序 if __name__ == '__main__': num = int(input('请输入要爬取的页面数:')) account = input("请输入知乎账号:") password = input("请输入知乎密码:") time_start = time.time() main(account,password,num) print("^^^"*43) print("共耗时{}秒".format(time.time()-time_start)) driver.quit()
转载请注明原文地址: https://www.6miu.com/read-59989.html

最新回复(0)