说明:仅做学习之用
下面用常用的两种方法来爬取微博:使用selenium+phantomjs和API解析
最重要的是设置user_agent,否则无法跳转链接
当然,还可以设置其它更多参数
from selenium import webdriver from selenium.webdriver.common.desired_capabilities import DesiredCapabilities user_agent = ( 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36 QIHU 360SE' ) dcap = dict(DesiredCapabilities.PHANTOMJS) dcap['phantomjs.page.settings.userAgent'] = user_agent driver = webdriver.PhantomJS(desired_capabilities=dcap)在登录时需要输入用户名与密码,然后点击登录按钮,还好这里不需要输入验证码
一般的,我们现在浏览器console控制台找到我们想要的东西
document.getElementById('loginname').value = '123'
document.getElementsByName('password')[0].value = '123'
下面是两种登录方式,但我在登录过程中遇到了验证码问题,后来在输入用户名和密码后各自休息了几秒钟,居然登录成功了,难道是程序输入太快而被阻止了吗???
#driver = webdriver.PhantomJS(desired_capabilities=dcap) driver = webdriver.Chrome() #driver.set_window_size(1280,2400) driver.get('https://www.weibo.com/') time.sleep(10) driver.find_element_by_id('loginname').send_keys(username) time.sleep(5) driver.find_element_by_name('password').send_keys(password) time.sleep(2) driver.find_element_by_xpath('//div[contains(@class,"login_btn")][1]/a').click() # wait = WebDriverWait(driver, 10) # u_id = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#loginname'))) # p_word = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#pl_login_form > div > div:nth-child(3) > div.info_list.password > div > input'))) # login = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#pl_login_form > div > div:nth-child(3) > div.info_list.login_btn > a'))) # u_id.send_keys(username) # time.sleep(5) # p_word.send_keys(password) # time.sleep(2) # login.click() time.sleep(15) html = driver.page_source print(html)第一种方法代码简单,但需要自己分析代码位置,而且要设置一个slee等待页面加载成功,如果等待时间太长,效率低,如果等待时间太短,则有可能达不到效果。
第二种方法代码复杂一些,需要导入一些库,但它不用分析网页源码,可以自动等待页面加载,只需要在chrome浏览器控制台源码中找到然后右键》copy》copy selector
就这样登录成功了!!!!
当我们在翻阅别人的微博时,只能翻一页就需要登录
剩下的就是结合需求,分析源码,找到你想要的内容了!!!!
代码:
import hashlib import threading import time import re from lxml import etree from selenium import webdriver from selenium.webdriver.common.desired_capabilities import DesiredCapabilities from pybloom import BloomFilter from collections import deque user_agent = ( 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36 QIHU 360SE' ) username = 'your weibo ID' password = 'your password' dcap = dict(DesiredCapabilities.PHANTOMJS) dcap['phantomjs.page.settings.userAgent'] = user_agent #爬取个人主页的爬虫 #feeds_crawler = webdriver.PhantomJS(desired_capabilities=dcap) feeds_crawler = webdriver.Chrome() feeds_crawler.set_window_size(1280,2400) #爬取个人中心:关注,粉丝,微博 #user_crawler = webdriver.PhantomJS(desired_capabilities=dcap) user_crawler = webdriver.Chrome() feeds_crawler.set_window_size(1280,2400) domain = 'weibo.com' url_home = 'http://' + domain download_bf = BloomFilter(1024*1024*16,0.01) cur_queue = deque() seed_user = 'http://weibo.com/yaochen' #获取有价值用户所设阈值 min_mblogs_allowed = 100 #user所发微博最低数量 max_follow_fans_ratio_allowed = 3 #关注(follow)/粉丝(fans)的倍数不能错过3倍 def extract_user(users): print('extract user') for i in range(0,20): for user_element in user_crawler.find_elements_by_xpath('//*[contains(@class, "follow_item")]'): tried = 0 while tried < 3: try: user = {} user['follows'] = re.findall('(\d+)', user_element.find_element_by_xpath('.//div[@class="info_connect"]/span').text)[0] user['follows_link'] = user_element.find_element_by_xpath('.//div[@class="info_connect"]/span//a').get_attribute('href') user['fans'] = re.findall('(\d+)', user_element.find_elements_by_xpath('.//div[@class="info_connect"]/span')[1].text)[0] user['fans_link'] = user_element.find_elements_by_xpath('.//div[@class="info_connect"]/span//a')[1].get_attribute('href') user['mblogs'] = re.findall('(\d+)', user_element.find_elements_by_xpath('.//div[@class="info_connect"]/span')[2].text)[0] user_link = user_element.find_element_by_xpath('.//div[contains(@class,"info_name")]/a') user['link'] = re.findall('(.+)\?', user_link.get_attribute('href'))[0] if user['link'][:4] != 'http': user['link'] = domain + user['link'] user['name'] = user_link.text user['icon'] = re.findall('/([^/]+)$', user_element.find_element_by_xpath('.//dt[@class="mod_pic"]/a/img').get_attribute('src'))[0] # name = user_element.find_element_by_xpath('.//a[@class="S_txt1"]') print('--------------------') print(user['name'] + ' follows: ' + user['follows'] + ' blogs:' + user['mblogs']) print(user['link']) # 如果微博数量少于阈值或者关注数量与粉丝数量比值超过阈值,则跳过 if int(user['mblogs']) < min_mblogs_allowed or int(user['follows'])/int(user['fans']) > max_follow_fans_ratio_allowed: break enqueueUrl(user['link']) users.append(user) break except Exception: time.sleep(1) tried += 1 if go_next_page(user_crawler) is False: return users def scroll_to_bottom(): print('scroll down !!') for i in range(50): feeds_crawler.execute_script('window.scrollTo(0, document.body.scrollHeight)') html = feeds_crawler.page_source res = etree.HTML(html) next_page_url = res.xpath('//a[contains(@class,"page next")]') if len(next_page_url) > 0: return next_page_url[0].get('href') if len(re.findall('点击重新载入', html)) > 0: print('scrolling failed, reload it') feeds_crawler.find_element_by_link_text('点击重新载入').click() time.sleep(1) def go_next_page(cur_driver): try: next_page = cur_driver.find_element_by_xpath('//a[contains(@class, "page next")]').get_attribute('href') print('next page is ' + next_page) cur_driver.get(next_page) time.sleep(3) return True except Exception: print('next page is not found') return False def extract_feed(feeds): for i in range(20): scroll_to_bottom() #博文内容爬取 for element in feeds_crawler.find_elements_by_class_name('WB_detail'): tried = 0 while tried < 3: try: feed = {} feed['time'] = element.find_element_by_xpath('.//div[@class="WB_from S_txt2"]').text feed['content'] = element.find_element_by_class_name('WB_text').text feed['image_names'] = [] for image in element.find_elements_by_xpath('.//li[contains(@class,"WB_pic")]/img'): feed['image_names'].append(re.findall('/([^/]+)$', image.get_attribute('src'))) feeds.append(feed) print('--------------------') print(feed['time']) print(feed['content']) break except Exception: tried += 1 time.sleep(1) if go_next_page(feeds_crawler) is False: return feeds def enqueueUrl(url): #将要爬取的url加入爬取队列 try: md5v = hashlib.md5(url.encode('gb2312')).hexdigest() #在Python3中字符串要编码成byte类型 if md5v not in download_bf: print(url + ' is added to queue') cur_queue.append(url) download_bf.add(md5v) except ValueError: print('enqueueUrl err !!!!!') def Login(username, password): #登录页面 ''' :param username: your weibo id :param password: your password :return: ''' feeds_crawler.get(url=url_home) user_crawler.get(url=url_home) time.sleep(8) print('find click button to login') feeds_crawler.find_element_by_id('loginname').send_keys(username) feeds_crawler.find_element_by_name('password').send_keys(password) time.sleep(3) #点击登录按钮 feeds_crawler.find_element_by_xpath('//div[contains(@class, "login_btn")][1]/a').click() # 也可以使用 execute_script 来执行一段 javascript # feeds_crawler.execute_script('document.getElementsByClassName("W_btn_a btn_32px")[0].click()') # #同样对于另一个爬虫也需要登录 user_crawler.find_element_by_id('loginname').send_keys(username) user_crawler.find_element_by_name('password').send_keys(password) time.sleep(3) # # 执行 click() user_crawler.find_element_by_xpath('//div[contains(@class,"login_btn")][1]/a').click() def dequeUrl(): return cur_queue.popleft() #将队列中的元素(url)一个一个的返回 def get_element_by_xpath(cur_driver,path): tried = 0 while tried<6: html = cur_driver.page_source res = etree.HTML(html) elements = res.xpath(path) if len(elements) == 0: time.sleep(1) continue return elements def fetch_user(url): print('Downloading ' + url) feeds_crawler.get(url) time.sleep(5) #提取用户姓名 account_name = get_element_by_xpath(feeds_crawler,'//h1')[0].text photo = get_element_by_xpath(feeds_crawler, '//p[@class="photo_wrap"]/img')[0].get('src') account_photo = re.findall('/([^/]+)$', photo) # 提取他的关注主页 follows_link = get_element_by_xpath(feeds_crawler, '//a[@class="t_link S_txt1"]')[0].get('href') print('account: ' + account_name) print('follows link is ' + follows_link) follows_link = 'http:' + follows_link user_crawler.get( follows_link ) feeds = [] users = [] #设置两个线程同时抓取 t_feeds = threading.Thread(target=extract_feed, name=None, args=(feeds,)) #t_users = threading.Thread(target=extract_user, name=None, args=(users,)) t_feeds.setDaemon(True) #t_users.setDaemon(True) t_feeds.start() #t_users.start() t_feeds.join() #t_users.join() def crawl(): while True: url = dequeUrl() fetch_user(url) def main(): enqueueUrl(seed_user) Login(username,password) crawl() if __name__ == '__main__': main() 注意:在python3中用hashlib库进行加密时,要将加密内容转换成byte类型:
用于判断是否已经抓取,也可用前面讲的mmh库
>>> >>> import hashlib >>> url = 'http://www.weibo.com' >>> md5v = hashlib.md5(url).hexdigest() Traceback (most recent call last): File "<pyshell#4>", line 1, in <module> md5v = hashlib.md5(url).hexdigest() TypeError: Unicode-objects must be encoded before hashing >>> md5v = hashlib.md5(url.encode('gb2312')).hexdigest() >>> >>> >>> md5v2 = hashlib.md5(b'www.baidu.com').hexdigest() >>> md5v '17d7b29a31328702848d2d42ae79a240' >>> md5v2 'dab19e82e1f9a681ee73346d3e7a575e' >>> >>> >>> md5v3 = hashlib.md5(url.encode('gb2312')).hexdigest() >>> md5v3 '17d7b29a31328702848d2d42ae79a240' >>> 关于微博图片:
实际上,微博中只有图片文件名没有改变,而存储域名、分辨率可能会改变,因此在储存时,只需要储存图片文件名即可,当我们要用时补充完整即可。
之前已经掌握,这里不再赘述!!
网上也有很多