递归抓取html页面且按目录结构保存页面的python爬虫

xiaoxiao2025-10-20  9

使用了python里的requests 和lxml库,这两个库可以通过pip install requests; pip install lxml来安装。

lxml官网为:https://lxml.de/installation.html, 另一种安装lxml的方式为 apt-get install python-lxml

相应的代码为:

# -*- coding: utf-8 -*- import os import requests from lxml import html headers = { 'Host': 'docs.qed-it.com', 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6', # !!!注意, 请求头部里使用gzip, 响应的网页内容不一定被压缩,这得看目标网站是否压缩网页 'Accept-Encoding': 'gzip, deflate, sdch, br', 'Connection': 'keep-alive', 'Pragma': 'no-cache', 'Cache-Control': 'no-cache', 'Upgrade-Insecure-Requests': '1', #'Authorization': 'Basic cWVkdGVzdGVyOmc1djhLUlZjdXBwNA==', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36', } cookies = dict(Cookie='__cfduid=da58e11c538b3d7981894a4763b5bb4db1539678780; _ga=GA1.2.1749017230.1539678802; _gid=GA1.2.986082824.1540429323') #store the crawled urls crawled_urls = [] #set up file to save the content of the page def save(text, filename='temp', path='output'): setup_dir(path) fpath = os.path.join(path, filename) with open(fpath, 'w') as f: print('output:', fpath) f.write(text) f.close() #get the completed url,e.g. http://www.xxx.com/xxx/yyy.html def fix_url(prefix_url, url): ind = url.rfind(u'html') if ind != -1: ind += 4 else: ind = -1 url = url[0:ind] return prefix_url+url #for recursivec crawl, so fix the preifix url def fix_prefix_url(url): ind = url.rfind('/') url = url[0:ind] return url + '/' #get the save path of the file def get_path(suffix_url): ind = suffix_url.rfind('/') if ind != -1: path = suffix_url[0:ind] else: path = '' return path #get the file name from the pointed url def fix_filename(suffix_url): ind0 = suffix_url.rfind('/') if ind0 != -1: ind0 += 1 else: ind0 = 0 ind1 = suffix_url.rfind('html') if ind1 != -1: ind1 += 4 else: ind1 = -1 filename = suffix_url[ind0:ind1] return filename #just crawl the html page def is_html(url): ind = url.rfind('html') if ind == -1: return False return True #recursive crawl #note:just crawl the pointed url with <a class="reference internal" /> def crawl(prefix_url, url, current_path): #just crawl the html page if not is_html(url): return #for recursive crawl, we should fix something each time, like the save path, prefix url e.g. file_path = current_path + '/' + get_path(url) file_name = fix_filename(url) target_url = fix_url(prefix_url, url) prefix_url = fix_prefix_url(target_url) #judge the url if has been crawled or not if target_url in crawled_urls: print target_url + " has crawled" return #so we remove the crawled url by judge if the file has existed or not if os.path.exists(os.path.join(file_path, file_name)): print target_url + " has_crawled" return print "will crawl " + target_url crawled_urls.append(target_url) #visit the url resp = requests.get(target_url, cookies=cookies, auth=('qedtester', 'g5v8KRVcupp4'), verify=False) #get the content of the page from response #page = resp.content page = resp.text #note: the encode format page = page.encode('utf-8') #save page to file_path/file_name save(page, file_name, file_path) #parse page to get hyperlinks html_tree = html.fromstring(page) #use lxml to extract the hyperlinks target_urls = html_tree.xpath(u'//a[@class="reference internal"]') print "size of target_urls: " + str(len(target_urls)) for target_url in target_urls: #file_name = target_url.text suffix_url = target_url.attrib['href'] #print suffix_url.encode('utf-8') #recursive crawl crawl(prefix_url, suffix_url, file_path) def setup_dir(dir_name): isExist = os.path.exists(dir_name) if not isExist: print "mkdir", dir_name os.makedirs(dir_name) else: print "%s existed" % dir_name if __name__ == '__main__': print "Crawl is running..." prefix_url = 'https://docs.qed-it.com/docs/sdk-docs/en/latest/' #the firsh page to crawl url = 'index.html' current_path = os.getcwd() crawl(prefix_url, url, current_path) print "Crawl comes back."

 

 

 

转载请注明原文地址: https://www.6miu.com/read-5038259.html

最新回复(0)