为什么使用非关系性数据库
摘要
MD5 32个字节, SHA1 48个字节, SHA256 64个字节 import hashlib # 创建一个md对象, 同样可以创建sha1 ,sha256对象 hasher = hashlib.md5() link = 'hello' # 将link的内容变成字节再对其进行摘要 hasher.update(link.encode('utf-8')) # 提取摘要结果 hasher.hexdigest()反序列化-把字符或者字节序列还原成对象
数据压缩
需要先将数据序列化才可以进行数据压缩zlib compress(data) 压缩数据decompress 解压缩 import hashlib from urllib.error import URLError from urllib.request import urlopen import re import pymysql import ssl # 通过指定的字符集对页面进行解码(不是每个网站都将字符集设置为utf-8) def decode_page(page_bytes, charsets=('utf-8',)): page_html = None for charset in charsets: try: page_html = page_bytes.decode(charset) break except UnicodeDecodeError: pass # logging.error('Decode:', error) return page_html # 获取页面的HTML代码(通过递归实现指定次数的重试操作) def get_page_html(seed_url, *, retry_times=3, charsets=('utf-8',)): page_html = None try: if seed_url.startswith('http://') or seed_url.startswith('https://'): page_html = decode_page(urlopen(seed_url).read(), charsets) except URLError: # logging.error('URL:', error) if retry_times > 0: return get_page_html(seed_url, retry_times=retry_times - 1, charsets=charsets) return page_html # 从页面中提取需要的部分(通常是链接也可以通过正则表达式进行指定) def get_matched_parts(page_html, pattern_str, pattern_ignore_case=re.I): pattern_regex = re.compile(pattern_str, pattern_ignore_case) return pattern_regex.findall(page_html) if page_html else [] # 开始执行爬虫程序并对指定的数据进行持久化操作 def start_crawl(seed_url, match_pattern, *, max_depth=-1): conn = pymysql.connect(host='localhost', port=3306, database='crawler', user='root', password='123456', charset='utf8') try: with conn.cursor() as cursor: url_list = [seed_url] visited_url_list = {seed_url: 0} while url_list: current_url = url_list.pop(0) depth = visited_url_list[current_url] if depth != max_depth: page_html = get_page_html(current_url, charsets=('utf-8', 'gbk', 'gb2312')) links_list = get_matched_parts(page_html, match_pattern) param_list = [] for link in links_list: if link not in visited_url_list or link not in url_list: visited_url_list[link] = depth + 1 page_html = get_page_html(link, charsets=('utf-8', 'gbk', 'gb2312')) # headings = get_matched_parts(page_html, r'<h1>(.*)<span') # if headings: # param_list.append((headings[0], link)) # 实例化md5 hasher = hashlib.md5() # 将link的内容变成字节再对其进行摘要 hasher.update(link.encode('utf-8')) # 将摘要的值和页面的元组加到列表中 param_list.append((hasher.hexdigest(), page_html)) # 批量持久化数据,列表中每有一个数据便执行一次 cursor.executemany('insert into tb_result values (default, %s, %s)', param_list) conn.commit() except Error as err: pass #logging.error('[SQL]:', err) finally: conn.close() def main(): ssl._create_default_https_context = ssl._create_unverified_context start_crawl('http://sports.sohu.com/nba_a.shtml', r'<a[^>]+href=["\'](.*?)["\']', max_depth=2) if __name__ == '__main__': main()Android
Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19Mozilla/5.0 (Linux; U; Android 4.0.4; en-gb; GT-I9300 Build/IMM76D) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30Mozilla/5.0 (Linux; U; Android 2.2; en-gb; GT-P1000 Build/FROYO) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1Firefox
Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0Mozilla/5.0 (Android; Mobile; rv:14.0) Gecko/14.0 Firefox/14.0Google Chrome
Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36Mozilla/5.0 (Linux; Android 4.0.4; Galaxy Nexus Build/IMM76B) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.133 Mobile Safari/535.19iOS
Mozilla/5.0 (iPad; CPU OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3Mozilla/5.0 (iPod; U; CPU like Mac OS X; en) AppleWebKit/420.1 (KHTML, like Gecko) Version/3.0 Mobile/3A101a Safari/419.3 import re from urllib.parse import urljoin import requests from bs4 import BeautifulSoup def main(): headers = {'user-agent': 'Baiduspider'} # 可以冒充百度的爬虫,也可以冒充是浏览器 proxies = { 'http': '61.135.217.7:80' } # 设置代理,这样可以隐藏自己的ip base_url = 'https://www.zhihu.com/' seed_url = urljoin(base_url, 'explore') # 可以自动比较url 并补全 resp = requests.get(seed_url, headers=headers, proxies=proxies) soup = BeautifulSoup(resp.text, 'lxml') href_regex = re.compile(r'^/question') link_set = set() for a_tag in soup.find_all('a', {'href': href_regex}): if 'href' in a_tag.attrs: href = a_tag.attrs['href'] full_url = urljoin(base_url, href) link_set.add(full_url) print(link_set) if __name__ == '__main__': main()