Python学习笔记——20170831

xiaoxiao2021-02-28  65

同城旅游网 爬虫练习

类库安装 pip install requestspip install beautifulsoup4代码 import requests from bs4 import BeautifulSoup import os # # def get_html(url): """ 获取html源码 :param url: 链接地址 :return: html源码 """ headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)"} response = requests.get(url, headers=headers) response.encoding = response.apparent_encoding if response.status_code == 200: return response.text else: print("网络访问出错") # # def parse_html(html): """ 解析html源码 :param html: html代码 :return: 返回数据List """ soup = BeautifulSoup(html, 'lxml') items = soup.select("#tagList > ul > li > div.line-info > div") infoList = [] for each in items: each_soup = BeautifulSoup(str(each), 'lxml') info = { 'type': each_soup.select('div.line-imgbox > span')[0].string, 'title': each_soup.select('p.line-title > b')[0].string, 'price': each_soup.select('div.line-pricebox > div > p')[0].em.next_sibling, } if len(each_soup.select('p.sat-num')) == 0: info['satisfied'] = 'None' else: info['satisfied'] = each_soup.select('p.sat-num')[0].em.previous_sibling infoList.append(info) return infoList # # def save_file(path, text): """ 文本存储 :param path: 存储路径 :param text: 文本内容 :return: None """ file_name = path.split("/")[-1] dir_path = path.strip(file_name) if not os.path.exists(dir_path): os.mkdir(dir_path) with open(path, 'w', encoding='UTF-8') as file: file.write(text) # # if __name__ == '__main__': url = 'https://www.ly.com/dujia/taiguo-lvyou/f394/' html = get_html(url) infoList = parse_html(html) save_file('./lyinfo.json', str(infoList).replace('\'', '\"'))
转载请注明原文地址: https://www.6miu.com/read-66986.html

最新回复(0)