同城旅游网 爬虫练习
类库安装
pip install requestspip install beautifulsoup4代码
import requests
from bs4
import BeautifulSoup
import os
def get_html(url):
"""
获取html源码
:param url: 链接地址
:return: html源码
"""
headers = {
"User-Agent":
"Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)"}
response = requests.get(url, headers=headers)
response.encoding = response.apparent_encoding
if response.status_code ==
200:
return response.text
else:
print(
"网络访问出错")
def parse_html(html):
"""
解析html源码
:param html: html代码
:return: 返回数据List
"""
soup = BeautifulSoup(html,
'lxml')
items = soup.select(
"#tagList > ul > li > div.line-info > div")
infoList = []
for each
in items:
each_soup = BeautifulSoup(str(each),
'lxml')
info = {
'type': each_soup.select(
'div.line-imgbox > span')[
0].string,
'title': each_soup.select(
'p.line-title > b')[
0].string,
'price': each_soup.select(
'div.line-pricebox > div > p')[
0].em.next_sibling,
}
if len(each_soup.select(
'p.sat-num')) ==
0:
info[
'satisfied'] =
'None'
else:
info[
'satisfied'] = each_soup.select(
'p.sat-num')[
0].em.previous_sibling
infoList.append(info)
return infoList
def save_file(path, text):
"""
文本存储
:param path: 存储路径
:param text: 文本内容
:return: None
"""
file_name = path.split(
"/")[-
1]
dir_path = path.strip(file_name)
if not os.path.exists(dir_path):
os.mkdir(dir_path)
with open(path,
'w', encoding=
'UTF-8')
as file:
file.write(text)
if __name__ ==
'__main__':
url =
'https://www.ly.com/dujia/taiguo-lvyou/f394/'
html = get_html(url)
infoList = parse_html(html)
save_file(
'./lyinfo.json', str(infoList).replace(
'\'',
'\"'))