不知道怎么从楼盘页中直接抓取地图坐标点(看到神箭手上人家提供的脚本有直接抓坐标的功能),因此先在链家网上爬取其他与房屋有关的信息;脚本主体套用的Tsukasa鱼的这篇,表示万分感谢
### 初步设定 import os os.chdir("D:/") import json import requests import re import pandas as pd import time import random from itertools import chain from bs4 import BeautifulSoup from fake_useragent import UserAgent import datetime import numpy as np import pymongo def generate_allurl(user_in_nub, url): # 生成url for url_next in range(1, int(user_in_nub)): yield url.format(url_next) def get_allurl_LJ(generate_allurl, wildcard): # 分析url解析出每一页的详细url ua = UserAgent() headers = {'User-Agent':ua.random} # 使用fake_useragent伪装的请求头 time.sleep(random.uniform(0.5,1.0)) # 随机数读取避免连续读取而造成错误 get_url = requests.get(generate_allurl, 'lxml', headers=headers) if get_url.status_code == 200: re_set = re.compile(wildcard) re_get = re.findall(re_set, get_url.text) return re_get def replace_column_LJ(info, info_column): # 爬取内容中的多余字符筛除 for i in info_column: info[i] = info[i].replace(' ','') info[i] = info[i].replace('\r','') info[i] = info[i].replace('\n','') info[i] = info[i].replace('㎡','') info[i] = info[i].replace('元/平米','') return info def open_url_LJ(re_get): # 分析详细url获取所需信息 ua = UserAgent() headers = {'User-Agent':ua.random} # 使用fake_useragent伪装的请求头 time.sleep(random.uniform(0.5,1.0)) # 随机数读取避免连续读取而造成错误 res = requests.get(re_get, 'lxml', headers=headers) info = {} if res.status_code == 200 and "unitPriceValue" in res.text: soup = BeautifulSoup(res.text, 'lxml') info['挂牌网址'] = re_get info['爬取时间'] = time.time() if "main" in res.text: info['房屋标题'] = soup.select('.main')[0].text info['总计售价'] = soup.select('.total')[0].text info['单方售价'] = soup.select('.unitPriceValue')[0].text info['建筑年代'] = soup.select('.subInfo')[2].text[:4] info['小区名称'] = soup.select('.info')[0].text info['所属政区'] = soup.select('.info a')[0].text info['所属片区'] = soup.select('.info a')[1].text info['链家编号'] = soup.select('.houseRecord span')[1].text.replace('举报','') for i in soup.select('.base li'): # 基本属性部分 i = str(i) if '</span>' in i or len(i) > 0: key, value = (i.split('</span>')) info[key[24:]] = value.rsplit('</li>')[0] for i in soup.select('.transaction li'): # 交易属性部分 i = str(i) if ('</span>' in i or len(i) > 0) and '抵押信息' not in i: key, value, drop = (i.split('</span>')) info[key[25:]] = value[7:].rsplit('</span>')[0] replace_column_LJ(info,['套内面积','建筑面积','单方售价']) # 排除内容中的多余字符 else: print('失败') return info # 整理所有需要爬取的详情页地址 wb_ttl = [] for i in generate_allurl(101, 'https://sz.ke.com/ershoufang/pg{}co32ng1hu1nb1/'): # url = 'https://sz.ke.com/ershoufang/pg{}co32ng1hu1nb1/' # 对贝壳找房最新非商业非车位非地下室 # url = 'http://sz.lianjia.com/ershoufang/pg{}ng1hu1nb1tt2/' # 对链家最新非商业非车位非地下室 wb_tmp=get_allurl_LJ(i, '<li.*?class="clear">.*?<a.*?class="img.*?".*?href="(.*?)"') #对列表页楼盘详情地址的通配符 wb_ttl.append(wb_tmp) # 在循环中产生的二维list wb_ttl_alt = list(chain(*wb_ttl)) # 将二维list展开成一维 len_tmp=len(wb_ttl_alt) # 计算将要爬取的总数 print("需要爬取 %d 个详情页"%(len_tmp)) # 依次爬取所有详情页中指定的内容 myclient = pymongo.MongoClient(host='mongodb://localhost:27017/') # 指向连接 mydb = myclient['db_WebCrwr'] # 指向库db_test mycol = mydb["cl_LJ_xhbn"] # 指向集合cl_test_dct # dct_dt_tmp={} # 构建一个空的字典以便于下面直接将数据按行导入该字典 c=0 for i in wb_ttl_alt: print(i) mycol.insert_one(open_url_LJ(i)) # 直接入库 # dct_dt_tmp[i] = open_url(i) # 进入前面定义的空字典 c = c + 1 if c%1000 == 1: print(str(datetime.datetime.now().strftime('%Y-%m-%d %H:%M'))+"; steps %d in total %i"%(c, len_tmp)) time.sleep(2) #避免连续读取而造成错误跑了几周没有问题,接下来会利用小区名称匹配百度地图API提供的坐标点来实现地图可视化