Python实战---抓取淘宝美食

xiaoxiao2021-02-28 49

系统：win10，Spyder3，MySQL

话不多说，先上代码：

# -*- coding: utf-8 -*- from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException import os import re from pyquery import PyQuery as pq import MySQLdb import sys sys.path.append("E:\PythonProject\TbMeishi") import config #chromedriver位置 abspath = os.path.abspath(r"C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe") #加启动配置，‘Chrome正在受到自动软件的控制‘ option = webdriver.ChromeOptions() option.add_argument('disable-infobars') #设置成用户自己的数据目录 #option.add_argument('--user-data-dir=C:/Users/Administrator/AppData/Local/Google/Chrome/User Data/Default') #打开chrome浏览器 browser = webdriver.Chrome(executable_path=abspath,chrome_options=option) #设置等待时间，倘若超过时间，默认为打不开浏览器，抛出异常 wait = WebDriverWait(browser,10) #打开网页搜索美食 def search_url(): try: url = 'https://www.taobao.com' browser.get(url) #判断是否加载成功,如果没有在指定时间内加载出来就判断出错 Input = wait.until( #css选择器，确认元素是否已经出现 EC.presence_of_element_located((By.CSS_SELECTOR,'#q')) ) #确认元素是否可点击 submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#J_TSearchForm > div.search-button > button'))) #搜索美食 Input.send_keys(config.SEARCH_ITEM) submit.click() #等待页数加载出来，输出总页数 total = wait.until( #千万不能写错，不是all_element，而是element EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.total')) ) #<class 'selenium.webdriver.remote.webelement.WebElement'> #print(type(total)) #<class 'str'> return total.text except TimeoutException: return search_url() #翻页 def next_page(page_number): try: Input = wait.until( #确认所有元素是否已经出现，通过css选择器找到第--页 EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.form > input')) ) #获取所有宝贝信息 get_products() submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit'))) Input.clear() #输入页数 Input.send_keys(str(page_number)) #点击确定 submit.click() wait.until( #确认某个元素文本包含某文字 EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > ul > li.item.active > span'),str(page_number)) ) except TimeoutException: next_page(page_number) #解析 def get_products(): #判断所有的宝贝信息是否加载成功 wait.until( EC.presence_of_all_elements_located((By.CSS_SELECTOR,'#mainsrp-itemlist .items .item')) ) #拿到网页的源代码 html = browser.page_source #对源代码进行解析 doc = pq(html) ''' print('--'*10) print(type(doc)) print(doc) ''' items = doc('#mainsrp-itemlist .items .item').items() for item in items: product = { 'image':item.find('.pic .img').attr('src'), 'price':item.find('.price').text()[0]+item.find('.price').text()[3:], #从倒数第三个开始 'deal':item.find('.deal-cnt').text()[:-3], 'title':item.find('.title').text(), 'shop':item.find('.shop').text(), 'location':item.find('.location').text() } #save_to_mysql(config.TABLE_NAME,product) write_to_file(product) #存入Mysql数据库中 def save_to_mysql(Tablename,dic): try: #打开数据库,首先要有这个DB conn = MySQLdb.connect(config.SQL_URL,config.SQL_USER,config.SQL_PASSWORD,config.SQL_DB,charset = "utf8") #使用cursor()方法获取操作游标 cur = conn.cursor() #列的字段 COLstr = '' #行字段，具体的数据 ROWstr = '' ColumnStyleImage = ' VARCHAR(200)' ColumnStyleOther = ' VARCHAR(50)' #dict:{keys:value} for key in dic.keys(): if(key == 'image'): COLstr=COLstr+' '+key+ColumnStyleImage+',' else: COLstr=COLstr+' '+key+ColumnStyleOther+',' ROWstr = (ROWstr+'"%s"'+',')%(dic[key]) print(COLstr) print(ROWstr) #如果没有这个表就新建一个 try: cur.execute("use %s"%config.SQL_DB) cur.execute("INSERT INTO %s VALUES (%s)"%(Tablename,ROWstr[:-1])) except MySQLdb.Error as e: #选中数据库 cur.execute("use %s"%config.SQL_DB) #创建表 cur.execute("CREATE TABLE %s (%s)"%(Tablename,COLstr[:-1])) #插入数据 cur.execute("INSERT INTO %s VALUES (%s)"%(Tablename,ROWstr[:-1])) conn.commit() cur.close() conn.close() except MySQLdb.Error as e: print("MySQL Error %d : %s"%(e.args[0],e.args[1])) #存入文档中 def write_to_file(content): #将字典写入txt文件，需要转换成字符串 with open('result.txt','a',encoding='utf-8') as f: f.write(str(content)+'\n') f.close() ''' 如果要从txt文件中读取此类字符串，变成字典，用到的函数是： f = open('result.txt','r') a = f.read() dict_name = eval(a) f.close() ''' def main(): total = search_url() print(total) #搜索整个字符串，如果产生了一个符合正则表达式的就返回，并且把str类型变成整型 total = int(re.compile('(\d+)').search(total).group(1)) print(total) #100 for i in range(2,4): next_page(i) browser.close() if __name__ == '__main__': main()

最终结果存在了数据库中，也存在了result.txt文件中，结果如下：

config.py代码如下：

# -*- coding: utf-8 -*- SQL_URL = 'localhost' SQL_DB = 'taobao' SQL_TABLE = 'product' SQL_USER = 'root' SQL_PASSWORD = '****' SEARCH_ITEM = '美食' TABLE_NAME = 'meishi'

其中在编码的时候，数据库乱码。这个问题通过上一篇博客得到解决。

转载请注明原文地址: https://www.6miu.com/read-2633039.html

技术

最新回复(0)