# -*- coding: utf-8 -*- |
| """ |
| Created on Tue Apr 11 11:19:52 2017 |
| |
| @author: cube |
| """ |
| import re |
| from selenium import webdriver |
| from selenium.common.exceptions import TimeoutException |
| from selenium.common.exceptions import StaleElementReferenceException |
| from selenium.webdriver.common.by import By |
| from selenium.webdriver.support.ui import WebDriverWait |
| from selenium.webdriver.support import expected_conditions as EC |
| from pyquery import PyQuery as pq |
| import pandas as pd |
| import time |
|
|
| browser = webdriver.Chrome() |
| wait = WebDriverWait(browser, 10) |
|
|
| def search(): |
| try: |
| browser.get('http://www.taobao.com') |
| #设置等待的过程 |
| input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#q"))) |
| submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#J_TSearchForm > div.search-button > button"))) |
| input.send_keys('美食') |
| submit.click() |
| total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.total"))) |
| total = int(re.compile('(\d+)').search(total.text).group(1))#得到总页数 |
| return total |
| except TimeoutException: |
| return search() |
|
|
| image_list = [] |
| price_list = [] |
| deal_list = [] |
| title_list = [] |
| shop_list = [] |
| location_list = [] |
|
|
| def get_products(): |
| wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-itemlist .items .item"))) |
| html = browser.page_source |
| doc = pq(html) |
| items = doc('#mainsrp-itemlist .items .item').items() |
| for item in items: |
| product = { |
| 'image': item.find('.pic .img').attr('src'), |
| 'price': item.find('.price').text(), |
| 'deal': item.find('.deal-cnt').text()[:-3], |
| 'title': item.find('.title').text(), |
| 'shop': item.find('.shop').text(), |
| 'location': item.find('.location').text() |
| } |
| image_list.append(product['image']) |
| price_list.append(product['price']) |
| deal_list.append(product['deal']) |
| title_list.append(product['title']) |
(0)
|