无视哈,存个淘宝虫

发布时间:2019-03-17 11:21:38
贴主:逆尘安
热度:1

逆尘安 2019-03-17

# -*- coding: utf-8 -*-
"""
Created on Tue Apr 11 11:19:52 2017
@author: cube
"""
import re
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from pyquery import PyQuery as pq
import pandas as pd
import time
browser = webdriver.Chrome()
wait = WebDriverWait(browser, 10)
def search():
try:
browser.get('http://www.taobao.com')
#设置等待的过程
input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#q")))
submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#J_TSearchForm > div.search-button > button")))
input.send_keys('美食')
submit.click()
total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.total")))
total = int(re.compile('(\d+)').search(total.text).group(1))#得到总页数
return total
except TimeoutException:
return search()
image_list = []
price_list = []
deal_list = []
title_list = []
shop_list = []
location_list = []
def get_products():
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-itemlist .items .item")))
html = browser.page_source
doc = pq(html)
items = doc('#mainsrp-itemlist .items .item').items()
for item in items:
product = {
'image': item.find('.pic .img').attr('src'),
'price': item.find('.price').text(),
'deal': item.find('.deal-cnt').text()[:-3],
'title': item.find('.title').text(),
'shop': item.find('.shop').text(),
'location': item.find('.location').text()
}
image_list.append(product['image'])
price_list.append(product['price'])
deal_list.append(product['deal'])
title_list.append(product['title'])
(0)
Copyright 2016 - 2024 XUJC ACM Team