Python爬蟲項目--爬取某寶男裝信息

1. 分析目標網(wǎng)站
1. 打開某寶首頁, 輸入"男裝"后點擊"搜索", 則跳轉(zhuǎn)到"男裝"的搜索界面.
2. 空白處"右擊"再點擊"檢查"審查網(wǎng)頁元素, 點擊"Network".
1) 找到對應(yīng)的URL, URL里的參數(shù)正是Query String Parameters的參數(shù), 且請求方式是GET
? ??
2) 我們請求該URL得到內(nèi)容就是"Response"里的內(nèi)容, 那么點擊它來確認信息.

3) 下拉看到"男裝"字樣, 那么再往下找, 并沒有發(fā)現(xiàn)有關(guān)"男裝"的商品信息.
4) 任意復(fù)制一個商品信息, 空白處右擊再點擊"查看網(wǎng)頁源代碼", 在源碼查找該商品, 即可看到該商品的信息.
5) 對比網(wǎng)頁源代碼和"Response"響應(yīng)內(nèi)容, 發(fā)現(xiàn)源代碼中的商品信息被替換, 這便是采用了JS加密
6) 如果去請求上面的URL, 得到的則是加密過的信息, 這時就可以利用Selenium庫來模擬瀏覽器, 進而得到商品信息.
2. 獲取單個商品界面
1. 請求網(wǎng)站
#?-*-?coding:?utf-8?-*-
from?selenium?import?webdriver???#從selenium導(dǎo)入瀏覽器驅(qū)動
browser?=?webdriver.Chrome()???#聲明驅(qū)動對象,?即Chrome瀏覽器
def?get_one_page():
????'''獲取單個頁面'''
????browser.get("https://www.xxxxx.com")??#請求網(wǎng)站
2. 輸入"男裝", 在輸入之前, 需要判斷輸入框是否存在, 如果存在則輸入"男裝", 不存在則等待顯示成功.
#?-*-?coding:?utf-8?-*-
from?selenium?import?webdriver??
from?selenium.webdriver.common.by?import?By???????????????????????#導(dǎo)入元素定位方法模塊
from?selenium.webdriver.support.ui?import?WebDriverWait???????????#導(dǎo)入等待判斷模塊
from?selenium.webdriver.support?import?expected_conditions?as?EC??#導(dǎo)入判斷條件模塊
browser?=?webdriver.Chrome()???
def?get_one_page():
????'''獲取單個頁面'''
????browser.get("https://www.xxxxx.com")?
????input?=?WebDriverWait(browser,10).until(???????????????????????#等待判斷
????????EC.presence_of_element_located((By.CSS_SELECTOR,"#q")))????#若輸入框顯示成功,則獲取,否則等待
????input.send_keys("男裝")?????????????????????????????????????????#輸入商品名稱
3. 下一步就是點擊"搜索"按鈕, 按鈕具有屬性: 可點擊, 那么加入判斷條件.
#?-*-?coding:?utf-8?-*-
from?selenium?import?webdriver???
from?selenium.webdriver.common.by?import?By??
from?selenium.webdriver.support.ui?import?WebDriverWait??
from?selenium.webdriver.support?import?expected_conditions?as?EC??
browser?=?webdriver.Chrome()???
def?get_one_page():
????'''獲取單個頁面'''
????browser.get("https://www.xxxxx.com")??
????input?=?WebDriverWait(browser,10).until(???????????????????????
????????EC.presence_of_element_located((By.CSS_SELECTOR,"#q")))????#
????input.send_keys("男裝")???
????button?=?WebDriverWait(browser,10).until(????????????????????????????????????????????????????????#等待判斷
????????EC.element_to_be_clickable((By.CSS_SELECTOR,"#J_TSearchForm?>?div.search-button?>?button")))?#若按鈕可點擊,?則獲取,?否則等待
????button.click()???????????????????????????????????????????????????????????????????????????????????#點擊按鈕
4. 獲取總的頁數(shù), 同樣加入等待判斷.
#?-*-?coding:?utf-8?-*-
import?re
from?selenium?import?webdriver
from?selenium.common.exceptions?import?TimeoutException
from?selenium.webdriver.common.by?import?By
from?selenium.webdriver.support.ui?import?WebDriverWait
from?selenium.webdriver.support?import?expected_conditions?as?EC
browser?=?webdriver.Chrome()
def?get_one_page():
????'''獲取單個頁面'''
????browser.get("https://www.xxxxx.com")
????input?=?WebDriverWait(browser,?10).until(
????????EC.presence_of_element_located((By.CSS_SELECTOR,?"#q")))
????input.send_keys("男裝")
????button?=?WebDriverWait(browser,?10).until(
????????EC.element_to_be_clickable(
????????????(By.CSS_SELECTOR,?"#J_TSearchForm?>?div.search-button?>?button")))
????button.click()??
????pages?=?WebDriverWait(browser,?10).until(??#?等待判斷
????????EC.presence_of_element_located(
????????????(By.CSS_SELECTOR,?"#mainsrp-pager?>?div?>?div?>?div?>?div.total")))??#?若總頁數(shù)加載成功,則獲取總頁數(shù),否則等待?
????return?pages.text
def?main():
????pages?=?get_one_page()
????print(pages)
if?__name__?==?'__main__':
????main()
5. 打印出來的不是我們想要的結(jié)果, 利用正則表達式獲取, 最后再利用try...except捕捉異常
#?-*-?coding:?utf-8?-*-
import?re
from?selenium?import?webdriver???
from?selenium.common.exceptions?import?TimeoutException???
from?selenium.webdriver.common.by?import?By??
from?selenium.webdriver.support.ui?import?WebDriverWait??
from?selenium.webdriver.support?import?expected_conditions?as?EC??
browser?=?webdriver.Chrome()??
def?get_one_page():
????'''獲取單個頁面'''
????try:
????????browser.get("https://www.xxxxx.com")??
????????input?=?WebDriverWait(browser,10).until(
EC.presence_of_element_located((By.CSS_SELECTOR,"#q")))????
????????input.send_keys("男裝")???
????????button?=?WebDriverWait(browser,10).until(
EC.element_to_be_clickable((By.CSS_SELECTOR,"#J_TSearchForm?>?div.search-button?>?button")))?
????????button.click()????????????
????????pages?=?WebDriverWait(browser,10).until(
EC.presence_of_element_located((By.CSS_SELECTOR,"#mainsrp-pager?>?div?>?div?>?div?>?div.total")))??
????????return?pages.text
????except?TimeoutException:
????????return?get_one_page()???#如果超時,繼續(xù)獲取
def?main():
????pages?=?get_one_page()
????pages?=?int(re.compile("(\d+)").findall(pages)[0])?#采用正則表達式提取文本中的總頁數(shù)
????print(pages)
if?__name__?==?'__main__':
????main()
關(guān)于Selenium的更多內(nèi)容,可參看官方文檔
3.?獲取多個商品界面
采用獲取"到第 頁"輸入框方式, 切換到下一頁, 同樣是等待判斷
需要注意的是, 最后要加入判斷: 高亮是否是當(dāng)前頁 ?
def?get_next_page(page):
????try:
????????input?=?WebDriverWait(browser,?10).until(?????????????????????????????????????????????????????????????????????????????????????????
????????????EC.presence_of_element_located((By.CSS_SELECTOR,?"#mainsrp-pager?>?div?>?div?>?div?>?div.form?>?input")))???????????????????????#?若輸入框加載成功,則獲取,否則等待
????????input.send_keys(page)???????????????????????????????????????????????????????????????????????????????????????????????????????????????#?輸入頁碼
????????button?=?WebDriverWait(browser,?10).until(???????????????????????????????????????????????????????????????????????????????????????
????????????EC.element_to_be_clickable((By.CSS_SELECTOR,?"#mainsrp-pager?>?div?>?div?>?div?>?div.form?>?span.btn.J_Submit")))???????????????#?若按鈕可點擊,則獲取,否則等待
????????button.click()??????????????????????????????????????????????????????????????????????????????????????????????????????????????????????#?點擊按鈕
????????WebDriverWait(browser,10).until(
????????????EC.text_to_be_present_in_element((By.CSS_SELECTOR,"#mainsrp-pager?>?div?>?div?>?div?>?ul?>?li.item.active?>?span"),str(page)))??#?判斷高亮是否是當(dāng)前頁
????except?TimeoutException:????????????????????????????????????????????????????????????????????????????????????????????????????????????????#?超時,?繼續(xù)請求
????????return?get_next_page(page)
def?main():
????pages?=?get_one_page()
????pages?=?int(re.compile("(\d+)").findall(pages)[0])
????for?page?in?range(1,pages+1):
????????get_next_page(page)
if?__name__?==?'__main__':
????main()
4. 獲取商品信息
首先, 判斷信息是否加載成功, 緊接著獲取源碼并初始化, 進而解析.
需要注意的是, 在"get_one_page"和"get_next_page"中調(diào)用之后, 才可執(zhí)行
def?get_info():
????"""獲取詳情"""
????WebDriverWait(browser,20).until(EC.presence_of_element_located((
????????By.CSS_SELECTOR,"#mainsrp-itemlist?.items?.item")))????????????????#判斷商品信息是否加載成功
????text?=?browser.page_source?????????????????????????????????????????????#獲取網(wǎng)頁源碼
????html?=?pq(text)????????????????????????????????????????????????????????#初始化網(wǎng)頁源碼
????items?=?html('#mainsrp-itemlist?.items?.item').items()?????????????????#采用items方法會得到生成器
????for?item?in?items:?????????????????????????????????????????????????????#遍歷每個節(jié)點對象
????????data?=?[]
????????image?=?item.find(".pic?.img").attr("src")????????????????????#用find方法查找子孫節(jié)點,用attr方法獲取屬性名稱
????????price?=?item.find(".price").text().strip().replace("\n","")????????#用text方法獲取文本,strip()去掉前后字符串,默認是空格
????????deal?=?item.find(".deal-cnt").text()[:-2]
????????title?=?item.find(".title").text().strip()
????????shop?=?item.find(".shop").text().strip()
????????location?=?item.find(".location").text()
????????data.append([shop,?location,?title,?price,?deal,?image])
????????print(data)
5. 保存到MySQL數(shù)據(jù)庫
def?save_to_mysql(data):
????"""存儲到數(shù)據(jù)庫"""
????#?創(chuàng)建數(shù)據(jù)庫連接對象
????db=?pymysql.connect(host?=?"localhost",user?=?"root",password?=?"password",port?=?3306,?db?=?"spiders",charset?=?"utf8")
????#?獲取游標
????cursor?=?db.cursor()
????#創(chuàng)建數(shù)據(jù)庫
????cursor.execute("CREATE?TABLE?IF?NOT?EXISTS?{0}(shop?VARCHAR(20),location?VARCHAR(10),title?VARCHAR(255),price?VARCHAR(20),deal?VARCHAR(20),?image?VARCHAR(255))".format("男裝"))
????#SQL語句
????sql?=?"INSERT?INTO?{0}?values(%s,%s,%s,%s,%s,%s)".format("男裝")
????try:
????????????#傳入?yún)?shù)sql,data
????????if?cursor.execute(sql,data):
????????????#插入數(shù)據(jù)庫
????????????db.commit()
????????????print("********已入庫**********")
????except:
????????print("#########入庫失敗#########")
????????#回滾,相當(dāng)什么都沒做
????????db.rollback()
????#關(guān)閉數(shù)據(jù)庫
????db.close()
完整代碼
#?-*-?coding:?utf-8?-*-
import?re
import?pymysql
from?selenium?import?webdriver???
from?selenium.common.exceptions?import?TimeoutException
from?selenium.webdriver.common.by?import?By??
from?selenium.webdriver.support.ui?import?WebDriverWait??
from?selenium.webdriver.support?import?expected_conditions?as?EC??
from?pyquery?import?PyQuery?as?pq
browser?=?webdriver.Chrome()??
def?get_one_page(name):
????'''獲取單個頁面'''
????print("-----------------------------------------------獲取第一頁-------------------------------------------------------")
????try:
????????browser.get("https://www.xxxxx.com")??
????????input?=?WebDriverWait(browser,10).until(???????????????????????
?????????????EC.presence_of_element_located((By.CSS_SELECTOR,"#q")))???
????????input.send_keys(name)???
????????button?=?WebDriverWait(browser,10).until(????????????????????????????????????????????????????????
?????????????EC.element_to_be_clickable((By.CSS_SELECTOR,"#J_TSearchForm?>?div.search-button?>?button")))?
????????button.click()????????????
????????pages?=?WebDriverWait(browser,10).until(???????????????????????????????????????????????????????????
??????????EC.presence_of_element_located((By.CSS_SELECTOR,"#mainsrp-pager?>?div?>?div?>?div?>?div.total")))??
????????print("----即將解析第一頁信息----")
????????get_info(name)
????????print("----第一頁信息解析完成----")
????????return?pages.text
????except?TimeoutException:
????????return?get_one_page(name)???
def?get_next_page(page,name):
?????"""獲取下一頁"""
?????print("---------------------------------------------------正在獲取第{0}頁----------------------------------------".format(page))
?????try:
????????input?=?WebDriverWait(browser,?10).until(
?????????????EC.presence_of_element_located((By.CSS_SELECTOR,?"#mainsrp-pager?>?div?>?div?>?div?>?div.form?>?input")))???????????????????????
????????input.send_keys(page)???????????????????????????????????????????????????????????????????????????????????????????????????????????????
????????button?=?WebDriverWait(browser,?10).until(
????????????EC.element_to_be_clickable((By.CSS_SELECTOR,?"#mainsrp-pager?>?div?>?div?>?div?>?div.form?>?span.btn.J_Submit")))???????????????
????????button.click()??????????????????????????????????????????????????????????????????????????????????????????????????????????????????????
????????WebDriverWait(browser,10).until(
?????????????EC.text_to_be_present_in_element((By.CSS_SELECTOR,"#mainsrp-pager?>?div?>?div?>?div?>?ul?>?li.item.active?>?span"),str(page)))??
????????print("-----即將解析第{0}頁信息-----".format(page))
????????get_info(name)
????????print("-----第{0}頁信息解析完成-----".format(page))
?????except?TimeoutException:????????????????????????????????????????????????????????????????????????????????????????????????????????????????
????????return?get_next_page(page,name)
def?get_info(name):
????"""獲取詳情"""?
????WebDriverWait(browser,20).until(EC.presence_of_element_located((
????????By.CSS_SELECTOR,"#mainsrp-itemlist?.items?.item")))????????????????
????text?=?browser.page_source?????????????????????????????????????????????
????html?=?pq(text)????????????????????????????????????????????????????????
????items?=?html('#mainsrp-itemlist?.items?.item').items()?????????????????
????for?item?in?items:?????????????????????????????????????????????????????
????????data?=?[]
????????image?=?item.find(".pic?.img").attr("src")????????????????????
????????price?=?item.find(".price").text().strip().replace("\n","")????????
????????deal?=?item.find(".deal-cnt").text()[:-2]
????????title?=?item.find(".title").text().strip()
????????shop?=?item.find(".shop").text().strip()
????????location?=?item.find(".location").text()
????????data.append([shop,?location,?title,?price,?deal,?image])
????????for?dt?in?data:
????????????save_to_mysql(dt,name)
def?save_to_mysql(data,name):
????"""存儲到數(shù)據(jù)庫"""
????db=?pymysql.connect(host?=?"localhost",user?=?"root",password?=?"password",port?=?3306,?db?=?"spiders",charset?=?"utf8")
????cursor?=?db.cursor()
????cursor.execute("CREATE?TABLE?IF?NOT?EXISTS?{0}(shop?VARCHAR(20),location?VARCHAR(10),title?VARCHAR(255),price?VARCHAR(20),deal?VARCHAR(20),?image?VARCHAR(255))".format(name))
????sql?=?"INSERT?INTO?{0}?values(%s,%s,%s,%s,%s,%s)".format(name)
????try:
????????if?cursor.execute(sql,data):
????????????db.commit()
?????????print("********已入庫**********")
????except:
????????print("#########入庫失敗#########")
????????db.rollback()
????db.close()
def?main(name):
????pages?=?get_one_page(name)
????pages?=?int(re.compile("(\d+)").findall(pages)[0])
????for?page?in?range(1,pages+1):
???????get_next_page(page,name)
if?__name__?==?'__main__':
????name?=?"男裝"
????main(name)文章轉(zhuǎn)載:Python編程學(xué)習(xí)圈
(版權(quán)歸原作者所有,侵刪)

點擊下方“閱讀原文”查看更多
