<kbd id="afajh"><form id="afajh"></form></kbd>
<strong id="afajh"><dl id="afajh"></dl></strong>
    <del id="afajh"><form id="afajh"></form></del>
        1. <th id="afajh"><progress id="afajh"></progress></th>
          <b id="afajh"><abbr id="afajh"></abbr></b>
          <th id="afajh"><progress id="afajh"></progress></th>

          Python副業(yè)400元,爬取阿里巴巴商城數(shù)據(jù)

          共 10740字,需瀏覽 22分鐘

           ·

          2022-06-06 21:31


          1、任務(wù)簡(jiǎn)介

          ????首先感謝螞蟻學(xué)python群獲取到這個(gè)單子,客戶要求是獲取阿里巴巴的列表頁(yè)商品信息包含,商品title,商品主圖片并且需要存入xls文件保存

          2、解決方案

          ???? 首先給出的方案是:

          ???? 2.1、通過(guò)wxPython框架寫出一個(gè)可視化界面,

          ???? 2.2、因?yàn)榘⒗锇桶头琅辣容^嚴(yán)重,所以我直接通過(guò)selenium進(jìn)行用戶超過(guò)來(lái)跳過(guò)反扒機(jī)制

          ???? 2.3、編寫瀏覽器池方便實(shí)現(xiàn)多線程爬取數(shù)據(jù)

          ???? 2.4、編寫爬數(shù)據(jù)業(yè)務(wù)邏輯

          3、現(xiàn)在開始上代碼實(shí)現(xiàn)

          ???? 3.1 首先初始先一個(gè)瀏覽器池子

          from?multiprocessing?import?Manager
          from?time?import?sleep

          from?tool.open_browser?import?open_browser


          class?DriverPool:
          ????def?__init__(self,?max_nums,driver_path,ui,open_headless=0):
          ????????self.ui?=?ui
          ????????self.drivers?=?{}
          ????????self.manager?=?Manager()
          ????????self.queue?=?self.manager.Queue()
          ????????self.max_nums?=?max_nums
          ????????self.open_headless?=?open_headless
          ????????self.CreateDriver(driver_path)

          ????def?CreateDriver(self,driver_path):
          ????????'''
          ????????初始化瀏覽器池
          ????????:return
          ????????'''

          ????????for?name?in?range(1,?self.max_nums?+?1):
          ????????????name?=?f'driver_{name}'
          ????????????d?=?open_browser(excute_path=driver_path,open_headless=self.open_headless)
          ????????????d.ui?=?self.ui
          ????????????self.drivers[name]?=?d
          ????????????self.queue.put(name)

          ????def?getDriver(self):
          ????????'''
          ????????獲取一個(gè)瀏覽器
          ????????:return?driver
          ????????'''

          ????????if?self.queue.empty():
          ????????????sleep(1)
          ????????????return?self.getDriver()
          ????????name?=?self.queue.get()
          ????????driver?=?self.drivers[name]
          ????????driver.pool_name_driver?=?name
          ????????return?driver

          ????def?putDriver(self,?name):
          ????????'''
          ????????歸還一個(gè)瀏覽器
          ????????:param?name:?
          ????????:return:?
          ????????'''

          ????????self.queue.put(name)

          ????def?quit(self):
          ????????'''
          ????????關(guān)閉瀏覽器,執(zhí)行結(jié)束操作
          ????????:return:?
          ????????'''

          ????????if?self.drivers:
          ????????????for?driver?in?self.drivers.values():
          ????????????????try:
          ????????????????????driver.quit()
          ????????????????except:
          ????????????????????pass

          ???? 3.2 編寫UI操作界面

          ????def?intUIRun(self):
          ????????'''
          ????????初始化UI主界面
          ????????:return:
          ????????'''

          ????????pannel?=?wx.Panel(self.panel_run)
          ????????pannel.Sizer?=?wx.BoxSizer(wx.VERTICAL)
          ????????self.text?=?wx.StaticText(pannel,?-1,?'狀態(tài)欄目:',?size=(100,?40),?pos=(0,?10))
          ????????self.text_input?=?wx.StaticText(pannel,?-1,?'',?size=(900,?40),?pos=(100,?0))

          ????????wx.StaticText(pannel,?-1,?'當(dāng)前執(zhí)行ID:',?size=(100,?30),?pos=(0,?65)).SetFont(self.font)
          ????????self.text_time?=?wx.TextCtrl(pannel,?id=self.choices_id_ref,?value=self.time_str,?size=(300,?30),?pos=(150,?60),
          ?????????????????????????????????????style=wx.TE_AUTO_URL?|?wx.TE_MULTILINE)
          ????????self.reflush_text_time?=?wx.Button(pannel,?-1,?'刷新ID',?size=(100,?50),?pos=(480,?50))
          ????????self.text_time.SetFont(self.font)
          ????????self.reflush_text_time.SetForegroundColour(wx.RED)
          ????????self.reflush_text_time.SetFont(self.font)
          ????????#?self.text_time.SetForegroundColour(wx.RED)

          ????????self.text_input.SetBackgroundColour(wx.WHITE)
          ????????self.text_input.SetLabel(self.in_text)
          ????????self.text_input.SetFont(self.font)
          ????????self.text.SetFont(self.font)
          ????????wx.Button(pannel,?self.get_product,?'獲取商品保存本地',?size=(200,?100),?pos=(0,?100)).SetFont(self.font)
          ????????wx.Button(pannel,?self.save_mysql,?'保存數(shù)據(jù)庫(kù)和OSS',?size=(200,?100),?pos=(200,?100)).SetFont(self.font)
          ????????wx.Button(pannel,?self.end_process,?'結(jié)束執(zhí)行',?size=(200,?100),?pos=(400,?100)).SetFont(self.font)
          ????????self.log_text?=?wx.TextCtrl(pannel,?size=(1000,?500),?pos=(0,?210),?style=wx.TE_MULTILINE?|?wx.TE_READONLY)

          ????????wx.LogTextCtrl(self.log_text)
          ????????self.Bind(wx.EVT_BUTTON,?self.get_product_p,?id=self.get_product)
          ????????self.Bind(wx.EVT_BUTTON,?self.save_mysql_p,?id=self.save_mysql)
          ????????self.Bind(wx.EVT_BUTTON,?self.end_process_p,?id=self.end_process)
          ????????self.text_time.Bind(wx.EVT_COMMAND_LEFT_CLICK,?self.choices_id,?id=self.choices_id_ref)
          ????????self.reflush_text_time.Bind(wx.EVT_BUTTON,?self.reflush_time_evt)
          ????????self.panel_run.Sizer.Add(pannel,?flag=wx.ALL?|?wx.EXPAND,?proportion=1)

          ???? 效果圖

          ???? 3.3編寫業(yè)務(wù)邏輯

          ???? 獲取商品列表頁(yè)數(shù)據(jù)

          global?_getMainProduct,?goods_info
          def?_getMainProduct(data_url):
          ????'''
          ????多線程獲取每一頁(yè)鏈接
          ????:param?data_url:
          ????:return:
          ????'''

          ????self,?url,?driver_pool?=?data_url
          ????c?=?Common(driver_pool.getDriver())
          ????goods_urls?=?[]
          ????try:
          ????????self.ui.print(f'當(dāng)前獲取第{url}頁(yè)數(shù)據(jù)')
          ????????c.d.get(url)
          ????????c.wait_page_loaded(url)
          ????????if?self.is_load_cache_cookies:
          ????????????self.load_cookies(c.d)
          ????????????c.d.get(url)
          ????????c.wait_page_loaded(url)
          ????????ele?=?c.find_element(By.CSS_SELECTOR,?'[class="component-product-list"]')
          ????????goods_urls?=?ele.find_elements(By.CSS_SELECTOR,?'a[class="product-image"]')
          ????????goods_urls?=?[goods_url.get_attribute('href')?for?goods_url?in?goods_urls]
          ????except?SystemExit:
          ????????sys.exit(1)
          ????except:
          ????????self.print(f'請(qǐng)求頁(yè)面超出范圍:?{url}?ERROR:?{traceback.format_exc()}')
          ????????if?c.find_element_true(By.CSS_SELECTOR,?'[class="no-data?common"]'):
          ????????????return?goods_urls
          ????finally:
          ????????name?=?c.d.pool_name_driver
          ????????driver_pool.putDriver(name)
          ????????self.queue_print.put(f'請(qǐng)求完成:{url}')
          ????return?goods_urls

          def?getMainProduct_(self):
          ????g_dict?=?globals()
          ????urls?=?[]
          ????sum_l?=?self.pageNums[1]?+?1
          ????complate?=?0
          ????products?=?[]
          ????for?i?in?range(self.pageNums[0],?sum_l):
          ????????if?self.ui.is_exit_process:
          ????????????exit()
          ????????url?=?self.url.format(i)
          ????????urls.append([self,?url,?self.drive_pool])
          ????if?urls:
          ????????p?=?self.pool.map_async(_getMainProduct,?urls)
          ????????while?not?p.ready():
          ????????????if?not?self.queue_print.empty():
          ????????????????complate?+=?1
          ????????????????self.print(self.queue_print.get(),?f'完成:{complate}/{sum_l?-?1}')
          ????????products?=?p.get()
          ????goods_info?=?set()
          ????for?xx?in?products:
          ????????for?x?in?xx:
          ????????????if?x:
          ????????????????goods_info.add(x)
          ????self.goods_info?=?goods_info
          ????return?goods_info
          goods_info?=?getMainProduct_(self)

          ???? 獲取詳情頁(yè)數(shù)據(jù)

          global?goods,Common,driver_pool,goods_url,sleep,re,By
          def?get_info_(self,?data_info):
          ????'''
          ????多線程獲取詳情頁(yè)數(shù)據(jù)
          ????:param?self:?
          ????:param?data_info:?
          ????:return:?
          ????'''

          ????if?self.ui.is_exit_process:
          ????????exit()
          ????goods_url,?driver_pool?=?data_info
          ????c?=?Common(driver_pool.getDriver())
          ????try:
          ????????c.d.get(goods_url)
          ????????sleep(3)
          ????????if?self.is_load_cache_cookies:
          ????????????self.load_cookies(c.d)
          ????????????c.d.get(goods_url)
          ????????c.wait_page_loaded(goods_url)
          ????????for?x?in?range(400,?18000,?200):
          ????????????sleep(0.1)
          ????????????c.d.execute_script(f'document.documentElement.scrollTop={x};')
          ????????is_all?=?c.find_element_true(By.CSS_SELECTOR,?'[id="J-rich-text-description"]')??#?'J-rich-text-description'
          ????????if?not?is_all:
          ????????????self.print(f'沒(méi)有發(fā)現(xiàn):?{is_all}')
          ????????is_video?=?c.find_elements_true(By.CSS_SELECTOR,?'[class="bc-video-player"]>video')
          ????????is_title?=?c.find_element_true(By.CSS_SELECTOR,?'[class="module-pdp-title"]')
          ????????is_description?=?c.find_element_true(By.CSS_SELECTOR,?'[name="description"]')
          ????????is_keywords?=?c.find_element_true(By.CSS_SELECTOR,?'[name="keywords"]')
          ????????is_overview?=?c.find_element_true(By.CSS_SELECTOR,?'[class="do-overview"]')
          ????????is_wz_goods_cat_id?=?c.find_element_true(By.CSS_SELECTOR,?'[class="detail-subscribe"]')
          ????????wz_goods_cat_id?=?self.wz_goods_cat_id
          ????????#?if?is_wz_goods_cat_id:
          ????????#?????wz_goods_cat_id?=?is_wz_goods_cat_id.find_elements(By.CSS_SELECTOR,?'[class="breadcrumb-item"]>a')[
          ????????#?????????-1].get_attribute('href')
          ????????#?????wz_goods_cat_id?=?re.search(r'(\d+)',?wz_goods_cat_id).group(1)
          ????????#?goods_id?=?re.search(r'(\d+)\.html$',?goods_url)
          ????????goods_id?=?re.search(r'(ssssss\d+)\.html$',?goods_url)
          ????????goods?=?{
          ????????????"商品分類ID":?int(wz_goods_cat_id)?if?wz_goods_cat_id?else?0,
          ????????????"商品ID":?goods_id.group(1)?if?goods_id?else?self.getMd5(f'{time.time()}')+'其他',
          ????????????"商品鏈接":?goods_url,
          ????????????"描述":?c.find_element(By.CSS_SELECTOR,?'[name="description"]').get_attribute(
          ????????????????'content')?if?is_description?else?'',
          ????????????"標(biāo)題":?is_title.get_attribute('title')?if?is_title?else?'',
          ????????????"關(guān)鍵字":?c.find_element(By.CSS_SELECTOR,?'[name="keywords"]').get_attribute(
          ????????????????'content')?if?is_keywords?else?is_keywords,
          ????????????"視頻連接":?c.find_element(By.CSS_SELECTOR,?'[class="bc-video-player"]>video').get_attribute(
          ????????????????'src')?if?is_video?else?'',
          ????????????"主圖片":?[],
          ????????????"商品詳情":?c.d.execute_script(
          ????????????????'''return?document.querySelectorAll('[class="do-overview"]')[0].outerHTML;''')?if?is_overview?else?is_overview,
          ????????????"商品描述":?'',
          ????????????"商品描述圖片":?[]
          ????????}

          ????????#?獲取商品描述圖片
          ????????goods_desc?=?getDescriptionFactory1(self,?c,?goods_url)
          ????????goods.update(goods_desc)

          ????????#?獲取主圖片
          ????????m_imgs?=?c.find_elements(By.CSS_SELECTOR,?'[class="main-image-thumb-ul"]>li')
          ????????for?m_img?in?m_imgs:
          ????????????try:
          ????????????????img?=?m_img.find_element(By.CSS_SELECTOR,?'[class="J-slider-cover-item"]').get_attribute('src')
          ????????????????s?=?re.search('(\d+x\d+)',?img)
          ????????????????img2?=?None
          ????????????????if?s:
          ????????????????????img2?=?str(img).replace(s.group(1),?'')
          ????????????????goods['主圖片'].append(img)
          ????????????????if?img2:
          ????????????????????goods['主圖片'].append(img2)
          ????????????except:
          ????????????????pass
          ????????self.ui.status['請(qǐng)求成功商品數(shù)量']?+=?1
          ????????return?goods
          ????except:
          ????????traceback.print_exc()
          ????????self.print(f'=========================\n鏈接請(qǐng)求錯(cuò)誤:?{goods_url}?\n?{traceback.format_exc()}\n=========================')
          ????????self.error_page.append([goods_url,?traceback.format_exc()])
          ????????self.ui.status['請(qǐng)求失敗商品數(shù)量']?+=?1
          ????finally:
          ????????name?=?c.d.pool_name_driver
          ????????driver_pool.putDriver(name)
          ????????self.queue_print.put(f'請(qǐng)求完成:{goods_url}')
          goods?=?get_info_(self,data_info)

          ???? 寫入excel

          ????????def?export_excel(self,?results):
          ????????'''
          ????????寫入excel方法
          ????????:param?results:?
          ????????:return:?
          ????????'''

          ????????now_dir_str?=?self.now
          ????????now_file_str?=?time.strftime('%Y_%m_%d__%H_%M_%S',?time.localtime())
          ????????img_path?=?os.path.join('data',?'xls',?now_dir_str)
          ????????if?not?os.path.exists(img_path):
          ????????????os.mkdir(img_path)
          ????????img_path?=?os.path.join('data',?'xls',?now_dir_str,?self.url_id)
          ????????if?not?os.path.exists(img_path):
          ????????????os.mkdir(img_path)
          ????????if?not?os.path.exists(img_path):
          ????????????os.mkdir(img_path)
          ????????img_path?=?os.path.join(img_path,?f"{now_file_str}.xlsx")
          ????????workbook?=?xlsxwriter.Workbook(img_path)
          ????????sheet?=?workbook.add_worksheet(name='阿里巴巴信息')
          ????????titles?=?list(results[0].keys())
          ????????for?i,?title?in?enumerate(titles):
          ????????????sheet.write_string(0,?i,?title)
          ????????for?row,?result?in?enumerate(results):
          ????????????row?=?row?+?1
          ????????????col?=?0
          ????????????for?value?in?result.values():
          ????????????????sheet.write_string(row,?col,?str(value))
          ????????????????col?+=?1

          ????????workbook.close()

          4、最后總結(jié):

          ????通過(guò)上述代碼最終實(shí)現(xiàn)了客戶的需求,由于通用selenium執(zhí)行瀏覽器操作沒(méi)有接口請(qǐng)求效率高,所以在最后使用了多線程在執(zhí)行效率上也做了一些提升。


          關(guān)注螞蟻老師的抖音賬號(hào):Python導(dǎo)師-螞蟻

          每晚21點(diǎn)直播,給你講解副業(yè)、Python學(xué)習(xí)路線;


          瀏覽 194
          點(diǎn)贊
          評(píng)論
          收藏
          分享

          手機(jī)掃一掃分享

          分享
          舉報(bào)
          評(píng)論
          圖片
          表情
          推薦
          點(diǎn)贊
          評(píng)論
          收藏
          分享

          手機(jī)掃一掃分享

          分享
          舉報(bào)
          <kbd id="afajh"><form id="afajh"></form></kbd>
          <strong id="afajh"><dl id="afajh"></dl></strong>
            <del id="afajh"><form id="afajh"></form></del>
                1. <th id="afajh"><progress id="afajh"></progress></th>
                  <b id="afajh"><abbr id="afajh"></abbr></b>
                  <th id="afajh"><progress id="afajh"></progress></th>
                  大香蕉精品视频在线 | www.色自拍 | AAA高清毛片 | 韩国精品一区二区三区 | 操逼操逼操逼 |