python獲取阿里巴巴國際站商家信息

幫一個(gè)做外貿(mào)的朋友搞的,他需要電話號(hào)去和商家溝通,提供國際貨運(yùn)一條龍服務(wù),不停地切換頁面查看手機(jī)號(hào),比較麻煩,幫他寫個(gè)腳本,一次性獲取下來,存成Excel。現(xiàn)在分享一下過程,同時(shí)記錄一下他欠我一頓飯。
前言
1.啟動(dòng)webdriver,并完成登錄
from selenium.webdriver import ChromeOptionsfrom selenium import webdriverfrom selenium.webdriver.common.by import Byfrom selenium.webdriver.support import expected_conditions as ECfrom selenium.webdriver.support.wait import WebDriverWaitimport reimport timefrom lxml import etreeimport csv# 完成登錄 登陸class Chrome_drive():def __init__(self):option = ChromeOptions()option.add_experimental_option('excludeSwitches', ['enable-automation'])option.add_experimental_option('useAutomationExtension', False)NoImage = {"profile.managed_default_content_settings.images": 2} # 控制 沒有圖片option.add_experimental_option("prefs", NoImage)# option.add_argument(f'user-agent={ua.chrome}') # 增加瀏覽器頭部# chrome_options.add_argument(f"--proxy-server=http://{self.ip}") # 增加IP地址。。# option.add_argument('--headless') #無頭模式 不彈出瀏覽器self.browser = webdriver.Chrome(executable_path="./chromedriver", options=option)self.browser.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {'source': 'Object.defineProperty(navigator,"webdriver",{get:()=>undefined})'}) #去掉selenium的驅(qū)動(dòng)設(shè)置self.browser.set_window_size(1200,768)self.wait = WebDriverWait(self.browser, 12)def get_login(self):url='https://passport.alibaba.com/icbu_login.htm'self.browser.get(url)#self.browser.maximize_window() # 在這里登陸的中國大陸的郵編#這里進(jìn)行人工登陸。k = input("輸入1")if 'Your Alibaba.com account is temporarily unavailable' in self.browser.page_source:self.browser.close()while k == 1:breakself.browser.refresh() # 刷新方法 refresreturn
2.獲取頁面內(nèi)容
#獲取判斷網(wǎng)頁文本的內(nèi)容:def index_page(self,page,wd):"""抓取索引頁:param page: 頁碼"""print('正在爬取第', page, '頁')url = f'https://www.alibaba.com/trade/search?page={page}&keyword={wd}&f1=y&indexArea=company_en&viewType=L&n=38'js1 = f" window.open('{url}')" # 執(zhí)行打開新的標(biāo)簽頁print(url)self.browser.execute_script(js1) # 打開新的網(wǎng)頁標(biāo)簽# 執(zhí)行打開新一個(gè)標(biāo)簽頁。self.browser.switch_to.window(self.browser.window_handles[-1]) # 此行代碼用來定位當(dāng)前頁面窗口self.buffer() # 網(wǎng)頁滑動(dòng) 成功切換#等待元素加載出來time.sleep(3)self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#J-items-content')))#獲取網(wǎng)頁的源代碼html = self.browser.page_sourceself.get_products(wd,html)self.close_window()def get_products(self, wd, html_text):"""提取商品數(shù)據(jù)"""e = etree.HTML(html_text)item_main = e.xpath('//div[@id="J-items-content"]//div[@class="item-main"]')items = e.xpath('//div[@id="J-items-content"]//div[@class="item-main"]')print('公司數(shù) ', len(items))for li in items:company_name = ''.join(li.xpath('./div[@class="top"]//h2[@class="title ellipsis"]/a/text()')) # 公司名稱company_phone_page = ''.join(li.xpath('./div[@class="top"]//a[@class="cd"]/@href')) # 公司電話連接product = ''.join(li.xpath('.//div[@class="value ellipsis ph"]/text()')) # 主要產(chǎn)品Attrs = li.xpath('.//div[@class="attrs"]//span[@class="ellipsis search"]/text()')length = len(Attrs)counctry = ''total_evenue = ''sell_adress = ''product_img = ''if length > 0:counctry = ''.join(Attrs[0]) # 國家if length > 1:total_evenue = ''.join(Attrs[1]) # Total 收入if length > 2:sell_adress = ''.join(Attrs[2]) # 主要銷售地if length > 3:sell_adress += '、' + ''.join(Attrs[3]) # 主要銷售地if length > 4:sell_adress += '、' + ''.join(Attrs[4]) # 主要銷售地product_img_list = li.xpath('.//div[@class="product"]/div/a/img/@src')if len(product_img_list) > 0:product_img = ','.join(product_img_list) # 產(chǎn)品圖片self.browser.get(company_phone_page)phone = ''address = ''mobilePhone = ''try:if 'Your Alibaba.com account is temporarily unavailable' in self.browser.page_source:self.browser.close()self.browser.find_element_by_xpath('//div[@class="sens-mask"]/a').click()phone = ''.join(re.findall('Telephone:(.*?) ', self.browser.page_source, re.S))mobilePhone = ''.join(re.findall('Mobile Phone:(.*?) ', self.browser.page_source, re.S))address = ''.join(re.findall('Address:(.*?) ', self.browser.page_source, re.S))except:print("該公司沒有電話")all_down = [wd, company_name, company_phone_page, product, counctry, phone, mobilePhone, address,total_evenue, sell_adress, product_img]save_csv(all_down)print(company_name, company_phone_page, product, counctry, phone, mobilePhone, address, total_evenue,sell_adress, product_img)
def buffer(self): #滑動(dòng)網(wǎng)頁的for i in range(33):time.sleep(0.5)self.browser.execute_script('window.scrollBy(0,380)', '') # 向下滑行300像素。def close_window(self):length=self.browser.window_handlesif len(length) > 3:self.browser.switch_to.window(self.browser.window_handles[1])self.browser.close()time.sleep(1)self.browser.switch_to.window(self.browser.window_handles[-1])
def save_csv(lise_line):file = csv.writer(open("./alibaba_com_img.csv", 'a', newline="", encoding="utf-8"))file.writerow(lise_line)def main():"""遍歷每一頁"""run = Chrome_drive()run.get_login() #先登錄wd ='henan'for i in range(1,32):run.index_page(i, wd)if __name__ == '__main__':csv_title = 'wd,company_name,company_phone_page,product,counctry,phone,mobilePhone,address,total_evenue,sell_adress,product_img'.split(',')save_csv(csv_title)main()
3.獲取產(chǎn)品圖片
# -*- coding: utf-8 -*-import requestsimport pandas as pddef open_requests(img, img_name):img_url ='https:'+ imgres=requests.get(img_url)with open(f"./downloads_picture/{img_name}", 'wb') as fn:fn.write(res.content)df1=pd.read_csv('./alibaba_com_img.csv',)for imgs in df1["product_img"]:imgList = str(imgs).split(',')if len(imgList) > 0:img = imgList[0]img_name = img[24:]print(img, img_name)open_requests(img, img_name)
4.獲取插入圖片
# -*- coding: utf-8 -*-from PIL import Imageimport osimport xlwings as xwpath='alibaba_com.xlsx'app = xw.App(visible=True, add_book=False)wb = app.books.open(path)sht = wb.sheets['Sheet1']img_list=sht.range("L2").expand('down').valueprint(len(img_list))def write_pic(cell,img_name):path=f'./downloads_picture/{img_name}'print(path)fileName = os.path.join(os.getcwd(), path)img = Image.open(path).convert("RGB")print(img.size)w, h = img.sizex_s = 70 # 設(shè)置寬 excel中,我設(shè)置了200x200的格式y_s = h * x_s / w # 等比例設(shè)置高sht.pictures.add(fileName, left=sht.range(cell).left, top=sht.range(cell).top, width=x_s, height=y_s)if __name__ == '__main__':for index,imgs in enumerate(img_list):cell="C"+str(index + 2)imgsList = str(imgs).split(',')if len(imgsList) > 0:img = imgsList[0]img_name = img[24:]try:write_pic(cell,img_name)print(cell,img_name)except:print("沒有找到這個(gè)img_name的圖片",img_name)wb.save()wb.close()app.quit()
文章轉(zhuǎn)載:Python編程學(xué)習(xí)圈
(版權(quán)歸原作者所有,侵刪)

點(diǎn)擊下方“閱讀原文”查看更多
評(píng)論
圖片
表情





