手把手帶你爬蟲 | 爬取語錄大全
↑?關(guān)注 + 星標?,每天學(xué)Python新技能
后臺回復(fù)【大禮包】送你Python自學(xué)大禮包
作者:大頭雪糕
來源:數(shù)據(jù)分析與統(tǒng)計學(xué)之美
目標
爬取語錄,批量下載到本地。
項目準備
軟件:Pycharm
第三方庫:requests,fake_useragent,re,lxml
網(wǎng)站地址:http://www.yuluju.com
網(wǎng)站分析
打開網(wǎng)站。

有很多分類,不同類型的語錄。?
點擊愛情語錄,發(fā)現(xiàn)上方網(wǎng)址變化為http://www.yuluju.com/aiqingyulu/

點擊搞笑語錄,也會發(fā)生類似的變化。

判斷是否為靜態(tài)網(wǎng)頁。

有頁碼跳轉(zhuǎn)一般為靜態(tài)網(wǎng)頁。Ctrl+U查看源代碼,Ctrl+F調(diào)出搜索框,輸入一些網(wǎng)頁上出現(xiàn)的文字。

反爬分析
同一個ip地址去多次訪問會面臨被封掉的風(fēng)險,這里采用fake_useragent,產(chǎn)生隨機的User-Agent請求頭進行訪問。
每一頁的鏈接分析
第一頁:http://www.yuluju.com/aiqingyulu/list_18_1.html
第二頁:http://www.yuluju.com/aiqingyulu/list_18_2.html
第三頁:http://www.yuluju.com/aiqingyulu/list_18_3.html
可以發(fā)現(xiàn),每頁的變化會隨著數(shù)字變化。當然這里分析的是愛情語錄這一欄目,其它的也類似。
代碼實現(xiàn)
1.導(dǎo)入相對應(yīng)的第三方庫,定義一個class類繼承object,定義init方法繼承self,主函數(shù)main繼承self。
import??requests
from?fake_useragent?import?UserAgent
from?lxml?import?etree
class?yulu(object):
????def?__init__(self):
????????self.url?=?'http://www.yuluju.com'
????????ua?=?UserAgent(verify_ssl=False)
????????#隨機產(chǎn)生user-agent
????????for?i?in?range(1,?100):
????????????self.headers?=?{
????????????????'User-Agent':?ua.random
????????????}
????def?mian(self):
?????pass
if?__name__?==?'__main__':
????spider?=?yulu()
????spider.main()
2.交互界面
print('?????1.勵志語錄\n'
??????????????'2.愛情語錄\n'
??????????????'3.搞笑語錄\n'
??????????????'4.人生語錄\n'
??????????????'5.情感語錄\n'
??????????????'6.經(jīng)典語錄\n'
??????????????'7.傷感語錄\n'
??????????????'8.名人語錄\n'
??????????????'9.心情語錄\n')
????????select=int(input('請輸入您的選擇:'))
????????if?(select==1):
????????????url=self.url+'lizhimingyan/list_1_{}.html'
????????elif?(select==2):
????????????url?=?self.url?+?'aiqingyulu/list_18_{}.html'
????????elif?(select==3):
????????????url?=?self.url?+?'gaoxiaoyulu/list_19_{}.html'
????????elif?(select==4):
????????????url=self.url+'renshenggeyan/list_14_{}.html'
????????elif?(select==5):
????????????url=self.url+'qingganyulu/list_23_{}.html'
????????elif?(select==6):
????????????url=self.url+'jingdianyulu/list_12_{}.html'
????????elif?(select==7):
????????????url=self.url+'shangganyulu/list_21_{}.html'
????????elif?(select==8):
????????????url=self.url+'mingrenmingyan/list_2_{}.html'
????????else:
????????????url=self.url+'xinqingyulu/list_22_{}.html'
3.發(fā)送請求,獲取網(wǎng)頁。
????def?get_html(self,url):
????????response=requests.get(url,headers=self.headers)
????????html=response.content.decode('gb2312')#經(jīng)過測試這里是'gb2312'
????????return?html
4.解析網(wǎng)頁,獲取文本信息。
????def?parse_html(self,html):
?????#獲取每頁中的鏈接地址和標題
????????datas=re.compile('(.*?)').findall(html)
????????for?data?in?datas:
????????????host='http://www.yuluju.com'+data[0]
????????????res=requests.get(host,headers=self.headers)
????????????con=res.content.decode('gb2312')
????????????target=etree.HTML(con)
????????????#獲取文本內(nèi)容
????????????results=target.xpath('//div[@class="content"]/div/div/span/text()')
????????????filename=data[1]
????????????#保存本地
????????????with?open('F:/pycharm文件/document/'+filename+'.txt','a',encoding='utf-8')as?f:
????????????????for?result?in?results:
????????????????????f.write(result+'\n')
5.獲取多頁及主函數(shù)調(diào)用。
????def?main(self):
????????print('1.勵志語錄\n'
??????????????'2.愛情語錄\n'
??????????????'3.搞笑語錄\n'
??????????????'4.人生語錄\n'
??????????????'5.情感語錄\n'
??????????????'6.經(jīng)典語錄\n'
??????????????'7.傷感語錄\n'
??????????????'8.名人語錄\n'
??????????????'9.心情語錄\n')
????????select=int(input('請輸入您的選擇:'))
????????if?(select==1):
????????????url=self.url+'lizhimingyan/list_1_{}.html'
????????elif?(select==2):
????????????url?=?self.url?+?'aiqingyulu/list_18_{}.html'
????????elif?(select==3):
????????????url?=?self.url?+?'gaoxiaoyulu/list_19_{}.html'
????????elif?(select==4):
????????????url=self.url+'renshenggeyan/list_14_{}.html'
????????elif?(select==5):
????????????url=self.url+'qingganyulu/list_23_{}.html'
????????elif?(select==6):
????????????url=self.url+'jingdianyulu/list_12_{}.html'
????????elif?(select==7):
????????????url=self.url+'shangganyulu/list_21_{}.html'
????????elif?(select==8):
????????????url=self.url+'mingrenmingyan/list_2_{}.html'
????????else:
????????????url=self.url+'xinqingyulu/list_22_{}.html'
????????start?=?int(input('輸入開始:'))
????????end?=?int(input('輸入結(jié)束頁:'))
????????for?page?in?range(start,?end?+?1):
????????????print('第%s頁開始:...'?%?page)
????????????newUrl=url.format(page)
????????????html=self.get_html(newUrl)
????????????self.parse_html(html)
????????????print('第%s頁爬取完成!'%page)
效果顯示
打開文件目錄:

爬取其它欄目也是可以的,就不做演示了,都一樣。
完整代碼
import??requests
from?fake_useragent?import?UserAgent
import?re
from?lxml?import?etree
class?yulu(object):
????def?__init__(self):
????????self.url='http://www.yuluju.com/'
????????ua?=?UserAgent(verify_ssl=False)
????????for?i?in?range(1,?100):
????????????self.headers?=?{
????????????????'User-Agent':?ua.random
????????????}
????def?get_html(self,url):
????????response=requests.get(url,headers=self.headers)
????????html=response.content.decode('gb2312')
????????return?html
????def?parse_html(self,html):
????????datas=re.compile('(.*?)').findall(html)
????????for?data?in?datas:
????????????host='http://www.yuluju.com'+data[0]
????????????res=requests.get(host,headers=self.headers)
????????????con=res.content.decode('gb2312')
????????????target=etree.HTML(con)
????????????results=target.xpath('//div[@class="content"]/div/div/span/text()')
????????????filename=data[1]
????????????with?open('F:/pycharm文件/document/'+filename+'.txt','a',encoding='utf-8')as?f:
????????????????for?result?in?results:
????????????????????f.write(result+'\n')
????def?main(self):
????????print('1.勵志語錄\n'
??????????????'2.愛情語錄\n'
??????????????'3.搞笑語錄\n'
??????????????'4.人生語錄\n'
??????????????'5.情感語錄\n'
??????????????'6.經(jīng)典語錄\n'
??????????????'7.傷感語錄\n'
??????????????'8.名人語錄\n'
??????????????'9.心情語錄\n')
????????select=int(input('請輸入您的選擇:'))
????????if?(select==1):
????????????url=self.url+'lizhimingyan/list_1_{}.html'
????????elif?(select==2):
????????????url?=?self.url?+?'aiqingyulu/list_18_{}.html'
????????elif?(select==3):
????????????url?=?self.url?+?'gaoxiaoyulu/list_19_{}.html'
????????elif?(select==4):
????????????url=self.url+'renshenggeyan/list_14_{}.html'
????????elif?(select==5):
????????????url=self.url+'qingganyulu/list_23_{}.html'
????????elif?(select==6):
????????????url=self.url+'jingdianyulu/list_12_{}.html'
????????elif?(select==7):
????????????url=self.url+'shangganyulu/list_21_{}.html'
????????elif?(select==8):
????????????url=self.url+'mingrenmingyan/list_2_{}.html'
????????else:
????????????url=self.url+'xinqingyulu/list_22_{}.html'
????????start?=?int(input('輸入開始:'))
????????end?=?int(input('輸入結(jié)束頁:'))
????????for?page?in?range(start,?end?+?1):
????????????print('第%s頁開始:...'?%?page)
????????????newUrl=url.format(page)
????????????html=self.get_html(newUrl)
????????????self.parse_html(html)
????????????print('第%s頁爬取完成!'%page)
if?__name__?==?'__main__':
????spider?=?yulu()
????spider.main()
◆?◆?◆ ?◆?◆
推薦閱讀
太慘了?。。∵@9名程序員開發(fā)50余款A(yù)PP獲利500萬,結(jié)果...
掃碼回復(fù)「大禮包」后獲取大禮
掃碼加我微信備注「三劍客」送你上圖三本電子書
評論
圖片
表情

