用 Python 爬取起點小說網

目標
爬取一本仙俠類的小說下載并保存為txt文件到本地。本例為“大周仙吏”。

項目準備
軟件:Pycharm
第三方庫:requests,fake_useragent,lxml
網站地址:https://book.qidian.com
網站分析
打開網址:

網址變?yōu)椋?/span>https://book.qidian.com/info/1020580616#Catalog
判斷是否為靜態(tài)加載網頁,Ctrl+U打開源代碼,Ctrl+F打開搜索框,輸入:第一章。

在這里是可以找到的,判定為靜態(tài)加載。
反爬分析
同一個ip地址去多次訪問會面臨被封掉的風險,這里采用fake_useragent,產生隨機的User-Agent請求頭進行訪問。
代碼實現(xiàn)
1.導入相對應的第三方庫,定義一個class類繼承object,定義init方法繼承self,主函數(shù)main繼承self。
import??requests
from?fake_useragent?import?UserAgent
from?lxml?import?etree
class?photo_spider(object):
????def?__init__(self):
????????self.url?=?'https://book.qidian.com/info/1020580616#Catalog'
????????ua?=?UserAgent(verify_ssl=False)
????????#隨機產生user-agent
????????for?i?in?range(1,?100):
????????????self.headers?=?{
????????????????'User-Agent':?ua.random
????????????}
????def?mian(self):
?????pass
if?__name__?==?'__main__':
????spider?=?qidian()
????spider.main()
2.發(fā)送請求,獲取網頁。
????def?get_html(self,url):
????????response=requests.get(url,headers=self.headers)
????????html=response.content.decode('utf-8')
????????return?html
3.獲取圖片的鏈接地址。
import?requests
from?lxml?import?etree
from?fake_useragent?import?UserAgent
class?qidian(object):
????def?__init__(self):
????????self.url?=?'https://book.qidian.com/info/1020580616#Catalog'
????????ua?=?UserAgent(verify_ssl=False)
????????for?i?in?range(1,?100):
????????????self.headers?=?{
????????????????'User-Agent':?ua.random
????????????}
????def?get_html(self,url):
????????response=requests.get(url,headers=self.headers)
????????html=response.content.decode('utf-8')
????????return?html
????def?parse_html(self,html):
????????target=etree.HTML(html)
????????links=target.xpath('//ul[@class="cf"]/li/a/@href')#獲取鏈接
????????names=target.xpath('//ul[@class="cf"]/li/a/text()')#獲取每一章的名字
????????for?link,name?in?zip(links,names):
????????????print(name+'\t'+'https:'+link)
????def?main(self):
????????url=self.url
????????html=self.get_html(url)
????????self.parse_html(html)
if?__name__?==?'__main__':
????spider=qidian()
????spider.main()
打印結果:

4.解析鏈接,獲取每一章內容。
????def?parse_html(self,html):
????????target=etree.HTML(html)
????????links=target.xpath('//ul[@class="cf"]/li/a/@href')
????????for?link?in?links:
????????????host='https:'+link
????????????#解析鏈接地址
????????????res=requests.get(host,headers=self.headers)
????????????c=res.content.decode('utf-8')
????????????target=etree.HTML(c)
????????????names=target.xpath('//span[@class="content-wrap"]/text()')
????????????results=target.xpath('//div[@class="read-content?j_readContent"]/p/text()')
????????????for?name?in?names:
????????????????print(name)
????????????for?result?in?results:
????????????????print(result)
打印結果:(下面內容過多,只貼出一部分。)

5.保存為txt文件到本地。
?with?open('F:/pycharm文件/document/'?+?name?+?'.txt',?'a')?as?f:
??????for?result?in?results:
??????????#print(result)
??????????f.write(result+'\n')
效果顯示:

打開文件目錄:

完整代碼
import?requests
from?lxml?import?etree
from?fake_useragent?import?UserAgent
class?qidian(object):
????def?__init__(self):
????????self.url?=?'https://book.qidian.com/info/1020580616#Catalog'
????????ua?=?UserAgent(verify_ssl=False)
????????for?i?in?range(1,?100):
????????????self.headers?=?{
????????????????'User-Agent':?ua.random
????????????}
????def?get_html(self,url):
????????response=requests.get(url,headers=self.headers)
????????html=response.content.decode('utf-8')
????????return?html
????def?parse_html(self,html):
????????target=etree.HTML(html)
????????links=target.xpath('//ul[@class="cf"]/li/a/@href')
????????for?link?in?links:
????????????host='https:'+link
????????????#解析鏈接地址
????????????res=requests.get(host,headers=self.headers)
????????????c=res.content.decode('utf-8')
????????????target=etree.HTML(c)
????????????names=target.xpath('//span[@class="content-wrap"]/text()')
????????????results=target.xpath('//div[@class="read-content?j_readContent"]/p/text()')
????????????for?name?in?names:
????????????????print(name)
????????????????with?open('F:/pycharm文件/document/'?+?name?+?'.txt',?'a')?as?f:
????????????????????for?result?in?results:
????????????????????????#print(result)
????????????????????????f.write(result+'\n')
????def?main(self):
????????url=self.url
????????html=self.get_html(url)
????????self.parse_html(html)
if?__name__?==?'__main__':
????spider=qidian()
????spider.main()
更多閱讀
特別推薦

點擊下方閱讀原文加入社區(qū)會員
評論
圖片
表情
