用Python爬取B站、騰訊視頻、芒果TV和愛奇藝視頻彈幕
點(diǎn)擊上方“數(shù)據(jù)管道”,選擇“置頂星標(biāo)”公眾號(hào)
干貨福利,第一時(shí)間送達(dá)


海量的彈幕數(shù)據(jù)不僅可以繪制此類詞云圖,還可以調(diào)用百度AI進(jìn)行情感分析。那么,我們?cè)撊绾潍@取彈幕數(shù)據(jù)呢?本文運(yùn)用Python爬取B站視頻、騰訊視頻、芒果TV和愛奇藝視頻等彈幕,讓你輕松獲取主流視頻網(wǎng)站彈幕數(shù)據(jù)。?
一、B站視頻彈幕
1.網(wǎng)頁分析
本文以爬取up主硬核的半佛仙人發(fā)布的《你知道奶茶加盟到底有多坑人嗎?》視頻彈幕為例,首先通過以下步驟找到存放彈幕的真實(shí)url。

import?requests?#請(qǐng)求網(wǎng)頁數(shù)據(jù)
from?bs4?import?BeautifulSoup?#美味湯解析數(shù)據(jù)
import?pandas?as?pd
import?time
from?tqdm?import?trange?#獲取爬取速度
def?get_bilibili_url(start,?end):
????url_list?=?[]
????date_list?=?[i?for?i?in?pd.date_range(start,?end).strftime('%Y-%m-%d')]
????for?date?in?date_list:
????????url?=?f"https://api.bilibili.com/x/v2/dm/history?type=1&oid=141367679&date={date}"
????????url_list.append(url)
????return?url_list
def?get_bilibili_danmu(url_list):
????headers?=?{
????????"user-agent":?"Mozilla/5.0?(Macintosh;?Intel?Mac?OS?X?10_14_6)?AppleWebKit/537.36?(KHTML,?like?Gecko)?Chrome/83.0.4103.116?Safari/537.36",
????????"cookie":?"你自己的"?#Headers中copy即可
???????????????}
????file?=?open("bilibili_danmu.txt",?'w')
????for?i?in?trange(len(url_list)):
????????url?=?url_list[i]
????????response?=?requests.get(url,?headers=headers)
????????response.encoding?=?'utf-8'
????????soup?=?BeautifulSoup(response.text)
????????data?=?soup.find_all("d")
????????danmu?=?[data[i].text?for?i?in?range(len(data))]
????????for?items?in?danmu:
????????????file.write(items)
????????????file.write("\n")
????????time.sleep(3)
????file.close()
if?__name__?==?"__main__":
????start?=?'9/24/2020'?#設(shè)置爬取彈幕的起始日
????end?=?'9/26/2020'?#設(shè)置爬取彈幕的終止日
????url_list?=?get_bilibili_url(start,?end)
????get_bilibili_danmu(url_list)
????print("彈幕爬取完成")

二、騰訊視頻彈幕
1.網(wǎng)頁分析

import?requests
import?json
import?time
import?pandas?as?pd
df?=?pd.DataFrame()
for?page?in?range(15,?12399,?30):
????headers?=?{'User-Agent':?'Mozilla/5.0?(Windows?NT?6.1;?WOW64)?AppleWebKit/537.36?(KHTML,?like?Gecko)?Chrome/63.0.3239.132?Safari/537.36'}
????url?=?'https://mfm.video.qq.com/danmu?otype=json×tamp={}&target_id=5938032297%26vid%3Dx0034hxucmw&count=80'.format(page)
????print("正在提取第"?+?str(page)?+?"頁")
????html?=?requests.get(url,headers?=?headers)
????bs?=?json.loads(html.text,strict?=?False)??#strict參數(shù)解決部分內(nèi)容json格式解析報(bào)錯(cuò)
????time.sleep(1)
????#遍歷獲取目標(biāo)字段
????for?i?in?bs['comments']:
????????content?=?i['content']??#彈幕
????????upcount?=?i['upcount']??#點(diǎn)贊數(shù)
????????user_degree?=i['uservip_degree']?#會(huì)員等級(jí)
????????timepoint?=?i['timepoint']??#發(fā)布時(shí)間
????????comment_id?=?i['commentid']??#彈幕id
????????cache?=?pd.DataFrame({'彈幕':[content],'會(huì)員等級(jí)':[user_degree],
??????????????????????????????'發(fā)布時(shí)間':[timepoint],'彈幕點(diǎn)贊':[upcount],'彈幕id':[comment_id]})
????????df?=?pd.concat([df,cache])
df.to_csv('tengxun_danmu.csv',encoding?=?'utf-8')
print(df.shape)
3.數(shù)據(jù)預(yù)覽


import?requests
import?json
import?pandas?as?pd
def?get_mangguo_danmu(num1,?num2,?page):
????try:
????????url?=?'https://bullet-ws.hitv.com/bullet/2020/09/29/{}/{}/{}.json'
????????print("正在爬取第"?+?str(page)?+?"頁")
????????danmuurl?=?url.format(num1,?num2,?page)
????????res?=?requests.get(danmuurl)
????????res.encoding?=?'utf-8'
????????#print(res.text)
????????data?=?json.loads(res.text)
????except:
????????print("無法連接")
????details?=?[]
????for?i?in?range(len(data['data']['items'])):??#?彈幕數(shù)據(jù)在json文件'data'的'items'中
????????result?=?{}
????????result['stype']?=?num2??#?通過stype可識(shí)別期數(shù)
????????result['id']?=?data['data']['items'][i]['id']??#?獲取id
????????try:??#?嘗試獲取uname
????????????result['uname']?=?data['data']['items'][i]['uname']
????????except:
????????????result['uname']?=?''
????????result['content']?=?data['data']['items'][i]['content']??#?獲取彈幕內(nèi)容
????????result['time']?=?data['data']['items'][i]['time']??#?獲取彈幕發(fā)布時(shí)間
????????try:??#?嘗試獲取彈幕點(diǎn)贊數(shù)
????????????result['v2_up_count']?=?data['data']['items'][i]['v2_up_count']
????????except:
????????????result['v2_up_count']?=?''
????????details.append(result)
????return?details
#輸入關(guān)鍵信息
def?count_danmu():
????danmu_total?=?[]
????num1?=?input('第一個(gè)數(shù)字')
????num2?=?input('第二個(gè)數(shù)字')
????page?=?int(input('輸入總時(shí)長'))
????for?i?in?range(page):
????????danmu_total.extend(get_mangguo_danmu(num1,?num2,?i))
????return?danmu_total
def?main():
????df?=?pd.DataFrame(count_danmu())
????df.to_csv('mangguo_danmu.csv')
if?__name__?==?'__main__':
????main()
3.數(shù)據(jù)預(yù)覽

四、愛奇藝彈幕
1.網(wǎng)頁分析
本文以爬取《樂隊(duì)的夏天第2季》第13期上視頻彈幕為例,首先通過以下步驟找到存放彈幕的真實(shí)url。


import?zlib
import?requests
#?1.爬取xml文件
def?download_xml(url):
????bulletold?=?requests.get(url).content??#?二進(jìn)制內(nèi)容
????return?zipdecode(bulletold)
def?zipdecode(bulletold):
????'對(duì)zip壓縮的二進(jìn)制內(nèi)容解碼成文本'
????decode?=?zlib.decompress(bytearray(bulletold),?15?+?32).decode('utf-8')
????return?decode
for?x?in?range(1,12):
????#?x是從1到12,12怎么來的,這一集總共57分鐘,愛奇藝每5分鐘會(huì)加載新的彈幕,57除以5向上取整
????url?=?'https://cmts.iqiyi.com/bullet/62/00/5981449914376200_300_'?+?str(x)?+?'.z'
????xml?=?download_xml(url)
????#?把編碼好的文件分別寫入17個(gè)xml文件中(類似于txt文件),方便后邊取數(shù)據(jù)
????with?open('./aiqiyi/iqiyi'?+?str(x)?+?'.xml',?'a+',?encoding='utf-8')?as?f:
????????f.write(xml)
#?2.讀取xml文件中的彈幕數(shù)據(jù)數(shù)據(jù)
from?xml.dom.minidom?import?parse
import?xml.dom.minidom
def?xml_parse(file_name):
????DOMTree?=?xml.dom.minidom.parse(file_name)
????collection?=?DOMTree.documentElement
????#?在集合中獲取所有entry數(shù)據(jù)
????entrys?=?collection.getElementsByTagName("entry")
????print(entrys)
????result?=?[]
????for?entry?in?entrys:
????????content?=?entry.getElementsByTagName('content')[0]
????????print(content.childNodes[0].data)
????????i?=?content.childNodes[0].data
????????result.append(i)
????return?result
with?open("aiyiqi_danmu.txt",?mode="w",?encoding="utf-8")?as?f:
????for?x?in?range(1,12):
????????l?=?xml_parse("./aiqiyi/iqiyi"?+?str(x)?+?".xml")
????????for?line?in?l:
????????????f.write(line)
????????????f.write("\n"
3.數(shù)據(jù)預(yù)覽

評(píng)論
圖片
表情
