Python脚本爬取B站弹幕(已失效)

20年底用于爬取B站弹幕的代码,最近在整理博客时测试,发现该代码已经失效了,应该是B站修改了API

这里先贴出来,之后具体需要用了再维护 😳

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
from lxml import etree

class CrawlerForB(object):
	"""docstring for CrawlerForB"""

	def __init__(self):
		self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}
		
	def crawlvediolst(self, keyword, page):
		vlst = []
		for i in range(page):
			html = f'https://search.bilibili.com/all?keyword={keyword}&page={i+1}'
			res = requests.get(html, headers=self.headers)
			bs = BeautifulSoup(res.text,'html.parser')
			vedio_lst = bs.find_all('li', class_='video-item matrix')
			# print(html, vedio_lst)
			for vedio in vedio_lst:
			    info = vedio.find('div', class_='info')
			    vedio_type = info.find('span', class_='type hide').text
			    href = info.find('a', class_='title')['href'][2:37]
			    bvid = href[-12:]
			    title = info.find('a', class_='title')['title']
			    intro = info.find('div', class_='des hide').text.strip()
			    watch_num = info.find('span', class_='so-icon watch-num').text.strip()
			    danmu = info.find('span', class_='so-icon hide').text.strip()
			    upload_time = info.find('span', class_='so-icon time').text.strip()
			    up = info.find('a', class_='up-name').text
			    vlst.append([vedio_type, href, bvid, title, intro, watch_num, danmu, upload_time, up])
			    self.crwaldanmu(bvid)
			    print(f'{i}-{bvid}-{title}')
		vedio_info = pd.DataFrame(vlst, columns=['vedio_type', 'href', 'bvid', 'title', 'intro', 'watch_num', 'danmu', 'upload_time', 'up'])
		vedio_info.to_csv('vedio.csv')

	def crwaldanmu(self, bvid):
		vedio_html = f'https://api.bilibili.com/x/player/pagelist?bvid={bvid}&jsonp=jsonp'
		vedio_request = requests.get(vedio_html, headers=self.headers)
		info = json.loads(vedio_request.text)
		vedio_key = info['data'][0]['cid']
		danmu_xml_url = f'https://api.bilibili.com/x/v1/dm/list.so?oid={vedio_key}'
		danmu_xml_reponse = requests.get(danmu_xml_url, headers=self.headers)
		danmu_xml = danmu_xml_reponse.content
		tree = etree.XML(danmu_xml)
		danmus = tree.findall('d')
		f = 'danmu2.csv'
		with open(f, "a", encoding='utf-8') as file:
			for i in danmus:
				text = i.text
				vedio_time = i.get('p').split(',')[0]
				file.write(f'{bvid}, {text},{vedio_time}\n')

	
if __name__ == '__main__':
	c = CrawlerForB()
	# c.crawlvediolst(50)
	# c.crwaldanmu("BV1yi4y1E7Wa")
	c.crwaldanmu('BV1VZ4y147nP')
	print('EXIT')