1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
|
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
from lxml import etree
class CrawlerForB(object):
"""docstring for CrawlerForB"""
def __init__(self):
self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}
def crawlvediolst(self, keyword, page):
vlst = []
for i in range(page):
html = f'https://search.bilibili.com/all?keyword={keyword}&page={i+1}'
res = requests.get(html, headers=self.headers)
bs = BeautifulSoup(res.text,'html.parser')
vedio_lst = bs.find_all('li', class_='video-item matrix')
# print(html, vedio_lst)
for vedio in vedio_lst:
info = vedio.find('div', class_='info')
vedio_type = info.find('span', class_='type hide').text
href = info.find('a', class_='title')['href'][2:37]
bvid = href[-12:]
title = info.find('a', class_='title')['title']
intro = info.find('div', class_='des hide').text.strip()
watch_num = info.find('span', class_='so-icon watch-num').text.strip()
danmu = info.find('span', class_='so-icon hide').text.strip()
upload_time = info.find('span', class_='so-icon time').text.strip()
up = info.find('a', class_='up-name').text
vlst.append([vedio_type, href, bvid, title, intro, watch_num, danmu, upload_time, up])
self.crwaldanmu(bvid)
print(f'{i}-{bvid}-{title}')
vedio_info = pd.DataFrame(vlst, columns=['vedio_type', 'href', 'bvid', 'title', 'intro', 'watch_num', 'danmu', 'upload_time', 'up'])
vedio_info.to_csv('vedio.csv')
def crwaldanmu(self, bvid):
vedio_html = f'https://api.bilibili.com/x/player/pagelist?bvid={bvid}&jsonp=jsonp'
vedio_request = requests.get(vedio_html, headers=self.headers)
info = json.loads(vedio_request.text)
vedio_key = info['data'][0]['cid']
danmu_xml_url = f'https://api.bilibili.com/x/v1/dm/list.so?oid={vedio_key}'
danmu_xml_reponse = requests.get(danmu_xml_url, headers=self.headers)
danmu_xml = danmu_xml_reponse.content
tree = etree.XML(danmu_xml)
danmus = tree.findall('d')
f = 'danmu2.csv'
with open(f, "a", encoding='utf-8') as file:
for i in danmus:
text = i.text
vedio_time = i.get('p').split(',')[0]
file.write(f'{bvid}, {text},{vedio_time}\n')
if __name__ == '__main__':
c = CrawlerForB()
# c.crawlvediolst(50)
# c.crwaldanmu("BV1yi4y1E7Wa")
c.crwaldanmu('BV1VZ4y147nP')
print('EXIT')
|