使用Python Tkinter开发一个爬取B站直播弹幕工具的实现代码可以分为以下步骤:
- 导入模块
import requests
import json
import re
import tkinter as tk
from tkinter import scrolledtext
import threading
import time
- 获取直播间id和弹幕服务器地址
def get_live_info(room_id):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
res = requests.get('https://api.live.bilibili.com/room/v1/Room/room_init?id=' + room_id, headers=headers)
data = json.loads(res.text)
room_id = str(data['data']['room_id'])
server_url = re.findall(r'[^\x00-\xff]+', data['data']['base_url'])[0]
post_url = 'https://' + server_url + ':443/sub'
return room_id, post_url
- 连接到弹幕服务器
def connect(post_url, room_id):
json_data = {"roomid":room_id, "protover":1, "platform":"web","clientver":"1.4.0"}
res = requests.post(post_url, data=json.dumps(json_data).encode('utf-8'), headers={
'Content-Type': 'application/json',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
'Origin': 'https://live.bilibili.com'})
s = res.content
guid = str(s[0x10:0x14][::-1].hex())
return s, guid
- 解析弹幕信息
def parse_dm_info(s):
while s:
# 得到消息的长度 header = 16, size = len(s) - header
header = s[0:16]
contentlength = int.from_bytes(header[0x00:0x04], byteorder='big')
magic = header[0x04:0x08]
op = int.from_bytes(header[0x08:0x0c], byteorder='little')
sequence = int.from_bytes(header[0x0c:0x10], byteorder='little')
# 如果不是这个magic 直接返回byte数组
if magic != b'\x00\x00\x00\x10':
return s
# 如果消息长度小于整个消息长条 直接返回byte数组
if len(s) < contentlength+16:
return s
body = s[0x10:contentlength+16]
try:
msg = body.decode('utf-8', errors='ignore')
except UnicodeDecodeError as err:
print(err)
# 将每条消息解析成json格式
dm_json = json.loads(msg, strict=False)
msg_type = dm_json['cmd']
data = dm_json['data']
#根据返回cmd类型来分类处理data数据
#在这里可以对每种类型数据进行后续操作
if msg_type == 'DANMU_MSG':
danmu_content = data[1]
user_name = data[2][1]
return_msg = user_name + ':“' + danmu_content + '”\n'
elif msg_type == 'SYS_MSG':
reason = data['msg']
return_msg = '系统公告:' + reason + '\n'
else:
return_msg = ''
s = s[contentlength+16:]
return return_msg
- GUI设计与多线程爬取弹幕
class Application(tk.Frame):
def __init__(self, master=None):
tk.Frame.__init__(self, master)
self.grid()
self.room_id = tk.StringVar()
self.room_id.set('输入房间号')
self.createWidgets()
def createWidgets(self):
self.room_id_entry = tk.Entry(self, textvariable=self.room_id, width=30)
self.room_id_entry.grid(row=0, column=0)
self.start_button = tk.Button(self, text='开始', command=self.start_crawl)
self.start_button.grid(row=0, column=1)
self.stop_button = tk.Button(self, text='停止', command=self.stop_crawl, state='disabled')
self.stop_button.grid(row=0, column=2)
self.scrolled_text = scrolledtext.ScrolledText(self, width=80, height=20)
self.scrolled_text.grid(row=1, columnspan=3)
def start_crawl(self):
self.start_button.config(state='disabled')
self.stop_button.config(state='normal')
self.t = threading.Thread(target=self.crawl_dm)
self.flag = True
self.t.start()
def stop_crawl(self):
self.flag = False
self.start_button.config(state='normal')
self.stop_button.config(state='disabled')
def crawl_dm(self):
self.scrolled_text.insert(tk.END, "开始爬虫\n")
room_id, post_url = get_live_info(self.room_id.get())
s, guid = connect(post_url, room_id)
while self.flag:
try:
s += requests.post(post_url, data=b'\x00\x10\x00\x01\x00\x00\x00\x03\x00\x00\x00\x01\x00\x00\x00\x01', headers={
'Content-Type': 'application/octet-stream',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
'Origin': 'https://live.bilibili.com',
'Referer': 'https://live.bilibili.com/' + self.room_id.get(),
'Accept-Language': 'zh-CN,zh;q=0.9',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache'
}).content
dm_msg = parse_dm_info(s)
if dm_msg:
self.scrolled_text.insert(tk.END, dm_msg)
self.scrolled_text.see(tk.END)
s = b''
except Exception as e:
print(e)
time.sleep(0.2)
app = Application()
app.master.title('B站直播弹幕爬虫')
app.mainloop()
以上就是使用Python Tkinter开发一个爬取B站直播弹幕工具的完整攻略、示例代码和操作结果。使用这个弹幕爬虫,你可以轻松进行B站弹幕的采集和分析,得到更多有用的信息。
本站文章如无特殊说明,均为本站原创,如若转载,请注明出处:使用python tkinter开发一个爬取B站直播弹幕工具的实现代码 - Python技术站