import base64 import functools import hashlib import itertools import json import math import re import time import urllib.parse import uuid from .common import InfoExtractor, SearchInfoExtractor from ..dependencies import Cryptodome from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, GeoRestrictedError, InAdvancePagedList, OnDemandPagedList, bool_or_none, clean_html, determine_ext, filter_dict, float_or_none, format_field, get_element_by_class, int_or_none, join_nonempty, make_archive_id, merge_dicts, mimetype2ext, parse_count, parse_qs, parse_resolution, qualities, smuggle_url, srt_subtitles_timecode, str_or_none, traverse_obj, unified_timestamp, unsmuggle_url, url_or_none, urlencode_postdata, variadic, ) class BilibiliBaseIE(InfoExtractor): _HEADERS = {'Referer': 'https://www.bilibili.com/'} _FORMAT_ID_RE = re.compile(r'-(\d+)\.m4s\?') _WBI_KEY_CACHE_TIMEOUT = 30 # exact expire timeout is unclear, use 30s for one session _wbi_key_cache = {} @property def is_logged_in(self): return bool(self._get_cookies('https://api.bilibili.com').get('SESSDATA')) def _check_missing_formats(self, play_info, formats): parsed_qualities = set(traverse_obj(formats, (..., 'quality'))) missing_formats = join_nonempty(*[ traverse_obj(fmt, 'new_description', 'display_desc', 'quality') for fmt in traverse_obj(play_info, ( 'support_formats', lambda _, v: v['quality'] not in parsed_qualities))], delim=', ') if missing_formats: self.to_screen( f'Format(s) {missing_formats} are missing; you have to login or ' f'become a premium member to download them. {self._login_hint()}') def extract_formats(self, play_info): format_names = { r['quality']: traverse_obj(r, 'new_description', 'display_desc') for r in traverse_obj(play_info, ('support_formats', lambda _, v: v['quality'])) } audios = traverse_obj(play_info, ('dash', (None, 'dolby'), 'audio', ..., {dict})) flac_audio = traverse_obj(play_info, ('dash', 'flac', 'audio')) if flac_audio: audios.append(flac_audio) formats = [{ 'url': traverse_obj(audio, 'baseUrl', 'base_url', 'url'), 'ext': mimetype2ext(traverse_obj(audio, 'mimeType', 'mime_type')), 'acodec': traverse_obj(audio, ('codecs', {str.lower})), 'vcodec': 'none', 'tbr': float_or_none(audio.get('bandwidth'), scale=1000), 'filesize': int_or_none(audio.get('size')), 'format_id': str_or_none(audio.get('id')), } for audio in audios] formats.extend({ 'url': traverse_obj(video, 'baseUrl', 'base_url', 'url'), 'ext': mimetype2ext(traverse_obj(video, 'mimeType', 'mime_type')), 'fps': float_or_none(traverse_obj(video, 'frameRate', 'frame_rate')), 'width': int_or_none(video.get('width')), 'height': int_or_none(video.get('height')), 'vcodec': video.get('codecs'), 'acodec': 'none' if audios else None, 'dynamic_range': {126: 'DV', 125: 'HDR10'}.get(int_or_none(video.get('id'))), 'tbr': float_or_none(video.get('bandwidth'), scale=1000), 'filesize': int_or_none(video.get('size')), 'quality': int_or_none(video.get('id')), 'format_id': traverse_obj( video, (('baseUrl', 'base_url'), {self._FORMAT_ID_RE.search}, 1), ('id', {str_or_none}), get_all=False), 'format': format_names.get(video.get('id')), } for video in traverse_obj(play_info, ('dash', 'video', ...))) if formats: self._check_missing_formats(play_info, formats) fragments = traverse_obj(play_info, ('durl', lambda _, v: url_or_none(v['url']), { 'url': ('url', {url_or_none}), 'duration': ('length', {functools.partial(float_or_none, scale=1000)}), 'filesize': ('size', {int_or_none}), })) if fragments: formats.append({ 'url': fragments[0]['url'], 'filesize': sum(traverse_obj(fragments, (..., 'filesize'))), **({ 'fragments': fragments, 'protocol': 'http_dash_segments', } if len(fragments) > 1 else {}), **traverse_obj(play_info, { 'quality': ('quality', {int_or_none}), 'format_id': ('quality', {str_or_none}), 'format_note': ('quality', {lambda x: format_names.get(x)}), 'duration': ('timelength', {functools.partial(float_or_none, scale=1000)}), }), **parse_resolution(format_names.get(play_info.get('quality'))), }) return formats def _get_wbi_key(self, video_id): if time.time() < self._wbi_key_cache.get('ts', 0) + self._WBI_KEY_CACHE_TIMEOUT: return self._wbi_key_cache['key'] session_data = self._download_json( 'https://api.bilibili.com/x/web-interface/nav', video_id, note='Downloading wbi sign') lookup = ''.join(traverse_obj(session_data, ( 'data', 'wbi_img', ('img_url', 'sub_url'), {lambda x: x.rpartition('/')[2].partition('.')[0]}))) # from getMixinKey() in the vendor js mixin_key_enc_tab = [ 46, 47, 18, 2, 53, 8, 23, 32, 15, 50, 10, 31, 58, 3, 45, 35, 27, 43, 5, 49, 33, 9, 42, 19, 29, 28, 14, 39, 12, 38, 41, 13, 37, 48, 7, 16, 24, 55, 40, 61, 26, 17, 0, 1, 60, 51, 30, 4, 22, 25, 54, 21, 56, 59, 6, 63, 57, 62, 11, 36, 20, 34, 44, 52, ] self._wbi_key_cache.update({ 'key': ''.join(lookup[i] for i in mixin_key_enc_tab)[:32], 'ts': time.time(), }) return self._wbi_key_cache['key'] def _sign_wbi(self, params, video_id): params['wts'] = round(time.time()) params = { k: ''.join(filter(lambda char: char not in "!'()*", str(v))) for k, v in sorted(params.items()) } query = urllib.parse.urlencode(params) params['w_rid'] = hashlib.md5(f'{query}{self._get_wbi_key(video_id)}'.encode()).hexdigest() return params def _download_playinfo(self, bvid, cid, headers=None, qn=None): params = {'bvid': bvid, 'cid': cid, 'fnval': 4048} if qn: params['qn'] = qn return self._download_json( 'https://api.bilibili.com/x/player/wbi/playurl', bvid, query=self._sign_wbi(params, bvid), headers=headers, note=f'Downloading video formats for cid {cid} {qn or ""}')['data'] def json2srt(self, json_data): srt_data = '' for idx, line in enumerate(json_data.get('body') or []): srt_data += (f'{idx + 1}\n' f'{srt_subtitles_timecode(line["from"])} --> {srt_subtitles_timecode(line["to"])}\n' f'{line["content"]}\n\n') return srt_data def _get_subtitles(self, video_id, cid, aid=None): subtitles = { 'danmaku': [{ 'ext': 'xml', 'url': f'https://comment.bilibili.com/{cid}.xml', }], } video_info = self._download_json( 'https://api.bilibili.com/x/player/v2', video_id, query={'aid': aid, 'cid': cid} if aid else {'bvid': video_id, 'cid': cid}, note=f'Extracting subtitle info {cid}', headers=self._HEADERS) if traverse_obj(video_info, ('data', 'need_login_subtitle')): self.report_warning( f'Subtitles are only available when logged in. {self._login_hint()}', only_once=True) for s in traverse_obj(video_info, ( 'data', 'subtitle', 'subtitles', lambda _, v: v['subtitle_url'] and v['lan'])): subtitles.setdefault(s['lan'], []).append({ 'ext': 'srt', 'data': self.json2srt(self._download_json(s['subtitle_url'], video_id)), }) return subtitles def _get_chapters(self, aid, cid): chapters = aid and cid and self._download_json( 'https://api.bilibili.com/x/player/v2', aid, query={'aid': aid, 'cid': cid}, note='Extracting chapters', fatal=False, headers=self._HEADERS) return traverse_obj(chapters, ('data', 'view_points', ..., { 'title': 'content', 'start_time': 'from', 'end_time': 'to', })) or None def _get_comments(self, aid): for idx in itertools.count(1): replies = traverse_obj( self._download_json( f'https://api.bilibili.com/x/v2/reply?pn={idx}&oid={aid}&type=1&jsonp=jsonp&sort=2&_=1567227301685', aid, note=f'Extracting comments from page {idx}', fatal=False), ('data', 'replies')) if not replies: return for children in map(self._get_all_children, replies): yield from children def _get_all_children(self, reply): yield { 'author': traverse_obj(reply, ('member', 'uname')), 'author_id': traverse_obj(reply, ('member', 'mid')), 'id': reply.get('rpid'), 'text': traverse_obj(reply, ('content', 'message')), 'timestamp': reply.get('ctime'), 'parent': reply.get('parent') or 'root', } for children in map(self._get_all_children, traverse_obj(reply, ('replies', ...))): yield from children def _get_episodes_from_season(self, ss_id, url): season_info = self._download_json( 'https://api.bilibili.com/pgc/web/season/section', ss_id, note='Downloading season info', query={'season_id': ss_id}, headers={'Referer': url, **self.geo_verification_headers()}) for entry in traverse_obj(season_info, ( 'result', 'main_section', 'episodes', lambda _, v: url_or_none(v['share_url']) and v['id'])): yield self.url_result(entry['share_url'], BiliBiliBangumiIE, str_or_none(entry.get('id'))) def _get_divisions(self, video_id, graph_version, edges, edge_id, cid_edges=None): cid_edges = cid_edges or {} division_data = self._download_json( 'https://api.bilibili.com/x/stein/edgeinfo_v2', video_id, query={'graph_version': graph_version, 'edge_id': edge_id, 'bvid': video_id}, note=f'Extracting divisions from edge {edge_id}') edges.setdefault(edge_id, {}).update( traverse_obj(division_data, ('data', 'story_list', lambda _, v: v['edge_id'] == edge_id, { 'title': ('title', {str}), 'cid': ('cid', {int_or_none}), }), get_all=False)) edges[edge_id].update(traverse_obj(division_data, ('data', { 'title': ('title', {str}), 'choices': ('edges', 'questions', ..., 'choices', ..., { 'edge_id': ('id', {int_or_none}), 'cid': ('cid', {int_or_none}), 'text': ('option', {str}), }), }))) # use dict to combine edges that use the same video section (same cid) cid_edges.setdefault(edges[edge_id]['cid'], {})[edge_id] = edges[edge_id] for choice in traverse_obj(edges, (edge_id, 'choices', ...)): if choice['edge_id'] not in edges: edges[choice['edge_id']] = {'cid': choice['cid']} self._get_divisions(video_id, graph_version, edges, choice['edge_id'], cid_edges=cid_edges) return cid_edges def _get_interactive_entries(self, video_id, cid, metainfo, headers=None): graph_version = traverse_obj( self._download_json( 'https://api.bilibili.com/x/player/wbi/v2', video_id, 'Extracting graph version', query={'bvid': video_id, 'cid': cid}, headers=headers), ('data', 'interaction', 'graph_version', {int_or_none})) cid_edges = self._get_divisions(video_id, graph_version, {1: {'cid': cid}}, 1) for cid, edges in cid_edges.items(): play_info = self._download_playinfo(video_id, cid, headers=headers) yield { **metainfo, 'id': f'{video_id}_{cid}', 'title': f'{metainfo.get("title")} - {next(iter(edges.values())).get("title")}', 'formats': self.extract_formats(play_info), 'description': f'{json.dumps(edges, ensure_ascii=False)}\n{metainfo.get("description", "")}', 'duration': float_or_none(play_info.get('timelength'), scale=1000), 'subtitles': self.extract_subtitles(video_id, cid), } class BiliBiliIE(BilibiliBaseIE): _VALID_URL = r'https?://(?:www\.)?bilibili\.com/(?:video/|festival/[^/?#]+\?(?:[^#]*&)?bvid=)[aAbB][vV](?P[^/?#&]+)' _TESTS = [{ 'url': 'https://www.bilibili.com/video/BV13x41117TL', 'info_dict': { 'id': 'BV13x41117TL', 'title': '阿滴英文|英文歌分享#6 "Closer', 'ext': 'mp4', 'description': '滴妹今天唱Closer給你聽! 有史以来,被推最多次也是最久的歌曲,其实歌词跟我原本想像差蛮多的,不过还是好听! 微博@阿滴英文', 'uploader_id': '65880958', 'uploader': '阿滴英文', 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', 'duration': 554.117, 'tags': list, 'comment_count': int, 'upload_date': '20170301', 'timestamp': 1488353834, 'like_count': int, 'view_count': int, '_old_archive_ids': ['bilibili 8903802_part1'], }, }, { 'note': 'old av URL version', 'url': 'http://www.bilibili.com/video/av1074402/', 'info_dict': { 'id': 'BV11x411K7CN', 'ext': 'mp4', 'title': '【金坷垃】金泡沫', 'uploader': '菊子桑', 'uploader_id': '156160', 'duration': 308.36, 'upload_date': '20140420', 'timestamp': 1397983878, 'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923', 'like_count': int, 'comment_count': int, 'view_count': int, 'tags': list, 'thumbnail': r're:^https?://.*\.(jpg|jpeg)$', '_old_archive_ids': ['bilibili 1074402_part1'], }, 'params': {'skip_download': True}, }, { 'note': 'Anthology', 'url': 'https://www.bilibili.com/video/BV1bK411W797', 'info_dict': { 'id': 'BV1bK411W797', 'title': '物语中的人物是如何吐槽自己的OP的', }, 'playlist_count': 18, 'playlist': [{ 'info_dict': { 'id': 'BV1bK411W797_p1', 'ext': 'mp4', 'title': '物语中的人物是如何吐槽自己的OP的 p01 Staple Stable/战场原+羽川', 'tags': 'count:10', 'timestamp': 1589601697, 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', 'uploader': '打牌还是打桩', 'uploader_id': '150259984', 'like_count': int, 'comment_count': int, 'upload_date': '20200516', 'view_count': int, 'description': 'md5:e3c401cf7bc363118d1783dd74068a68', 'duration': 90.314, '_old_archive_ids': ['bilibili 498159642_part1'], }, }], }, { 'note': 'Specific page of Anthology', 'url': 'https://www.bilibili.com/video/BV1bK411W797?p=1', 'info_dict': { 'id': 'BV1bK411W797_p1', 'ext': 'mp4', 'title': '物语中的人物是如何吐槽自己的OP的 p01 Staple Stable/战场原+羽川', 'tags': 'count:10', 'timestamp': 1589601697, 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', 'uploader': '打牌还是打桩', 'uploader_id': '150259984', 'like_count': int, 'comment_count': int, 'upload_date': '20200516', 'view_count': int, 'description': 'md5:e3c401cf7bc363118d1783dd74068a68', 'duration': 90.314, '_old_archive_ids': ['bilibili 498159642_part1'], }, }, { 'url': 'https://www.bilibili.com/video/av8903802/', 'info_dict': { 'id': 'BV13x41117TL', 'ext': 'mp4', 'title': '阿滴英文|英文歌分享#6 "Closer', 'upload_date': '20170301', 'description': 'md5:3b1b9e25b78da4ef87e9b548b88ee76a', 'timestamp': 1488353834, 'uploader_id': '65880958', 'uploader': '阿滴英文', 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', 'duration': 554.117, 'tags': list, 'comment_count': int, 'view_count': int, 'like_count': int, '_old_archive_ids': ['bilibili 8903802_part1'], }, 'params': { 'skip_download': True, }, }, { 'note': 'video has chapter', 'url': 'https://www.bilibili.com/video/BV1vL411G7N7/', 'info_dict': { 'id': 'BV1vL411G7N7', 'ext': 'mp4', 'title': '如何为你的B站视频添加进度条分段', 'timestamp': 1634554558, 'upload_date': '20211018', 'description': 'md5:a9a3d6702b3a94518d419b2e9c320a6d', 'tags': list, 'uploader': '爱喝咖啡的当麻', 'duration': 669.482, 'uploader_id': '1680903', 'chapters': 'count:6', 'comment_count': int, 'view_count': int, 'like_count': int, 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', '_old_archive_ids': ['bilibili 463665680_part1'], }, 'params': {'skip_download': True}, }, { 'note': 'video redirects to festival page', 'url': 'https://www.bilibili.com/video/BV1wP4y1P72h', 'info_dict': { 'id': 'BV1wP4y1P72h', 'ext': 'mp4', 'title': '牛虎年相交之际,一首传统民族打击乐《牛斗虎》祝大家新春快乐,虎年大吉!【bilibili音乐虎闹新春】', 'timestamp': 1643947497, 'upload_date': '20220204', 'description': 'md5:8681a0d4d2c06b4ae27e59c8080a7fe6', 'uploader': '叨叨冯聊音乐', 'duration': 246.719, 'uploader_id': '528182630', 'view_count': int, 'like_count': int, 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', '_old_archive_ids': ['bilibili 893839363_part1'], }, }, { 'note': 'newer festival video', 'url': 'https://www.bilibili.com/festival/2023honkaiimpact3gala?bvid=BV1ay4y1d77f', 'info_dict': { 'id': 'BV1ay4y1d77f', 'ext': 'mp4', 'title': '【崩坏3新春剧场】为特别的你送上祝福!', 'timestamp': 1674273600, 'upload_date': '20230121', 'description': 'md5:58af66d15c6a0122dc30c8adfd828dd8', 'uploader': '果蝇轰', 'duration': 1111.722, 'uploader_id': '8469526', 'view_count': int, 'like_count': int, 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', '_old_archive_ids': ['bilibili 778246196_part1'], }, }, { 'note': 'legacy flv/mp4 video', 'url': 'https://www.bilibili.com/video/BV1ms411Q7vw/?p=4', 'info_dict': { 'id': 'BV1ms411Q7vw_p4', 'title': '[搞笑]【动画】云南方言快乐生产线出品 p04 新烧包谷之漫游桃花岛', 'timestamp': 1458222815, 'upload_date': '20160317', 'description': '云南方言快乐生产线出品', 'duration': float, 'uploader': '一笑颠天', 'uploader_id': '3916081', 'view_count': int, 'comment_count': int, 'like_count': int, 'tags': list, 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', '_old_archive_ids': ['bilibili 4120229_part4'], }, 'params': {'extractor_args': {'bilibili': {'prefer_multi_flv': ['32']}}}, 'playlist_count': 19, 'playlist': [{ 'info_dict': { 'id': 'BV1ms411Q7vw_p4_0', 'ext': 'flv', 'title': '[搞笑]【动画】云南方言快乐生产线出品 p04 新烧包谷之漫游桃花岛', 'duration': 399.102, }, }], }, { 'note': 'legacy mp4-only video', 'url': 'https://www.bilibili.com/video/BV1nx411u79K', 'info_dict': { 'id': 'BV1nx411u79K', 'ext': 'mp4', 'title': '【练习室】201603声乐练习《No Air》with VigoVan', 'timestamp': 1508893551, 'upload_date': '20171025', 'description': '@ZERO-G伯远\n声乐练习 《No Air》with Vigo Van', 'duration': 80.384, 'uploader': '伯远', 'uploader_id': '10584494', 'comment_count': int, 'view_count': int, 'like_count': int, 'tags': list, 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', '_old_archive_ids': ['bilibili 15700301_part1'], }, }, { 'note': 'interactive/split-path video', 'url': 'https://www.bilibili.com/video/BV1af4y1H7ga/', 'info_dict': { 'id': 'BV1af4y1H7ga', 'title': '【互动游戏】花了大半年时间做的自我介绍~请查收!!', 'timestamp': 1630500414, 'upload_date': '20210901', 'description': 'md5:01113e39ab06e28042d74ac356a08786', 'tags': list, 'uploader': '钉宫妮妮Ninico', 'duration': 1503, 'uploader_id': '8881297', 'comment_count': int, 'view_count': int, 'like_count': int, 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', '_old_archive_ids': ['bilibili 292734508_part1'], }, 'playlist_count': 33, 'playlist': [{ 'info_dict': { 'id': 'BV1af4y1H7ga_400950101', 'ext': 'mp4', 'title': '【互动游戏】花了大半年时间做的自我介绍~请查收!! - 听见猫猫叫~', 'timestamp': 1630500414, 'upload_date': '20210901', 'description': 'md5:db66ac7a2813a94b8291dbce990cc5b2', 'tags': list, 'uploader': '钉宫妮妮Ninico', 'duration': 11.605, 'uploader_id': '8881297', 'comment_count': int, 'view_count': int, 'like_count': int, 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', '_old_archive_ids': ['bilibili 292734508_part1'], }, }], }, { 'note': '301 redirect to bangumi link', 'url': 'https://www.bilibili.com/video/BV1TE411f7f1', 'info_dict': { 'id': '288525', 'title': '李永乐老师 钱学森弹道和乘波体飞行器是什么?', 'ext': 'mp4', 'series': '我和我的祖国', 'series_id': '4780', 'season': '幕后纪实', 'season_id': '28609', 'season_number': 1, 'episode': '钱学森弹道和乘波体飞行器是什么?', 'episode_id': '288525', 'episode_number': 105, 'duration': 1183.957, 'timestamp': 1571648124, 'upload_date': '20191021', 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', }, }, { 'note': 'video has subtitles, which requires login', 'url': 'https://www.bilibili.com/video/BV12N4y1M7rh', 'info_dict': { 'id': 'BV12N4y1M7rh', 'ext': 'mp4', 'title': 'md5:96e8bb42c2b432c0d4ce3434a61479c1', 'tags': list, 'description': 'md5:afde2b7ba9025c01d9e3dde10de221e4', 'duration': 313.557, 'upload_date': '20220709', 'uploader': '小夫太渴', 'timestamp': 1657347907, 'uploader_id': '1326814124', 'comment_count': int, 'view_count': int, 'like_count': int, 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', 'subtitles': 'count:2', # login required for CC subtitle '_old_archive_ids': ['bilibili 898179753_part1'], }, 'params': {'listsubtitles': True}, 'skip': 'login required for subtitle', }, { 'url': 'https://www.bilibili.com/video/BV1jL41167ZG/', 'info_dict': { 'id': 'BV1jL41167ZG', 'title': '一场大火引发的离奇死亡!古典推理经典短篇集《不可能犯罪诊断书》!', 'ext': 'mp4', }, 'skip': 'supporter-only video', }, { 'url': 'https://www.bilibili.com/video/BV1Ks411f7aQ/', 'info_dict': { 'id': 'BV1Ks411f7aQ', 'title': '【BD1080P】狼与香辛料I【华盟】', 'ext': 'mp4', }, 'skip': 'login required', }, { 'url': 'https://www.bilibili.com/video/BV1GJ411x7h7/', 'info_dict': { 'id': 'BV1GJ411x7h7', 'title': '【官方 MV】Never Gonna Give You Up - Rick Astley', 'ext': 'mp4', }, 'skip': 'geo-restricted', }, { 'note': 'has - in the last path segment of the url', 'url': 'https://www.bilibili.com/festival/bh3-7th?bvid=BV1tr4y1f7p2&', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) headers = self.geo_verification_headers() webpage, urlh = self._download_webpage_handle(url, video_id, headers=headers) if not self._match_valid_url(urlh.url): return self.url_result(urlh.url) headers['Referer'] = url initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', video_id) is_festival = 'videoData' not in initial_state if is_festival: video_data = initial_state['videoInfo'] else: play_info_obj = self._search_json( r'window\.__playinfo__\s*=', webpage, 'play info', video_id, fatal=False) if not play_info_obj: if traverse_obj(initial_state, ('error', 'trueCode')) == -403: self.raise_login_required() if traverse_obj(initial_state, ('error', 'trueCode')) == -404: raise ExtractorError( 'This video may be deleted or geo-restricted. ' 'You might want to try a VPN or a proxy server (with --proxy)', expected=True) play_info = traverse_obj(play_info_obj, ('data', {dict})) if not play_info: if traverse_obj(play_info_obj, 'code') == 87007: toast = get_element_by_class('tips-toast', webpage) or '' msg = clean_html( f'{get_element_by_class("belongs-to", toast) or ""},' + (get_element_by_class('level', toast) or '')) raise ExtractorError( f'This is a supporter-only video: {msg}. {self._login_hint()}', expected=True) raise ExtractorError('Failed to extract play info') video_data = initial_state['videoData'] video_id, title = video_data['bvid'], video_data.get('title') # Bilibili anthologies are similar to playlists but all videos share the same video ID as the anthology itself. page_list_json = not is_festival and traverse_obj( self._download_json( 'https://api.bilibili.com/x/player/pagelist', video_id, fatal=False, query={'bvid': video_id, 'jsonp': 'jsonp'}, note='Extracting videos in anthology', headers=headers), 'data', expected_type=list) or [] is_anthology = len(page_list_json) > 1 part_id = int_or_none(parse_qs(url).get('p', [None])[-1]) if is_anthology and not part_id and self._yes_playlist(video_id, video_id): return self.playlist_from_matches( page_list_json, video_id, title, ie=BiliBiliIE, getter=lambda entry: f'https://www.bilibili.com/video/{video_id}?p={entry["page"]}') if is_anthology: part_id = part_id or 1 title += f' p{part_id:02d} {traverse_obj(page_list_json, (part_id - 1, "part")) or ""}' aid = video_data.get('aid') old_video_id = format_field(aid, None, f'%s_part{part_id or 1}') cid = traverse_obj(video_data, ('pages', part_id - 1, 'cid')) if part_id else video_data.get('cid') festival_info = {} if is_festival: play_info = self._download_playinfo(video_id, cid, headers=headers) festival_info = traverse_obj(initial_state, { 'uploader': ('videoInfo', 'upName'), 'uploader_id': ('videoInfo', 'upMid', {str_or_none}), 'like_count': ('videoStatus', 'like', {int_or_none}), 'thumbnail': ('sectionEpisodes', lambda _, v: v['bvid'] == video_id, 'cover'), }, get_all=False) metainfo = { **traverse_obj(initial_state, { 'uploader': ('upData', 'name'), 'uploader_id': ('upData', 'mid', {str_or_none}), 'like_count': ('videoData', 'stat', 'like', {int_or_none}), 'tags': ('tags', ..., 'tag_name'), 'thumbnail': ('videoData', 'pic', {url_or_none}), }), **festival_info, **traverse_obj(video_data, { 'description': 'desc', 'timestamp': ('pubdate', {int_or_none}), 'view_count': (('viewCount', ('stat', 'view')), {int_or_none}), 'comment_count': ('stat', 'reply', {int_or_none}), }, get_all=False), 'id': f'{video_id}{format_field(part_id, None, "_p%d")}', '_old_archive_ids': [make_archive_id(self, old_video_id)] if old_video_id else None, 'title': title, 'http_headers': {'Referer': url}, } is_interactive = traverse_obj(video_data, ('rights', 'is_stein_gate')) if is_interactive: return self.playlist_result( self._get_interactive_entries(video_id, cid, metainfo, headers=headers), **metainfo, duration=traverse_obj(initial_state, ('videoData', 'duration', {int_or_none})), __post_extractor=self.extract_comments(aid)) else: formats = self.extract_formats(play_info) if not traverse_obj(play_info, ('dash')): # we only have legacy formats and need additional work has_qn = lambda x: x in traverse_obj(formats, (..., 'quality')) for qn in traverse_obj(play_info, ('accept_quality', lambda _, v: not has_qn(v), {int})): formats.extend(traverse_obj( self.extract_formats(self._download_playinfo(video_id, cid, headers=headers, qn=qn)), lambda _, v: not has_qn(v['quality']))) self._check_missing_formats(play_info, formats) flv_formats = traverse_obj(formats, lambda _, v: v['fragments']) if flv_formats and len(flv_formats) < len(formats): # Flv and mp4 are incompatible due to `multi_video` workaround, so drop one if not self._configuration_arg('prefer_multi_flv'): dropped_fmts = ', '.join( f'{f.get("format_note")} ({f.get("format_id")})' for f in flv_formats) formats = traverse_obj(formats, lambda _, v: not v.get('fragments')) if dropped_fmts: self.to_screen( f'Dropping incompatible flv format(s) {dropped_fmts} since mp4 is available. ' 'To extract flv, pass --extractor-args "bilibili:prefer_multi_flv"') else: formats = traverse_obj( # XXX: Filtering by extractor-arg is for testing purposes formats, lambda _, v: v['quality'] == int(self._configuration_arg('prefer_multi_flv')[0]), ) or [max(flv_formats, key=lambda x: x['quality'])] if traverse_obj(formats, (0, 'fragments')): # We have flv formats, which are individual short videos with their own timestamps and metainfo # Binary concatenation corrupts their timestamps, so we need a `multi_video` workaround return { **metainfo, '_type': 'multi_video', 'entries': [{ 'id': f'{metainfo["id"]}_{idx}', 'title': metainfo['title'], 'http_headers': metainfo['http_headers'], 'formats': [{ **fragment, 'format_id': formats[0].get('format_id'), }], 'subtitles': self.extract_subtitles(video_id, cid) if idx == 0 else None, '__post_extractor': self.extract_comments(aid) if idx == 0 else None, } for idx, fragment in enumerate(formats[0]['fragments'])], 'duration': float_or_none(play_info.get('timelength'), scale=1000), } else: return { **metainfo, 'formats': formats, 'duration': float_or_none(play_info.get('timelength'), scale=1000), 'chapters': self._get_chapters(aid, cid), 'subtitles': self.extract_subtitles(video_id, cid), '__post_extractor': self.extract_comments(aid), } class BiliBiliBangumiIE(BilibiliBaseIE): _VALID_URL = r'https?://(?:www\.)?bilibili\.com/bangumi/play/ep(?P\d+)' _TESTS = [{ 'url': 'https://www.bilibili.com/bangumi/play/ep21495/', 'info_dict': { 'id': '21495', 'ext': 'mp4', 'series': '悠久之翼', 'series_id': '774', 'season': '第二季', 'season_id': '1182', 'season_number': 2, 'episode': 'forever/ef', 'episode_id': '21495', 'episode_number': 12, 'title': '12 forever/ef', 'duration': 1420.791, 'timestamp': 1320412200, 'upload_date': '20111104', 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', }, }, { 'url': 'https://www.bilibili.com/bangumi/play/ep267851', 'info_dict': { 'id': '267851', 'ext': 'mp4', 'series': '鬼灭之刃', 'series_id': '4358', 'season': '立志篇', 'season_id': '26801', 'season_number': 1, 'episode': '残酷', 'episode_id': '267851', 'episode_number': 1, 'title': '1 残酷', 'duration': 1425.256, 'timestamp': 1554566400, 'upload_date': '20190406', 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', }, 'skip': 'Geo-restricted', }, { 'note': 'a making-of which falls outside main section', 'url': 'https://www.bilibili.com/bangumi/play/ep345120', 'info_dict': { 'id': '345120', 'ext': 'mp4', 'series': '鬼灭之刃', 'series_id': '4358', 'season': '立志篇', 'season_id': '26801', 'season_number': 1, 'episode': '炭治郎篇', 'episode_id': '345120', 'episode_number': 27, 'title': '#1 炭治郎篇', 'duration': 1922.129, 'timestamp': 1602853860, 'upload_date': '20201016', 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', }, }] def _real_extract(self, url): episode_id = self._match_id(url) headers = self.geo_verification_headers() webpage = self._download_webpage(url, episode_id, headers=headers) if '您所在的地区无法观看本片' in webpage: raise GeoRestrictedError('This video is restricted') elif '正在观看预览,大会员免费看全片' in webpage: self.raise_login_required('This video is for premium members only') headers['Referer'] = url play_info = self._download_json( 'https://api.bilibili.com/pgc/player/web/v2/playurl', episode_id, 'Extracting episode', query={'fnval': '4048', 'ep_id': episode_id}, headers=headers) premium_only = play_info.get('code') == -10403 play_info = traverse_obj(play_info, ('result', 'video_info', {dict})) or {} formats = self.extract_formats(play_info) if not formats and (premium_only or '成为大会员抢先看' in webpage or '开通大会员观看' in webpage): self.raise_login_required('This video is for premium members only') bangumi_info = self._download_json( 'https://api.bilibili.com/pgc/view/web/season', episode_id, 'Get episode details', query={'ep_id': episode_id}, headers=headers)['result'] episode_number, episode_info = next(( (idx, ep) for idx, ep in enumerate(traverse_obj( bangumi_info, (('episodes', ('section', ..., 'episodes')), ..., {dict})), 1) if str_or_none(ep.get('id')) == episode_id), (1, {})) season_id = bangumi_info.get('season_id') season_number, season_title = season_id and next(( (idx + 1, e.get('season_title')) for idx, e in enumerate( traverse_obj(bangumi_info, ('seasons', ...))) if e.get('season_id') == season_id ), (None, None)) aid = episode_info.get('aid') return { 'id': episode_id, 'formats': formats, **traverse_obj(bangumi_info, { 'series': ('series', 'series_title', {str}), 'series_id': ('series', 'series_id', {str_or_none}), 'thumbnail': ('square_cover', {url_or_none}), }), **traverse_obj(episode_info, { 'episode': ('long_title', {str}), 'episode_number': ('title', {int_or_none}, {lambda x: x or episode_number}), 'timestamp': ('pub_time', {int_or_none}), 'title': {lambda v: v and join_nonempty('title', 'long_title', delim=' ', from_dict=v)}, }), 'episode_id': episode_id, 'season': str_or_none(season_title), 'season_id': str_or_none(season_id), 'season_number': season_number, 'duration': float_or_none(play_info.get('timelength'), scale=1000), 'subtitles': self.extract_subtitles(episode_id, episode_info.get('cid'), aid=aid), '__post_extractor': self.extract_comments(aid), 'http_headers': {'Referer': url}, } class BiliBiliBangumiMediaIE(BilibiliBaseIE): _VALID_URL = r'https?://(?:www\.)?bilibili\.com/bangumi/media/md(?P\d+)' _TESTS = [{ 'url': 'https://www.bilibili.com/bangumi/media/md24097891', 'info_dict': { 'id': '24097891', 'title': 'CAROLE & TUESDAY', 'description': 'md5:42417ad33d1eaa1c93bfd2dd1626b829', }, 'playlist_mincount': 25, }, { 'url': 'https://www.bilibili.com/bangumi/media/md1565/', 'info_dict': { 'id': '1565', 'title': '攻壳机动队 S.A.C. 2nd GIG', 'description': 'md5:46cac00bafd645b97f4d6df616fc576d', }, 'playlist_count': 26, 'playlist': [{ 'info_dict': { 'id': '68540', 'ext': 'mp4', 'series': '攻壳机动队', 'series_id': '1077', 'season': '第二季', 'season_id': '1565', 'season_number': 2, 'episode': '再启动 REEMBODY', 'episode_id': '68540', 'episode_number': 1, 'title': '1 再启动 REEMBODY', 'duration': 1525.777, 'timestamp': 1425074413, 'upload_date': '20150227', 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', }, }], }] def _real_extract(self, url): media_id = self._match_id(url) webpage = self._download_webpage(url, media_id) initial_state = self._search_json( r'window\.__INITIAL_STATE__\s*=', webpage, 'initial_state', media_id) ss_id = initial_state['mediaInfo']['season_id'] return self.playlist_result( self._get_episodes_from_season(ss_id, url), media_id, **traverse_obj(initial_state, ('mediaInfo', { 'title': ('title', {str}), 'description': ('evaluate', {str}), }))) class BiliBiliBangumiSeasonIE(BilibiliBaseIE): _VALID_URL = r'(?x)https?://(?:www\.)?bilibili\.com/bangumi/play/ss(?P\d+)' _TESTS = [{ 'url': 'https://www.bilibili.com/bangumi/play/ss26801', 'info_dict': { 'id': '26801', 'title': '鬼灭之刃', 'description': 'md5:e2cc9848b6f69be6db79fc2a82d9661b', }, 'playlist_mincount': 26, }, { 'url': 'https://www.bilibili.com/bangumi/play/ss2251', 'info_dict': { 'id': '2251', 'title': '玲音', 'description': 'md5:1fd40e3df4c08d4d9d89a6a34844bdc4', }, 'playlist_count': 13, 'playlist': [{ 'info_dict': { 'id': '50188', 'ext': 'mp4', 'series': '玲音', 'series_id': '1526', 'season': 'TV', 'season_id': '2251', 'season_number': 1, 'episode': 'WEIRD', 'episode_id': '50188', 'episode_number': 1, 'title': '1 WEIRD', 'duration': 1436.992, 'timestamp': 1343185080, 'upload_date': '20120725', 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', }, }], }] def _real_extract(self, url): ss_id = self._match_id(url) webpage = self._download_webpage(url, ss_id) metainfo = traverse_obj( self._search_json(r']+type="application/ld\+json"[^>]*>', webpage, 'info', ss_id), ('itemListElement', ..., { 'title': ('name', {str}), 'description': ('description', {str}), }), get_all=False) return self.playlist_result(self._get_episodes_from_season(ss_id, url), ss_id, **metainfo) class BilibiliCheeseBaseIE(BilibiliBaseIE): def _extract_episode(self, season_info, ep_id): episode_info = traverse_obj(season_info, ( 'episodes', lambda _, v: v['id'] == int(ep_id)), get_all=False) aid, cid = episode_info['aid'], episode_info['cid'] if traverse_obj(episode_info, 'ep_status') == -1: raise ExtractorError('This course episode is not yet available.', expected=True) if not traverse_obj(episode_info, 'playable'): self.raise_login_required('You need to purchase the course to download this episode') play_info = self._download_json( 'https://api.bilibili.com/pugv/player/web/playurl', ep_id, query={'avid': aid, 'cid': cid, 'ep_id': ep_id, 'fnval': 16, 'fourk': 1}, headers=self._HEADERS, note='Downloading playinfo')['data'] return { 'id': str_or_none(ep_id), 'episode_id': str_or_none(ep_id), 'formats': self.extract_formats(play_info), 'extractor_key': BilibiliCheeseIE.ie_key(), 'extractor': BilibiliCheeseIE.IE_NAME, 'webpage_url': f'https://www.bilibili.com/cheese/play/ep{ep_id}', **traverse_obj(episode_info, { 'episode': ('title', {str}), 'title': {lambda v: v and join_nonempty('index', 'title', delim=' - ', from_dict=v)}, 'alt_title': ('subtitle', {str}), 'duration': ('duration', {int_or_none}), 'episode_number': ('index', {int_or_none}), 'thumbnail': ('cover', {url_or_none}), 'timestamp': ('release_date', {int_or_none}), 'view_count': ('play', {int_or_none}), }), **traverse_obj(season_info, { 'uploader': ('up_info', 'uname', {str}), 'uploader_id': ('up_info', 'mid', {str_or_none}), }), 'subtitles': self.extract_subtitles(ep_id, cid, aid=aid), '__post_extractor': self.extract_comments(aid), 'http_headers': self._HEADERS, } def _download_season_info(self, query_key, video_id): return self._download_json( f'https://api.bilibili.com/pugv/view/web/season?{query_key}={video_id}', video_id, headers=self._HEADERS, note='Downloading season info')['data'] class BilibiliCheeseIE(BilibiliCheeseBaseIE): _VALID_URL = r'https?://(?:www\.)?bilibili\.com/cheese/play/ep(?P\d+)' _TESTS = [{ 'url': 'https://www.bilibili.com/cheese/play/ep229832', 'info_dict': { 'id': '229832', 'ext': 'mp4', 'title': '1 - 课程先导片', 'alt_title': '视频课 · 3分41秒', 'uploader': '马督工', 'uploader_id': '316568752', 'episode': '课程先导片', 'episode_id': '229832', 'episode_number': 1, 'duration': 221, 'timestamp': 1695549606, 'upload_date': '20230924', 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', 'view_count': int, }, }] def _real_extract(self, url): ep_id = self._match_id(url) return self._extract_episode(self._download_season_info('ep_id', ep_id), ep_id) class BilibiliCheeseSeasonIE(BilibiliCheeseBaseIE): _VALID_URL = r'https?://(?:www\.)?bilibili\.com/cheese/play/ss(?P\d+)' _TESTS = [{ 'url': 'https://www.bilibili.com/cheese/play/ss5918', 'info_dict': { 'id': '5918', 'title': '【限时五折】新闻系学不到:马督工教你做自媒体', 'description': '帮普通人建立世界模型,降低人与人的沟通门槛', }, 'playlist': [{ 'info_dict': { 'id': '229832', 'ext': 'mp4', 'title': '1 - 课程先导片', 'alt_title': '视频课 · 3分41秒', 'uploader': '马督工', 'uploader_id': '316568752', 'episode': '课程先导片', 'episode_id': '229832', 'episode_number': 1, 'duration': 221, 'timestamp': 1695549606, 'upload_date': '20230924', 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', 'view_count': int, }, }], 'params': {'playlist_items': '1'}, }, { 'url': 'https://www.bilibili.com/cheese/play/ss5918', 'info_dict': { 'id': '5918', 'title': '【限时五折】新闻系学不到:马督工教你做自媒体', 'description': '帮普通人建立世界模型,降低人与人的沟通门槛', }, 'playlist_mincount': 5, 'skip': 'paid video in list', }] def _get_cheese_entries(self, season_info): for ep_id in traverse_obj(season_info, ('episodes', lambda _, v: v['episode_can_view'], 'id')): yield self._extract_episode(season_info, ep_id) def _real_extract(self, url): season_id = self._match_id(url) season_info = self._download_season_info('season_id', season_id) return self.playlist_result( self._get_cheese_entries(season_info), season_id, **traverse_obj(season_info, { 'title': ('title', {str}), 'description': ('subtitle', {str}), })) class BilibiliSpaceBaseIE(BilibiliBaseIE): def _extract_playlist(self, fetch_page, get_metadata, get_entries): first_page = fetch_page(0) metadata = get_metadata(first_page) paged_list = InAdvancePagedList( lambda idx: get_entries(fetch_page(idx) if idx else first_page), metadata['page_count'], metadata['page_size']) return metadata, paged_list class BilibiliSpaceVideoIE(BilibiliSpaceBaseIE): _VALID_URL = r'https?://space\.bilibili\.com/(?P\d+)(?P