[ie/youtube] Add po_token, visitor_data, data_sync_id extractor args (#10648)

Authored by:  seproDev, coletdjnz, bashonly
This commit is contained in:
sepro 2024-09-13 12:51:58 +02:00 committed by GitHub
parent d1c4d88b2d
commit 3a3bd00037
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 182 additions and 78 deletions

View File

@ -1777,6 +1777,9 @@ The following extractors use this feature:
* `innertube_host`: Innertube API host to use for all API requests; e.g. `studio.youtube.com`, `youtubei.googleapis.com`. Note that cookies exported from one subdomain will not work on others * `innertube_host`: Innertube API host to use for all API requests; e.g. `studio.youtube.com`, `youtubei.googleapis.com`. Note that cookies exported from one subdomain will not work on others
* `innertube_key`: Innertube API key to use for all API requests. By default, no API key is used * `innertube_key`: Innertube API key to use for all API requests. By default, no API key is used
* `raise_incomplete_data`: `Incomplete Data Received` raises an error instead of reporting a warning * `raise_incomplete_data`: `Incomplete Data Received` raises an error instead of reporting a warning
* `data_sync_id`: Overrides the account Data Sync ID used in Innertube API requests. This may be needed if you are using an account with `youtube:player_skip=webpage,configs` or `youtubetab:skip=webpage`
* `visitor_data`: Overrides the Visitor Data used in Innertube API requests. This should be used with `player_skip=webpage,configs` and without cookies. Note: this may have adverse effects if used improperly. If a session from a browser is wanted, you should pass cookies instead (which contain the Visitor ID)
* `po_token`: Proof of Origin (PO) Token(s) to use for requesting video playback. Comma seperated list of PO Tokens in the format `CLIENT+PO_TOKEN`, e.g. `youtube:po_token=web+XXX,android+YYY`
#### youtubetab (YouTube playlists, channels, feeds, etc.) #### youtubetab (YouTube playlists, channels, feeds, etc.)
* `skip`: One or more of `webpage` (skip initial webpage download), `authcheck` (allow the download of playlists requiring authentication when no initial webpage is downloaded. This may cause unwanted behavior, see [#1122](https://github.com/yt-dlp/yt-dlp/pull/1122) for more details) * `skip`: One or more of `webpage` (skip initial webpage download), `authcheck` (allow the download of playlists requiring authentication when no initial webpage is downloaded. This may cause unwanted behavior, see [#1122](https://github.com/yt-dlp/yt-dlp/pull/1122) for more details)

View File

@ -69,6 +69,8 @@ from ..utils import (
) )
STREAMING_DATA_CLIENT_NAME = '__yt_dlp_client' STREAMING_DATA_CLIENT_NAME = '__yt_dlp_client'
STREAMING_DATA_PO_TOKEN = '__yt_dlp_po_token'
# any clients starting with _ cannot be explicitly requested by the user # any clients starting with _ cannot be explicitly requested by the user
INNERTUBE_CLIENTS = { INNERTUBE_CLIENTS = {
'web': { 'web': {
@ -79,6 +81,7 @@ INNERTUBE_CLIENTS = {
}, },
}, },
'INNERTUBE_CONTEXT_CLIENT_NAME': 1, 'INNERTUBE_CONTEXT_CLIENT_NAME': 1,
'REQUIRE_PO_TOKEN': True,
}, },
# Safari UA returns pre-merged video+audio 144p/240p/360p/720p/1080p HLS formats # Safari UA returns pre-merged video+audio 144p/240p/360p/720p/1080p HLS formats
'web_safari': { 'web_safari': {
@ -90,6 +93,7 @@ INNERTUBE_CLIENTS = {
}, },
}, },
'INNERTUBE_CONTEXT_CLIENT_NAME': 1, 'INNERTUBE_CONTEXT_CLIENT_NAME': 1,
'REQUIRE_PO_TOKEN': True,
}, },
'web_embedded': { 'web_embedded': {
'INNERTUBE_CONTEXT': { 'INNERTUBE_CONTEXT': {
@ -132,6 +136,7 @@ INNERTUBE_CLIENTS = {
}, },
'INNERTUBE_CONTEXT_CLIENT_NAME': 3, 'INNERTUBE_CONTEXT_CLIENT_NAME': 3,
'REQUIRE_JS_PLAYER': False, 'REQUIRE_JS_PLAYER': False,
'REQUIRE_PO_TOKEN': True,
}, },
'android_music': { 'android_music': {
'INNERTUBE_CONTEXT': { 'INNERTUBE_CONTEXT': {
@ -146,6 +151,7 @@ INNERTUBE_CLIENTS = {
}, },
'INNERTUBE_CONTEXT_CLIENT_NAME': 21, 'INNERTUBE_CONTEXT_CLIENT_NAME': 21,
'REQUIRE_JS_PLAYER': False, 'REQUIRE_JS_PLAYER': False,
'REQUIRE_PO_TOKEN': True,
}, },
'android_creator': { 'android_creator': {
'INNERTUBE_CONTEXT': { 'INNERTUBE_CONTEXT': {
@ -160,6 +166,7 @@ INNERTUBE_CLIENTS = {
}, },
'INNERTUBE_CONTEXT_CLIENT_NAME': 14, 'INNERTUBE_CONTEXT_CLIENT_NAME': 14,
'REQUIRE_JS_PLAYER': False, 'REQUIRE_JS_PLAYER': False,
'REQUIRE_PO_TOKEN': True,
}, },
# YouTube Kids videos aren't returned on this client for some reason # YouTube Kids videos aren't returned on this client for some reason
'android_vr': { 'android_vr': {
@ -323,6 +330,7 @@ def build_innertube_clients():
for client, ytcfg in tuple(INNERTUBE_CLIENTS.items()): for client, ytcfg in tuple(INNERTUBE_CLIENTS.items()):
ytcfg.setdefault('INNERTUBE_HOST', 'www.youtube.com') ytcfg.setdefault('INNERTUBE_HOST', 'www.youtube.com')
ytcfg.setdefault('REQUIRE_JS_PLAYER', True) ytcfg.setdefault('REQUIRE_JS_PLAYER', True)
ytcfg.setdefault('REQUIRE_PO_TOKEN', False)
ytcfg.setdefault('PLAYER_PARAMS', None) ytcfg.setdefault('PLAYER_PARAMS', None)
ytcfg['INNERTUBE_CONTEXT']['client'].setdefault('hl', 'en') ytcfg['INNERTUBE_CONTEXT']['client'].setdefault('hl', 'en')
@ -688,31 +696,46 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage, r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage,
'identity token', default=None, fatal=False) 'identity token', default=None, fatal=False)
@staticmethod def _data_sync_id_to_delegated_session_id(self, data_sync_id):
def _extract_account_syncid(*args): if not data_sync_id:
""" return
Extract syncId required to download private playlists of secondary channels
@params response and/or ytcfg
"""
for data in args:
# ytcfg includes channel_syncid if on secondary channel
delegated_sid = try_get(data, lambda x: x['DELEGATED_SESSION_ID'], str)
if delegated_sid:
return delegated_sid
sync_ids = (try_get(
data, (lambda x: x['responseContext']['mainAppWebResponseContext']['datasyncId'],
lambda x: x['DATASYNC_ID']), str) or '').split('||')
if len(sync_ids) >= 2 and sync_ids[1]:
# datasyncid is of the form "channel_syncid||user_syncid" for secondary channel # datasyncid is of the form "channel_syncid||user_syncid" for secondary channel
# and just "user_syncid||" for primary channel. We only want the channel_syncid # and just "user_syncid||" for primary channel. We only want the channel_syncid
return sync_ids[0] channel_syncid, _, user_syncid = data_sync_id.partition('||')
if user_syncid:
return channel_syncid
@staticmethod def _extract_account_syncid(self, *args):
def _extract_visitor_data(*args): """
Extract current session ID required to download private playlists of secondary channels
@params response and/or ytcfg
"""
# ytcfg includes channel_syncid if on secondary channel
if delegated_sid := traverse_obj(args, (..., 'DELEGATED_SESSION_ID', {str}, any)):
return delegated_sid
data_sync_id = self._extract_data_sync_id(*args)
return self._data_sync_id_to_delegated_session_id(data_sync_id)
def _extract_data_sync_id(self, *args):
"""
Extract current account dataSyncId.
In the format DELEGATED_SESSION_ID||USER_SESSION_ID or USER_SESSION_ID||
@params response and/or ytcfg
"""
if data_sync_id := self._configuration_arg('data_sync_id', [None], ie_key=YoutubeIE, casesense=True)[0]:
return data_sync_id
return traverse_obj(
args, (..., ('DATASYNC_ID', ('responseContext', 'mainAppWebResponseContext', 'datasyncId')), {str}, any))
def _extract_visitor_data(self, *args):
""" """
Extracts visitorData from an API response or ytcfg Extracts visitorData from an API response or ytcfg
Appears to be used to track session state Appears to be used to track session state
""" """
if visitor_data := self._configuration_arg('visitor_data', [None], ie_key=YoutubeIE, casesense=True)[0]:
return visitor_data
return get_first( return get_first(
args, [('VISITOR_DATA', ('INNERTUBE_CONTEXT', 'client', 'visitorData'), ('responseContext', 'visitorData'))], args, [('VISITOR_DATA', ('INNERTUBE_CONTEXT', 'client', 'visitorData'), ('responseContext', 'visitorData'))],
expected_type=str) expected_type=str)
@ -1334,11 +1357,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'401': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'av01.0.12M.08'}, '401': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'av01.0.12M.08'},
} }
_SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt') _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt')
_POTOKEN_EXPERIMENTS = ('51217476', '51217102')
_BROKEN_CLIENTS = {
short_client_name(client): client
for client in ('android', 'android_creator', 'android_music')
}
_DEFAULT_CLIENTS = ('ios', 'web_creator') _DEFAULT_CLIENTS = ('ios', 'web_creator')
_GEO_BYPASS = False _GEO_BYPASS = False
@ -3701,6 +3719,54 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
**cls._get_checkok_params(), **cls._get_checkok_params(),
} }
def _get_config_po_token(self, client):
po_token_strs = self._configuration_arg('po_token', [], ie_key=YoutubeIE, casesense=True)
for token_str in po_token_strs:
po_token_client, sep, po_token = token_str.partition('+')
if not sep:
self.report_warning(
f'Invalid po_token configuration format. Expected "client+po_token", got "{token_str}"', only_once=True)
continue
if po_token_client == client:
return po_token
def fetch_po_token(self, client='web', visitor_data=None, data_sync_id=None, player_url=None, **kwargs):
# PO Token is bound to visitor_data / Visitor ID when logged out. Must have visitor_data for it to function.
if not visitor_data and not self.is_authenticated and player_url:
self.report_warning(
f'Unable to fetch PO Token for {client} client: Missing required Visitor Data. '
f'You may need to pass Visitor Data with --extractor-args "youtube:visitor_data=XXX"')
return
config_po_token = self._get_config_po_token(client)
if config_po_token:
# PO token is bound to data_sync_id / account Session ID when logged in. However, for the config po_token,
# if using first channel in an account then we don't need the data_sync_id anymore...
if not data_sync_id and self.is_authenticated and player_url:
self.report_warning(
f'Got a PO Token for {client} client, but missing Data Sync ID for account. Formats may not work.'
f'You may need to pass a Data Sync ID with --extractor-args "youtube:data_sync_id=XXX"')
return config_po_token
# Require PO Token if logged in for external fetching
if not data_sync_id and self.is_authenticated and player_url:
self.report_warning(
f'Unable to fetch PO Token for {client} client: Missing required Data Sync ID for account. '
f'You may need to pass a Data Sync ID with --extractor-args "youtube:data_sync_id=XXX"')
return
return self._fetch_po_token(
client=client,
visitor_data=visitor_data,
data_sync_id=data_sync_id,
player_url=player_url,
**kwargs,
)
def _fetch_po_token(self, client, visitor_data=None, data_sync_id=None, player_url=None, **kwargs):
"""External PO Token fetch stub"""
@staticmethod @staticmethod
def _is_agegated(player_response): def _is_agegated(player_response):
if traverse_obj(player_response, ('playabilityStatus', 'desktopLegacyAgeGateReason')): if traverse_obj(player_response, ('playabilityStatus', 'desktopLegacyAgeGateReason')):
@ -3717,13 +3783,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
def _is_unplayable(player_response): def _is_unplayable(player_response):
return traverse_obj(player_response, ('playabilityStatus', 'status')) == 'UNPLAYABLE' return traverse_obj(player_response, ('playabilityStatus', 'status')) == 'UNPLAYABLE'
def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, player_url, initial_pr, smuggled_data): def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, player_url, initial_pr, visitor_data, data_sync_id, po_token):
session_index = self._extract_session_index(player_ytcfg, master_ytcfg)
syncid = self._extract_account_syncid(player_ytcfg, master_ytcfg, initial_pr)
sts = self._extract_signature_timestamp(video_id, player_url, master_ytcfg, fatal=False) if player_url else None
headers = self.generate_api_headers( headers = self.generate_api_headers(
ytcfg=player_ytcfg, account_syncid=syncid, session_index=session_index, default_client=client) ytcfg=player_ytcfg,
default_client=client,
visitor_data=visitor_data,
session_index=self._extract_session_index(master_ytcfg, player_ytcfg),
account_syncid=(
self._data_sync_id_to_delegated_session_id(data_sync_id)
or self._extract_account_syncid(master_ytcfg, initial_pr, player_ytcfg)
),
)
yt_query = { yt_query = {
'videoId': video_id, 'videoId': video_id,
@ -3734,6 +3804,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if player_params := self._configuration_arg('player_params', [default_pp], casesense=True)[0]: if player_params := self._configuration_arg('player_params', [default_pp], casesense=True)[0]:
yt_query['params'] = player_params yt_query['params'] = player_params
if po_token:
yt_query['serviceIntegrityDimensions'] = {'poToken': po_token}
sts = self._extract_signature_timestamp(video_id, player_url, master_ytcfg, fatal=False) if player_url else None
yt_query.update(self._generate_player_context(sts)) yt_query.update(self._generate_player_context(sts))
return self._extract_response( return self._extract_response(
item_id=video_id, ep='player', query=yt_query, item_id=video_id, ep='player', query=yt_query,
@ -3744,7 +3818,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
def _get_requested_clients(self, url, smuggled_data): def _get_requested_clients(self, url, smuggled_data):
requested_clients = [] requested_clients = []
broken_clients = []
excluded_clients = [] excluded_clients = []
allowed_clients = sorted( allowed_clients = sorted(
(client for client in INNERTUBE_CLIENTS if client[:1] != '_'), (client for client in INNERTUBE_CLIENTS if client[:1] != '_'),
@ -3758,12 +3831,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
excluded_clients.append(client[1:]) excluded_clients.append(client[1:])
elif client not in allowed_clients: elif client not in allowed_clients:
self.report_warning(f'Skipping unsupported client "{client}"') self.report_warning(f'Skipping unsupported client "{client}"')
elif client in self._BROKEN_CLIENTS.values():
broken_clients.append(client)
else: else:
requested_clients.append(client) requested_clients.append(client)
# Force deprioritization of _BROKEN_CLIENTS for format de-duplication
requested_clients.extend(broken_clients)
if not requested_clients: if not requested_clients:
requested_clients.extend(self._DEFAULT_CLIENTS) requested_clients.extend(self._DEFAULT_CLIENTS)
for excluded_client in excluded_clients: for excluded_client in excluded_clients:
@ -3788,19 +3857,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
return pr_id return pr_id
def _extract_player_responses(self, clients, video_id, webpage, master_ytcfg, smuggled_data): def _extract_player_responses(self, clients, video_id, webpage, master_ytcfg, smuggled_data):
initial_pr = ignore_initial_response = None initial_pr = None
if webpage: if webpage:
if 'web' in clients:
experiments = traverse_obj(master_ytcfg, (
'WEB_PLAYER_CONTEXT_CONFIGS', ..., 'serializedExperimentIds', {lambda x: x.split(',')}, ...))
if all(x in experiments for x in self._POTOKEN_EXPERIMENTS):
self.report_warning(
'Webpage contains broken formats (poToken experiment detected). Ignoring initial player response')
ignore_initial_response = True
initial_pr = self._search_json( initial_pr = self._search_json(
self._YT_INITIAL_PLAYER_RESPONSE_RE, webpage, 'initial player response', video_id, fatal=False) self._YT_INITIAL_PLAYER_RESPONSE_RE, webpage, 'initial player response', video_id, fatal=False)
prs = [] prs = []
deprioritized_prs = []
if initial_pr and not self._invalid_player_response(initial_pr, video_id): if initial_pr and not self._invalid_player_response(initial_pr, video_id):
# Android player_response does not have microFormats which are needed for # Android player_response does not have microFormats which are needed for
# extraction of some data. So we return the initial_pr with formats # extraction of some data. So we return the initial_pr with formats
@ -3822,14 +3886,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
return return
tried_iframe_fallback = False tried_iframe_fallback = False
player_url = None player_url = visitor_data = data_sync_id = None
skipped_clients = {} skipped_clients = {}
while clients: while clients:
deprioritize_pr = False
client, base_client, variant = _split_innertube_client(clients.pop()) client, base_client, variant = _split_innertube_client(clients.pop())
player_ytcfg = {} player_ytcfg = master_ytcfg if client == 'web' else {}
if client == 'web': if 'configs' not in self._configuration_arg('player_skip') and client != 'web':
player_ytcfg = self._get_default_ytcfg() if ignore_initial_response else master_ytcfg
elif 'configs' not in self._configuration_arg('player_skip'):
player_ytcfg = self._download_ytcfg(client, video_id) or player_ytcfg player_ytcfg = self._download_ytcfg(client, video_id) or player_ytcfg
player_url = player_url or self._extract_player_url(master_ytcfg, player_ytcfg, webpage=webpage) player_url = player_url or self._extract_player_url(master_ytcfg, player_ytcfg, webpage=webpage)
@ -3842,33 +3905,52 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
player_url = self._download_player_url(video_id) player_url = self._download_player_url(video_id)
tried_iframe_fallback = True tried_iframe_fallback = True
pr = initial_pr if client == 'web' and not ignore_initial_response else None visitor_data = visitor_data or self._extract_visitor_data(master_ytcfg, initial_pr, player_ytcfg)
for retry in self.RetryManager(fatal=False): data_sync_id = data_sync_id or self._extract_data_sync_id(master_ytcfg, initial_pr, player_ytcfg)
po_token = self.fetch_po_token(
client=client, visitor_data=visitor_data,
data_sync_id=data_sync_id if self.is_authenticated else None,
player_url=player_url if require_js_player else None,
)
require_po_token = self._get_default_ytcfg(client).get('REQUIRE_PO_TOKEN')
if not po_token and require_po_token:
self.report_warning(
f'No PO Token provided for {client} client, '
f'which is required for working {client} formats. '
f'You can manually pass a PO Token for this client with '
f'--extractor-args "youtube:po_token={client}+XXX"',
only_once=True)
deprioritize_pr = True
pr = initial_pr if client == 'web' else None
try: try:
pr = pr or self._extract_player_response( pr = pr or self._extract_player_response(
client, video_id, player_ytcfg or master_ytcfg, player_ytcfg, client, video_id,
player_url if require_js_player else None, initial_pr, smuggled_data) master_ytcfg=player_ytcfg or master_ytcfg,
player_ytcfg=player_ytcfg,
player_url=player_url,
initial_pr=initial_pr,
visitor_data=visitor_data,
data_sync_id=data_sync_id,
po_token=po_token)
except ExtractorError as e: except ExtractorError as e:
self.report_warning(e) self.report_warning(e)
break
experiments = traverse_obj(pr, (
'responseContext', 'serviceTrackingParams', lambda _, v: v['service'] == 'GFEEDBACK',
'params', lambda _, v: v['key'] == 'e', 'value', {lambda x: x.split(',')}, ...))
if all(x in experiments for x in self._POTOKEN_EXPERIMENTS):
pr = None
retry.error = ExtractorError('API returned broken formats (poToken experiment detected)', expected=True)
if not pr:
continue continue
if pr_id := self._invalid_player_response(pr, video_id): if pr_id := self._invalid_player_response(pr, video_id):
skipped_clients[client] = pr_id skipped_clients[client] = pr_id
elif pr: elif pr:
# Save client name for introspection later # Save client name for introspection later
name = short_client_name(client)
sd = traverse_obj(pr, ('streamingData', {dict})) or {} sd = traverse_obj(pr, ('streamingData', {dict})) or {}
sd[STREAMING_DATA_CLIENT_NAME] = name sd[STREAMING_DATA_CLIENT_NAME] = client
sd[STREAMING_DATA_PO_TOKEN] = po_token
for f in traverse_obj(sd, (('formats', 'adaptiveFormats'), ..., {dict})): for f in traverse_obj(sd, (('formats', 'adaptiveFormats'), ..., {dict})):
f[STREAMING_DATA_CLIENT_NAME] = name f[STREAMING_DATA_CLIENT_NAME] = client
f[STREAMING_DATA_PO_TOKEN] = po_token
if deprioritize_pr:
deprioritized_prs.append(pr)
else:
prs.append(pr) prs.append(pr)
# tv_embedded can work around age-gate and age-verification IF the video is embeddable # tv_embedded can work around age-gate and age-verification IF the video is embeddable
@ -3893,6 +3975,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
# _producer, _testsuite, & _vr variants can also work around age-verification # _producer, _testsuite, & _vr variants can also work around age-verification
append_client('web_creator', 'mediaconnect') append_client('web_creator', 'mediaconnect')
prs.extend(deprioritized_prs)
if skipped_clients: if skipped_clients:
self.report_warning( self.report_warning(
f'Skipping player responses from {"/".join(skipped_clients)} clients ' f'Skipping player responses from {"/".join(skipped_clients)} clients '
@ -4027,13 +4111,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
f'{video_id}: Some formats are possibly damaged. They will be deprioritized', only_once=True) f'{video_id}: Some formats are possibly damaged. They will be deprioritized', only_once=True)
client_name = fmt.get(STREAMING_DATA_CLIENT_NAME) client_name = fmt.get(STREAMING_DATA_CLIENT_NAME)
# _BROKEN_CLIENTS return videoplayback URLs that expire after 30 seconds po_token = fmt.get(STREAMING_DATA_PO_TOKEN)
# Ref: https://github.com/yt-dlp/yt-dlp/issues/9554
is_broken = client_name in self._BROKEN_CLIENTS if po_token:
fmt_url = update_url_query(fmt_url, {'pot': po_token})
# Clients that require PO Token return videoplayback URLs that may return 403
is_broken = (not po_token and self._get_default_ytcfg(client_name).get('REQUIRE_PO_TOKEN'))
if is_broken: if is_broken:
self.report_warning( self.report_warning(
f'{video_id}: {self._BROKEN_CLIENTS[client_name]} client formats are broken ' f'{video_id}: {client_name} client formats require a PO Token which was not provided. '
'and may yield HTTP Error 403. They will be deprioritized', only_once=True) 'They will be deprioritized as they may yield HTTP Error 403', only_once=True)
name = fmt.get('qualityLabel') or quality.replace('audio_quality_', '') or '' name = fmt.get('qualityLabel') or quality.replace('audio_quality_', '') or ''
fps = int_or_none(fmt.get('fps')) or 0 fps = int_or_none(fmt.get('fps')) or 0
@ -4109,12 +4197,24 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
elif skip_bad_formats and live_status == 'is_live' and needs_live_processing != 'is_live': elif skip_bad_formats and live_status == 'is_live' and needs_live_processing != 'is_live':
skip_manifests.add('dash') skip_manifests.add('dash')
def process_manifest_format(f, proto, client_name, itag): def process_manifest_format(f, proto, client_name, itag, po_token):
key = (proto, f.get('language')) key = (proto, f.get('language'))
if not all_formats and key in itags[itag]: if not all_formats and key in itags[itag]:
return False return False
itags[itag].add(key) itags[itag].add(key)
if f.get('source_preference') is None:
f['source_preference'] = -1
# Clients that require PO Token return videoplayback URLs that may return 403
# hls does not currently require PO Token
if (not po_token and self._get_default_ytcfg(client_name).get('REQUIRE_PO_TOKEN')) and proto != 'hls':
self.report_warning(
f'{video_id}: {client_name} client {proto} formats require a PO Token which was not provided. '
'They will be deprioritized as they may yield HTTP Error 403', only_once=True)
f['format_note'] = join_nonempty(f.get('format_note'), 'BROKEN', delim=' ')
f['source_preference'] -= 20
if itag and all_formats: if itag and all_formats:
f['format_id'] = f'{itag}-{proto}' f['format_id'] = f'{itag}-{proto}'
elif any(p != proto for p, _ in itags[itag]): elif any(p != proto for p, _ in itags[itag]):
@ -4126,9 +4226,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
f['format_note'] = join_nonempty(f.get('format_note'), '(default)', delim=' ') f['format_note'] = join_nonempty(f.get('format_note'), '(default)', delim=' ')
f['language_preference'] = PREFERRED_LANG_VALUE f['language_preference'] = PREFERRED_LANG_VALUE
if f.get('source_preference') is None:
f['source_preference'] = -1
if itag in ('616', '235'): if itag in ('616', '235'):
f['format_note'] = join_nonempty(f.get('format_note'), 'Premium', delim=' ') f['format_note'] = join_nonempty(f.get('format_note'), 'Premium', delim=' ')
f['source_preference'] += 100 f['source_preference'] += 100
@ -4149,23 +4246,27 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
subtitles = {} subtitles = {}
for sd in streaming_data: for sd in streaming_data:
client_name = sd.get(STREAMING_DATA_CLIENT_NAME) client_name = sd.get(STREAMING_DATA_CLIENT_NAME)
po_token = sd.get(STREAMING_DATA_PO_TOKEN)
hls_manifest_url = 'hls' not in skip_manifests and sd.get('hlsManifestUrl') hls_manifest_url = 'hls' not in skip_manifests and sd.get('hlsManifestUrl')
if hls_manifest_url: if hls_manifest_url:
if po_token:
hls_manifest_url = hls_manifest_url.rstrip('/') + f'/pot/{po_token}'
fmts, subs = self._extract_m3u8_formats_and_subtitles( fmts, subs = self._extract_m3u8_formats_and_subtitles(
hls_manifest_url, video_id, 'mp4', fatal=False, live=live_status == 'is_live') hls_manifest_url, video_id, 'mp4', fatal=False, live=live_status == 'is_live')
subtitles = self._merge_subtitles(subs, subtitles) subtitles = self._merge_subtitles(subs, subtitles)
for f in fmts: for f in fmts:
if process_manifest_format(f, 'hls', client_name, self._search_regex( if process_manifest_format(f, 'hls', client_name, self._search_regex(
r'/itag/(\d+)', f['url'], 'itag', default=None)): r'/itag/(\d+)', f['url'], 'itag', default=None), po_token):
yield f yield f
dash_manifest_url = 'dash' not in skip_manifests and sd.get('dashManifestUrl') dash_manifest_url = 'dash' not in skip_manifests and sd.get('dashManifestUrl')
if dash_manifest_url: if dash_manifest_url:
if po_token:
dash_manifest_url = dash_manifest_url.rstrip('/') + f'/pot/{po_token}'
formats, subs = self._extract_mpd_formats_and_subtitles(dash_manifest_url, video_id, fatal=False) formats, subs = self._extract_mpd_formats_and_subtitles(dash_manifest_url, video_id, fatal=False)
subtitles = self._merge_subtitles(subs, subtitles) # Prioritize HLS subs over DASH subtitles = self._merge_subtitles(subs, subtitles) # Prioritize HLS subs over DASH
for f in formats: for f in formats:
if process_manifest_format(f, 'dash', client_name, f['format_id']): if process_manifest_format(f, 'dash', client_name, f['format_id'], po_token):
f['filesize'] = int_or_none(self._search_regex( f['filesize'] = int_or_none(self._search_regex(
r'/clen/(\d+)', f.get('fragment_base_url') or f['url'], 'file size', default=None)) r'/clen/(\d+)', f.get('fragment_base_url') or f['url'], 'file size', default=None))
if needs_live_processing: if needs_live_processing: