[WatchESPN] Support free videos and BAM_DTC (#4118 )

Authored by: ischmidt20
[extractor/BiliIntl] Fix metadata extraction
2024-11-15 13:43:04 +00:00 · 2022-06-19 20:06:37 -07:00 · 2022-06-20 03:05:46 +05:30
5 changed files with 28 additions and 25 deletions
--- a/yt_dlp/extractor/bilibili.py
+++ b/yt_dlp/extractor/bilibili.py
@ -947,12 +947,11 @@ class BiliIntlIE(BiliIntlBaseIE):
        video_id = ep_id or aid
        webpage = self._download_webpage(url, video_id)
        # Bstation layout
-        initial_data = self._parse_json(self._search_regex(
-            r'window\.__INITIAL_(?:DATA|STATE)__\s*=\s*({.+?});', webpage,
-            'preload state', default='{}'), video_id, fatal=False) or {}
-        video_data = (
-            traverse_obj(initial_data, ('OgvVideo', 'epDetail'), expected_type=dict)
-            or traverse_obj(initial_data, ('UgcVideo', 'videoData'), expected_type=dict) or {})
+        initial_data = (
+            self._search_json(r'window\.__INITIAL_(?:DATA|STATE)__\s*=', webpage, 'preload state', video_id, default={})
+            or self._search_nuxt_data(webpage, video_id, '__initialState', fatal=False, traverse=None))
+        video_data = traverse_obj(
+            initial_data, ('OgvVideo', 'epDetail'), ('UgcVideo', 'videoData'), ('ugc', 'archive'), expected_type=dict)

        if season_id and not video_data:
            # Non-Bstation layout, read through episode list
@ -960,7 +959,7 @@ class BiliIntlIE(BiliIntlBaseIE):
            video_data = traverse_obj(season_json,
                                      ('sections', ..., 'episodes', lambda _, v: str(v['episode_id']) == ep_id),
                                      expected_type=dict, get_all=False)
-        return self._extract_video_info(video_data, ep_id=ep_id, aid=aid)
+        return self._extract_video_info(video_data or {}, ep_id=ep_id, aid=aid)


 class BiliIntlSeriesIE(BiliIntlBaseIE):
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@ -1588,15 +1588,13 @@ class InfoExtractor:
                webpage, 'next.js data', fatal=fatal, **kw),
            video_id, transform_source=transform_source, fatal=fatal)

-    def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', return_full_data=False):
-        ''' Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function. '''
-        # not all website do this, but it can be changed
-        # https://stackoverflow.com/questions/67463109/how-to-change-or-hide-nuxt-and-nuxt-keyword-in-page-source
+    def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
+        """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
        rectx = re.escape(context_name)
+        FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){return\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
        js, arg_keys, arg_vals = self._search_regex(
-            (r'<script>window\.%s=\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.+?)\)\);?</script>' % rectx,
-             r'%s\(.*?\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.*?)\)' % rectx),
-            webpage, context_name, group=['js', 'arg_keys', 'arg_vals'])
+            (rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),
+            webpage, context_name, group=('js', 'arg_keys', 'arg_vals'), fatal=fatal)

        args = dict(zip(arg_keys.split(','), arg_vals.split(',')))

@ -1604,10 +1602,8 @@ class InfoExtractor:
            if val in ('undefined', 'void 0'):
                args[key] = 'null'

-        ret = self._parse_json(js_to_json(js, args), video_id)
-        if return_full_data:
-            return ret
-        return ret['data'][0]
+        ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
+        return traverse_obj(ret, traverse) or {}

    @staticmethod
    def _hidden_inputs(html):
--- a/yt_dlp/extractor/espn.py
+++ b/yt_dlp/extractor/espn.py
@ -322,7 +322,7 @@ class WatchESPNIE(AdobePassIE):
            video_id)['playbackState']

        # ESPN+ subscription required, through cookies
-        if video_data.get('sourceId') == 'ESPN_DTC':
+        if 'DTC' in video_data.get('sourceId'):
            cookie = self._get_cookies(url).get('ESPN-ONESITE.WEB-PROD.token')
            if not cookie:
                self.raise_login_required(method='cookies')
@ -366,6 +366,13 @@ class WatchESPNIE(AdobePassIE):
                })
            m3u8_url, headers = playback['stream']['complete'][0]['url'], {'authorization': token}

+        # No login required
+        elif video_data.get('sourceId') == 'ESPN_FREE':
+            asset = self._download_json(
+                f'https://watch.auth.api.espn.com/video/auth/media/{video_id}/asset?apikey=uiqlbgzdwuru14v627vdusswb',
+                video_id)
+            m3u8_url, headers = asset['stream'], {}
+
        # TV Provider required
        else:
            resource = self._get_mvpd_resource('ESPN', video_data['name'], video_id, None)
--- a/yt_dlp/extractor/fourzerostudio.py
+++ b/yt_dlp/extractor/fourzerostudio.py
@ -1,8 +1,5 @@
 from .common import InfoExtractor
-from ..utils import (
-    traverse_obj,
-    unified_timestamp,
-)
+from ..utils import traverse_obj, unified_timestamp


 class FourZeroStudioArchiveIE(InfoExtractor):
@ -25,7 +22,7 @@ class FourZeroStudioArchiveIE(InfoExtractor):
    def _real_extract(self, url):
        video_id, uploader_id = self._match_valid_url(url).group('id', 'uploader_id')
        webpage = self._download_webpage(url, video_id)
-        nuxt_data = self._search_nuxt_data(webpage, video_id, return_full_data=True)
+        nuxt_data = self._search_nuxt_data(webpage, video_id, traverse=None)

        pcb = traverse_obj(nuxt_data, ('ssrRefs', lambda _, v: v['__typename'] == 'PublicCreatorBroadcast'), get_all=False)
        uploader_internal_id = traverse_obj(nuxt_data, (
@ -82,7 +79,7 @@ class FourZeroStudioClipIE(InfoExtractor):
    def _real_extract(self, url):
        video_id, uploader_id = self._match_valid_url(url).group('id', 'uploader_id')
        webpage = self._download_webpage(url, video_id)
-        nuxt_data = self._search_nuxt_data(webpage, video_id, return_full_data=True)
+        nuxt_data = self._search_nuxt_data(webpage, video_id, traverse=None)

        clip_info = traverse_obj(nuxt_data, ('ssrRefs', lambda _, v: v['__typename'] == 'PublicCreatorArchivedClip'), get_all=False)

--- a/yt_dlp/utils.py
+++ b/yt_dlp/utils.py
@ -3216,7 +3216,11 @@ def js_to_json(code, vars={}):

        return '"%s"' % v

+    def create_map(mobj):
+        return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
+
    code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
+    code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)

    return re.sub(r'''(?sx)
        "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
Author	SHA1	Message	Date
ischmidt20	0a4fb0d3fe	[WatchESPN] Support free videos and BAM_DTC (#4118 ) Authored by: ischmidt20	2022-06-19 20:06:37 -07:00
pukkandan	8072ef2bbd	[extractor/BiliIntl] Fix metadata extraction Closes #4116	2022-06-20 03:05:46 +05:30