Merge remote-tracking branch 'origin/master' into c13cl

2024-11-14 13:13:06 +00:00 · 2023-12-09 11:40:38 -03:00 · 2023-12-09 11:40:38 -03:00 · 6314563ce5
commit 6314563ce5
parent cf87ca678c 0b6f829b1d
10 changed files with 132 additions and 65 deletions
--- a/README.md
+++ b/README.md
@ -1268,7 +1268,7 @@ The field names themselves (the part inside the parenthesis) can also have some

 1. **Object traversal**: The dictionaries and lists available in metadata can be traversed by using a dot `.` separator; e.g. `%(tags.0)s`, `%(subtitles.en.-1.ext)s`. You can do Python slicing with colon `:`; E.g. `%(id.3:7:-1)s`, `%(formats.:.format_id)s`. Curly braces `{}` can be used to build dictionaries with only specific keys; e.g. `%(formats.:.{format_id,height})#j`. An empty field name `%()s` refers to the entire infodict; e.g. `%(.{id,title})s`. Note that all the fields that become available using this method are not listed below. Use `-j` to see such fields

-1. **Addition**: Addition and subtraction of numeric fields can be done using `+` and `-` respectively. E.g. `%(playlist_index+10)03d`, `%(n_entries+1-playlist_index)d`
+1. **Arithmetic**: Simple arithmetic can be done on numeric fields using `+`, `-` and `*`. E.g. `%(playlist_index+10)03d`, `%(n_entries+1-playlist_index)d`

 1. **Date/time Formatting**: Date/time fields can be formatted according to [strftime formatting](https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes) by specifying it separated from the field name using a `>`. E.g. `%(duration>%H-%M-%S)s`, `%(upload_date>%Y-%m-%d)s`, `%(epoch-3600>%H-%M-%S)s`

--- a/test/test_YoutubeDL.py
+++ b/test/test_YoutubeDL.py
@ -797,6 +797,7 @@ class TestYoutubeDL(unittest.TestCase):
        test('%(title|%)s %(title|%%)s', '% %%')
        test('%(id+1-height+3)05d', '00158')
        test('%(width+100)05d', 'NA')
+        test('%(filesize*8)d', '8192')
        test('%(formats.0) 15s', ('% 15s' % FORMATS[0], None))
        test('%(formats.0)r', (repr(FORMATS[0]), None))
        test('%(height.0)03d', '001')
--- a/test/test_utils.py
+++ b/test/test_utils.py
@ -2317,23 +2317,6 @@ Line 1
        self.assertEqual(traverse_obj({}, (0, slice(1)), traverse_string=True), [],
                         msg='branching should result in list if `traverse_string`')

-        # Test is_user_input behavior
-        _IS_USER_INPUT_DATA = {'range8': list(range(8))}
-        self.assertEqual(traverse_obj(_IS_USER_INPUT_DATA, ('range8', '3'),
-                                      is_user_input=True), 3,
-                         msg='allow for string indexing if `is_user_input`')
-        self.assertCountEqual(traverse_obj(_IS_USER_INPUT_DATA, ('range8', '3:'),
-                                           is_user_input=True), tuple(range(8))[3:],
-                              msg='allow for string slice if `is_user_input`')
-        self.assertCountEqual(traverse_obj(_IS_USER_INPUT_DATA, ('range8', ':4:2'),
-                                           is_user_input=True), tuple(range(8))[:4:2],
-                              msg='allow step in string slice if `is_user_input`')
-        self.assertCountEqual(traverse_obj(_IS_USER_INPUT_DATA, ('range8', ':'),
-                                           is_user_input=True), range(8),
-                              msg='`:` should be treated as `...` if `is_user_input`')
-        with self.assertRaises(TypeError, msg='too many params should result in error'):
-            traverse_obj(_IS_USER_INPUT_DATA, ('range8', ':::'), is_user_input=True)
-
        # Test re.Match as input obj
        mobj = re.fullmatch(r'0(12)(?P<group>3)(4)?', '0123')
        self.assertEqual(traverse_obj(mobj, ...), [x for x in mobj.groups() if x is not None],
--- a/yt_dlp/YoutubeDL.py
+++ b/yt_dlp/YoutubeDL.py
@ -1179,6 +1179,7 @@ class YoutubeDL:
        MATH_FUNCTIONS = {
            '+': float.__add__,
            '-': float.__sub__,
+            '*': float.__mul__,
        }
        # Field is of the form key1.key2...
        # where keys (except first) can be string, int, slice or "{field, ...}"
@ -1200,6 +1201,15 @@ class YoutubeDL:
                (?:\|(?P<default>.*?))?
            )$''')

+        def _from_user_input(field):
+            if field == ':':
+                return ...
+            elif ':' in field:
+                return slice(*map(int_or_none, field.split(':')))
+            elif int_or_none(field) is not None:
+                return int(field)
+            return field
+
        def _traverse_infodict(fields):
            fields = [f for x in re.split(r'\.({.+?})\.?', fields)
                      for f in ([x] if x.startswith('{') else x.split('.'))]
@ -1209,11 +1219,12 @@ class YoutubeDL:

            for i, f in enumerate(fields):
                if not f.startswith('{'):
+                    fields[i] = _from_user_input(f)
                    continue
                assert f.endswith('}'), f'No closing brace for {f} in {fields}'
-                fields[i] = {k: k.split('.') for k in f[1:-1].split(',')}
+                fields[i] = {k: list(map(_from_user_input, k.split('.'))) for k in f[1:-1].split(',')}

-            return traverse_obj(info_dict, fields, is_user_input=True, traverse_string=True)
+            return traverse_obj(info_dict, fields, traverse_string=True)

        def get_value(mdict):
            # Object traversal
--- a/yt_dlp/extractor/francetv.py
+++ b/yt_dlp/extractor/francetv.py
@ -1,12 +1,14 @@
 from .common import InfoExtractor
+from .dailymotion import DailymotionIE
 from ..utils import (
-    determine_ext,
    ExtractorError,
+    determine_ext,
    format_field,
+    int_or_none,
+    join_nonempty,
    parse_iso8601,
    parse_qs,
 )
-from .dailymotion import DailymotionIE


 class FranceTVBaseInfoExtractor(InfoExtractor):
@ -82,6 +84,8 @@ class FranceTVIE(InfoExtractor):
        videos = []
        title = None
        subtitle = None
+        episode_number = None
+        season_number = None
        image = None
        duration = None
        timestamp = None
@ -112,7 +116,9 @@ class FranceTVIE(InfoExtractor):
            if meta:
                if title is None:
                    title = meta.get('title')
-                # XXX: what is meta['pre_title']?
+                # meta['pre_title'] contains season and episode number for series in format "S<ID> E<ID>"
+                season_number, episode_number = self._search_regex(
+                    r'S(\d+)\s*E(\d+)', meta.get('pre_title'), 'episode info', group=(1, 2), default=(None, None))
                if subtitle is None:
                    subtitle = meta.get('additional_title')
                if image is None:
@ -191,19 +197,19 @@ class FranceTVIE(InfoExtractor):
                } for sheet in spritesheets]
            })

-        if subtitle:
-            title += ' - %s' % subtitle
-        title = title.strip()
-
        return {
            'id': video_id,
-            'title': title,
+            'title': join_nonempty(title, subtitle, delim=' - ').strip(),
            'thumbnail': image,
            'duration': duration,
            'timestamp': timestamp,
            'is_live': is_live,
            'formats': formats,
            'subtitles': subtitles,
+            'episode': subtitle if episode_number else None,
+            'series': title if episode_number else None,
+            'episode_number': int_or_none(episode_number),
+            'season_number': int_or_none(season_number),
        }

    def _real_extract(self, url):
@ -230,14 +236,31 @@ class FranceTVSiteIE(FranceTVBaseInfoExtractor):
            'id': 'ec217ecc-0733-48cf-ac06-af1347b849d1',
            'ext': 'mp4',
            'title': '13h15, le dimanche... - Les mystères de Jésus',
-            'description': 'md5:75efe8d4c0a8205e5904498ffe1e1a42',
            'timestamp': 1502623500,
+            'duration': 2580,
+            'thumbnail': r're:^https?://.*\.jpg$',
            'upload_date': '20170813',
        },
        'params': {
            'skip_download': True,
        },
        'add_ie': [FranceTVIE.ie_key()],
+    }, {
+        'url': 'https://www.france.tv/enfants/six-huit-ans/foot2rue/saison-1/3066387-duel-au-vieux-port.html',
+        'info_dict': {
+            'id': 'a9050959-eedd-4b4a-9b0d-de6eeaa73e44',
+            'ext': 'mp4',
+            'title': 'Foot2Rue - Duel au vieux port',
+            'episode': 'Duel au vieux port',
+            'series': 'Foot2Rue',
+            'episode_number': 1,
+            'season_number': 1,
+            'timestamp': 1642761360,
+            'upload_date': '20220121',
+            'season': 'Season 1',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 1441,
+        },
    }, {
        # france3
        'url': 'https://www.france.tv/france-3/des-chiffres-et-des-lettres/139063-emission-du-mardi-9-mai-2017.html',
--- a/yt_dlp/extractor/mediastream.py
+++ b/yt_dlp/extractor/mediastream.py
@ -3,8 +3,11 @@ import re
 from .common import InfoExtractor
 from ..utils import (
    clean_html,
+    filter_dict,
+    parse_qs,
    remove_end,
    traverse_obj,
+    update_url_query,
    urljoin,
 )

@ -108,7 +111,9 @@ class MediaStreamIE(MediaStreamBaseIE):

        for message in [
            'Debido a tu ubicación no puedes ver el contenido',
-            'You are not allowed to watch this video: Geo Fencing Restriction'
+            'You are not allowed to watch this video: Geo Fencing Restriction',
+            'Este contenido no está disponible en tu zona geográfica.',
+            'El contenido sólo está disponible dentro de',
        ]:
            if message in webpage:
                self.raise_geo_restricted()
@ -118,7 +123,16 @@ class MediaStreamIE(MediaStreamBaseIE):
        formats, subtitles = [], {}
        for video_format in player_config['src']:
            if video_format == 'hls':
-                fmts, subs = self._extract_m3u8_formats_and_subtitles(player_config['src'][video_format], video_id)
+                params = {
+                    'at': 'web-app',
+                    'access_token': traverse_obj(parse_qs(url), ('access_token', 0)),
+                }
+                for name, key in (('MDSTRMUID', 'uid'), ('MDSTRMSID', 'sid'), ('MDSTRMPID', 'pid'), ('VERSION', 'av')):
+                    params[key] = self._search_regex(
+                        rf'window\.{name}\s*=\s*["\']([^"\']+)["\'];', webpage, key, default=None)
+
+                fmts, subs = self._extract_m3u8_formats_and_subtitles(
+                    update_url_query(player_config['src'][video_format], filter_dict(params)), video_id)
                formats.extend(fmts)
                self._merge_subtitles(subs, target=subtitles)
            elif video_format == 'mpd':
--- a/yt_dlp/extractor/ondemandkorea.py
+++ b/yt_dlp/extractor/ondemandkorea.py
@ -3,7 +3,6 @@ import re
 import uuid

 from .common import InfoExtractor
-from ..networking import HEADRequest
 from ..utils import (
    ExtractorError,
    OnDemandPagedList,
@ -84,15 +83,17 @@ class OnDemandKoreaIE(InfoExtractor):
        def try_geo_bypass(url):
            return traverse_obj(url, ({parse_qs}, 'stream_url', 0, {url_or_none})) or url

-        def try_upgrade_quality(url):
-            mod_url = re.sub(r'_720(p?)\.m3u8', r'_1080\1.m3u8', url)
-            return mod_url if mod_url != url and self._request_webpage(
-                HEADRequest(mod_url), video_id, note='Checking for higher quality format',
-                errnote='No higher quality format found', fatal=False) else url
-
        formats = []
        for m3u8_url in traverse_obj(data, (('sources', 'manifest'), ..., 'url', {url_or_none}, {try_geo_bypass})):
-            formats.extend(self._extract_m3u8_formats(try_upgrade_quality(m3u8_url), video_id, fatal=False))
+            mod_url = re.sub(r'_720(p?)\.m3u8', r'_1080\1.m3u8', m3u8_url)
+            if mod_url != m3u8_url:
+                mod_format = self._extract_m3u8_formats(
+                    mod_url, video_id, note='Checking for higher quality format',
+                    errnote='No higher quality format found', fatal=False)
+                if mod_format:
+                    formats.extend(mod_format)
+                    continue
+            formats.extend(self._extract_m3u8_formats(m3u8_url, video_id, fatal=False))

        subtitles = {}
        for track in traverse_obj(data, ('text_tracks', lambda _, v: url_or_none(v['url']))):
--- a/yt_dlp/extractor/pr0gramm.py
+++ b/yt_dlp/extractor/pr0gramm.py
@ -4,7 +4,14 @@ from urllib.parse import unquote

 from .common import InfoExtractor
 from ..compat import functools
-from ..utils import ExtractorError, make_archive_id, urljoin
+from ..utils import (
+    ExtractorError,
+    float_or_none,
+    int_or_none,
+    make_archive_id,
+    mimetype2ext,
+    urljoin,
+)
 from ..utils.traversal import traverse_obj


@ -26,6 +33,7 @@ class Pr0grammIE(InfoExtractor):
            'dislike_count': int,
            'age_limit': 0,
            'thumbnail': r're:^https://thumb\.pr0gramm\.com/.*\.jpg',
+            '_old_archive_ids': ['pr0grammstatic 5466437'],
        },
    }, {
        # Tags require account
@ -43,6 +51,7 @@ class Pr0grammIE(InfoExtractor):
            'dislike_count': int,
            'age_limit': 0,
            'thumbnail': r're:^https://thumb\.pr0gramm\.com/.*\.jpg',
+            '_old_archive_ids': ['pr0grammstatic 3052805'],
        },
    }, {
        # Requires verified account
@ -60,6 +69,7 @@ class Pr0grammIE(InfoExtractor):
            'dislike_count': int,
            'age_limit': 18,
            'thumbnail': r're:^https://thumb\.pr0gramm\.com/.*\.jpg',
+            '_old_archive_ids': ['pr0grammstatic 5848332'],
        },
    }, {
        'url': 'https://pr0gramm.com/static/5466437',
@ -110,37 +120,61 @@ class Pr0grammIE(InfoExtractor):

        return data

+    @staticmethod
+    def _create_source_url(path):
+        return urljoin('https://img.pr0gramm.com', path)
+
    def _real_extract(self, url):
        video_id = self._match_id(url)
        video_info = traverse_obj(
            self._call_api('get', video_id, {'id': video_id, 'flags': self._maximum_flags}),
            ('items', 0, {dict}))

-        source = urljoin('https://img.pr0gramm.com', video_info.get('image'))
+        source = video_info.get('image')
        if not source or not source.endswith('mp4'):
            self.raise_no_formats('Could not extract a video', expected=bool(source), video_id=video_id)

        tags = None
        if self._is_logged_in:
-            metadata = self._call_api('info', video_id, {'itemId': video_id})
+            metadata = self._call_api('info', video_id, {'itemId': video_id}, note='Downloading tags')
            tags = traverse_obj(metadata, ('tags', ..., 'tag', {str}))
            # Sorted by "confidence", higher confidence = earlier in list
            confidences = traverse_obj(metadata, ('tags', ..., 'confidence', ({int}, {float})))
            if confidences:
                tags = [tag for _, tag in sorted(zip(confidences, tags), reverse=True)]

+        formats = traverse_obj(video_info, ('variants', ..., {
+            'format_id': ('name', {str}),
+            'url': ('path', {self._create_source_url}),
+            'ext': ('mimeType', {mimetype2ext}),
+            'vcodec': ('codec', {str}),
+            'width': ('width', {int_or_none}),
+            'height': ('height', {int_or_none}),
+            'bitrate': ('bitRate', {float_or_none}),
+            'filesize': ('fileSize', {int_or_none}),
+        })) if video_info.get('variants') else [{
+            'ext': 'mp4',
+            'format_id': 'source',
+            **traverse_obj(video_info, {
+                'url': ('image', {self._create_source_url}),
+                'width': ('width', {int_or_none}),
+                'height': ('height', {int_or_none}),
+            }),
+        }]
+
+        subtitles = {}
+        for subtitle in traverse_obj(video_info, ('subtitles', lambda _, v: v['language'])):
+            subtitles.setdefault(subtitle['language'], []).append(traverse_obj(subtitle, {
+                'url': ('path', {self._create_source_url}),
+                'note': ('label', {str}),
+            }))
+
        return {
            'id': video_id,
            'title': f'pr0gramm-{video_id} by {video_info.get("user")}',
-            'formats': [{
-                'url': source,
-                'ext': 'mp4',
-                **traverse_obj(video_info, {
-                    'width': ('width', {int}),
-                    'height': ('height', {int}),
-                }),
-            }],
            'tags': tags,
+            'formats': formats,
+            'subtitles': subtitles,
            'age_limit': 18 if traverse_obj(video_info, ('flags', {0b110.__and__})) else 0,
            '_old_archive_ids': [make_archive_id('Pr0grammStatic', video_id)],
            **traverse_obj(video_info, {
--- a/yt_dlp/extractor/youtube.py
+++ b/yt_dlp/extractor/youtube.py
@ -6469,6 +6469,9 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
    def _has_tab(self, tabs, tab_id):
        return any(self._extract_tab_id_and_name(tab)[0] == tab_id for tab in tabs)

+    def _empty_playlist(self, item_id, data):
+        return self.playlist_result([], item_id, **self._extract_metadata_from_tabs(item_id, data))
+
    @YoutubeTabBaseInfoExtractor.passthrough_smuggled_data
    def _real_extract(self, url, smuggled_data):
        item_id = self._match_id(url)
@ -6534,6 +6537,10 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
            selected_tab_id, selected_tab_name = self._extract_tab_id_and_name(selected_tab, url)  # NB: Name may be translated
            self.write_debug(f'Selected tab: {selected_tab_id!r} ({selected_tab_name}), Requested tab: {original_tab_id!r}')

+            # /about is no longer a tab
+            if original_tab_id == 'about':
+                return self._empty_playlist(item_id, data)
+
            if not original_tab_id and selected_tab_name:
                self.to_screen('Downloading all uploads of the channel. '
                               'To download only the videos in a specific tab, pass the tab\'s URL')
@ -6546,7 +6553,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
                if not extra_tabs and selected_tab_id != 'videos':
                    # Channel does not have streams, shorts or videos tabs
                    if item_id[:2] != 'UC':
-                        raise ExtractorError('This channel has no uploads', expected=True)
+                        return self._empty_playlist(item_id, data)

                    # Topic channels don't have /videos. Use the equivalent playlist instead
                    pl_id = f'UU{item_id[2:]}'
@ -6554,7 +6561,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
                    try:
                        data, ytcfg = self._extract_data(pl_url, pl_id, ytcfg=ytcfg, fatal=True, webpage_fatal=True)
                    except ExtractorError:
-                        raise ExtractorError('This channel has no uploads', expected=True)
+                        return self._empty_playlist(item_id, data)
                    else:
                        item_id, url = pl_id, pl_url
                        self.to_screen(
--- a/yt_dlp/utils/traversal.py
+++ b/yt_dlp/utils/traversal.py
@ -8,7 +8,7 @@ from ._utils import (
    IDENTITY,
    NO_DEFAULT,
    LazyList,
-    int_or_none,
+    deprecation_warning,
    is_iterable_like,
    try_call,
    variadic,
@ -17,7 +17,7 @@ from ._utils import (

 def traverse_obj(
        obj, *paths, default=NO_DEFAULT, expected_type=None, get_all=True,
-        casesense=True, is_user_input=False, traverse_string=False):
+        casesense=True, is_user_input=NO_DEFAULT, traverse_string=False):
    """
    Safely traverse nested `dict`s and `Iterable`s

@ -63,10 +63,8 @@ def traverse_obj(
    @param get_all          If `False`, return the first matching result, otherwise all matching ones.
    @param casesense        If `False`, consider string dictionary keys as case insensitive.

-    The following are only meant to be used by YoutubeDL.prepare_outtmpl and are not part of the API
+    `traverse_string` is only meant to be used by YoutubeDL.prepare_outtmpl and is not part of the API

-    @param is_user_input    Whether the keys are generated from user input.
-                            If `True` strings get converted to `int`/`slice` if needed.
    @param traverse_string  Whether to traverse into objects as strings.
                            If `True`, any non-compatible object will first be
                            converted into a string and then traversed into.
@ -80,6 +78,9 @@ def traverse_obj(
                            If no `default` is given and the last path branches, a `list` of results
                            is always returned. If a path ends on a `dict` that result will always be a `dict`.
    """
+    if is_user_input is not NO_DEFAULT:
+        deprecation_warning('The is_user_input parameter is deprecated and no longer works')
+
    casefold = lambda k: k.casefold() if isinstance(k, str) else k

    if isinstance(expected_type, type):
@ -195,14 +196,6 @@ def traverse_obj(

        key = None
        for last, key in lazy_last(variadic(path, (str, bytes, dict, set))):
-            if is_user_input and isinstance(key, str):
-                if key == ':':
-                    key = ...
-                elif ':' in key:
-                    key = slice(*map(int_or_none, key.split(':')))
-                elif int_or_none(key) is not None:
-                    key = int(key)
-
            if not casesense and isinstance(key, str):
                key = key.casefold()