[Biqle] Fix extractor (#2731 )

Closes #193 Authored by: Bricio
[youtube] De-prioritize potentially damaged formats
2024-11-14 21:23:05 +00:00 · 2022-02-18 08:02:14 -08:00 · 2022-02-18 19:41:37 +05:30 · 2022-02-18 18:14:50 +05:30 · 2022-02-18 18:03:21 +05:30 · 2022-02-18 18:03:20 +05:30
5 changed files with 85 additions and 66 deletions
--- a/yt_dlp/YoutubeDL.py
+++ b/yt_dlp/YoutubeDL.py
@ -2663,12 +2663,15 @@ class YoutubeDL(object):
            # given in subtitleslangs. See https://github.com/yt-dlp/yt-dlp/issues/1041
            requested_langs = []
            for lang_re in self.params.get('subtitleslangs'):
-                if lang_re == 'all':
-                    requested_langs.extend(all_sub_langs)
-                    continue
                discard = lang_re[0] == '-'
                if discard:
                    lang_re = lang_re[1:]
+                if lang_re == 'all':
+                    if discard:
+                        requested_langs = []
+                    else:
+                        requested_langs.extend(all_sub_langs)
+                    continue
                current_langs = filter(re.compile(lang_re + '$').match, all_sub_langs)
                if discard:
                    for lang in current_langs:
@ -2732,6 +2735,7 @@ class YoutubeDL(object):
            filename = self.evaluate_outtmpl(file_tmpl, info_dict)
            tmpl = format_tmpl(tmpl)
            self.to_screen(f'[info] Writing {tmpl!r} to: {filename}')
+            if self._ensure_dir_exists(filename):
                with io.open(filename, 'a', encoding='utf-8') as f:
                    f.write(self.evaluate_outtmpl(tmpl, info_copy) + '\n')

@ -2904,9 +2908,11 @@ class YoutubeDL(object):

        # Write internet shortcut files
        def _write_link_file(link_type):
-            if 'webpage_url' not in info_dict:
-                self.report_error('Cannot write internet shortcut file because the "webpage_url" field is missing in the media information')
-                return False
+            url = try_get(info_dict['webpage_url'], iri_to_uri)
+            if not url:
+                self.report_warning(
+                    f'Cannot write internet shortcut file because the actual URL of "{info_dict["webpage_url"]}" is unknown')
+                return True
            linkfn = replace_extension(self.prepare_filename(info_dict, 'link'), link_type, info_dict.get('ext'))
            if not self._ensure_dir_exists(encodeFilename(linkfn)):
                return False
@ -2917,7 +2923,7 @@ class YoutubeDL(object):
                self.to_screen(f'[info] Writing internet shortcut (.{link_type}) to: {linkfn}')
                with io.open(encodeFilename(to_high_limit_path(linkfn)), 'w', encoding='utf-8',
                             newline='\r\n' if link_type == 'url' else '\n') as linkfile:
-                    template_vars = {'url': iri_to_uri(info_dict['webpage_url'])}
+                    template_vars = {'url': url}
                    if link_type == 'desktop':
                        template_vars['filename'] = linkfn[:-(len(link_type) + 1)]
                    linkfile.write(LINK_TEMPLATES[link_type] % template_vars)
--- a/yt_dlp/downloader/external.py
+++ b/yt_dlp/downloader/external.py
@ -253,7 +253,7 @@ class Aria2cFD(ExternalFD):
    def _make_cmd(self, tmpfilename, info_dict):
        cmd = [self.exe, '-c',
               '--console-log-level=warn', '--summary-interval=0', '--download-result=hide',
-               '--file-allocation=none', '-x16', '-j16', '-s16']
+               '--http-accept-gzip=true', '--file-allocation=none', '-x16', '-j16', '-s16']
        if 'fragments' in info_dict:
            cmd += ['--allow-overwrite=true', '--allow-piece-length-change=true']
        else:
--- a/yt_dlp/extractor/abc.py
+++ b/yt_dlp/extractor/abc.py
@ -213,7 +213,7 @@ class ABCIViewIE(InfoExtractor):
                'hdnea': token,
            })

-        for sd in ('720', 'sd', 'sd-low'):
+        for sd in ('1080', '720', 'sd', 'sd-low'):
            sd_url = try_get(
                stream, lambda x: x['streams']['hls'][sd], compat_str)
            if not sd_url:
--- a/yt_dlp/extractor/biqle.py
+++ b/yt_dlp/extractor/biqle.py
@ -3,27 +3,28 @@ from __future__ import unicode_literals

 from .common import InfoExtractor
 from .vk import VKIE
-from ..compat import (
-    compat_b64decode,
-    compat_urllib_parse_unquote,
+from ..compat import compat_b64decode
+from ..utils import (
+    int_or_none,
+    js_to_json,
+    traverse_obj,
+    unified_timestamp,
 )
-from ..utils import int_or_none


 class BIQLEIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?biqle\.(?:com|org|ru)/watch/(?P<id>-?\d+_\d+)'
    _TESTS = [{
-        # Youtube embed
-        'url': 'https://biqle.ru/watch/-115995369_456239081',
-        'md5': '97af5a06ee4c29bbf9c001bdb1cf5c06',
+        'url': 'https://biqle.ru/watch/-2000421746_85421746',
+        'md5': 'ae6ef4f04d19ac84e4658046d02c151c',
        'info_dict': {
-            'id': '8v4f-avW-VI',
+            'id': '-2000421746_85421746',
            'ext': 'mp4',
-            'title': "PASSE-PARTOUT - L'ete c'est fait pour jouer",
-            'description': 'Passe-Partout',
-            'uploader_id': 'mrsimpsonstef3',
-            'uploader': 'Phanolito',
-            'upload_date': '20120822',
+            'title': 'Forsaken By Hope Studio Clip',
+            'description': 'Forsaken By Hope Studio Clip — Смотреть онлайн',
+            'upload_date': '19700101',
+            'thumbnail': r're:https://[^/]+/impf/7vN3ACwSTgChP96OdOfzFjUCzFR6ZglDQgWsIw/KPaACiVJJxM\.jpg\?size=800x450&quality=96&keep_aspect_ratio=1&background=000000&sign=b48ea459c4d33dbcba5e26d63574b1cb&type=video_thumb',
+            'timestamp': 0,
        },
    }, {
        'url': 'http://biqle.org/watch/-44781847_168547604',
@ -32,50 +33,59 @@ class BIQLEIE(InfoExtractor):
            'id': '-44781847_168547604',
            'ext': 'mp4',
            'title': 'Ребенок в шоке от автоматической мойки',
+            'description': 'Ребенок в шоке от автоматической мойки — Смотреть онлайн',
            'timestamp': 1396633454,
-            'uploader': 'Dmitry Kotov',
            'upload_date': '20140404',
-            'uploader_id': '47850140',
+            'thumbnail': r're:https://[^/]+/c535507/u190034692/video/l_b84df002\.jpg',
        },
    }]

    def _real_extract(self, url):
        video_id = self._match_id(url)
        webpage = self._download_webpage(url, video_id)
-        embed_url = self._proto_relative_url(self._search_regex(
-            r'<iframe.+?src="((?:https?:)?//(?:daxab\.com|dxb\.to|[^/]+/player)/[^"]+)".*?></iframe>',
-            webpage, 'embed url'))
+
+        title = self._html_search_meta('name', webpage, 'Title', fatal=False)
+        timestamp = unified_timestamp(self._html_search_meta('uploadDate', webpage, 'Upload Date', default=None))
+        description = self._html_search_meta('description', webpage, 'Description', default=None)
+
+        global_embed_url = self._search_regex(
+            r'<script[^<]+?window.globEmbedUrl\s*=\s*\'((?:https?:)?//(?:daxab\.com|dxb\.to|[^/]+/player)/[^\']+)\'',
+            webpage, 'global Embed url')
+        hash = self._search_regex(
+            r'<script id="data-embed-video[^<]+?hash: "([^"]+)"[^<]*</script>', webpage, 'Hash')
+
+        embed_url = global_embed_url + hash
+
        if VKIE.suitable(embed_url):
            return self.url_result(embed_url, VKIE.ie_key(), video_id)

        embed_page = self._download_webpage(
-            embed_url, video_id, headers={'Referer': url})
-        video_ext = self._get_cookies(embed_url).get('video_ext')
-        if video_ext:
-            video_ext = compat_urllib_parse_unquote(video_ext.value)
-        if not video_ext:
-            video_ext = compat_b64decode(self._search_regex(
-                r'video_ext\s*:\s*[\'"]([A-Za-z0-9+/=]+)',
-                embed_page, 'video_ext')).decode()
-        video_id, sig, _, access_token = video_ext.split(':')
+            embed_url, video_id, 'Downloading embed webpage', headers={'Referer': url})
+
+        glob_params = self._parse_json(self._search_regex(
+            r'<script id="globParams">[^<]*window.globParams = ([^;]+);[^<]+</script>',
+            embed_page, 'Global Parameters'), video_id, transform_source=js_to_json)
+        host_name = compat_b64decode(glob_params['server'][::-1]).decode()
+
        item = self._download_json(
-            'https://api.vk.com/method/video.get', video_id,
-            headers={'User-Agent': 'okhttp/3.4.1'}, query={
-                'access_token': access_token,
-                'sig': sig,
-                'v': 5.44,
+            f'https://{host_name}/method/video.get/{video_id}', video_id,
+            headers={'Referer': url}, query={
+                'token': glob_params['video']['access_token'],
                'videos': video_id,
+                'ckey': glob_params['c_key'],
+                'credentials': glob_params['video']['credentials'],
            })['response']['items'][0]
-        title = item['title']

        formats = []
        for f_id, f_url in item.get('files', {}).items():
            if f_id == 'external':
                return self.url_result(f_url)
            ext, height = f_id.split('_')
+            height_extra_key = traverse_obj(glob_params, ('video', 'partial', 'quality', height))
+            if height_extra_key:
                formats.append({
-                'format_id': height + 'p',
-                'url': f_url,
+                    'format_id': f'{height}p',
+                    'url': f'https://{host_name}/{f_url[8:]}&videos={video_id}&extra_key={height_extra_key}',
                    'height': int_or_none(height),
                    'ext': ext,
                })
@ -96,10 +106,9 @@ class BIQLEIE(InfoExtractor):
            'title': title,
            'formats': formats,
            'comment_count': int_or_none(item.get('comments')),
-            'description': item.get('description'),
+            'description': description,
            'duration': int_or_none(item.get('duration')),
            'thumbnails': thumbnails,
-            'timestamp': int_or_none(item.get('date')),
-            'uploader': item.get('owner_id'),
+            'timestamp': timestamp,
            'view_count': int_or_none(item.get('views')),
        }
--- a/yt_dlp/extractor/youtube.py
+++ b/yt_dlp/extractor/youtube.py
@ -225,28 +225,28 @@ INNERTUBE_CLIENTS = {


 def build_innertube_clients():
-    third_party = {
+    THIRD_PARTY = {
        'embedUrl': 'https://google.com',  # Can be any valid URL
    }
-    base_clients = ('android', 'web', 'ios', 'mweb')
-    priority = qualities(base_clients[::-1])
+    BASE_CLIENTS = ('android', 'web', 'ios', 'mweb')
+    priority = qualities(BASE_CLIENTS[::-1])

    for client, ytcfg in tuple(INNERTUBE_CLIENTS.items()):
        ytcfg.setdefault('INNERTUBE_API_KEY', 'AIzaSyDCU8hByM-4DrUqRUYnGn-3llEO78bcxq8')
        ytcfg.setdefault('INNERTUBE_HOST', 'www.youtube.com')
        ytcfg.setdefault('REQUIRE_JS_PLAYER', True)
        ytcfg['INNERTUBE_CONTEXT']['client'].setdefault('hl', 'en')
-        ytcfg['priority'] = 10 * priority(client.split('_', 1)[0])

-        if client in base_clients:
-            INNERTUBE_CLIENTS[f'{client}_agegate'] = agegate_ytcfg = copy.deepcopy(ytcfg)
+        base_client, *variant = client.split('_')
+        ytcfg['priority'] = 10 * priority(base_client)
+
+        if variant == ['embedded']:
+            ytcfg['INNERTUBE_CONTEXT']['thirdParty'] = THIRD_PARTY
+            INNERTUBE_CLIENTS[f'{base_client}_agegate'] = agegate_ytcfg = copy.deepcopy(ytcfg)
            agegate_ytcfg['INNERTUBE_CONTEXT']['client']['clientScreen'] = 'EMBED'
-            agegate_ytcfg['INNERTUBE_CONTEXT']['thirdParty'] = third_party
            agegate_ytcfg['priority'] -= 1
-        elif client.endswith('_embedded'):
-            ytcfg['INNERTUBE_CONTEXT']['thirdParty'] = third_party
            ytcfg['priority'] -= 2
-        else:
+        elif variant:
            ytcfg['priority'] -= 3


@ -2936,6 +2936,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
            'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres'
        ])
        streaming_formats = traverse_obj(streaming_data, (..., ('formats', 'adaptiveFormats'), ...), default=[])
+        approx_duration = max(traverse_obj(streaming_formats, (..., 'approxDurationMs'), expected_type=float_or_none) or [0]) or None

        for fmt in streaming_formats:
            if fmt.get('targetDurationSec') or fmt.get('drmFamilies'):
@ -2995,12 +2996,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                itags[itag] = 'https'
                stream_ids.append(stream_id)

-            tbr = float_or_none(
-                fmt.get('averageBitrate') or fmt.get('bitrate'), 1000)
+            tbr = float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000)
            language_preference = (
                10 if audio_track.get('audioIsDefault') and 10
                else -10 if 'descriptive' in (audio_track.get('displayName') or '').lower() and -10
                else -1)
+            # Some formats may have much smaller duration than others (possibly damaged during encoding)
+            # Eg: 2-nOtRESiUc Ref: https://github.com/yt-dlp/yt-dlp/issues/2823
+            is_damaged = try_get(fmt, lambda x: float(x['approxDurationMs']) < approx_duration - 10000)
            dct = {
                'asr': int_or_none(fmt.get('audioSampleRate')),
                'filesize': int_or_none(fmt.get('contentLength')),
@ -3009,7 +3012,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                    '%s%s' % (audio_track.get('displayName') or '',
                              ' (default)' if language_preference > 0 else ''),
                    fmt.get('qualityLabel') or quality.replace('audio_quality_', ''),
-                    throttled and 'THROTTLED', delim=', '),
+                    throttled and 'THROTTLED', is_damaged and 'DAMAGED', delim=', '),
                'source_preference': -10 if throttled else -1,
                'fps': int_or_none(fmt.get('fps')) or None,
                'height': height,
@ -3020,6 +3023,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                'language': join_nonempty(audio_track.get('id', '').split('.')[0],
                                          'desc' if language_preference < -1 else ''),
                'language_preference': language_preference,
+                'preference': -10 if is_damaged else None,
            }
            mime_mobj = re.match(
                r'((?:[^/]+)/(?:[^;]+))(?:;\s*codecs="([^"]+)")?', fmt.get('mimeType') or '')
Author	SHA1	Message	Date
Bricio	5625e6073f	[Biqle] Fix extractor (#2731 ) Closes #193 Authored by: Bricio	2022-02-18 08:02:14 -08:00
pukkandan	0ad92dfb18	[youtube] De-prioritize potentially damaged formats Closes #2823	2022-02-18 19:41:37 +05:30
pukkandan	60f3e99592	Tolerate failure to `--write-link` due to unknown URL Closes #2724	2022-02-18 18:14:50 +05:30
pukkandan	8d93e69d67	Create necessary directories for `--print-to-file` Closes #2721	2022-02-18 18:03:21 +05:30
pukkandan	3aa915400d	Fix `-all` for `--sub-langs` Closes #2703	2022-02-18 18:03:20 +05:30
pukkandan	dcd55f766d	[aria2c] Add `--http-accept-gzip=true` Closes #1936, #1236	2022-02-18 18:03:20 +05:30
pukkandan	2e4cacd038	[youtube] Fix intermittent failure of embed-based age-gate bypass	2022-02-18 18:03:13 +05:30
Ronnnny	c15c316b21	[abc] Support 1080p (#2819 ) Authored by: Ronnnny	2022-02-18 00:25:47 -08:00