[extractor/generic] Remove HEAD request

Do not load system certificates when certifi is used
This causes `CERTIFICATE_VERIFY_FAILED` if there is an expired/bad certificate in the system store Partially reverts 8a82af3511 Related: #4145
2024-11-15 13:43:04 +00:00 · 2022-07-07 12:09:30 +05:30 · 2022-07-07 11:29:49 +05:30 · 2022-07-07 11:28:56 +05:30
5 changed files with 42 additions and 60 deletions
--- a/README.md
+++ b/README.md
@ -146,8 +146,8 @@ Some of yt-dlp's default options are different from that of youtube-dl and youtu
 * Thumbnail embedding in `mp4` is done with mutagen if possible. Use `--compat-options embed-thumbnail-atomicparsley` to force the use of AtomicParsley instead
 * Some private fields such as filenames are removed by default from the infojson. Use `--no-clean-infojson` or `--compat-options no-clean-infojson` to revert this
 * When `--embed-subs` and `--write-subs` are used together, the subtitles are written to disk and also embedded in the media file. You can use just `--embed-subs` to embed the subs and automatically delete the separate file. See [#630 (comment)](https://github.com/yt-dlp/yt-dlp/issues/630#issuecomment-893659460) for more info. `--compat-options no-keep-subs` can be used to revert this
-* `certifi` will be used for SSL root certificates, if installed. If you want to use only system certificates, use `--compat-options no-certifi`
-* youtube-dl tries to remove some superfluous punctuations from filenames. While this can sometimes be helpful, it is often undesirable. So yt-dlp tries to keep the fields in the filenames as close to their original values as possible. You can use `--compat-options filename-sanitization` to revert to youtube-dl's behavior
+* `certifi` will be used for SSL root certificates, if installed. If you want to use system certificates (e.g. self-signed), use `--compat-options no-certifi`
+* youtube-dl tries to remove some superfluous punctuations from filenames. While this can sometimes be helpfull, it is often undesirable. So yt-dlp tries to keep the fields in the filenames as close to their original values as possible. You can use `--compat-options filename-sanitization` to revert to youtube-dl's behavior

 For ease of use, a few more compat options are available:

--- a/yt_dlp/YoutubeDL.py
+++ b/yt_dlp/YoutubeDL.py
@ -2377,13 +2377,18 @@ class YoutubeDL:
            self.report_warning('"duration" field is negative, there is an error in extractor')

        chapters = info_dict.get('chapters') or []
+        if chapters and chapters[0].get('start_time'):
+            chapters.insert(0, {'start_time': 0})
+
        dummy_chapter = {'end_time': 0, 'start_time': info_dict.get('duration')}
-        for prev, current, next_ in zip(
-                (dummy_chapter, *chapters), chapters, (*chapters[1:], dummy_chapter)):
+        for idx, (prev, current, next_) in enumerate(zip(
+                (dummy_chapter, *chapters), chapters, (*chapters[1:], dummy_chapter)), 1):
            if current.get('start_time') is None:
                current['start_time'] = prev.get('end_time')
            if not current.get('end_time'):
                current['end_time'] = next_.get('start_time')
+            if not current.get('title'):
+                current['title'] = f'<Untitled Chapter {idx}>'

        if 'playlist' not in info_dict:
            # It isn't part of a playlist
--- a/yt_dlp/extractor/generic.py
+++ b/yt_dlp/extractor/generic.py
@ -111,7 +111,6 @@ from ..compat import compat_etree_fromstring
 from ..utils import (
    KNOWN_EXTENSIONS,
    ExtractorError,
-    HEADRequest,
    UnsupportedError,
    determine_ext,
    dict_get,
@ -124,7 +123,6 @@ from ..utils import (
    orderedSet,
    parse_duration,
    parse_resolution,
-    sanitized_Request,
    smuggle_url,
    str_or_none,
    try_call,
@ -2807,26 +2805,6 @@ class GenericIE(InfoExtractor):
        else:
            video_id = self._generic_id(url)

-        self.to_screen('%s: Requesting header' % video_id)
-
-        head_req = HEADRequest(url)
-        head_response = self._request_webpage(
-            head_req, video_id,
-            note=False, errnote='Could not send HEAD request to %s' % url,
-            fatal=False)
-
-        if head_response is not False:
-            # Check for redirect
-            new_url = head_response.geturl()
-            if url != new_url:
-                self.report_following_redirect(new_url)
-                if force_videoid:
-                    new_url = smuggle_url(
-                        new_url, {'force_videoid': force_videoid})
-                return self.url_result(new_url)
-
-        def request_webpage():
-            request = sanitized_Request(url)
        # Some webservers may serve compressed content of rather big size (e.g. gzipped flac)
        # making it impossible to download only chunk of the file (yet we need only 512kB to
        # test whether it's HTML or not). According to yt-dlp default Accept-Encoding
@ -2834,22 +2812,23 @@ class GenericIE(InfoExtractor):
        # Therefore for extraction pass we have to override Accept-Encoding to any in order
        # to accept raw bytes and being able to download only a chunk.
        # It may probably better to solve this by checking Content-Type for application/octet-stream
-            # after HEAD request finishes, but not sure if we can rely on this.
-            request.add_header('Accept-Encoding', '*')
-            return self._request_webpage(request, video_id)
-
-        full_response = None
-        if head_response is False:
-            head_response = full_response = request_webpage()
+        # after a HEAD request, but not sure if we can rely on this.
+        full_response = self._request_webpage(url, video_id, headers={'Accept-Encoding': '*'})
+        new_url = full_response.geturl()
+        if url != new_url:
+            self.report_following_redirect(new_url)
+            if force_videoid:
+                new_url = smuggle_url(new_url, {'force_videoid': force_videoid})
+            return self.url_result(new_url)

        info_dict = {
            'id': video_id,
            'title': self._generic_title(url),
-            'timestamp': unified_timestamp(head_response.headers.get('Last-Modified'))
+            'timestamp': unified_timestamp(full_response.headers.get('Last-Modified'))
        }

        # Check for direct link to a video
-        content_type = head_response.headers.get('Content-Type', '').lower()
+        content_type = full_response.headers.get('Content-Type', '').lower()
        m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type)
        if m:
            self.report_detected('direct video link')
@ -2878,7 +2857,6 @@ class GenericIE(InfoExtractor):
            self.report_warning(
                '%s on generic information extractor.' % ('Forcing' if force else 'Falling back'))

-        full_response = full_response or request_webpage()
        first_bytes = full_response.read(512)

        # Is it an M3U playlist?
@ -4103,7 +4081,7 @@ class GenericIE(InfoExtractor):
                webpage)
            if not found:
                # Look also in Refresh HTTP header
-                refresh_header = head_response.headers.get('Refresh')
+                refresh_header = full_response.headers.get('Refresh')
                if refresh_header:
                    found = re.search(REDIRECT_REGEX, refresh_header)
            if found:
--- a/yt_dlp/extractor/youtube.py
+++ b/yt_dlp/extractor/youtube.py
@ -2764,17 +2764,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
        if not strict:
            chapter_list.sort(key=lambda c: c['start_time'] or 0)

-        chapters = [{'start_time': 0, 'title': '<Untitled>'}]
+        chapters = [{'start_time': 0}]
        for idx, chapter in enumerate(chapter_list):
-            if chapter['start_time'] is None or not chapter['title']:
+            if chapter['start_time'] is None:
                self.report_warning(f'Incomplete chapter {idx}')
            elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration:
-                chapters[-1]['end_time'] = chapter['start_time']
                chapters.append(chapter)
            else:
                self.report_warning(f'Invalid start time for chapter "{chapter["title"]}"')
-        chapters[-1]['end_time'] = duration
-        return chapters if len(chapters) > 1 and chapters[1]['start_time'] else chapters[1:]
+        return chapters[1:]

    def _extract_comment(self, comment_renderer, parent=None):
        comment_id = comment_renderer.get('commentId')
--- a/yt_dlp/utils.py
+++ b/yt_dlp/utils.py
@ -950,6 +950,7 @@ def make_HTTPS_handler(params, **kwargs):
    if opts_check_certificate:
        if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
            context.load_verify_locations(cafile=certifi.where())
+        else:
            try:
                context.load_default_certs()
                # Work around the issue in load_default_certs when there are bad certificates. See:
Author	SHA1	Message	Date
pukkandan	6154438178	[extractor/generic] Remove HEAD request	2022-07-07 12:09:30 +05:30
pukkandan	168bbc4f38	Do not load system certificates when `certifi` is used This causes `CERTIFICATE_VERIFY_FAILED` if there is an expired/bad certificate in the system store Partially reverts `8a82af3511` Related: #4145	2022-07-07 11:29:49 +05:30
pukkandan	a3976e0760	Improve chapter sanitization	2022-07-07 11:28:56 +05:30