From b4e0d75848e9447cee2cd3646ce54d4744a7ff56 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Thu, 22 Jun 2023 04:54:39 +0530 Subject: [PATCH] Improve `--download-sections` * Support negative time-ranges * Add `*from-url` to obey time-ranges in URL Closes #7248 --- README.md | 14 +++++----- yt_dlp/YoutubeDL.py | 6 +++-- yt_dlp/__init__.py | 59 +++++++++++++++++++++++++++++------------- yt_dlp/options.py | 5 ++-- yt_dlp/utils/_utils.py | 22 +++++++++++++--- 5 files changed, 74 insertions(+), 32 deletions(-) diff --git a/README.md b/README.md index d10832103..8db2d4f06 100644 --- a/README.md +++ b/README.md @@ -610,12 +610,14 @@ If you fork the project on GitHub, you can run your fork's [build workflow](.git --no-hls-use-mpegts Do not use the mpegts container for HLS videos. This is default when not downloading live streams - --download-sections REGEX Download only chapters whose title matches - the given regular expression. Time ranges - prefixed by a "*" can also be used in place - of chapters to download the specified range. - Needs ffmpeg. This option can be used - multiple times to download multiple + --download-sections REGEX Download only chapters that match the + regular expression. A "*" prefix denotes + time-range instead of chapter. Negative + timestamps are calculated from the end. + "*from-url" can be used to download between + the "start_time" and "end_time" extracted + from the URL. Needs ffmpeg. This option can + be used multiple times to download multiple sections, e.g. --download-sections "*10:15-inf" --download-sections "intro" --downloader [PROTO:]NAME Name or path of the external downloader to diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 79b7d47b0..6dade0b2a 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -2806,11 +2806,13 @@ class YoutubeDL: new_info.update(fmt) offset, duration = info_dict.get('section_start') or 0, info_dict.get('duration') or float('inf') end_time = offset + min(chapter.get('end_time', duration), duration) + # duration may not be accurate. So allow deviations <1sec + if end_time == float('inf') or end_time > offset + duration + 1: + end_time = None if chapter or offset: new_info.update({ 'section_start': offset + chapter.get('start_time', 0), - # duration may not be accurate. So allow deviations <1sec - 'section_end': end_time if end_time <= offset + duration + 1 else None, + 'section_end': end_time, 'section_title': chapter.get('title'), 'section_number': chapter.get('index'), }) diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index 46edd88d3..b81277a57 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -320,26 +320,49 @@ def validate_options(opts): opts.skip_download = None del opts.outtmpl['default'] - def parse_chapters(name, value): - chapters, ranges = [], [] + def parse_chapters(name, value, advanced=False): parse_timestamp = lambda x: float('inf') if x in ('inf', 'infinite') else parse_duration(x) - for regex in value or []: - if regex.startswith('*'): - for range_ in map(str.strip, regex[1:].split(',')): - mobj = range_ != '-' and re.fullmatch(r'([^-]+)?\s*-\s*([^-]+)?', range_) - dur = mobj and (parse_timestamp(mobj.group(1) or '0'), parse_timestamp(mobj.group(2) or 'inf')) - if None in (dur or [None]): - raise ValueError(f'invalid {name} time range "{regex}". Must be of the form "*start-end"') - ranges.append(dur) - continue - try: - chapters.append(re.compile(regex)) - except re.error as err: - raise ValueError(f'invalid {name} regex "{regex}" - {err}') - return chapters, ranges + TIMESTAMP_RE = r'''(?x)(?: + (?P-?)(?P[^-]+) + )?\s*-\s*(?: + (?P-?)(?P[^-]+) + )?''' - opts.remove_chapters, opts.remove_ranges = parse_chapters('--remove-chapters', opts.remove_chapters) - opts.download_ranges = download_range_func(*parse_chapters('--download-sections', opts.download_ranges)) + chapters, ranges, from_url = [], [], False + for regex in value or []: + if advanced and regex == '*from-url': + from_url = True + continue + elif not regex.startswith('*'): + try: + chapters.append(re.compile(regex)) + except re.error as err: + raise ValueError(f'invalid {name} regex "{regex}" - {err}') + continue + + for range_ in map(str.strip, regex[1:].split(',')): + mobj = range_ != '-' and re.fullmatch(TIMESTAMP_RE, range_) + dur = mobj and [parse_timestamp(mobj.group('start') or '0'), parse_timestamp(mobj.group('end') or 'inf')] + signs = mobj and (mobj.group('start_sign'), mobj.group('end_sign')) + + err = None + if None in (dur or [None]): + err = 'Must be of the form "*start-end"' + elif not advanced and any(signs): + err = 'Negative timestamps are not allowed' + else: + dur[0] *= -1 if signs[0] else 1 + dur[1] *= -1 if signs[1] else 1 + if dur[1] == float('-inf'): + err = '"-inf" is not a valid end' + if err: + raise ValueError(f'invalid {name} time range "{regex}". {err}') + ranges.append(dur) + + return chapters, ranges, from_url + + opts.remove_chapters, opts.remove_ranges, _ = parse_chapters('--remove-chapters', opts.remove_chapters) + opts.download_ranges = download_range_func(*parse_chapters('--download-sections', opts.download_ranges, True)) # Cookies from browser if opts.cookiesfrombrowser: diff --git a/yt_dlp/options.py b/yt_dlp/options.py index 9d6dbec9f..163809706 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -1012,8 +1012,9 @@ def create_parser(): '--download-sections', metavar='REGEX', dest='download_ranges', action='append', help=( - 'Download only chapters whose title matches the given regular expression. ' - 'Time ranges prefixed by a "*" can also be used in place of chapters to download the specified range. ' + 'Download only chapters that match the regular expression. ' + 'A "*" prefix denotes time-range instead of chapter. Negative timestamps are calculated from the end. ' + '"*from-url" can be used to download between the "start_time" and "end_time" extracted from the URL. ' 'Needs ffmpeg. This option can be used multiple times to download multiple sections, ' 'e.g. --download-sections "*10:15-inf" --download-sections "intro"')) downloader.add_option( diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index bc1bc9116..56acadd73 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -3753,11 +3753,11 @@ def match_filter_func(filters, breaking_filters=None): class download_range_func: - def __init__(self, chapters, ranges): - self.chapters, self.ranges = chapters, ranges + def __init__(self, chapters, ranges, from_info=False): + self.chapters, self.ranges, self.from_info = chapters, ranges, from_info def __call__(self, info_dict, ydl): - if not self.ranges and not self.chapters: + if not any((self.ranges, self.chapters, self.from_info)): yield {} warning = ('There are no chapters matching the regex' if info_dict.get('chapters') @@ -3770,7 +3770,21 @@ class download_range_func: if self.chapters and warning: ydl.to_screen(f'[info] {info_dict["id"]}: {warning}') - yield from ({'start_time': start, 'end_time': end} for start, end in self.ranges or []) + for start, end in self.ranges or []: + yield { + 'start_time': self._handle_negative_timestamp(start, info_dict), + 'end_time': self._handle_negative_timestamp(end, info_dict), + } + + if self.from_info and (info_dict.get('start_time') or info_dict.get('end_time')): + yield { + 'start_time': info_dict.get('start_time'), + 'end_time': info_dict.get('end_time'), + } + + @staticmethod + def _handle_negative_timestamp(time, info): + return max(info['duration'] + time, 0) if info.get('duration') and time < 0 else time def __eq__(self, other): return (isinstance(other, download_range_func)