Compare commits

...

11 Commits

Author SHA1 Message Date
Felix S
47b8bf207b
[go,viu] Extract subtitles from the m3u8 manifest (#3219)
Authored by: fstirlitz
2022-03-27 02:35:14 -07:00
Tim Schindler
4628a3aa75
[ITProTV] Add extractor (#3196)
Authored by: aaearon
2022-03-27 02:00:38 -07:00
mehq
5b4bb715e6
[BanBye] Add extractor (#3177)
Closes #3175
Authored by: mehq
2022-03-27 01:57:05 -07:00
pukkandan
1235d333ab
[youtube] Fix auto-translated automatic captions
d49669acad only covered ASR

Closes #2956
2022-03-27 14:06:26 +05:30
pukkandan
18e4940825
[youtube] Add extractor-arg to skip auto-translated subs 2022-03-27 14:04:20 +05:30
pukkandan
c0b6e5c74d
Show warning when all media formats have DRM
Related: #1379
2022-03-27 11:39:35 +05:30
shirt
727029c508
[youtube] Detect DRM better
Authored by: shirt-dev
2022-03-27 11:27:27 +05:30
pukkandan
5c3895fff1
[outtmpl] Limit changes during sanitization
Closes #2761
2022-03-27 11:18:35 +05:30
coletdev
fd2ad7cb24
[youtube:tab] Return shorts url if video is a short (#3168)
Allows filtering out shorts from feeds with `--match-filter`
Closes #3165
Authored-by: coletdjnz
2022-03-27 05:20:25 +00:00
pukkandan
4a3175fc4c
[VideoConvertor] Ensure all streams are copied
Closes #3200
2022-03-27 09:28:58 +05:30
pukkandan
5cf34021f5
[Concat] Ensure final directory exists
Fixes https://github.com/yt-dlp/yt-dlp/issues/3181#issuecomment-1079622589
2022-03-27 04:52:11 +05:30
12 changed files with 367 additions and 36 deletions

View File

@ -144,6 +144,7 @@ Some of yt-dlp's default options are different from that of youtube-dl and youtu
* Some private fields such as filenames are removed by default from the infojson. Use `--no-clean-infojson` or `--compat-options no-clean-infojson` to revert this
* When `--embed-subs` and `--write-subs` are used together, the subtitles are written to disk and also embedded in the media file. You can use just `--embed-subs` to embed the subs and automatically delete the separate file. See [#630 (comment)](https://github.com/yt-dlp/yt-dlp/issues/630#issuecomment-893659460) for more info. `--compat-options no-keep-subs` can be used to revert this
* `certifi` will be used for SSL root certificates, if installed. If you want to use system certificates (e.g. self-signed), use `--compat-options no-certifi`
* youtube-dl tries to remove some superfluous punctuations from filenames. While this can sometimes be helpfull, it is often undesirable. So yt-dlp tries to keep the fields in the filenames as close to their original values as possible. You can use `--compat-options filename-sanitization` to revert to youtube-dl's behavior
For ease of use, a few more compat options are available:
* `--compat-options all`: Use all compat options
@ -1655,7 +1656,7 @@ Some extractors accept additional arguments which can be passed using `--extract
The following extractors use this feature:
#### youtube
* `skip`: `hls` or `dash` (or both) to skip download of the respective manifests
* `skip`: One or more of `hls`, `dash` or `translated_subs` to skip extraction of the m3u8 manifests, dash manifests and auto-translated subtitles respectively
* `player_client`: Clients to extract video data from. The main clients are `web`, `android`, `ios`, `mweb`. These also have `_music`, `_embedded`, `_agegate`, and `_creator` variants (Eg: `web_embedded`) (`mweb` has only `_agegate`). By default, `android,web` is used, but the agegate and creator variants are added as required for age-gated videos. Similarly the music variants are added for `music.youtube.com` urls. You can also use `all` to use all the clients, and `default` for the default clients.
* `player_skip`: Skip some network requests that are generally needed for robust extraction. One or more of `configs` (skip client configs), `webpage` (skip initial webpage), `js` (skip js player). While these options can help reduce the number of requests needed or avoid some rate-limiting, they could cause some issues. See [#860](https://github.com/yt-dlp/yt-dlp/pull/860) for more details
* `include_live_dash`: Include live dash formats even without `--live-from-start` (These formats don't download properly)

View File

@ -160,10 +160,12 @@ class TestUtil(unittest.TestCase):
sanitize_filename('New World record at 0:12:34'),
'New World record at 0_12_34')
self.assertEqual(sanitize_filename('--gasdgf'), '_-gasdgf')
self.assertEqual(sanitize_filename('--gasdgf'), '--gasdgf')
self.assertEqual(sanitize_filename('--gasdgf', is_id=True), '--gasdgf')
self.assertEqual(sanitize_filename('.gasdgf'), 'gasdgf')
self.assertEqual(sanitize_filename('--gasdgf', is_id=False), '_-gasdgf')
self.assertEqual(sanitize_filename('.gasdgf'), '.gasdgf')
self.assertEqual(sanitize_filename('.gasdgf', is_id=True), '.gasdgf')
self.assertEqual(sanitize_filename('.gasdgf', is_id=False), 'gasdgf')
forbidden = '"\0\\/'
for fc in forbidden:

View File

@ -87,6 +87,7 @@ from .utils import (
MaxDownloadsReached,
merge_headers,
network_exceptions,
NO_DEFAULT,
number_of_digits,
orderedSet,
OUTTMPL_TYPES,
@ -1150,8 +1151,10 @@ class YoutubeDL(object):
na = self.params.get('outtmpl_na_placeholder', 'NA')
def filename_sanitizer(key, value, restricted=self.params.get('restrictfilenames')):
return sanitize_filename(str(value), restricted=restricted,
is_id=re.search(r'(^|[_.])id(\.|$)', key))
return sanitize_filename(str(value), restricted=restricted, is_id=(
bool(re.search(r'(^|[_.])id(\.|$)', key))
if 'filename-sanitization' in self.params.get('compat_opts', [])
else NO_DEFAULT))
sanitizer = sanitize if callable(sanitize) else filename_sanitizer
sanitize = bool(sanitize)
@ -2456,6 +2459,11 @@ class YoutubeDL(object):
info_dict['__has_drm'] = any(f.get('has_drm') for f in formats)
if not self.params.get('allow_unplayable_formats'):
formats = [f for f in formats if not f.get('has_drm')]
if info_dict['__has_drm'] and all(
f.get('acodec') == f.get('vcodec') == 'none' for f in formats):
self.report_warning(
'This video is DRM protected and only images are available for download. '
'Use --list-formats to see them')
get_from_start = not info_dict.get('is_live') or bool(self.params.get('live_from_start'))
if not get_from_start:
@ -2628,8 +2636,9 @@ class YoutubeDL(object):
if not formats_to_download:
if not self.params.get('ignore_no_formats_error'):
raise ExtractorError('Requested format is not available', expected=True,
video_id=info_dict['id'], ie=info_dict['extractor'])
raise ExtractorError(
'Requested format is not available. Use --list-formats for a list of available formats',
expected=True, video_id=info_dict['id'], ie=info_dict['extractor'])
self.report_warning('Requested format is not available')
# Process what we can, even without any available formats.
formats_to_download = [{}]

153
yt_dlp/extractor/banbye.py Normal file
View File

@ -0,0 +1,153 @@
# coding: utf-8
from __future__ import unicode_literals
import math
from .common import InfoExtractor
from ..compat import (
compat_urllib_parse_urlparse,
compat_parse_qs,
)
from ..utils import (
format_field,
InAdvancePagedList,
traverse_obj,
unified_timestamp,
)
class BanByeBaseIE(InfoExtractor):
_API_BASE = 'https://api.banbye.com'
_CDN_BASE = 'https://cdn.banbye.com'
_VIDEO_BASE = 'https://banbye.com/watch'
@staticmethod
def _extract_playlist_id(url, param='playlist'):
return compat_parse_qs(
compat_urllib_parse_urlparse(url).query).get(param, [None])[0]
def _extract_playlist(self, playlist_id):
data = self._download_json(f'{self._API_BASE}/playlists/{playlist_id}', playlist_id)
return self.playlist_result([
self.url_result(f'{self._VIDEO_BASE}/{video_id}', BanByeIE)
for video_id in data['videoIds']], playlist_id, data.get('name'))
class BanByeIE(BanByeBaseIE):
_VALID_URL = r'https?://(?:www\.)?banbye.com/(?:en/)?watch/(?P<id>\w+)'
_TESTS = [{
'url': 'https://banbye.com/watch/v_ytfmvkVYLE8T',
'md5': '2f4ea15c5ca259a73d909b2cfd558eb5',
'info_dict': {
'id': 'v_ytfmvkVYLE8T',
'ext': 'mp4',
'title': 'md5:5ec098f88a0d796f987648de6322ba0f',
'description': 'md5:4d94836e73396bc18ef1fa0f43e5a63a',
'uploader': 'wRealu24',
'channel_id': 'ch_wrealu24',
'channel_url': 'https://banbye.com/channel/ch_wrealu24',
'timestamp': 1647604800,
'upload_date': '20220318',
'duration': 1931,
'thumbnail': r're:https?://.*\.webp',
'tags': 'count:5',
'like_count': int,
'dislike_count': int,
'view_count': int,
'comment_count': int,
},
}, {
'url': 'https://banbye.com/watch/v_2JjQtqjKUE_F?playlistId=p_Ld82N6gBw_OJ',
'info_dict': {
'title': 'Krzysztof Karoń',
'id': 'p_Ld82N6gBw_OJ',
},
'playlist_count': 9,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
playlist_id = self._extract_playlist_id(url, 'playlistId')
if self._yes_playlist(playlist_id, video_id):
return self._extract_playlist(playlist_id)
data = self._download_json(f'{self._API_BASE}/videos/{video_id}', video_id)
thumbnails = [{
'id': f'{quality}p',
'url': f'{self._CDN_BASE}/video/{video_id}/{quality}.webp',
} for quality in [48, 96, 144, 240, 512, 1080]]
formats = [{
'format_id': f'http-{quality}p',
'quality': quality,
'url': f'{self._CDN_BASE}/video/{video_id}/{quality}.mp4',
} for quality in data['quality']]
self._sort_formats(formats)
return {
'id': video_id,
'title': data.get('title'),
'description': data.get('desc'),
'uploader': traverse_obj(data, ('channel', 'name')),
'channel_id': data.get('channelId'),
'channel_url': format_field(data, 'channelId', 'https://banbye.com/channel/%s'),
'timestamp': unified_timestamp(data.get('publishedAt')),
'duration': data.get('duration'),
'tags': data.get('tags'),
'formats': formats,
'thumbnails': thumbnails,
'like_count': data.get('likes'),
'dislike_count': data.get('dislikes'),
'view_count': data.get('views'),
'comment_count': data.get('commentCount'),
}
class BanByeChannelIE(BanByeBaseIE):
_VALID_URL = r'https?://(?:www\.)?banbye.com/(?:en/)?channel/(?P<id>\w+)'
_TESTS = [{
'url': 'https://banbye.com/channel/ch_wrealu24',
'info_dict': {
'title': 'wRealu24',
'id': 'ch_wrealu24',
'description': 'md5:da54e48416b74dfdde20a04867c0c2f6',
},
'playlist_mincount': 791,
}, {
'url': 'https://banbye.com/channel/ch_wrealu24?playlist=p_Ld82N6gBw_OJ',
'info_dict': {
'title': 'Krzysztof Karoń',
'id': 'p_Ld82N6gBw_OJ',
},
'playlist_count': 9,
}]
_PAGE_SIZE = 100
def _real_extract(self, url):
channel_id = self._match_id(url)
playlist_id = self._extract_playlist_id(url)
if playlist_id:
return self._extract_playlist(playlist_id)
def page_func(page_num):
data = self._download_json(f'{self._API_BASE}/videos', channel_id, query={
'channelId': channel_id,
'sort': 'new',
'limit': self._PAGE_SIZE,
'offset': page_num * self._PAGE_SIZE,
}, note=f'Downloading page {page_num+1}')
return [
self.url_result(f"{self._VIDEO_BASE}/{video['_id']}", BanByeIE)
for video in data['items']
]
channel_data = self._download_json(f'{self._API_BASE}/channels/{channel_id}', channel_id)
entries = InAdvancePagedList(
page_func,
math.ceil(channel_data['videoCount'] / self._PAGE_SIZE),
self._PAGE_SIZE)
return self.playlist_result(
entries, channel_id, channel_data.get('name'), channel_data.get('description'))

View File

@ -122,6 +122,10 @@ from .awaan import (
)
from .azmedien import AZMedienIE
from .baidu import BaiduVideoIE
from .banbye import (
BanByeIE,
BanByeChannelIE,
)
from .bandaichannel import BandaiChannelIE
from .bandcamp import (
BandcampIE,
@ -674,6 +678,12 @@ from .iqiyi import (
IqIE,
IqAlbumIE
)
from .itprotv import (
ITProTVIE,
ITProTVCourseIE
)
from .itv import (
ITVIE,
ITVBTCCIE,

View File

@ -217,6 +217,7 @@ class GoIE(AdobePassIE):
title = video_data['title']
formats = []
subtitles = {}
for asset in video_data.get('assets', {}).get('asset', []):
asset_url = asset.get('value')
if not asset_url:
@ -256,8 +257,10 @@ class GoIE(AdobePassIE):
error_message = ', '.join([error['message'] for error in errors])
raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message), expected=True)
asset_url += '?' + entitlement['uplynkData']['sessionKey']
formats.extend(self._extract_m3u8_formats(
asset_url, video_id, 'mp4', m3u8_id=format_id or 'hls', fatal=False))
fmts, subs = self._extract_m3u8_formats_and_subtitles(
asset_url, video_id, 'mp4', m3u8_id=format_id or 'hls', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
else:
f = {
'format_id': format_id,
@ -281,7 +284,6 @@ class GoIE(AdobePassIE):
formats.append(f)
self._sort_formats(formats)
subtitles = {}
for cc in video_data.get('closedcaption', {}).get('src', []):
cc_url = cc.get('value')
if not cc_url:

141
yt_dlp/extractor/itprotv.py Normal file
View File

@ -0,0 +1,141 @@
# coding: utf-8
import re
from .common import InfoExtractor
from ..utils import (
int_or_none,
str_or_none,
traverse_obj,
urljoin
)
class ITProTVBaseIE(InfoExtractor):
_ENDPOINTS = {
'course': 'course?url={}&brand=00002560-0000-3fa9-0000-1d61000035f3',
'episode': 'brand/00002560-0000-3fa9-0000-1d61000035f3/episode?url={}'
}
def _call_api(self, ep, item_id, webpage):
return self._download_json(
f'https://api.itpro.tv/api/urza/v3/consumer-web/{self._ENDPOINTS[ep].format(item_id)}',
item_id, note=f'Fetching {ep} data API',
headers={'Authorization': f'Bearer {self._fetch_jwt(webpage)}'})[ep]
def _fetch_jwt(self, webpage):
return self._search_regex(r'{"passedToken":"([\w-]+\.[\w-]+\.[\w-]+)",', webpage, 'jwt')
def _check_if_logged_in(self, webpage):
if re.match(r'{\s*member\s*:\s*null', webpage):
self.raise_login_required()
class ITProTVIE(ITProTVBaseIE):
_VALID_URL = r'https://app.itpro.tv/course/(?P<course>[\w-]+)/(?P<id>[\w-]+)'
_TESTS = [{
'url': 'https://app.itpro.tv/course/guided-tour/introductionitprotv',
'md5': 'bca4a28c2667fd1a63052e71a94bb88c',
'info_dict': {
'id': 'introductionitprotv',
'ext': 'mp4',
'title': 'An Introduction to ITProTV 101',
'thumbnail': 'https://itprotv-image-bucket.s3.amazonaws.com/getting-started/itprotv-101-introduction-PGM.11_39_56_02.Still001.png',
'description': 'md5:b175c2c3061ce35a4dd33865b2c1da4e',
'duration': 269,
'series': 'ITProTV 101',
'series_id': 'guided-tour',
'availability': 'needs_auth',
'chapter': 'ITProTV 101',
'chapter_number': 1,
'chapter_id': '5dbb3de426b46c0010b5d1b6'
},
},
{
'url': 'https://app.itpro.tv/course/beyond-tech/job-interview-tips',
'md5': '101a299b98c47ccf4c67f9f0951defa8',
'info_dict': {
'id': 'job-interview-tips',
'ext': 'mp4',
'title': 'Job Interview Tips',
'thumbnail': 'https://s3.amazonaws.com:443/production-itprotv-thumbnails/2f370bf5-294d-4bbe-ab80-c0b5781630ea.png',
'description': 'md5:30d8ba483febdf89ec85623aad3c3cb6',
'duration': 267,
'series': 'Beyond Tech',
'series_id': 'beyond-tech',
'availability': 'needs_auth',
'chapter': 'Job Development',
'chapter_number': 2,
'chapter_id': '5f7c78d424330c000edf04d9'
},
}]
def _real_extract(self, url):
episode_id, course_name = self._match_valid_url(url).group('id', 'course')
webpage = self._download_webpage(url, episode_id)
self._check_if_logged_in(webpage)
course = self._call_api('course', course_name, webpage)
episode = self._call_api('episode', episode_id, webpage)
chapter_number, chapter = next((
(i, topic) for i, topic in enumerate(course.get('topics') or [], 1)
if traverse_obj(topic, 'id') == episode.get('topic')), {})
return {
'id': episode_id,
'title': episode.get('title'),
'description': episode.get('description'),
'thumbnail': episode.get('thumbnail'),
'formats': [
{'url': episode[f'jwVideo{h}Embed'], 'height': h}
for h in (320, 480, 720, 1080) if episode.get(f'jwVideo{h}Embed')
],
'duration': int_or_none(episode.get('length')),
'series': course.get('name'),
'series_id': course.get('url'),
'chapter': str_or_none(chapter.get('title')),
'chapter_number': chapter_number,
'chapter_id': str_or_none(chapter.get('id')),
'subtitles': {
'en': [{'ext': 'vtt', 'data': episode['enCaptionData']}]
} if episode.get('enCaptionData') else None,
}
class ITProTVCourseIE(ITProTVBaseIE):
_VALID_URL = r'https?://app.itpro.tv/course/(?P<id>[\w-]+)/?(?:$|[#?])'
_TESTS = [
{
'url': 'https://app.itpro.tv/course/guided-tour',
'info_dict': {
'id': 'guided-tour',
'description': 'md5:b175c2c3061ce35a4dd33865b2c1da4e',
'title': 'ITProTV 101',
},
'playlist_count': 6
},
{
'url': 'https://app.itpro.tv/course/beyond-tech',
'info_dict': {
'id': 'beyond-tech',
'description': 'md5:44cd99855e7f81a15ce1269bd0621fed',
'title': 'Beyond Tech'
},
'playlist_count': 15
},
]
def _real_extract(self, url):
course_id = self._match_id(url)
webpage = self._download_webpage(url, course_id)
self._check_if_logged_in(webpage)
course = self._call_api('course', course_id, webpage)
entries = [self.url_result(
urljoin(url, f'{course_id}/{episode["url"]}'), ITProTVIE,
episode['url'], episode.get('title'), url_transparent=True)
for episode in course['episodes']]
return self.playlist_result(
entries, course_id, course.get('name'), course.get('description'))

View File

@ -88,10 +88,9 @@ class ViuIE(ViuBaseIE):
# r'(/hlsc_)[a-z]+(\d+\.m3u8)',
# r'\1whe\2', video_data['href'])
m3u8_url = video_data['href']
formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4')
formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, 'mp4')
self._sort_formats(formats)
subtitles = {}
for key, value in video_data.items():
mobj = re.match(r'^subtitle_(?P<lang>[^_]+)_(?P<ext>(vtt|srt))', key)
if not mobj:

View File

@ -818,12 +818,17 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
renderer, ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'style'), get_all=False, expected_type=str)
badges = self._extract_badges(renderer)
thumbnails = self._extract_thumbnails(renderer, 'thumbnail')
navigation_url = urljoin('https://www.youtube.com/', traverse_obj(
renderer, ('navigationEndpoint', 'commandMetadata', 'webCommandMetadata', 'url'), expected_type=str))
url = f'https://www.youtube.com/watch?v={video_id}'
if overlay_style == 'SHORTS' or (navigation_url and '/shorts/' in navigation_url):
url = f'https://www.youtube.com/shorts/{video_id}'
return {
'_type': 'url',
'ie_key': YoutubeIE.ie_key(),
'id': video_id,
'url': f'https://www.youtube.com/watch?v={video_id}',
'url': url,
'title': title,
'description': description,
'duration': duration,
@ -3018,7 +3023,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
streaming_formats = traverse_obj(streaming_data, (..., ('formats', 'adaptiveFormats'), ...), default=[])
for fmt in streaming_formats:
if fmt.get('targetDurationSec') or fmt.get('drmFamilies'):
if fmt.get('targetDurationSec'):
continue
itag = str_or_none(fmt.get('itag'))
@ -3100,6 +3105,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'fps': int_or_none(fmt.get('fps')) or None,
'height': height,
'quality': q(quality),
'has_drm': bool(fmt.get('drmFamilies')),
'tbr': tbr,
'url': fmt_url,
'width': int_or_none(fmt.get('width')),
@ -3473,6 +3479,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
subtitles, automatic_captions = {}, {}
for lang_code, caption_track in captions.items():
base_url = caption_track.get('baseUrl')
orig_lang = parse_qs(base_url).get('lang', [None])[-1]
if not base_url:
continue
lang_name = self._get_text(caption_track, 'name', max_runs=1)
@ -3486,19 +3493,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
for trans_code, trans_name in translation_languages.items():
if not trans_code:
continue
orig_trans_code = trans_code
if caption_track.get('kind') != 'asr':
if 'translated_subs' in self._configuration_arg('skip'):
continue
trans_code += f'-{lang_code}'
trans_name += format_field(lang_name, template=' from %s')
# Add an "-orig" label to the original language so that it can be distinguished.
# The subs are returned without "-orig" as well for compatibility
if lang_code == f'a-{trans_code}':
if lang_code == f'a-{orig_trans_code}':
process_language(
automatic_captions, base_url, f'{trans_code}-orig', f'{trans_name} (Original)', {})
# Setting tlang=lang returns damaged subtitles.
# Not using lang_code == f'a-{trans_code}' here for future-proofing
orig_lang = parse_qs(base_url).get('lang', [None])[-1]
process_language(automatic_captions, base_url, trans_code, trans_name,
{} if orig_lang == trans_code else {'tlang': trans_code})
{} if orig_lang == orig_trans_code else {'tlang': trans_code})
info['automatic_captions'] = automatic_captions
info['subtitles'] = subtitles

View File

@ -338,7 +338,7 @@ def create_parser():
action='callback', callback=_set_from_options_callback,
callback_kwargs={
'allowed_values': {
'filename', 'format-sort', 'abort-on-error', 'format-spec', 'no-playlist-metafiles',
'filename', 'filename-sanitization', 'format-sort', 'abort-on-error', 'format-spec', 'no-playlist-metafiles',
'multistreams', 'no-live-chat', 'playlist-index', 'list-formats', 'no-direct-merge',
'no-youtube-channel-redirect', 'no-youtube-unavailable-videos', 'no-attach-info-json', 'embed-metadata',
'embed-thumbnail-atomicparsley', 'seperate-video-versions', 'no-clean-infojson', 'no-keep-subs', 'no-certifi',

View File

@ -553,9 +553,9 @@ class FFmpegVideoConvertorPP(FFmpegPostProcessor):
@staticmethod
def _options(target_ext):
yield from FFmpegPostProcessor.stream_copy_opts(False)
if target_ext == 'avi':
return ['-c:v', 'libxvid', '-vtag', 'XVID']
return []
yield from ('-c:v', 'libxvid', '-vtag', 'XVID')
@PostProcessor._restrict_to(images=False)
def run(self, info):
@ -1129,6 +1129,8 @@ class FFmpegConcatPP(FFmpegPostProcessor):
super().__init__(downloader)
def concat_files(self, in_files, out_file):
if not self._downloader._ensure_dir_exists(out_file):
return
if len(in_files) == 1:
if os.path.realpath(in_files[0]) != os.path.realpath(out_file):
self.to_screen(f'Moving "{in_files[0]}" to "{out_file}"')

View File

@ -705,36 +705,40 @@ def timeconvert(timestr):
return timestamp
def sanitize_filename(s, restricted=False, is_id=False):
def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
"""Sanitizes a string so it could be used as part of a filename.
If restricted is set, use a stricter subset of allowed characters.
Set is_id if this is not an arbitrary string, but an ID that should be kept
if possible.
@param restricted Use a stricter subset of allowed characters
@param is_id Whether this is an ID that should be kept unchanged if possible.
If unset, yt-dlp's new sanitization rules are in effect
"""
if s == '':
return ''
def replace_insane(char):
if restricted and char in ACCENT_CHARS:
return ACCENT_CHARS[char]
elif not restricted and char == '\n':
return ' '
return '\0 '
elif char == '?' or ord(char) < 32 or ord(char) == 127:
return ''
elif char == '"':
return '' if restricted else '\''
elif char == ':':
return '_-' if restricted else ' -'
return '\0_\0-' if restricted else '\0 \0-'
elif char in '\\/|*<>':
return '_'
if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
return '_'
if restricted and ord(char) > 127:
return '_'
return '\0_'
if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
return '\0_'
return char
if s == '':
return ''
# Handle timestamps
s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
result = ''.join(map(replace_insane, s))
if is_id is NO_DEFAULT:
result = re.sub('(\0.)(?:(?=\\1)..)+', r'\1', result) # Remove repeated substitute chars
STRIP_RE = '(?:\0.|[ _-])*'
result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
result = result.replace('\0', '') or '_'
if not is_id:
while '__' in result:
result = result.replace('__', '_')