Compare commits

...

4 Commits

Author SHA1 Message Date
krichbanana
ca5300c7ed
[youtube] Add :ytnotifications extractor (#3347)
Authored by: krichbanana
2022-04-09 12:55:24 -07:00
pukkandan
97ec5bc550
[cookies] Report progress when importing cookies 2022-04-10 01:21:35 +05:30
pukkandan
a25bca9f89
[youtube, cleanup] Minor refactoring
Authored by: coletdjnz, pukkandan
2022-04-10 01:21:34 +05:30
pukkandan
f894294636
[EmbedThumbnail] Do not remove id3v1 tags 2022-04-10 01:21:34 +05:30
7 changed files with 235 additions and 72 deletions

View File

@ -79,7 +79,7 @@ yt-dlp is a [youtube-dl](https://github.com/ytdl-org/youtube-dl) fork based on t
* **Merged with animelover1984/youtube-dl**: You get most of the features and improvements from [animelover1984/youtube-dl](https://github.com/animelover1984/youtube-dl) including `--write-comments`, `BiliBiliSearch`, `BilibiliChannel`, Embedding thumbnail in mp4/ogg/opus, playlist infojson etc. Note that the NicoNico livestreams are not available. See [#31](https://github.com/yt-dlp/yt-dlp/pull/31) for details.
* **Youtube improvements**:
* All Feeds (`:ytfav`, `:ytwatchlater`, `:ytsubs`, `:ythistory`, `:ytrec`) and private playlists supports downloading multiple pages of content
* All Feeds (`:ytfav`, `:ytwatchlater`, `:ytsubs`, `:ythistory`, `:ytrec`, `:ytnotif`) and private playlists supports downloading multiple pages of content
* Search (`ytsearch:`, `ytsearchdate:`), search URLs and in-channel search works
* Mixes supports downloading multiple pages of content
* Some (but not all) age-gated content can be downloaded without cookies

View File

@ -643,6 +643,11 @@ class YoutubeDL(object):
else:
raise
if auto_init:
if auto_init != 'no_verbose_header':
self.print_debug_header()
self.add_default_info_extractors()
if (sys.platform != 'win32'
and sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968']
and not self.params.get('restrictfilenames', False)):
@ -664,13 +669,6 @@ class YoutubeDL(object):
# Set http_headers defaults according to std_headers
self.params['http_headers'] = merge_headers(std_headers, self.params.get('http_headers', {}))
self._setup_opener()
if auto_init:
if auto_init != 'no_verbose_header':
self.print_debug_header()
self.add_default_info_extractors()
hooks = {
'post_hooks': self.add_post_hook,
'progress_hooks': self.add_progress_hook,
@ -687,6 +685,7 @@ class YoutubeDL(object):
get_postprocessor(pp_def.pop('key'))(self, **compat_kwargs(pp_def)),
when=when)
self._setup_opener()
register_socks_protocols()
def preload_download_archive(fn):
@ -3698,6 +3697,7 @@ class YoutubeDL(object):
delim=', ') or 'none'
write_debug('Optional libraries: %s' % lib_str)
self._setup_opener()
proxy_map = {}
for handler in self._opener.handlers:
if hasattr(handler, 'proxies'):
@ -3717,6 +3717,8 @@ class YoutubeDL(object):
latest_version)
def _setup_opener(self):
if hasattr(self, '_opener'):
return
timeout_val = self.params.get('socket_timeout')
self._socket_timeout = 20 if timeout_val is None else float(timeout_val)

View File

@ -20,6 +20,7 @@ from .compat import (
compat_b64decode,
compat_cookiejar_Cookie,
)
from .minicurses import MultilinePrinter, QuietMultilinePrinter
from .utils import (
error_to_str,
expand_path,
@ -73,6 +74,32 @@ class YDLLogger:
if self._ydl:
self._ydl.report_error(message)
def progress_bar(self):
"""Return a context manager with a print method. (Optional)"""
# Do not print to files/pipes, loggers, or when --no-progress is used
if not self._ydl or self._ydl.params.get('noprogress') or self._ydl.params.get('logger'):
return
file = self._ydl._out_files['error']
try:
if not file.isatty():
return
except BaseException:
return
printer = MultilinePrinter(file, preserve_output=False)
printer.print = lambda message: printer.print_at_line(f'[Cookies] {message}', 0)
return printer
def _create_progress_bar(logger):
if hasattr(logger, 'progress_bar'):
printer = logger.progress_bar()
if printer:
return printer
printer = QuietMultilinePrinter()
printer.print = lambda _: None
return printer
def load_cookies(cookie_file, browser_specification, ydl):
cookie_jars = []
@ -115,7 +142,7 @@ def _extract_firefox_cookies(profile, logger):
else:
search_root = os.path.join(_firefox_browser_dir(), profile)
cookie_database_path = _find_most_recently_used_file(search_root, 'cookies.sqlite')
cookie_database_path = _find_most_recently_used_file(search_root, 'cookies.sqlite', logger)
if cookie_database_path is None:
raise FileNotFoundError('could not find firefox cookies database in {}'.format(search_root))
logger.debug('Extracting cookies from: "{}"'.format(cookie_database_path))
@ -126,13 +153,17 @@ def _extract_firefox_cookies(profile, logger):
cursor = _open_database_copy(cookie_database_path, tmpdir)
cursor.execute('SELECT host, name, value, path, expiry, isSecure FROM moz_cookies')
jar = YoutubeDLCookieJar()
for host, name, value, path, expiry, is_secure in cursor.fetchall():
cookie = compat_cookiejar_Cookie(
version=0, name=name, value=value, port=None, port_specified=False,
domain=host, domain_specified=bool(host), domain_initial_dot=host.startswith('.'),
path=path, path_specified=bool(path), secure=is_secure, expires=expiry, discard=False,
comment=None, comment_url=None, rest={})
jar.set_cookie(cookie)
with _create_progress_bar(logger) as progress_bar:
table = cursor.fetchall()
total_cookie_count = len(table)
for i, (host, name, value, path, expiry, is_secure) in enumerate(table):
progress_bar.print(f'Loading cookie {i: 6d}/{total_cookie_count: 6d}')
cookie = compat_cookiejar_Cookie(
version=0, name=name, value=value, port=None, port_specified=False,
domain=host, domain_specified=bool(host), domain_initial_dot=host.startswith('.'),
path=path, path_specified=bool(path), secure=is_secure, expires=expiry, discard=False,
comment=None, comment_url=None, rest={})
jar.set_cookie(cookie)
logger.info('Extracted {} cookies from firefox'.format(len(jar)))
return jar
finally:
@ -232,7 +263,7 @@ def _extract_chrome_cookies(browser_name, profile, keyring, logger):
logger.error('{} does not support profiles'.format(browser_name))
search_root = config['browser_dir']
cookie_database_path = _find_most_recently_used_file(search_root, 'Cookies')
cookie_database_path = _find_most_recently_used_file(search_root, 'Cookies', logger)
if cookie_database_path is None:
raise FileNotFoundError('could not find {} cookies database in "{}"'.format(browser_name, search_root))
logger.debug('Extracting cookies from: "{}"'.format(cookie_database_path))
@ -251,26 +282,18 @@ def _extract_chrome_cookies(browser_name, profile, keyring, logger):
jar = YoutubeDLCookieJar()
failed_cookies = 0
unencrypted_cookies = 0
for host_key, name, value, encrypted_value, path, expires_utc, is_secure in cursor.fetchall():
host_key = host_key.decode('utf-8')
name = name.decode('utf-8')
value = value.decode('utf-8')
path = path.decode('utf-8')
if not value and encrypted_value:
value = decryptor.decrypt(encrypted_value)
if value is None:
with _create_progress_bar(logger) as progress_bar:
table = cursor.fetchall()
total_cookie_count = len(table)
for i, line in enumerate(table):
progress_bar.print(f'Loading cookie {i: 6d}/{total_cookie_count: 6d}')
is_encrypted, cookie = _process_chrome_cookie(decryptor, *line)
if not cookie:
failed_cookies += 1
continue
else:
unencrypted_cookies += 1
cookie = compat_cookiejar_Cookie(
version=0, name=name, value=value, port=None, port_specified=False,
domain=host_key, domain_specified=bool(host_key), domain_initial_dot=host_key.startswith('.'),
path=path, path_specified=bool(path), secure=is_secure, expires=expires_utc, discard=False,
comment=None, comment_url=None, rest={})
jar.set_cookie(cookie)
elif not is_encrypted:
unencrypted_cookies += 1
jar.set_cookie(cookie)
if failed_cookies > 0:
failed_message = ' ({} could not be decrypted)'.format(failed_cookies)
else:
@ -285,6 +308,25 @@ def _extract_chrome_cookies(browser_name, profile, keyring, logger):
cursor.connection.close()
def _process_chrome_cookie(decryptor, host_key, name, value, encrypted_value, path, expires_utc, is_secure):
host_key = host_key.decode('utf-8')
name = name.decode('utf-8')
value = value.decode('utf-8')
path = path.decode('utf-8')
is_encrypted = not value and encrypted_value
if is_encrypted:
value = decryptor.decrypt(encrypted_value)
if value is None:
return is_encrypted, None
return is_encrypted, compat_cookiejar_Cookie(
version=0, name=name, value=value, port=None, port_specified=False,
domain=host_key, domain_specified=bool(host_key), domain_initial_dot=host_key.startswith('.'),
path=path, path_specified=bool(path), secure=is_secure, expires=expires_utc, discard=False,
comment=None, comment_url=None, rest={})
class ChromeCookieDecryptor:
"""
Overview:
@ -547,10 +589,12 @@ def _parse_safari_cookies_page(data, jar, logger):
p.skip_to(record_offsets[0], 'unknown page header field')
for record_offset in record_offsets:
p.skip_to(record_offset, 'space between records')
record_length = _parse_safari_cookies_record(data[record_offset:], jar, logger)
p.read_bytes(record_length)
with _create_progress_bar(logger) as progress_bar:
for i, record_offset in enumerate(record_offsets):
progress_bar.print(f'Loading cookie {i: 6d}/{number_of_cookies: 6d}')
p.skip_to(record_offset, 'space between records')
record_length = _parse_safari_cookies_record(data[record_offset:], jar, logger)
p.read_bytes(record_length)
p.skip_to_end('space in between pages')
@ -830,10 +874,11 @@ def _get_mac_keyring_password(browser_keyring_name, logger):
def _get_windows_v10_key(browser_root, logger):
path = _find_most_recently_used_file(browser_root, 'Local State')
path = _find_most_recently_used_file(browser_root, 'Local State', logger)
if path is None:
logger.error('could not find local state file')
return None
logger.debug(f'Found local state file at "{path}"')
with open(path, 'r', encoding='utf8') as f:
data = json.load(f)
try:
@ -925,13 +970,16 @@ def _get_column_names(cursor, table_name):
return [row[1].decode('utf-8') for row in table_info]
def _find_most_recently_used_file(root, filename):
def _find_most_recently_used_file(root, filename, logger):
# if there are multiple browser profiles, take the most recently used one
paths = []
for root, dirs, files in os.walk(root):
for file in files:
if file == filename:
paths.append(os.path.join(root, file))
i, paths = 0, []
with _create_progress_bar(logger) as progress_bar:
for curr_root, dirs, files in os.walk(root):
for file in files:
i += 1
progress_bar.print(f'Searching for "{filename}": {i: 6d} files searched')
if file == filename:
paths.append(os.path.join(curr_root, file))
return None if not paths else max(paths, key=lambda path: os.lstat(path).st_mtime)

View File

@ -2100,6 +2100,7 @@ from .youtube import (
YoutubeIE,
YoutubeClipIE,
YoutubeFavouritesIE,
YoutubeNotificationsIE,
YoutubeHistoryIE,
YoutubeTabIE,
YoutubeLivestreamEmbedIE,

View File

@ -384,6 +384,9 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
def _real_initialize(self):
self._initialize_pref()
self._initialize_consent()
self._check_login_required()
def _check_login_required(self):
if (self._LOGIN_REQUIRED
and self.get_param('cookiefile') is None
and self.get_param('cookiesfrombrowser') is None):
@ -563,6 +566,18 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
headers['X-Origin'] = origin
return {h: v for h, v in headers.items() if v is not None}
def _download_ytcfg(self, client, video_id):
url = {
'web': 'https://www.youtube.com',
'web_music': 'https://music.youtube.com',
'web_embedded': f'https://www.youtube.com/embed/{video_id}?html5=1'
}.get(client)
if not url:
return {}
webpage = self._download_webpage(
url, video_id, fatal=False, note=f'Downloading {client.replace("_", " ").strip()} client config')
return self.extract_ytcfg(video_id, webpage) or {}
@staticmethod
def _build_api_continuation_query(continuation, ctp=None):
query = {
@ -728,6 +743,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
return None
def _extract_time_text(self, renderer, *path_list):
"""@returns (timestamp, time_text)"""
text = self._get_text(renderer, *path_list) or ''
dt = self.extract_relative_time(text)
timestamp = None
@ -2959,16 +2975,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
return orderedSet(requested_clients)
def _extract_player_ytcfg(self, client, video_id):
url = {
'web_music': 'https://music.youtube.com',
'web_embedded': f'https://www.youtube.com/embed/{video_id}?html5=1'
}.get(client)
if not url:
return {}
webpage = self._download_webpage(url, video_id, fatal=False, note='Downloading %s config' % client.replace('_', ' ').strip())
return self.extract_ytcfg(video_id, webpage) or {}
def _extract_player_responses(self, clients, video_id, webpage, master_ytcfg):
initial_pr = None
if webpage:
@ -3005,8 +3011,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
while clients:
client, base_client, variant = _split_innertube_client(clients.pop())
player_ytcfg = master_ytcfg if client == 'web' else {}
if 'configs' not in self._configuration_arg('player_skip'):
player_ytcfg = self._extract_player_ytcfg(client, video_id) or player_ytcfg
if 'configs' not in self._configuration_arg('player_skip') and client != 'web':
player_ytcfg = self._download_ytcfg(client, video_id) or player_ytcfg
player_url = player_url or self._extract_player_url(master_ytcfg, player_ytcfg, webpage=webpage)
require_js_player = self._get_default_ytcfg(client).get('REQUIRE_JS_PLAYER')
@ -4347,6 +4353,10 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor):
check_get_keys='contents', fatal=False, ytcfg=ytcfg,
note='Downloading API JSON with unavailable videos')
@property
def skip_webpage(self):
return 'webpage' in self._configuration_arg('skip', ie_key=YoutubeTabIE.ie_key())
def _extract_webpage(self, url, item_id, fatal=True):
retries = self.get_param('extractor_retries', 3)
count = -1
@ -4393,9 +4403,21 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor):
return webpage, data
def _report_playlist_authcheck(self, ytcfg, fatal=True):
"""Use if failed to extract ytcfg (and data) from initial webpage"""
if not ytcfg and self.is_authenticated:
msg = 'Playlists that require authentication may not extract correctly without a successful webpage download'
if 'authcheck' not in self._configuration_arg('skip', ie_key=YoutubeTabIE.ie_key()) and fatal:
raise ExtractorError(
f'{msg}. If you are not downloading private content, or '
'your cookies are only for the first account and channel,'
' pass "--extractor-args youtubetab:skip=authcheck" to skip this check',
expected=True)
self.report_warning(msg, only_once=True)
def _extract_data(self, url, item_id, ytcfg=None, fatal=True, webpage_fatal=False, default_client='web'):
data = None
if 'webpage' not in self._configuration_arg('skip'):
if not self.skip_webpage:
webpage, data = self._extract_webpage(url, item_id, fatal=webpage_fatal)
ytcfg = ytcfg or self.extract_ytcfg(item_id, webpage)
# Reject webpage data if redirected to home page without explicitly requesting
@ -4409,14 +4431,7 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor):
raise ExtractorError(msg, expected=True)
self.report_warning(msg, only_once=True)
if not data:
if not ytcfg and self.is_authenticated:
msg = 'Playlists that require authentication may not extract correctly without a successful webpage download.'
if 'authcheck' not in self._configuration_arg('skip') and fatal:
raise ExtractorError(
msg + ' If you are not downloading private content, or your cookies are only for the first account and channel,'
' pass "--extractor-args youtubetab:skip=authcheck" to skip this check',
expected=True)
self.report_warning(msg, only_once=True)
self._report_playlist_authcheck(ytcfg, fatal=fatal)
data = self._extract_tab_endpoint(url, item_id, ytcfg, fatal=fatal, default_client=default_client)
return data, ytcfg
@ -4454,14 +4469,20 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor):
('contents', 'tabbedSearchResultsRenderer', 'tabs', 0, 'tabRenderer', 'content', 'sectionListRenderer', 'contents'),
('continuationContents', ),
)
display_id = f'query "{query}"'
check_get_keys = tuple(set(keys[0] for keys in content_keys))
ytcfg = self._download_ytcfg(default_client, display_id) if not self.skip_webpage else {}
self._report_playlist_authcheck(ytcfg, fatal=False)
continuation_list = [None]
search = None
for page_num in itertools.count(1):
data.update(continuation_list[0] or {})
headers = self.generate_api_headers(
ytcfg=ytcfg, visitor_data=self._extract_visitor_data(search), default_client=default_client)
search = self._extract_response(
item_id='query "%s" page %s' % (query, page_num), ep='search', query=data,
default_client=default_client, check_get_keys=check_get_keys)
item_id=f'{display_id} page {page_num}', ep='search', query=data,
default_client=default_client, check_get_keys=check_get_keys, ytcfg=ytcfg, headers=headers)
slr_contents = traverse_obj(search, *content_keys)
yield from self._extract_entries({'contents': list(variadic(slr_contents))}, continuation_list)
if not continuation_list[0]:
@ -5505,6 +5526,95 @@ class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
ie=YoutubeTabIE.ie_key())
class YoutubeNotificationsIE(YoutubeTabBaseInfoExtractor):
IE_NAME = 'youtube:notif'
IE_DESC = 'YouTube notifications; ":ytnotif" keyword (requires cookies)'
_VALID_URL = r':ytnotif(?:ication)?s?'
_LOGIN_REQUIRED = True
_TESTS = [{
'url': ':ytnotif',
'only_matching': True,
}, {
'url': ':ytnotifications',
'only_matching': True,
}]
def _extract_notification_menu(self, response, continuation_list):
notification_list = traverse_obj(
response,
('actions', 0, 'openPopupAction', 'popup', 'multiPageMenuRenderer', 'sections', 0, 'multiPageMenuNotificationSectionRenderer', 'items'),
('actions', 0, 'appendContinuationItemsAction', 'continuationItems'),
expected_type=list) or []
continuation_list[0] = None
for item in notification_list:
entry = self._extract_notification_renderer(item.get('notificationRenderer'))
if entry:
yield entry
continuation = item.get('continuationItemRenderer')
if continuation:
continuation_list[0] = continuation
def _extract_notification_renderer(self, notification):
video_id = traverse_obj(
notification, ('navigationEndpoint', 'watchEndpoint', 'videoId'), expected_type=str)
url = f'https://www.youtube.com/watch?v={video_id}'
channel_id = None
if not video_id:
browse_ep = traverse_obj(
notification, ('navigationEndpoint', 'browseEndpoint'), expected_type=dict)
channel_id = traverse_obj(browse_ep, 'browseId', expected_type=str)
post_id = self._search_regex(
r'/post/(.+)', traverse_obj(browse_ep, 'canonicalBaseUrl', expected_type=str),
'post id', default=None)
if not channel_id or not post_id:
return
# The direct /post url redirects to this in the browser
url = f'https://www.youtube.com/channel/{channel_id}/community?lb={post_id}'
channel = traverse_obj(
notification, ('contextualMenu', 'menuRenderer', 'items', 1, 'menuServiceItemRenderer', 'text', 'runs', 1, 'text'),
expected_type=str)
title = self._search_regex(
rf'{re.escape(channel)} [^:]+: (.+)', self._get_text(notification, 'shortMessage'),
'video title', default=None)
if title:
title = title.replace('\xad', '') # remove soft hyphens
upload_date = (strftime_or_none(self._extract_time_text(notification, 'sentTimeText')[0], '%Y%m%d')
if self._configuration_arg('approximate_date', ie_key=YoutubeTabIE.ie_key())
else None)
return {
'_type': 'url',
'url': url,
'ie_key': (YoutubeIE if video_id else YoutubeTabIE).ie_key(),
'video_id': video_id,
'title': title,
'channel_id': channel_id,
'channel': channel,
'thumbnails': self._extract_thumbnails(notification, 'videoThumbnail'),
'upload_date': upload_date,
}
def _notification_menu_entries(self, ytcfg):
continuation_list = [None]
response = None
for page in itertools.count(1):
ctoken = traverse_obj(
continuation_list, (0, 'continuationEndpoint', 'getNotificationMenuEndpoint', 'ctoken'), expected_type=str)
response = self._extract_response(
item_id=f'page {page}', query={'ctoken': ctoken} if ctoken else {}, ytcfg=ytcfg,
ep='notification/get_notification_menu', check_get_keys='actions',
headers=self.generate_api_headers(ytcfg=ytcfg, visitor_data=self._extract_visitor_data(response)))
yield from self._extract_notification_menu(response, continuation_list)
if not continuation_list[0]:
break
def _real_extract(self, url):
display_id = 'notifications'
ytcfg = self._download_ytcfg('web', display_id) if not self.skip_webpage else {}
self._report_playlist_authcheck(ytcfg)
return self.playlist_result(self._notification_menu_entries(ytcfg), display_id, display_id)
class YoutubeSearchIE(YoutubeTabBaseInfoExtractor, SearchInfoExtractor):
IE_DESC = 'YouTube search'
IE_NAME = 'youtube:search'
@ -5634,7 +5744,9 @@ class YoutubeFeedsInfoExtractor(InfoExtractor):
Subclasses must define the _FEED_NAME property.
"""
_LOGIN_REQUIRED = True
_TESTS = []
def _real_initialize(self):
YoutubeBaseInfoExtractor._check_login_required(self)
@property
def IE_NAME(self):

View File

@ -178,4 +178,4 @@ class MultilinePrinter(MultilinePrinterBase):
*text, CONTROL_SEQUENCES['ERASE_LINE'],
f'{CONTROL_SEQUENCES["UP"]}{CONTROL_SEQUENCES["ERASE_LINE"]}' * self.maximum)
else:
self.write(*text, ' ' * self._lastlength)
self.write('\r', ' ' * self._lastlength, '\r')

View File

@ -101,7 +101,7 @@ class EmbedThumbnailPP(FFmpegPostProcessor):
success = True
if info['ext'] == 'mp3':
options = [
'-c', 'copy', '-map', '0:0', '-map', '1:0', '-id3v2_version', '3',
'-c', 'copy', '-map', '0:0', '-map', '1:0', '-write_id3v1', '1', '-id3v2_version', '3',
'-metadata:s:v', 'title="Album cover"', '-metadata:s:v', 'comment="Cover (front)"']
self._report_run('ffmpeg', filename)