From a8769f672b58135dc681b87dd0bdd8073c847bf0 Mon Sep 17 00:00:00 2001 From: grqx_wsl <173253225+grqx@users.noreply.github.com> Date: Sun, 13 Oct 2024 12:46:03 +1300 Subject: [PATCH 01/26] [ie/boomplay] add extractors --- yt_dlp/extractor/_extractors.py | 7 + yt_dlp/extractor/boomplay.py | 283 ++++++++++++++++++++++++++++++++ 2 files changed, 290 insertions(+) create mode 100644 yt_dlp/extractor/boomplay.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 4b1f4c316d..1abca1ed93 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -280,6 +280,13 @@ from .blogger import BloggerIE from .bloomberg import BloombergIE from .bokecc import BokeCCIE from .bongacams import BongaCamsIE +from .boomplay import ( + BoomPlayEpisodeIE, + BoomPlayMusicIE, + BoomPlayPlaylistIE, + BoomPlayPodcastIE, + BoomPlayVideoIE, +) from .boosty import BoostyIE from .bostonglobe import BostonGlobeIE from .box import BoxIE diff --git a/yt_dlp/extractor/boomplay.py b/yt_dlp/extractor/boomplay.py new file mode 100644 index 0000000000..dba8a1c9ca --- /dev/null +++ b/yt_dlp/extractor/boomplay.py @@ -0,0 +1,283 @@ +import base64 +import functools +import json +import re + +from .common import InfoExtractor +from ..aes import aes_cbc_decrypt_bytes, aes_cbc_encrypt_bytes, unpad_pkcs7 +from ..utils import ( + ExtractorError, + clean_html, + get_element_by_attribute, + get_element_by_class, + get_elements_by_attribute, + int_or_none, + merge_dicts, + parse_duration, + strip_or_none, + unified_strdate, + url_or_none, + urlencode_postdata, +) +from ..utils.traversal import traverse_obj + + +class BoomPlayBaseIE(InfoExtractor): + # Calculated from const values, see lhx.AESUtils.encrypt, see public.js + # Note that the real key/iv differs from `lhx.AESUtils.key`/`lhx.AESUtils.iv` + _KEY = b'boomplayVr3xopAM' + _IV = b'boomplay8xIsKTn9' + + def _get_playurl(self, item_id, item_type): + resp = self._download_json( + 'https://www.boomplay.com/getResourceAddr', item_id, + note='Downloading play URL', errnote='Failed to download play URL', + data=urlencode_postdata({ + 'param': base64.b64encode(aes_cbc_encrypt_bytes(json.dumps({ + 'itemID': item_id, + 'itemType': item_type, + }).encode(), self._KEY, self._IV)).decode(), + }), headers={ + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + }) + if not (source := resp.get('source')) and resp.get('code'): + raise ExtractorError(resp.get('desc') or 'Please solve the captcha') + return unpad_pkcs7( + aes_cbc_decrypt_bytes(base64.b64decode(source), self._KEY, self._IV)).decode() + + def _extract_formats(self, _id, item_type='MUSIC', **kwargs): + if url := url_or_none(self._get_playurl(_id, item_type)): + return [{ + 'format_id': '0', + 'vcodec': 'none' if item_type == 'MUSIC' else None, + 'url': url, + 'http_headers': { + 'Origin': 'https://www.boomplay.com', + 'Referer': 'https://www.boomplay.com', + 'X-Boomplay-Ref': 'Boomplay_WEBV1', + }, + **kwargs, + }] + else: + self.raise_no_formats('No formats found') + + def _extract_page_metadata(self, webpage, _id): + metadata_div = get_element_by_attribute( + 'class', r'[^\'"]*(?<=[\'"\s])summary(?=[\'"\s])[^\'"]*', webpage, + tag='div', escape_value=False) or '' + metadata_entries = re.findall(r'(?s)(?P.*?)', metadata_div) or [] + description = get_element_by_attribute( + 'class', r'[^\'"]*(?<=[\'"\s])description_content(?=[\'"\s])[^\'"]*', webpage, + tag='span', escape_value=False) or 'Listen and download music for free on Boomplay!' + description = clean_html(description.strip()) + if description == 'Listen and download music for free on Boomplay!': + description = None + + details_section = get_element_by_attribute( + 'class', r'[^\'"]*(?<=[\'"\s])songDetailInfo(?=[\'"\s])[^\'"]*', webpage, + tag='section', escape_value=False) or '' + metadata_entries.extend(re.findall(r'(?s)
  • (?P.*?)
  • ', details_section) or []) + page_metadata = { + 'id': _id, + 'title': self._html_search_regex(r'

    ([^<]+)

    ', metadata_div, 'title', default=''), + 'thumbnail': self._html_search_meta(['og:image', 'twitter:image'], + webpage, 'thumbnail', default=''), + 'like_count': int_or_none(get_element_by_class('btn_favorite', metadata_div)), + 'repost_count': int_or_none(get_element_by_class('btn_share', metadata_div)), + 'comment_count': int_or_none(get_element_by_class('btn_comment', metadata_div)), + 'duration': parse_duration(get_element_by_class('btn_duration', metadata_div)), + 'upload_date': unified_strdate(strip_or_none(get_element_by_class('btn_pubDate', metadata_div))), + 'description': description, + } + for metadata_entry in metadata_entries: + if ':' not in metadata_entry: + continue + k, v = clean_html(metadata_entry).split(':', 2) + v = v.strip() + if 'artist' in k.lower(): + page_metadata['artists'] = [v] + elif 'album' in k.lower(): + page_metadata['album'] = v + elif 'genre' in k.lower(): + page_metadata['genres'] = [v] + elif 'year of release' in k.lower(): + page_metadata['release_year'] = int_or_none(v) + return page_metadata + + extract = lambda self, url: self.write_debug(json.dumps(a := super().extract(url), indent=2)) or a # rm + + +class BoomPlayMusicIE(BoomPlayBaseIE): + _VALID_URL = r'https?://(?:www\.)?boomplay\.com/songs/(?P\d+)' + _TEST = { + 'url': 'https://www.boomplay.com/songs/165481965', + 'md5': 'c5fb4f23e6aae98064230ef3c39c2178', + 'info_dict': { + 'title': 'Rise of the Fallen Heroes', + 'ext': 'mp3', + 'id': '165481965', + 'artists': ['fatbunny'], + 'thumbnail': 'https://source.boomplaymusic.com/group10/M00/04/29/375ecda38f6f48179a93c72ab909118f_464_464.jpg', + 'channel_url': 'https://www.boomplay.com/artists/52723101', + 'duration': 125.0, + 'release_year': 2024, + 'comment_count': int, + 'like_count': int, + 'repost_count': int, + 'album': 'Legendary Battle', + 'genres': ['Metal'], + }, + } + + def _real_extract(self, url): + song_id = self._match_id(url) + webpage = self._download_webpage(url, song_id) + ld_json_meta = next(self._yield_json_ld(webpage, song_id)) + + return merge_dicts( + self._extract_page_metadata(webpage, song_id), + traverse_obj(ld_json_meta, { + 'title': 'name', + 'thumbnail': 'image', + 'channel_url': ('byArtist', 0, '@id'), + 'artists': ('byArtist', ..., 'name'), + 'duration': ('duration', {parse_duration}), + }), { + 'formats': self._extract_formats(song_id, 'MUSIC'), + }) + + +class BoomPlayVideoIE(BoomPlayBaseIE): + _VALID_URL = r'https?://(?:www\.)?boomplay\.com/video/(?P\d+)' + _TEST = { + 'url': 'https://www.boomplay.com/video/1154892', + 'md5': 'd9b67ad333d2292a82922062d065352d', + 'info_dict': { + 'id': '1154892', + 'ext': 'mp4', + 'title': 'Autumn blues', + 'thumbnail': 'https://source.boomplaymusic.com/group10/M00/10/10/2171dee9e1f8452e84021560729edb88.jpg', + 'upload_date': '20241010', + 'timestamp': 1728599214, + 'view_count': int, + 'duration': 177.0, + 'description': 'Autumn blues by Lugo', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + return merge_dicts( + self._extract_page_metadata(webpage, video_id), + self._search_json_ld(webpage, video_id), { + 'formats': self._extract_formats(video_id, 'VIDEO', ext='mp4'), + }) + + +class BoomPlayEpisodeIE(BoomPlayBaseIE): + _VALID_URL = r'https?://(?:www\.)?boomplay\.com/episode/(?P\d+)' + _TEST = { + 'url': 'https://www.boomplay.com/episode/7132706', + 'md5': 'f26e236b764baa53d7a2cbb7e9ce6dc4', + 'info_dict': { + 'id': '7132706', + 'ext': 'mp3', + 'title': 'Letting Go', + 'repost_count': int, + 'thumbnail': 'https://source.boomplaymusic.com/group10/M00/05/06/fc535eaa25714b43a47185a9831887a5_320_320.jpg', + 'comment_count': int, + 'duration': 921.0, + 'upload_date': '20240506', + 'description': 'md5:5ec684b281fa0f9e4c31b3ee20c5e57a', + }, + } + + def _real_extract(self, url): + ep_id = self._match_id(url) + webpage = self._download_webpage(url, ep_id) + return merge_dicts( + self._extract_page_metadata(webpage, ep_id), { + 'title': self._og_search_title(webpage, fatal=True).rsplit('|', 2)[0].strip(), + 'description': self._html_search_meta( + ['description', 'og:description', 'twitter:description'], webpage), + 'formats': self._extract_formats(ep_id, 'EPISODE', vcodec='none'), + }) + + +class BoomPlayPodcastIE(BoomPlayBaseIE): + _VALID_URL = r'https?://(?:www\.)?boomplay\.com/podcasts/(?P\d+)' + _TEST = { + 'url': 'https://www.boomplay.com/podcasts/5372', + 'playlist_count': 200, + 'info_dict': { + 'id': '5372', + 'title': 'TED Talks Daily', + 'description': 'md5:541182e787ce8fd578c835534c907077', + 'thumbnail': 'https://source.boomplaymusic.com/group10/M00/12/22/6f9cf97ad6f846a0a7882c98dfcf4f8c_320_320.jpg', + 'repost_count': int, + 'comment_count': int, + }, + } + + def _real_extract(self, url): + _id = self._match_id(url) + webpage = self._download_webpage(url, _id) + song_list = get_elements_by_attribute( + 'class', r'[^\'"]*(?<=[\'"\s])morePart_musics(?=[\'"\s])[^\'"]*', webpage, + tag='ol', escape_value=False)[0] + song_list = traverse_obj(re.finditer( + r'''(?x) + <(?Pli) + (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)? + \sdata-id\s*=\s*(?P<_q>['"]?)(?:(?P\d+))(?P=_q)''', + song_list), + (..., 'id', { + lambda x: self.url_result( + f'https://www.boomplay.com/episode/{x}', BoomPlayEpisodeIE, x), + })) + return self.playlist_result( + song_list, _id, + playlist_title=self._og_search_title(webpage, fatal=True).rsplit('|', 2)[0].strip(), + playlist_description=self._og_search_description(webpage, default=''), + **self._extract_page_metadata(webpage, _id)) + + +class BoomPlayPlaylistIE(BoomPlayBaseIE): + _VALID_URL = r'https?://(?:www\.)?boomplay\.com/(?:playlists|artists|albums)/(?P\d+)' + _TESTS = [{ + 'url': 'https://www.boomplay.com/playlists/33792494', + 'info_dict': { + 'id': '33792494', + 'title': 'Daily Trending Indonesia', + 'thumbnail': 'https://source.boomplaymusic.com/group10/M00/08/19/d05d431ee616412caeacd7f78f4f68f5_320_320.jpeg', + 'repost_count': int, + 'comment_count': int, + 'description': 'md5:7ebdffc5137c77acb62acb3c89248445', + }, + 'playlist_count': 10, + }, { + 'url': 'https://www.boomplay.com/artists/52723101', + 'only_matching': True, + }, { + 'url': 'https://www.boomplay.com/albums/89611238?from=home#google_vignette', + 'only_matching': True, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + json_ld_metadata = next(self._yield_json_ld(webpage, playlist_id)) + # schema `MusicGroup` not supported by self._json_ld() + + return self.playlist_result(**merge_dicts( + self._extract_page_metadata(webpage, playlist_id), + traverse_obj(json_ld_metadata, { + 'entries': ('track', ..., 'url', { + functools.partial(self.url_result, ie=BoomPlayMusicIE), + }), + 'playlist_title': 'name', + 'thumbnail': 'image', + 'artists': ('byArtist', ..., 'name'), + 'channel_url': ('byArtist', 0, '@id'), + }))) From 6d2de79b7a419e822ed3e2f1308fde121b413094 Mon Sep 17 00:00:00 2001 From: grqx_wsl <173253225+grqx@users.noreply.github.com> Date: Sun, 13 Oct 2024 23:07:33 +1300 Subject: [PATCH 02/26] BoomPlayGenericPlaylistIE, BoomPlaySearchIE --- yt_dlp/extractor/_extractors.py | 2 + yt_dlp/extractor/boomplay.py | 152 ++++++++++++++++++++++++++++++-- 2 files changed, 146 insertions(+), 8 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 1abca1ed93..5208639e3a 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -282,9 +282,11 @@ from .bokecc import BokeCCIE from .bongacams import BongaCamsIE from .boomplay import ( BoomPlayEpisodeIE, + BoomPlayGenericPlaylistIE, BoomPlayMusicIE, BoomPlayPlaylistIE, BoomPlayPodcastIE, + BoomPlaySearchIE, BoomPlayVideoIE, ) from .boosty import BoostyIE diff --git a/yt_dlp/extractor/boomplay.py b/yt_dlp/extractor/boomplay.py index dba8a1c9ca..692f4d98b4 100644 --- a/yt_dlp/extractor/boomplay.py +++ b/yt_dlp/extractor/boomplay.py @@ -2,22 +2,29 @@ import base64 import functools import json import re +import urllib.parse -from .common import InfoExtractor +from .common import InfoExtractor, SearchInfoExtractor from ..aes import aes_cbc_decrypt_bytes, aes_cbc_encrypt_bytes, unpad_pkcs7 from ..utils import ( ExtractorError, clean_html, + extract_attributes, get_element_by_attribute, get_element_by_class, get_elements_by_attribute, int_or_none, + join_nonempty, merge_dicts, + orderedSet, + parse_count, parse_duration, strip_or_none, unified_strdate, url_or_none, urlencode_postdata, + urljoin, + variadic, ) from ..utils.traversal import traverse_obj @@ -27,6 +34,14 @@ class BoomPlayBaseIE(InfoExtractor): # Note that the real key/iv differs from `lhx.AESUtils.key`/`lhx.AESUtils.iv` _KEY = b'boomplayVr3xopAM' _IV = b'boomplay8xIsKTn9' + _BASE = 'https://www.boomplay.com' + _MEDIA_TYPES = ('songs', 'video', 'episode', 'podcasts', 'playlists', 'artists', 'albums') + + @classmethod + def _urljoin(cls, path): + if not hasattr(path, 'startswith') or path.startswith('javascript:'): + return None + return url_or_none(urljoin(base=cls._BASE, path=path)) def _get_playurl(self, item_id, item_type): resp = self._download_json( @@ -49,7 +64,6 @@ class BoomPlayBaseIE(InfoExtractor): if url := url_or_none(self._get_playurl(_id, item_type)): return [{ 'format_id': '0', - 'vcodec': 'none' if item_type == 'MUSIC' else None, 'url': url, 'http_headers': { 'Origin': 'https://www.boomplay.com', @@ -79,12 +93,12 @@ class BoomPlayBaseIE(InfoExtractor): metadata_entries.extend(re.findall(r'(?s)
  • (?P.*?)
  • ', details_section) or []) page_metadata = { 'id': _id, - 'title': self._html_search_regex(r'

    ([^<]+)

    ', metadata_div, 'title', default=''), + 'title': self._html_search_regex(r'

    ([^<]+)

    ', metadata_div, 'title', default=None), 'thumbnail': self._html_search_meta(['og:image', 'twitter:image'], webpage, 'thumbnail', default=''), - 'like_count': int_or_none(get_element_by_class('btn_favorite', metadata_div)), - 'repost_count': int_or_none(get_element_by_class('btn_share', metadata_div)), - 'comment_count': int_or_none(get_element_by_class('btn_comment', metadata_div)), + 'like_count': parse_count(get_element_by_class('btn_favorite', metadata_div)), + 'repost_count': parse_count(get_element_by_class('btn_share', metadata_div)), + 'comment_count': parse_count(get_element_by_class('btn_comment', metadata_div)), 'duration': parse_duration(get_element_by_class('btn_duration', metadata_div)), 'upload_date': unified_strdate(strip_or_none(get_element_by_class('btn_pubDate', metadata_div))), 'description': description, @@ -104,7 +118,55 @@ class BoomPlayBaseIE(InfoExtractor): page_metadata['release_year'] = int_or_none(v) return page_metadata - extract = lambda self, url: self.write_debug(json.dumps(a := super().extract(url), indent=2)) or a # rm + def _extract_suitable_links(self, webpage, media_types): + if not media_types: + media_types = self._MEDIA_TYPES + media_types = list(variadic(media_types)) + + for idx, v in enumerate(media_types): + media_types[idx] = re.escape(v) if v in self._MEDIA_TYPES else '' + media_types = join_nonempty(*media_types, delim='|') + return orderedSet(traverse_obj(re.finditer( + rf'''(?x) + "']|"[^"]*"|'[^']*')*)? + (?<=\s)href\s*=\s*(?P<_q>['"]) + (?: + (?!javascript:)(?P/(?:{media_types})/\d+?) + ) + (?P=_q) + (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)? + ''', webpage), (..., 'link', {self._urljoin}, {self.url_result}))) + + def _extract_playlist_entries(self, webpage, media_types, warn=True): + song_list = strip_or_none( + get_element_by_attribute( + 'class', r'[^\'"]*(?<=[\'"\s])morePart_musics(?=[\'"\s])[^\'"]*', webpage, + tag='ol', escape_value=False) + or get_element_by_attribute( + 'class', r'[^\'"]*(?<=[\'"\s])morePart(?=[\'"\s])[^\'"]*', webpage, + tag='ol', escape_value=False) + or '') + + entries = traverse_obj(re.finditer( + r'''(?x) + "']|"[^"]*"|'[^']*')*)? + (?<=\s)class\s*=\s*(?P<_q>['"]) + (?: + [^\'"]*(?<=[\'"\s])songName(?=[\'"\s])[^\'"]* + ) + (?P=_q) + (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)? + > + ''', song_list), + (..., 0, {extract_attributes}, 'href', {self._urljoin}, {self.url_result})) + if not entries: + if warn: + self.report_warning('Failed to extract playlist entries, finding suitable links instead!') + return self._extract_suitable_links(webpage, media_types) + + return entries class BoomPlayMusicIE(BoomPlayBaseIE): @@ -143,7 +205,7 @@ class BoomPlayMusicIE(BoomPlayBaseIE): 'artists': ('byArtist', ..., 'name'), 'duration': ('duration', {parse_duration}), }), { - 'formats': self._extract_formats(song_id, 'MUSIC'), + 'formats': self._extract_formats(song_id, 'MUSIC', vcodec='none'), }) @@ -217,6 +279,7 @@ class BoomPlayPodcastIE(BoomPlayBaseIE): 'thumbnail': 'https://source.boomplaymusic.com/group10/M00/12/22/6f9cf97ad6f846a0a7882c98dfcf4f8c_320_320.jpg', 'repost_count': int, 'comment_count': int, + 'like_count': int, }, } @@ -253,6 +316,7 @@ class BoomPlayPlaylistIE(BoomPlayBaseIE): 'thumbnail': 'https://source.boomplaymusic.com/group10/M00/08/19/d05d431ee616412caeacd7f78f4f68f5_320_320.jpeg', 'repost_count': int, 'comment_count': int, + 'like_count': int, 'description': 'md5:7ebdffc5137c77acb62acb3c89248445', }, 'playlist_count': 10, @@ -281,3 +345,75 @@ class BoomPlayPlaylistIE(BoomPlayBaseIE): 'artists': ('byArtist', ..., 'name'), 'channel_url': ('byArtist', 0, '@id'), }))) + + +class BoomPlayGenericPlaylistIE(BoomPlayBaseIE): + _VALID_URL = r'https?://(?:www\.)?boomplay\.com/.+' + _TESTS = [{ + 'url': 'https://www.boomplay.com/search/default/Rise%20of%20the%20Fallen%20Heroes', + 'md5': 'c5fb4f23e6aae98064230ef3c39c2178', + 'info_dict': { + 'id': '165481965', + 'ext': 'mp3', + 'title': 'Rise of the Fallen Heroes', + 'duration': 125.0, + 'genres': ['Metal'], + 'artists': ['fatbunny'], + 'thumbnail': 'https://source.boomplaymusic.com/group10/M00/04/29/375ecda38f6f48179a93c72ab909118f_464_464.jpg', + 'channel_url': 'https://www.boomplay.com/artists/52723101', + 'comment_count': int, + 'repost_count': int, + 'album': 'Legendary Battle', + 'release_year': 2024, + 'like_count': int, + }, + }, { + 'url': 'https://www.boomplay.com/search/video/%20Autumn%20blues', + 'md5': 'd9b67ad333d2292a82922062d065352d', + 'info_dict': { + 'id': '1154892', + 'title': 'Autumn blues', + 'ext': 'mp4', + 'timestamp': 1728599214, + 'view_count': int, + 'thumbnail': 'https://source.boomplaymusic.com/group10/M00/10/10/2171dee9e1f8452e84021560729edb88.jpg', + 'description': 'Autumn blues by Lugo', + 'upload_date': '20241010', + 'duration': 177.0, + }, + 'expected_warnings': ['Failed to extract playlist entries, finding suitable links instead!'], + 'params': {'playlist_items': '1'}, + }] + + @classmethod + def suitable(cls, url): + if not any(ie.suitable(url) for ie in ( + BoomPlayEpisodeIE, + BoomPlayMusicIE, + BoomPlayPlaylistIE, + BoomPlayPodcastIE, + BoomPlayVideoIE, + )): + return super().suitable(url) + return False + + def _real_extract(self, url): + _id = self._generic_id(url) + webpage = self._download_webpage(url, _id) + # TODO: pass media types based on search types + return self.playlist_result( + self._extract_playlist_entries(webpage, self._MEDIA_TYPES), + **self._extract_page_metadata(webpage, _id)) + + +class BoomPlaySearchIE(SearchInfoExtractor): + _SEARCH_KEY = 'boomplaysearch' + _RETURN_TYPE = 'url' + _TEST = { + 'url': 'boomplaysearch:rise of the fallen heroes', + 'only_matching': True, + } + + def _search_results(self, query): + yield self.url_result( + f'https://www.boomplay.com/search/default/{urllib.parse.quote(query)}') From 5b962d70de1ab995a9081b9cabdd44448bb06cd4 Mon Sep 17 00:00:00 2001 From: grqx_wsl <173253225+grqx@users.noreply.github.com> Date: Mon, 14 Oct 2024 23:49:07 +1300 Subject: [PATCH 03/26] improve metadata extraction, add extractor for search pages - pass tests&code formatting Co-authored-by: dirkf Co-authored-by: grqx_wsl <173253225+grqx@users.noreply.github.com> --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/boomplay.py | 204 ++++++++++++++++++++------------ 2 files changed, 129 insertions(+), 76 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 5208639e3a..399c8429a9 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -287,6 +287,7 @@ from .boomplay import ( BoomPlayPlaylistIE, BoomPlayPodcastIE, BoomPlaySearchIE, + BoomPlaySearchPageIE, BoomPlayVideoIE, ) from .boosty import BoostyIE diff --git a/yt_dlp/extractor/boomplay.py b/yt_dlp/extractor/boomplay.py index 692f4d98b4..3fa6030c85 100644 --- a/yt_dlp/extractor/boomplay.py +++ b/yt_dlp/extractor/boomplay.py @@ -8,11 +8,10 @@ from .common import InfoExtractor, SearchInfoExtractor from ..aes import aes_cbc_decrypt_bytes, aes_cbc_encrypt_bytes, unpad_pkcs7 from ..utils import ( ExtractorError, + classproperty, clean_html, extract_attributes, - get_element_by_attribute, - get_element_by_class, - get_elements_by_attribute, + get_elements_text_and_html_by_attribute, int_or_none, join_nonempty, merge_dicts, @@ -30,12 +29,40 @@ from ..utils.traversal import traverse_obj class BoomPlayBaseIE(InfoExtractor): - # Calculated from const values, see lhx.AESUtils.encrypt, see public.js + # Calculated from const values, see lhx.AESUtils.encrypt in public.js # Note that the real key/iv differs from `lhx.AESUtils.key`/`lhx.AESUtils.iv` _KEY = b'boomplayVr3xopAM' _IV = b'boomplay8xIsKTn9' _BASE = 'https://www.boomplay.com' _MEDIA_TYPES = ('songs', 'video', 'episode', 'podcasts', 'playlists', 'artists', 'albums') + _GEO_COUNTRIES = ['NG'] + + @staticmethod + def __yield_elements_text_and_html_by_class_and_tag(class_, tag, html): + """ + Yields content of all element matching `tag .class_` in html + class_ must be re escaped + """ + # get_elements_text_and_html_by_attribute returns a generator + return get_elements_text_and_html_by_attribute( + 'class', rf'''[^'"]*(?<=['"\s]){class_}(?=['"\s])[^'"]*''', html, + tag=tag, escape_value=False) + + @classmethod + def __yield_elements_by_class_and_tag(cls, *args, **kwargs): + return (content for content, _ in cls.__yield_elements_text_and_html_by_class_and_tag(*args, **kwargs)) + + @classmethod + def __yield_elements_html_by_class_and_tag(cls, *args, **kwargs): + return (whole for _, whole in cls.__yield_elements_text_and_html_by_class_and_tag(*args, **kwargs)) + + @classmethod + def _get_elements_by_class_and_tag(cls, class_, tag, html): + return list(cls.__yield_elements_by_class_and_tag(class_, tag, html)) + + @classmethod + def _get_element_by_class_and_tag(cls, class_, tag, html): + return next(cls.__yield_elements_by_class_and_tag(class_, tag, html), None) @classmethod def _urljoin(cls, path): @@ -55,10 +82,15 @@ class BoomPlayBaseIE(InfoExtractor): }), headers={ 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', }) - if not (source := resp.get('source')) and resp.get('code'): - raise ExtractorError(resp.get('desc') or 'Please solve the captcha') - return unpad_pkcs7( - aes_cbc_decrypt_bytes(base64.b64decode(source), self._KEY, self._IV)).decode() + if not (source := resp.get('source')) and (code := resp.get('code')): + if 'unavailable in your country' in (desc := resp.get('desc')) or '': + # since NG must have failed ... + self.raise_geo_restricted(countries=['GH', 'KE', 'TZ', 'CM', 'CI']) + else: + raise ExtractorError(desc or f'Failed to get play url, code: {code}') + return unpad_pkcs7(aes_cbc_decrypt_bytes( + base64.b64decode(source), + self._KEY, self._IV)).decode() def _extract_formats(self, _id, item_type='MUSIC', **kwargs): if url := url_or_none(self._get_playurl(_id, item_type)): @@ -75,38 +107,35 @@ class BoomPlayBaseIE(InfoExtractor): else: self.raise_no_formats('No formats found') - def _extract_page_metadata(self, webpage, _id): - metadata_div = get_element_by_attribute( - 'class', r'[^\'"]*(?<=[\'"\s])summary(?=[\'"\s])[^\'"]*', webpage, - tag='div', escape_value=False) or '' - metadata_entries = re.findall(r'(?s)(?P.*?)', metadata_div) or [] - description = get_element_by_attribute( - 'class', r'[^\'"]*(?<=[\'"\s])description_content(?=[\'"\s])[^\'"]*', webpage, - tag='span', escape_value=False) or 'Listen and download music for free on Boomplay!' + def _extract_page_metadata(self, webpage, _id, playlist=False): + metadata_div = self._get_element_by_class_and_tag('summary', 'div', webpage) or '' + metadata_entries = re.findall(r'(?si)(?P.*?)', metadata_div) or [] + description = ( + self._get_element_by_class_and_tag('description_content', 'span', webpage) + or 'Listen and download music for free on Boomplay!') description = clean_html(description.strip()) if description == 'Listen and download music for free on Boomplay!': description = None - details_section = get_element_by_attribute( - 'class', r'[^\'"]*(?<=[\'"\s])songDetailInfo(?=[\'"\s])[^\'"]*', webpage, - tag='section', escape_value=False) or '' - metadata_entries.extend(re.findall(r'(?s)
  • (?P.*?)
  • ', details_section) or []) + details_section = self._get_element_by_class_and_tag('songDetailInfo', 'section', webpage) or '' + metadata_entries.extend(re.findall(r'(?si)
  • (?P.*?)
  • ', details_section) or []) page_metadata = { 'id': _id, - 'title': self._html_search_regex(r'

    ([^<]+)

    ', metadata_div, 'title', default=None), + 'title': self._html_search_regex(r']*>([^<]+)', webpage, 'title', default=None), 'thumbnail': self._html_search_meta(['og:image', 'twitter:image'], webpage, 'thumbnail', default=''), - 'like_count': parse_count(get_element_by_class('btn_favorite', metadata_div)), - 'repost_count': parse_count(get_element_by_class('btn_share', metadata_div)), - 'comment_count': parse_count(get_element_by_class('btn_comment', metadata_div)), - 'duration': parse_duration(get_element_by_class('btn_duration', metadata_div)), - 'upload_date': unified_strdate(strip_or_none(get_element_by_class('btn_pubDate', metadata_div))), + 'like_count': parse_count(self._get_element_by_class_and_tag('btn_favorite', 'button', metadata_div)), + 'repost_count': parse_count(self._get_element_by_class_and_tag('btn_share', 'button', metadata_div)), + 'comment_count': parse_count(self._get_element_by_class_and_tag('btn_comment', 'button', metadata_div)), + 'duration': parse_duration(self._get_element_by_class_and_tag('btn_duration', 'button', metadata_div)), + 'upload_date': unified_strdate(strip_or_none( + self._get_element_by_class_and_tag('btn_pubDate', 'button', metadata_div))), 'description': description, } for metadata_entry in metadata_entries: if ':' not in metadata_entry: continue - k, v = clean_html(metadata_entry).split(':', 2) + k, v = clean_html(metadata_entry).split(':', 1) v = v.strip() if 'artist' in k.lower(): page_metadata['artists'] = [v] @@ -118,8 +147,8 @@ class BoomPlayBaseIE(InfoExtractor): page_metadata['release_year'] = int_or_none(v) return page_metadata - def _extract_suitable_links(self, webpage, media_types): - if not media_types: + def _extract_suitable_links(self, webpage, media_types=None): + if media_types is None: media_types = self._MEDIA_TYPES media_types = list(variadic(media_types)) @@ -132,35 +161,21 @@ class BoomPlayBaseIE(InfoExtractor): (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)? (?<=\s)href\s*=\s*(?P<_q>['"]) (?: - (?!javascript:)(?P/(?:{media_types})/\d+?) + (?!javascript:)(?P/(?:{media_types})/\d+/?[\-a-zA-Z=?&#:;@]*) ) (?P=_q) (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)? - ''', webpage), (..., 'link', {self._urljoin}, {self.url_result}))) + >''', webpage), (..., 'link', {self._urljoin}, {self.url_result}))) def _extract_playlist_entries(self, webpage, media_types, warn=True): song_list = strip_or_none( - get_element_by_attribute( - 'class', r'[^\'"]*(?<=[\'"\s])morePart_musics(?=[\'"\s])[^\'"]*', webpage, - tag='ol', escape_value=False) - or get_element_by_attribute( - 'class', r'[^\'"]*(?<=[\'"\s])morePart(?=[\'"\s])[^\'"]*', webpage, - tag='ol', escape_value=False) + self._get_element_by_class_and_tag('morePart_musics', 'ol', webpage) + or self._get_element_by_class_and_tag('morePart', 'ol', webpage) or '') - entries = traverse_obj(re.finditer( - r'''(?x) -
    "']|"[^"]*"|'[^']*')*)? - (?<=\s)class\s*=\s*(?P<_q>['"]) - (?: - [^\'"]*(?<=[\'"\s])songName(?=[\'"\s])[^\'"]* - ) - (?P=_q) - (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)? - > - ''', song_list), - (..., 0, {extract_attributes}, 'href', {self._urljoin}, {self.url_result})) + entries = traverse_obj(self.__yield_elements_html_by_class_and_tag( + 'songName', 'a', song_list), + (..., {extract_attributes}, 'href', {self._urljoin}, {self.url_result})) if not entries: if warn: self.report_warning('Failed to extract playlist entries, finding suitable links instead!') @@ -195,7 +210,8 @@ class BoomPlayMusicIE(BoomPlayBaseIE): song_id = self._match_id(url) webpage = self._download_webpage(url, song_id) ld_json_meta = next(self._yield_json_ld(webpage, song_id)) - + # TODO: extract comments(and lyrics? they don't have timestamps) + # example: https://www.boomplay.com/songs/96352673?from=home return merge_dicts( self._extract_page_metadata(webpage, song_id), traverse_obj(ld_json_meta, { @@ -286,14 +302,17 @@ class BoomPlayPodcastIE(BoomPlayBaseIE): def _real_extract(self, url): _id = self._match_id(url) webpage = self._download_webpage(url, _id) - song_list = get_elements_by_attribute( - 'class', r'[^\'"]*(?<=[\'"\s])morePart_musics(?=[\'"\s])[^\'"]*', webpage, - tag='ol', escape_value=False)[0] + song_list = self._get_element_by_class_and_tag('morePart_musics', 'ol', webpage) song_list = traverse_obj(re.finditer( r'''(?x) - <(?Pli) - (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)? - \sdata-id\s*=\s*(?P<_q>['"]?)(?:(?P\d+))(?P=_q)''', +
  • "']|"[^"]*"|'[^']*')*)? + \sdata-id\s*=\s* + (?P<_q>['"]?) + (?P\d+) + (?P=_q) + (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)? + >''', song_list), (..., 'id', { lambda x: self.url_result( @@ -350,7 +369,47 @@ class BoomPlayPlaylistIE(BoomPlayBaseIE): class BoomPlayGenericPlaylistIE(BoomPlayBaseIE): _VALID_URL = r'https?://(?:www\.)?boomplay\.com/.+' _TESTS = [{ - 'url': 'https://www.boomplay.com/search/default/Rise%20of%20the%20Fallen%20Heroes', + 'url': 'https://www.boomplay.com/new-songs', + 'playlist_mincount': 20, + 'info_dict': { + 'id': 'new-songs', + 'title': 'New Songs', + 'thumbnail': 'http://www.boomplay.com/pc/img/og_default_v3.jpg', + }, + }, { + 'url': 'https://www.boomplay.com/trending-songs', + 'playlist_mincount': 20, + 'info_dict': { + 'id': 'trending-songs', + 'title': 'Trending Songs', + 'thumbnail': 'http://www.boomplay.com/pc/img/og_default_v3.jpg', + }, + }] + + @classmethod + def suitable(cls, url): + if super().suitable(url): + return not any(ie.suitable(url) for ie in ( + BoomPlayEpisodeIE, + BoomPlayMusicIE, + BoomPlayPlaylistIE, + BoomPlayPodcastIE, + BoomPlaySearchPageIE, + BoomPlayVideoIE, + )) + return False + + def _real_extract(self, url): + _id = self._generic_id(url) + webpage = self._download_webpage(url, _id) + return self.playlist_result( + self._extract_playlist_entries(webpage, self._MEDIA_TYPES), + **self._extract_page_metadata(webpage, _id)) + + +class BoomPlaySearchPageIE(BoomPlayBaseIE): + _TESTS = [{ + 'url': 'https://www.boomplay.com/search/default/%20Rise%20of%20the%20Falletesn%20Heroes%20fatbunny', 'md5': 'c5fb4f23e6aae98064230ef3c39c2178', 'info_dict': { 'id': '165481965', @@ -381,29 +440,21 @@ class BoomPlayGenericPlaylistIE(BoomPlayBaseIE): 'upload_date': '20241010', 'duration': 177.0, }, - 'expected_warnings': ['Failed to extract playlist entries, finding suitable links instead!'], 'params': {'playlist_items': '1'}, }] - @classmethod - def suitable(cls, url): - if not any(ie.suitable(url) for ie in ( - BoomPlayEpisodeIE, - BoomPlayMusicIE, - BoomPlayPlaylistIE, - BoomPlayPodcastIE, - BoomPlayVideoIE, - )): - return super().suitable(url) - return False + @classproperty + def _VALID_URL(cls): + return rf'https?://(?:www\.)?boomplay\.com/search/(?P{"|".join(cls._MEDIA_TYPES)})/(?P[^?&#/]+)' def _real_extract(self, url): - _id = self._generic_id(url) - webpage = self._download_webpage(url, _id) - # TODO: pass media types based on search types + media_type, query = self._match_valid_url(url).group('media_type', 'query') + if media_type == 'default': + media_type = 'songs' + webpage = self._download_webpage(url, query) return self.playlist_result( - self._extract_playlist_entries(webpage, self._MEDIA_TYPES), - **self._extract_page_metadata(webpage, _id)) + self._extract_playlist_entries(webpage, media_type, warn=media_type == 'songs'), + **self._extract_page_metadata(webpage, query)) class BoomPlaySearchIE(SearchInfoExtractor): @@ -416,4 +467,5 @@ class BoomPlaySearchIE(SearchInfoExtractor): def _search_results(self, query): yield self.url_result( - f'https://www.boomplay.com/search/default/{urllib.parse.quote(query)}') + f'https://www.boomplay.com/search/default/{urllib.parse.quote(query)}', + BoomPlaySearchPageIE) From 16d68723dc80b9f21113a144c0c97626ae298d43 Mon Sep 17 00:00:00 2001 From: N/Ame <173015200+grqz@users.noreply.github.com> Date: Tue, 15 Oct 2024 13:23:54 +1300 Subject: [PATCH 04/26] Update yt_dlp/extractor/boomplay.py --- yt_dlp/extractor/boomplay.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/boomplay.py b/yt_dlp/extractor/boomplay.py index 3fa6030c85..2f2eb2f100 100644 --- a/yt_dlp/extractor/boomplay.py +++ b/yt_dlp/extractor/boomplay.py @@ -388,16 +388,14 @@ class BoomPlayGenericPlaylistIE(BoomPlayBaseIE): @classmethod def suitable(cls, url): - if super().suitable(url): - return not any(ie.suitable(url) for ie in ( + return False if any(ie.suitable(url) for ie in ( BoomPlayEpisodeIE, BoomPlayMusicIE, BoomPlayPlaylistIE, BoomPlayPodcastIE, BoomPlaySearchPageIE, BoomPlayVideoIE, - )) - return False + )) else super().suitable(url) def _real_extract(self, url): _id = self._generic_id(url) From 445531c5a063fd18feb185adcf274716c62cb4e6 Mon Sep 17 00:00:00 2001 From: N/Ame <173015200+grqz@users.noreply.github.com> Date: Tue, 15 Oct 2024 13:27:09 +1300 Subject: [PATCH 05/26] Update yt_dlp/extractor/boomplay.py --- yt_dlp/extractor/boomplay.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/yt_dlp/extractor/boomplay.py b/yt_dlp/extractor/boomplay.py index 2f2eb2f100..439a7ec746 100644 --- a/yt_dlp/extractor/boomplay.py +++ b/yt_dlp/extractor/boomplay.py @@ -388,14 +388,14 @@ class BoomPlayGenericPlaylistIE(BoomPlayBaseIE): @classmethod def suitable(cls, url): - return False if any(ie.suitable(url) for ie in ( - BoomPlayEpisodeIE, - BoomPlayMusicIE, - BoomPlayPlaylistIE, - BoomPlayPodcastIE, - BoomPlaySearchPageIE, - BoomPlayVideoIE, - )) else super().suitable(url) + return False if any(ie.suitable(url) for ie in ( + BoomPlayEpisodeIE, + BoomPlayMusicIE, + BoomPlayPlaylistIE, + BoomPlayPodcastIE, + BoomPlaySearchPageIE, + BoomPlayVideoIE, + )) else super().suitable(url) def _real_extract(self, url): _id = self._generic_id(url) From 5b1b5bb1b6bdc8964364bcd8e808ddbd76da2976 Mon Sep 17 00:00:00 2001 From: grqx_wsl <173253225+grqx@users.noreply.github.com> Date: Tue, 15 Oct 2024 14:53:36 +1300 Subject: [PATCH 06/26] updxate _VALID_URL --- yt_dlp/extractor/boomplay.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/boomplay.py b/yt_dlp/extractor/boomplay.py index 3fa6030c85..f6173c139e 100644 --- a/yt_dlp/extractor/boomplay.py +++ b/yt_dlp/extractor/boomplay.py @@ -445,7 +445,7 @@ class BoomPlaySearchPageIE(BoomPlayBaseIE): @classproperty def _VALID_URL(cls): - return rf'https?://(?:www\.)?boomplay\.com/search/(?P{"|".join(cls._MEDIA_TYPES)})/(?P[^?&#/]+)' + return r'https?://(?:www\.)?boomplay\.com/search/(?Pdefault|video|episode|podcasts|playlists|artists|albums)/(?P[^?&#/]+)' def _real_extract(self, url): media_type, query = self._match_valid_url(url).group('media_type', 'query') From 6beca5eb570b77545270a05f0fdc17bac444d73d Mon Sep 17 00:00:00 2001 From: grqx_wsl <173253225+grqx@users.noreply.github.com> Date: Tue, 15 Oct 2024 14:56:37 +1300 Subject: [PATCH 07/26] revert --- yt_dlp/extractor/boomplay.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/yt_dlp/extractor/boomplay.py b/yt_dlp/extractor/boomplay.py index c50cf275ef..f6173c139e 100644 --- a/yt_dlp/extractor/boomplay.py +++ b/yt_dlp/extractor/boomplay.py @@ -388,14 +388,16 @@ class BoomPlayGenericPlaylistIE(BoomPlayBaseIE): @classmethod def suitable(cls, url): - return False if any(ie.suitable(url) for ie in ( - BoomPlayEpisodeIE, - BoomPlayMusicIE, - BoomPlayPlaylistIE, - BoomPlayPodcastIE, - BoomPlaySearchPageIE, - BoomPlayVideoIE, - )) else super().suitable(url) + if super().suitable(url): + return not any(ie.suitable(url) for ie in ( + BoomPlayEpisodeIE, + BoomPlayMusicIE, + BoomPlayPlaylistIE, + BoomPlayPodcastIE, + BoomPlaySearchPageIE, + BoomPlayVideoIE, + )) + return False def _real_extract(self, url): _id = self._generic_id(url) From bbb121c2afa141956dccebd9c34d9eab9ff658ab Mon Sep 17 00:00:00 2001 From: grqx_wsl <173253225+grqx@users.noreply.github.com> Date: Wed, 16 Oct 2024 23:47:36 +1300 Subject: [PATCH 08/26] Correct extractor name: `BoomPlay`==>`Boomplay` --- yt_dlp/extractor/_extractors.py | 16 +++++++-------- yt_dlp/extractor/boomplay.py | 36 ++++++++++++++++----------------- 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 399c8429a9..f36560e395 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -281,14 +281,14 @@ from .bloomberg import BloombergIE from .bokecc import BokeCCIE from .bongacams import BongaCamsIE from .boomplay import ( - BoomPlayEpisodeIE, - BoomPlayGenericPlaylistIE, - BoomPlayMusicIE, - BoomPlayPlaylistIE, - BoomPlayPodcastIE, - BoomPlaySearchIE, - BoomPlaySearchPageIE, - BoomPlayVideoIE, + BoomplayEpisodeIE, + BoomplayGenericPlaylistIE, + BoomplayMusicIE, + BoomplayPlaylistIE, + BoomplayPodcastIE, + BoomplaySearchIE, + BoomplaySearchPageIE, + BoomplayVideoIE, ) from .boosty import BoostyIE from .bostonglobe import BostonGlobeIE diff --git a/yt_dlp/extractor/boomplay.py b/yt_dlp/extractor/boomplay.py index f6173c139e..0c4bd3681b 100644 --- a/yt_dlp/extractor/boomplay.py +++ b/yt_dlp/extractor/boomplay.py @@ -28,7 +28,7 @@ from ..utils import ( from ..utils.traversal import traverse_obj -class BoomPlayBaseIE(InfoExtractor): +class BoomplayBaseIE(InfoExtractor): # Calculated from const values, see lhx.AESUtils.encrypt in public.js # Note that the real key/iv differs from `lhx.AESUtils.key`/`lhx.AESUtils.iv` _KEY = b'boomplayVr3xopAM' @@ -184,7 +184,7 @@ class BoomPlayBaseIE(InfoExtractor): return entries -class BoomPlayMusicIE(BoomPlayBaseIE): +class BoomplayMusicIE(BoomplayBaseIE): _VALID_URL = r'https?://(?:www\.)?boomplay\.com/songs/(?P\d+)' _TEST = { 'url': 'https://www.boomplay.com/songs/165481965', @@ -225,7 +225,7 @@ class BoomPlayMusicIE(BoomPlayBaseIE): }) -class BoomPlayVideoIE(BoomPlayBaseIE): +class BoomplayVideoIE(BoomplayBaseIE): _VALID_URL = r'https?://(?:www\.)?boomplay\.com/video/(?P\d+)' _TEST = { 'url': 'https://www.boomplay.com/video/1154892', @@ -253,7 +253,7 @@ class BoomPlayVideoIE(BoomPlayBaseIE): }) -class BoomPlayEpisodeIE(BoomPlayBaseIE): +class BoomplayEpisodeIE(BoomplayBaseIE): _VALID_URL = r'https?://(?:www\.)?boomplay\.com/episode/(?P\d+)' _TEST = { 'url': 'https://www.boomplay.com/episode/7132706', @@ -283,7 +283,7 @@ class BoomPlayEpisodeIE(BoomPlayBaseIE): }) -class BoomPlayPodcastIE(BoomPlayBaseIE): +class BoomplayPodcastIE(BoomplayBaseIE): _VALID_URL = r'https?://(?:www\.)?boomplay\.com/podcasts/(?P\d+)' _TEST = { 'url': 'https://www.boomplay.com/podcasts/5372', @@ -316,7 +316,7 @@ class BoomPlayPodcastIE(BoomPlayBaseIE): song_list), (..., 'id', { lambda x: self.url_result( - f'https://www.boomplay.com/episode/{x}', BoomPlayEpisodeIE, x), + f'https://www.boomplay.com/episode/{x}', BoomplayEpisodeIE, x), })) return self.playlist_result( song_list, _id, @@ -325,7 +325,7 @@ class BoomPlayPodcastIE(BoomPlayBaseIE): **self._extract_page_metadata(webpage, _id)) -class BoomPlayPlaylistIE(BoomPlayBaseIE): +class BoomplayPlaylistIE(BoomplayBaseIE): _VALID_URL = r'https?://(?:www\.)?boomplay\.com/(?:playlists|artists|albums)/(?P\d+)' _TESTS = [{ 'url': 'https://www.boomplay.com/playlists/33792494', @@ -357,7 +357,7 @@ class BoomPlayPlaylistIE(BoomPlayBaseIE): self._extract_page_metadata(webpage, playlist_id), traverse_obj(json_ld_metadata, { 'entries': ('track', ..., 'url', { - functools.partial(self.url_result, ie=BoomPlayMusicIE), + functools.partial(self.url_result, ie=BoomplayMusicIE), }), 'playlist_title': 'name', 'thumbnail': 'image', @@ -366,7 +366,7 @@ class BoomPlayPlaylistIE(BoomPlayBaseIE): }))) -class BoomPlayGenericPlaylistIE(BoomPlayBaseIE): +class BoomplayGenericPlaylistIE(BoomplayBaseIE): _VALID_URL = r'https?://(?:www\.)?boomplay\.com/.+' _TESTS = [{ 'url': 'https://www.boomplay.com/new-songs', @@ -390,12 +390,12 @@ class BoomPlayGenericPlaylistIE(BoomPlayBaseIE): def suitable(cls, url): if super().suitable(url): return not any(ie.suitable(url) for ie in ( - BoomPlayEpisodeIE, - BoomPlayMusicIE, - BoomPlayPlaylistIE, - BoomPlayPodcastIE, - BoomPlaySearchPageIE, - BoomPlayVideoIE, + BoomplayEpisodeIE, + BoomplayMusicIE, + BoomplayPlaylistIE, + BoomplayPodcastIE, + BoomplaySearchPageIE, + BoomplayVideoIE, )) return False @@ -407,7 +407,7 @@ class BoomPlayGenericPlaylistIE(BoomPlayBaseIE): **self._extract_page_metadata(webpage, _id)) -class BoomPlaySearchPageIE(BoomPlayBaseIE): +class BoomplaySearchPageIE(BoomplayBaseIE): _TESTS = [{ 'url': 'https://www.boomplay.com/search/default/%20Rise%20of%20the%20Falletesn%20Heroes%20fatbunny', 'md5': 'c5fb4f23e6aae98064230ef3c39c2178', @@ -457,7 +457,7 @@ class BoomPlaySearchPageIE(BoomPlayBaseIE): **self._extract_page_metadata(webpage, query)) -class BoomPlaySearchIE(SearchInfoExtractor): +class BoomplaySearchIE(SearchInfoExtractor): _SEARCH_KEY = 'boomplaysearch' _RETURN_TYPE = 'url' _TEST = { @@ -468,4 +468,4 @@ class BoomPlaySearchIE(SearchInfoExtractor): def _search_results(self, query): yield self.url_result( f'https://www.boomplay.com/search/default/{urllib.parse.quote(query)}', - BoomPlaySearchPageIE) + BoomplaySearchPageIE) From cee1c763e49c8aa73e80998e5589c1ee5785dfb9 Mon Sep 17 00:00:00 2001 From: grqx_wsl <173253225+grqx@users.noreply.github.com> Date: Wed, 16 Oct 2024 23:48:40 +1300 Subject: [PATCH 09/26] fix the docstring of `BoomplayBaseIE.__yield_elements_text_and_html_by_class_and_tag` --- yt_dlp/extractor/boomplay.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/boomplay.py b/yt_dlp/extractor/boomplay.py index 0c4bd3681b..7a73955372 100644 --- a/yt_dlp/extractor/boomplay.py +++ b/yt_dlp/extractor/boomplay.py @@ -40,7 +40,7 @@ class BoomplayBaseIE(InfoExtractor): @staticmethod def __yield_elements_text_and_html_by_class_and_tag(class_, tag, html): """ - Yields content of all element matching `tag .class_` in html + Yields content of all element matching `tag.class_` in html class_ must be re escaped """ # get_elements_text_and_html_by_attribute returns a generator From 28a11630105d875518ffbf9967b9032bff0c495e Mon Sep 17 00:00:00 2001 From: grqx_wsl <173253225+grqx@users.noreply.github.com> Date: Fri, 18 Oct 2024 13:28:49 +1300 Subject: [PATCH 10/26] consistency: BoomplaySearchPageIE => BoomplaySearchURLIE --- yt_dlp/extractor/_extractors.py | 2 +- yt_dlp/extractor/boomplay.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index f36560e395..24fa798efa 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -287,7 +287,7 @@ from .boomplay import ( BoomplayPlaylistIE, BoomplayPodcastIE, BoomplaySearchIE, - BoomplaySearchPageIE, + BoomplaySearchURLIE, BoomplayVideoIE, ) from .boosty import BoostyIE diff --git a/yt_dlp/extractor/boomplay.py b/yt_dlp/extractor/boomplay.py index 7a73955372..24ba2dffa9 100644 --- a/yt_dlp/extractor/boomplay.py +++ b/yt_dlp/extractor/boomplay.py @@ -394,7 +394,7 @@ class BoomplayGenericPlaylistIE(BoomplayBaseIE): BoomplayMusicIE, BoomplayPlaylistIE, BoomplayPodcastIE, - BoomplaySearchPageIE, + BoomplaySearchURLIE, BoomplayVideoIE, )) return False @@ -407,7 +407,7 @@ class BoomplayGenericPlaylistIE(BoomplayBaseIE): **self._extract_page_metadata(webpage, _id)) -class BoomplaySearchPageIE(BoomplayBaseIE): +class BoomplaySearchURLIE(BoomplayBaseIE): _TESTS = [{ 'url': 'https://www.boomplay.com/search/default/%20Rise%20of%20the%20Falletesn%20Heroes%20fatbunny', 'md5': 'c5fb4f23e6aae98064230ef3c39c2178', @@ -468,4 +468,4 @@ class BoomplaySearchIE(SearchInfoExtractor): def _search_results(self, query): yield self.url_result( f'https://www.boomplay.com/search/default/{urllib.parse.quote(query)}', - BoomplaySearchPageIE) + BoomplaySearchURLIE) From 38383ea31323ea8ccb859897bafb6cd16fbaa34b Mon Sep 17 00:00:00 2001 From: N/Ame <173015200+grqz@users.noreply.github.com> Date: Fri, 18 Oct 2024 13:34:07 +1300 Subject: [PATCH 11/26] use `re.sub` instead in description extraction Co-authored-by: dirkf --- yt_dlp/extractor/boomplay.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/yt_dlp/extractor/boomplay.py b/yt_dlp/extractor/boomplay.py index 24ba2dffa9..1254e50fa7 100644 --- a/yt_dlp/extractor/boomplay.py +++ b/yt_dlp/extractor/boomplay.py @@ -110,12 +110,10 @@ class BoomplayBaseIE(InfoExtractor): def _extract_page_metadata(self, webpage, _id, playlist=False): metadata_div = self._get_element_by_class_and_tag('summary', 'div', webpage) or '' metadata_entries = re.findall(r'(?si)(?P.*?)', metadata_div) or [] - description = ( - self._get_element_by_class_and_tag('description_content', 'span', webpage) - or 'Listen and download music for free on Boomplay!') - description = clean_html(description.strip()) - if description == 'Listen and download music for free on Boomplay!': - description = None + description = re.sub( + '(?i)Listen and download music for free on Boomplay!', '', + clean_html(self._get_element_by_class_and_tag( + 'description_content', 'span', webpage)) or '') or None details_section = self._get_element_by_class_and_tag('songDetailInfo', 'section', webpage) or '' metadata_entries.extend(re.findall(r'(?si)
  • (?P.*?)
  • ', details_section) or []) From a886439396d42b77b5928a801ed57f7d9727bbec Mon Sep 17 00:00:00 2001 From: grqx_wsl <173253225+grqx@users.noreply.github.com> Date: Fri, 18 Oct 2024 13:32:12 +1300 Subject: [PATCH 12/26] `_id` -> `item_id` Co-authored-by: dirkf --- yt_dlp/extractor/boomplay.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/yt_dlp/extractor/boomplay.py b/yt_dlp/extractor/boomplay.py index 24ba2dffa9..70c4485c53 100644 --- a/yt_dlp/extractor/boomplay.py +++ b/yt_dlp/extractor/boomplay.py @@ -92,8 +92,8 @@ class BoomplayBaseIE(InfoExtractor): base64.b64decode(source), self._KEY, self._IV)).decode() - def _extract_formats(self, _id, item_type='MUSIC', **kwargs): - if url := url_or_none(self._get_playurl(_id, item_type)): + def _extract_formats(self, item_id, item_type='MUSIC', **kwargs): + if url := url_or_none(self._get_playurl(item_id, item_type)): return [{ 'format_id': '0', 'url': url, @@ -107,7 +107,7 @@ class BoomplayBaseIE(InfoExtractor): else: self.raise_no_formats('No formats found') - def _extract_page_metadata(self, webpage, _id, playlist=False): + def _extract_page_metadata(self, webpage, item_id, playlist=False): metadata_div = self._get_element_by_class_and_tag('summary', 'div', webpage) or '' metadata_entries = re.findall(r'(?si)(?P.*?)', metadata_div) or [] description = ( @@ -120,7 +120,7 @@ class BoomplayBaseIE(InfoExtractor): details_section = self._get_element_by_class_and_tag('songDetailInfo', 'section', webpage) or '' metadata_entries.extend(re.findall(r'(?si)
  • (?P.*?)
  • ', details_section) or []) page_metadata = { - 'id': _id, + 'id': item_id, 'title': self._html_search_regex(r']*>([^<]+)', webpage, 'title', default=None), 'thumbnail': self._html_search_meta(['og:image', 'twitter:image'], webpage, 'thumbnail', default=''), @@ -300,8 +300,8 @@ class BoomplayPodcastIE(BoomplayBaseIE): } def _real_extract(self, url): - _id = self._match_id(url) - webpage = self._download_webpage(url, _id) + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) song_list = self._get_element_by_class_and_tag('morePart_musics', 'ol', webpage) song_list = traverse_obj(re.finditer( r'''(?x) @@ -319,10 +319,10 @@ class BoomplayPodcastIE(BoomplayBaseIE): f'https://www.boomplay.com/episode/{x}', BoomplayEpisodeIE, x), })) return self.playlist_result( - song_list, _id, + song_list, playlist_id, playlist_title=self._og_search_title(webpage, fatal=True).rsplit('|', 2)[0].strip(), playlist_description=self._og_search_description(webpage, default=''), - **self._extract_page_metadata(webpage, _id)) + **self._extract_page_metadata(webpage, playlist_id)) class BoomplayPlaylistIE(BoomplayBaseIE): @@ -400,11 +400,11 @@ class BoomplayGenericPlaylistIE(BoomplayBaseIE): return False def _real_extract(self, url): - _id = self._generic_id(url) - webpage = self._download_webpage(url, _id) + playlist_id = self._generic_id(url) + webpage = self._download_webpage(url, playlist_id) return self.playlist_result( self._extract_playlist_entries(webpage, self._MEDIA_TYPES), - **self._extract_page_metadata(webpage, _id)) + **self._extract_page_metadata(webpage, playlist_id)) class BoomplaySearchURLIE(BoomplayBaseIE): From aa34d34596940a2a0b1a0e3e750da412d337bda4 Mon Sep 17 00:00:00 2001 From: grqx_wsl <173253225+grqx@users.noreply.github.com> Date: Wed, 23 Oct 2024 22:53:24 +1300 Subject: [PATCH 13/26] `_TEST` -> `_TESTS` --- yt_dlp/extractor/bilibili.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index 62f68fbc6d..ab98a80beb 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -1727,7 +1727,7 @@ class BilibiliAudioBaseIE(InfoExtractor): class BilibiliAudioIE(BilibiliAudioBaseIE): _VALID_URL = r'https?://(?:www\.)?bilibili\.com/audio/au(?P\d+)' - _TEST = { + _TESTS = [{ 'url': 'https://www.bilibili.com/audio/au1003142', 'md5': 'fec4987014ec94ef9e666d4d158ad03b', 'info_dict': { @@ -1749,7 +1749,7 @@ class BilibiliAudioIE(BilibiliAudioBaseIE): 'uploader': 'tsukimi-つきみぐー', 'view_count': int, }, - } + }] def _real_extract(self, url): au_id = self._match_id(url) @@ -1797,7 +1797,7 @@ class BilibiliAudioIE(BilibiliAudioBaseIE): class BilibiliAudioAlbumIE(BilibiliAudioBaseIE): _VALID_URL = r'https?://(?:www\.)?bilibili\.com/audio/am(?P\d+)' - _TEST = { + _TESTS = [{ 'url': 'https://www.bilibili.com/audio/am10624', 'info_dict': { 'id': '10624', @@ -1805,7 +1805,7 @@ class BilibiliAudioAlbumIE(BilibiliAudioBaseIE): 'description': '每天11:00更新,为你推送最新音乐', }, 'playlist_count': 19, - } + }] def _real_extract(self, url): am_id = self._match_id(url) From 0f9b09842e3f91fda7be6a8c269c446d6f012ce8 Mon Sep 17 00:00:00 2001 From: grqx_wsl <173253225+grqx@users.noreply.github.com> Date: Wed, 23 Oct 2024 23:52:44 +1300 Subject: [PATCH 14/26] remove `playlist` argument from `BoomplayBaseIE._extract_page_metadata` Will consider `require_title` later if moving title extraction here --- yt_dlp/extractor/boomplay.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/boomplay.py b/yt_dlp/extractor/boomplay.py index 27953575b5..50826e99e7 100644 --- a/yt_dlp/extractor/boomplay.py +++ b/yt_dlp/extractor/boomplay.py @@ -107,7 +107,7 @@ class BoomplayBaseIE(InfoExtractor): else: self.raise_no_formats('No formats found') - def _extract_page_metadata(self, webpage, item_id, playlist=False): + def _extract_page_metadata(self, webpage, item_id): metadata_div = self._get_element_by_class_and_tag('summary', 'div', webpage) or '' metadata_entries = re.findall(r'(?si)(?P.*?)', metadata_div) or [] description = re.sub( From 8a1daf41ab21f3b2e06eb2c6d12162e60822ab3b Mon Sep 17 00:00:00 2001 From: dirkf Date: Wed, 23 Oct 2024 23:58:57 +1300 Subject: [PATCH 15/26] [ie/BoomplayEpisode] Make title extraction non-fatal Co-authored-by: dirkf --- yt_dlp/extractor/boomplay.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/boomplay.py b/yt_dlp/extractor/boomplay.py index 50826e99e7..e6ecfe0e7b 100644 --- a/yt_dlp/extractor/boomplay.py +++ b/yt_dlp/extractor/boomplay.py @@ -274,7 +274,7 @@ class BoomplayEpisodeIE(BoomplayBaseIE): webpage = self._download_webpage(url, ep_id) return merge_dicts( self._extract_page_metadata(webpage, ep_id), { - 'title': self._og_search_title(webpage, fatal=True).rsplit('|', 2)[0].strip(), + 'title': self._og_search_title(webpage, default='').rsplit('|', 2)[0].strip() or None, 'description': self._html_search_meta( ['description', 'og:description', 'twitter:description'], webpage), 'formats': self._extract_formats(ep_id, 'EPISODE', vcodec='none'), From 195af478f38d2e83615d7bb69474f1beb050bd75 Mon Sep 17 00:00:00 2001 From: grqx_termux Date: Thu, 24 Oct 2024 15:41:59 +1300 Subject: [PATCH 16/26] Revert "`_TEST` -> `_TESTS`" This reverts commit aa34d34596940a2a0b1a0e3e750da412d337bda4. --- yt_dlp/extractor/bilibili.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index ab98a80beb..62f68fbc6d 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -1727,7 +1727,7 @@ class BilibiliAudioBaseIE(InfoExtractor): class BilibiliAudioIE(BilibiliAudioBaseIE): _VALID_URL = r'https?://(?:www\.)?bilibili\.com/audio/au(?P\d+)' - _TESTS = [{ + _TEST = { 'url': 'https://www.bilibili.com/audio/au1003142', 'md5': 'fec4987014ec94ef9e666d4d158ad03b', 'info_dict': { @@ -1749,7 +1749,7 @@ class BilibiliAudioIE(BilibiliAudioBaseIE): 'uploader': 'tsukimi-つきみぐー', 'view_count': int, }, - }] + } def _real_extract(self, url): au_id = self._match_id(url) @@ -1797,7 +1797,7 @@ class BilibiliAudioIE(BilibiliAudioBaseIE): class BilibiliAudioAlbumIE(BilibiliAudioBaseIE): _VALID_URL = r'https?://(?:www\.)?bilibili\.com/audio/am(?P\d+)' - _TESTS = [{ + _TEST = { 'url': 'https://www.bilibili.com/audio/am10624', 'info_dict': { 'id': '10624', @@ -1805,7 +1805,7 @@ class BilibiliAudioAlbumIE(BilibiliAudioBaseIE): 'description': '每天11:00更新,为你推送最新音乐', }, 'playlist_count': 19, - }] + } def _real_extract(self, url): am_id = self._match_id(url) From 60b763c50f75021c6efc78563c126247d6089663 Mon Sep 17 00:00:00 2001 From: grqx_termux Date: Thu, 24 Oct 2024 15:49:16 +1300 Subject: [PATCH 17/26] `_TEST` -> `_TESTS` actually meant this. working on 2 branches simultanously can lead to results like this... --- yt_dlp/extractor/boomplay.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/yt_dlp/extractor/boomplay.py b/yt_dlp/extractor/boomplay.py index e6ecfe0e7b..7349e03b49 100644 --- a/yt_dlp/extractor/boomplay.py +++ b/yt_dlp/extractor/boomplay.py @@ -184,7 +184,7 @@ class BoomplayBaseIE(InfoExtractor): class BoomplayMusicIE(BoomplayBaseIE): _VALID_URL = r'https?://(?:www\.)?boomplay\.com/songs/(?P\d+)' - _TEST = { + _TESTS = [{ 'url': 'https://www.boomplay.com/songs/165481965', 'md5': 'c5fb4f23e6aae98064230ef3c39c2178', 'info_dict': { @@ -202,7 +202,7 @@ class BoomplayMusicIE(BoomplayBaseIE): 'album': 'Legendary Battle', 'genres': ['Metal'], }, - } + }] def _real_extract(self, url): song_id = self._match_id(url) @@ -225,7 +225,7 @@ class BoomplayMusicIE(BoomplayBaseIE): class BoomplayVideoIE(BoomplayBaseIE): _VALID_URL = r'https?://(?:www\.)?boomplay\.com/video/(?P\d+)' - _TEST = { + _TESTS = [{ 'url': 'https://www.boomplay.com/video/1154892', 'md5': 'd9b67ad333d2292a82922062d065352d', 'info_dict': { @@ -239,7 +239,7 @@ class BoomplayVideoIE(BoomplayBaseIE): 'duration': 177.0, 'description': 'Autumn blues by Lugo', }, - } + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -253,7 +253,7 @@ class BoomplayVideoIE(BoomplayBaseIE): class BoomplayEpisodeIE(BoomplayBaseIE): _VALID_URL = r'https?://(?:www\.)?boomplay\.com/episode/(?P\d+)' - _TEST = { + _TESTS = [{ 'url': 'https://www.boomplay.com/episode/7132706', 'md5': 'f26e236b764baa53d7a2cbb7e9ce6dc4', 'info_dict': { @@ -267,7 +267,7 @@ class BoomplayEpisodeIE(BoomplayBaseIE): 'upload_date': '20240506', 'description': 'md5:5ec684b281fa0f9e4c31b3ee20c5e57a', }, - } + }] def _real_extract(self, url): ep_id = self._match_id(url) @@ -283,7 +283,7 @@ class BoomplayEpisodeIE(BoomplayBaseIE): class BoomplayPodcastIE(BoomplayBaseIE): _VALID_URL = r'https?://(?:www\.)?boomplay\.com/podcasts/(?P\d+)' - _TEST = { + _TESTS = [{ 'url': 'https://www.boomplay.com/podcasts/5372', 'playlist_count': 200, 'info_dict': { @@ -295,7 +295,7 @@ class BoomplayPodcastIE(BoomplayBaseIE): 'comment_count': int, 'like_count': int, }, - } + }] def _real_extract(self, url): playlist_id = self._match_id(url) @@ -458,10 +458,10 @@ class BoomplaySearchURLIE(BoomplayBaseIE): class BoomplaySearchIE(SearchInfoExtractor): _SEARCH_KEY = 'boomplaysearch' _RETURN_TYPE = 'url' - _TEST = { + _TESTS = [{ 'url': 'boomplaysearch:rise of the fallen heroes', 'only_matching': True, - } + }] def _search_results(self, query): yield self.url_result( From 0e344b806f1cc2ebc8274b040c4ca9d86ac81e2e Mon Sep 17 00:00:00 2001 From: grqx_wsl <173253225+grqx@users.noreply.github.com> Date: Sat, 2 Nov 2024 02:11:49 +1300 Subject: [PATCH 18/26] [ie/boomplaypodcast]extract full description --- yt_dlp/extractor/boomplay.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/yt_dlp/extractor/boomplay.py b/yt_dlp/extractor/boomplay.py index 7349e03b49..3672ba7e4c 100644 --- a/yt_dlp/extractor/boomplay.py +++ b/yt_dlp/extractor/boomplay.py @@ -289,7 +289,7 @@ class BoomplayPodcastIE(BoomplayBaseIE): 'info_dict': { 'id': '5372', 'title': 'TED Talks Daily', - 'description': 'md5:541182e787ce8fd578c835534c907077', + 'description': r're:(?s)Every weekday, TED Talks Daily brings you the latest talks .{328} learn something new\.$', 'thumbnail': 'https://source.boomplaymusic.com/group10/M00/12/22/6f9cf97ad6f846a0a7882c98dfcf4f8c_320_320.jpg', 'repost_count': int, 'comment_count': int, @@ -319,7 +319,6 @@ class BoomplayPodcastIE(BoomplayBaseIE): return self.playlist_result( song_list, playlist_id, playlist_title=self._og_search_title(webpage, fatal=True).rsplit('|', 2)[0].strip(), - playlist_description=self._og_search_description(webpage, default=''), **self._extract_page_metadata(webpage, playlist_id)) From 8ef229428206db0b5a936408a5a37f3c6cb8d667 Mon Sep 17 00:00:00 2001 From: grqx_wsl <173253225+grqx@users.noreply.github.com> Date: Sat, 2 Nov 2024 02:18:16 +1300 Subject: [PATCH 19/26] case insensitive tag matching --- yt_dlp/extractor/boomplay.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/boomplay.py b/yt_dlp/extractor/boomplay.py index 3672ba7e4c..80745c65de 100644 --- a/yt_dlp/extractor/boomplay.py +++ b/yt_dlp/extractor/boomplay.py @@ -119,7 +119,7 @@ class BoomplayBaseIE(InfoExtractor): metadata_entries.extend(re.findall(r'(?si)
  • (?P.*?)
  • ', details_section) or []) page_metadata = { 'id': item_id, - 'title': self._html_search_regex(r']*>([^<]+)', webpage, 'title', default=None), + 'title': self._html_search_regex(r'(?i)]*>([^<]+)', webpage, 'title', default=None), 'thumbnail': self._html_search_meta(['og:image', 'twitter:image'], webpage, 'thumbnail', default=''), 'like_count': parse_count(self._get_element_by_class_and_tag('btn_favorite', 'button', metadata_div)), From 9a6f9843c00b0492dbb7cc622c00690f7fe4cce8 Mon Sep 17 00:00:00 2001 From: grqx_wsl <173253225+grqx@users.noreply.github.com> Date: Mon, 4 Nov 2024 14:09:42 +1300 Subject: [PATCH 20/26] use _extract_from_webpage and _extract_embed_urls - `_extract_playlist_entries` is now a `classmethod` - case insensitive html tag matching Co-authored-by: dirkf --- yt_dlp/extractor/boomplay.py | 75 ++++++++++++++++++++++-------------- 1 file changed, 46 insertions(+), 29 deletions(-) diff --git a/yt_dlp/extractor/boomplay.py b/yt_dlp/extractor/boomplay.py index 80745c65de..706e8ad836 100644 --- a/yt_dlp/extractor/boomplay.py +++ b/yt_dlp/extractor/boomplay.py @@ -15,11 +15,12 @@ from ..utils import ( int_or_none, join_nonempty, merge_dicts, - orderedSet, parse_count, parse_duration, + smuggle_url, strip_or_none, unified_strdate, + unsmuggle_url, url_or_none, urlencode_postdata, urljoin, @@ -45,7 +46,7 @@ class BoomplayBaseIE(InfoExtractor): """ # get_elements_text_and_html_by_attribute returns a generator return get_elements_text_and_html_by_attribute( - 'class', rf'''[^'"]*(?<=['"\s]){class_}(?=['"\s])[^'"]*''', html, + attribute='class', value=rf'''[^'"]*(?<=['"\s]){class_}(?=['"\s])[^'"]*''', html=html, tag=tag, escape_value=False) @classmethod @@ -111,7 +112,7 @@ class BoomplayBaseIE(InfoExtractor): metadata_div = self._get_element_by_class_and_tag('summary', 'div', webpage) or '' metadata_entries = re.findall(r'(?si)(?P.*?)', metadata_div) or [] description = re.sub( - '(?i)Listen and download music for free on Boomplay!', '', + r'(?i)Listen and download music for free on Boomplay!', '', clean_html(self._get_element_by_class_and_tag( 'description_content', 'span', webpage)) or '') or None @@ -145,39 +146,55 @@ class BoomplayBaseIE(InfoExtractor): page_metadata['release_year'] = int_or_none(v) return page_metadata - def _extract_suitable_links(self, webpage, media_types=None): - if media_types is None: - media_types = self._MEDIA_TYPES - media_types = list(variadic(media_types)) + @classmethod + def _extract_from_webpage(cls, url, webpage, **kwargs): + if kwargs: + url = smuggle_url(url, kwargs) + return super()._extract_from_webpage(url, webpage) - for idx, v in enumerate(media_types): - media_types[idx] = re.escape(v) if v in self._MEDIA_TYPES else '' - media_types = join_nonempty(*media_types, delim='|') - return orderedSet(traverse_obj(re.finditer( - rf'''(?x) -
    "']|"[^"]*"|'[^']*')*)? - (?<=\s)href\s*=\s*(?P<_q>['"]) - (?: - (?!javascript:)(?P/(?:{media_types})/\d+/?[\-a-zA-Z=?&#:;@]*) - ) - (?P=_q) - (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)? - >''', webpage), (..., 'link', {self._urljoin}, {self.url_result}))) + @classmethod + def _extract_embed_urls(cls, url, webpage): + url, smuggled_data = unsmuggle_url(url) + media_types = variadic(smuggled_data.get('media_types', cls._MEDIA_TYPES)) + media_types = join_nonempty(*( + re.escape(v)for v in media_types if v in cls._MEDIA_TYPES), + delim='|') - def _extract_playlist_entries(self, webpage, media_types, warn=True): + for mobj in re.finditer( + rf'''(?ix) + "']|"[^"]*"|'[^']*')*)? + (?<=\s)href\s*=\s*(?P<_q>['"]) + (?: + (?!javascript:)(?P/(?:{media_types})/\d+/?[\-a-zA-Z=?&#:;@]*) + ) + (?P=_q) + (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)? + >''', webpage): + if url := cls._urljoin(mobj.group('href')): + yield url + + @classmethod + def _extract_playlist_entries(cls, webpage, media_types, warn=True): song_list = strip_or_none( - self._get_element_by_class_and_tag('morePart_musics', 'ol', webpage) - or self._get_element_by_class_and_tag('morePart', 'ol', webpage) + cls._get_element_by_class_and_tag('morePart_musics', 'ol', webpage) + or cls._get_element_by_class_and_tag('morePart', 'ol', webpage) or '') - entries = traverse_obj(self.__yield_elements_html_by_class_and_tag( + entries = traverse_obj(cls.__yield_elements_html_by_class_and_tag( 'songName', 'a', song_list), - (..., {extract_attributes}, 'href', {self._urljoin}, {self.url_result})) + (..., {extract_attributes}, 'href', {cls._urljoin}, {cls.url_result})) if not entries: if warn: - self.report_warning('Failed to extract playlist entries, finding suitable links instead!') - return self._extract_suitable_links(webpage, media_types) + cls.report_warning('Failed to extract playlist entries, finding suitable links instead!') + + def strip_ie(entry): + # All our IEs have a _VALID_URL and set a key: don't use it + entry.pop('ie_key', None) + return entry + + return (strip_ie(result) for result in + cls._extract_from_webpage(cls._BASE, webpage, media_types=media_types)) return entries @@ -302,7 +319,7 @@ class BoomplayPodcastIE(BoomplayBaseIE): webpage = self._download_webpage(url, playlist_id) song_list = self._get_element_by_class_and_tag('morePart_musics', 'ol', webpage) song_list = traverse_obj(re.finditer( - r'''(?x) + r'''(?ix)
  • "']|"[^"]*"|'[^']*')*)? \sdata-id\s*=\s* From 901e78af62180c469d67a023a14b85c3754ac69b Mon Sep 17 00:00:00 2001 From: grqx_wsl <173253225+grqx@users.noreply.github.com> Date: Mon, 4 Nov 2024 14:19:52 +1300 Subject: [PATCH 21/26] improve regex --- yt_dlp/extractor/boomplay.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/yt_dlp/extractor/boomplay.py b/yt_dlp/extractor/boomplay.py index 706e8ad836..f19a77aa40 100644 --- a/yt_dlp/extractor/boomplay.py +++ b/yt_dlp/extractor/boomplay.py @@ -165,9 +165,7 @@ class BoomplayBaseIE(InfoExtractor): "']|"[^"]*"|'[^']*')*)? (?<=\s)href\s*=\s*(?P<_q>['"]) - (?: - (?!javascript:)(?P/(?:{media_types})/\d+/?[\-a-zA-Z=?&#:;@]*) - ) + (?!javascript:)(?P/(?:{media_types})/\d+/?[\-\w=?&#:;@]*) (?P=_q) (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)? >''', webpage): From d69a1be537610d15ff7d1fc6f1a38f83d0264da6 Mon Sep 17 00:00:00 2001 From: grqx_wsl <173253225+grqx@users.noreply.github.com> Date: Mon, 4 Nov 2024 23:17:26 +1300 Subject: [PATCH 22/26] _urljoin(): let url_or_none sanitize the url; more classmethods --- yt_dlp/extractor/boomplay.py | 43 ++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/yt_dlp/extractor/boomplay.py b/yt_dlp/extractor/boomplay.py index f19a77aa40..0bc7a28641 100644 --- a/yt_dlp/extractor/boomplay.py +++ b/yt_dlp/extractor/boomplay.py @@ -67,34 +67,34 @@ class BoomplayBaseIE(InfoExtractor): @classmethod def _urljoin(cls, path): - if not hasattr(path, 'startswith') or path.startswith('javascript:'): - return None return url_or_none(urljoin(base=cls._BASE, path=path)) - def _get_playurl(self, item_id, item_type): - resp = self._download_json( + @classmethod + def _get_playurl(cls, item_id, item_type): + resp = cls._download_json( 'https://www.boomplay.com/getResourceAddr', item_id, note='Downloading play URL', errnote='Failed to download play URL', data=urlencode_postdata({ 'param': base64.b64encode(aes_cbc_encrypt_bytes(json.dumps({ 'itemID': item_id, 'itemType': item_type, - }).encode(), self._KEY, self._IV)).decode(), + }).encode(), cls._KEY, cls._IV)).decode(), }), headers={ 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', }) if not (source := resp.get('source')) and (code := resp.get('code')): if 'unavailable in your country' in (desc := resp.get('desc')) or '': # since NG must have failed ... - self.raise_geo_restricted(countries=['GH', 'KE', 'TZ', 'CM', 'CI']) + cls.raise_geo_restricted(countries=['GH', 'KE', 'TZ', 'CM', 'CI']) else: raise ExtractorError(desc or f'Failed to get play url, code: {code}') return unpad_pkcs7(aes_cbc_decrypt_bytes( base64.b64decode(source), - self._KEY, self._IV)).decode() + cls._KEY, cls._IV)).decode() - def _extract_formats(self, item_id, item_type='MUSIC', **kwargs): - if url := url_or_none(self._get_playurl(item_id, item_type)): + @classmethod + def _extract_formats(cls, item_id, item_type='MUSIC', **kwargs): + if url := url_or_none(cls._get_playurl(item_id, item_type)): return [{ 'format_id': '0', 'url': url, @@ -106,29 +106,30 @@ class BoomplayBaseIE(InfoExtractor): **kwargs, }] else: - self.raise_no_formats('No formats found') + cls.raise_no_formats('No formats found') - def _extract_page_metadata(self, webpage, item_id): - metadata_div = self._get_element_by_class_and_tag('summary', 'div', webpage) or '' + @classmethod + def _extract_page_metadata(cls, webpage, item_id): + metadata_div = cls._get_element_by_class_and_tag('summary', 'div', webpage) or '' metadata_entries = re.findall(r'(?si)(?P.*?)', metadata_div) or [] description = re.sub( r'(?i)Listen and download music for free on Boomplay!', '', - clean_html(self._get_element_by_class_and_tag( + clean_html(cls._get_element_by_class_and_tag( 'description_content', 'span', webpage)) or '') or None - details_section = self._get_element_by_class_and_tag('songDetailInfo', 'section', webpage) or '' + details_section = cls._get_element_by_class_and_tag('songDetailInfo', 'section', webpage) or '' metadata_entries.extend(re.findall(r'(?si)
  • (?P.*?)
  • ', details_section) or []) page_metadata = { 'id': item_id, - 'title': self._html_search_regex(r'(?i)]*>([^<]+)', webpage, 'title', default=None), - 'thumbnail': self._html_search_meta(['og:image', 'twitter:image'], + 'title': cls._html_search_regex(r'(?i)]*>([^<]+)', webpage, 'title', default=None), + 'thumbnail': cls._html_search_meta(['og:image', 'twitter:image'], webpage, 'thumbnail', default=''), - 'like_count': parse_count(self._get_element_by_class_and_tag('btn_favorite', 'button', metadata_div)), - 'repost_count': parse_count(self._get_element_by_class_and_tag('btn_share', 'button', metadata_div)), - 'comment_count': parse_count(self._get_element_by_class_and_tag('btn_comment', 'button', metadata_div)), - 'duration': parse_duration(self._get_element_by_class_and_tag('btn_duration', 'button', metadata_div)), + 'like_count': parse_count(cls._get_element_by_class_and_tag('btn_favorite', 'button', metadata_div)), + 'repost_count': parse_count(cls._get_element_by_class_and_tag('btn_share', 'button', metadata_div)), + 'comment_count': parse_count(cls._get_element_by_class_and_tag('btn_comment', 'button', metadata_div)), + 'duration': parse_duration(cls._get_element_by_class_and_tag('btn_duration', 'button', metadata_div)), 'upload_date': unified_strdate(strip_or_none( - self._get_element_by_class_and_tag('btn_pubDate', 'button', metadata_div))), + cls._get_element_by_class_and_tag('btn_pubDate', 'button', metadata_div))), 'description': description, } for metadata_entry in metadata_entries: From eacad11a5a69c0705625773438e923d1440d8a7c Mon Sep 17 00:00:00 2001 From: grqx_wsl <173253225+grqx@users.noreply.github.com> Date: Tue, 5 Nov 2024 00:18:14 +1300 Subject: [PATCH 23/26] code formatting --- yt_dlp/extractor/boomplay.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/boomplay.py b/yt_dlp/extractor/boomplay.py index 0bc7a28641..c5bbcbff2c 100644 --- a/yt_dlp/extractor/boomplay.py +++ b/yt_dlp/extractor/boomplay.py @@ -123,7 +123,7 @@ class BoomplayBaseIE(InfoExtractor): 'id': item_id, 'title': cls._html_search_regex(r'(?i)]*>([^<]+)', webpage, 'title', default=None), 'thumbnail': cls._html_search_meta(['og:image', 'twitter:image'], - webpage, 'thumbnail', default=''), + webpage, 'thumbnail', default=''), 'like_count': parse_count(cls._get_element_by_class_and_tag('btn_favorite', 'button', metadata_div)), 'repost_count': parse_count(cls._get_element_by_class_and_tag('btn_share', 'button', metadata_div)), 'comment_count': parse_count(cls._get_element_by_class_and_tag('btn_comment', 'button', metadata_div)), From c58ee488a9a005d6c8e79ad925a1e9afc88b90b7 Mon Sep 17 00:00:00 2001 From: grqx_wsl <173253225+grqx@users.noreply.github.com> Date: Tue, 5 Nov 2024 13:31:53 +1300 Subject: [PATCH 24/26] simplify BoomplayGenericPlaylistIE.suitable --- yt_dlp/extractor/boomplay.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/yt_dlp/extractor/boomplay.py b/yt_dlp/extractor/boomplay.py index c5bbcbff2c..bb5ba0f7dd 100644 --- a/yt_dlp/extractor/boomplay.py +++ b/yt_dlp/extractor/boomplay.py @@ -401,16 +401,14 @@ class BoomplayGenericPlaylistIE(BoomplayBaseIE): @classmethod def suitable(cls, url): - if super().suitable(url): - return not any(ie.suitable(url) for ie in ( - BoomplayEpisodeIE, - BoomplayMusicIE, - BoomplayPlaylistIE, - BoomplayPodcastIE, - BoomplaySearchURLIE, - BoomplayVideoIE, - )) - return False + return super().suitable(url) and all(not ie.suitable(url) for ie in ( + BoomplayEpisodeIE, + BoomplayMusicIE, + BoomplayPlaylistIE, + BoomplayPodcastIE, + BoomplaySearchURLIE, + BoomplayVideoIE, + )) def _real_extract(self, url): playlist_id = self._generic_id(url) From bd857a06a049183bc681d86347f8a2323c68c404 Mon Sep 17 00:00:00 2001 From: grqx_wsl <173253225+grqx@users.noreply.github.com> Date: Wed, 6 Nov 2024 00:10:40 +1300 Subject: [PATCH 25/26] fix: do not use classmethod; fix title in the base extractor --- yt_dlp/extractor/boomplay.py | 80 +++++++++++++++++++++++++----------- 1 file changed, 55 insertions(+), 25 deletions(-) diff --git a/yt_dlp/extractor/boomplay.py b/yt_dlp/extractor/boomplay.py index bb5ba0f7dd..d25f376798 100644 --- a/yt_dlp/extractor/boomplay.py +++ b/yt_dlp/extractor/boomplay.py @@ -69,32 +69,30 @@ class BoomplayBaseIE(InfoExtractor): def _urljoin(cls, path): return url_or_none(urljoin(base=cls._BASE, path=path)) - @classmethod - def _get_playurl(cls, item_id, item_type): - resp = cls._download_json( + def _get_playurl(self, item_id, item_type): + resp = self._download_json( 'https://www.boomplay.com/getResourceAddr', item_id, note='Downloading play URL', errnote='Failed to download play URL', data=urlencode_postdata({ 'param': base64.b64encode(aes_cbc_encrypt_bytes(json.dumps({ 'itemID': item_id, 'itemType': item_type, - }).encode(), cls._KEY, cls._IV)).decode(), + }).encode(), self._KEY, self._IV)).decode(), }), headers={ 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', }) if not (source := resp.get('source')) and (code := resp.get('code')): if 'unavailable in your country' in (desc := resp.get('desc')) or '': # since NG must have failed ... - cls.raise_geo_restricted(countries=['GH', 'KE', 'TZ', 'CM', 'CI']) + self.raise_geo_restricted(countries=['GH', 'KE', 'TZ', 'CM', 'CI']) else: raise ExtractorError(desc or f'Failed to get play url, code: {code}') return unpad_pkcs7(aes_cbc_decrypt_bytes( base64.b64decode(source), - cls._KEY, cls._IV)).decode() + self._KEY, self._IV)).decode() - @classmethod - def _extract_formats(cls, item_id, item_type='MUSIC', **kwargs): - if url := url_or_none(cls._get_playurl(item_id, item_type)): + def _extract_formats(self, item_id, item_type='MUSIC', **kwargs): + if url := url_or_none(self._get_playurl(item_id, item_type)): return [{ 'format_id': '0', 'url': url, @@ -106,30 +104,29 @@ class BoomplayBaseIE(InfoExtractor): **kwargs, }] else: - cls.raise_no_formats('No formats found') + self.raise_no_formats('No formats found') - @classmethod - def _extract_page_metadata(cls, webpage, item_id): - metadata_div = cls._get_element_by_class_and_tag('summary', 'div', webpage) or '' + def _extract_page_metadata(self, webpage, item_id): + metadata_div = self._get_element_by_class_and_tag('summary', 'div', webpage) or '' metadata_entries = re.findall(r'(?si)(?P.*?)', metadata_div) or [] description = re.sub( r'(?i)Listen and download music for free on Boomplay!', '', - clean_html(cls._get_element_by_class_and_tag( + clean_html(self._get_element_by_class_and_tag( 'description_content', 'span', webpage)) or '') or None - details_section = cls._get_element_by_class_and_tag('songDetailInfo', 'section', webpage) or '' + details_section = self._get_element_by_class_and_tag('songDetailInfo', 'section', webpage) or '' metadata_entries.extend(re.findall(r'(?si)
  • (?P.*?)
  • ', details_section) or []) page_metadata = { 'id': item_id, - 'title': cls._html_search_regex(r'(?i)]*>([^<]+)', webpage, 'title', default=None), - 'thumbnail': cls._html_search_meta(['og:image', 'twitter:image'], - webpage, 'thumbnail', default=''), - 'like_count': parse_count(cls._get_element_by_class_and_tag('btn_favorite', 'button', metadata_div)), - 'repost_count': parse_count(cls._get_element_by_class_and_tag('btn_share', 'button', metadata_div)), - 'comment_count': parse_count(cls._get_element_by_class_and_tag('btn_comment', 'button', metadata_div)), - 'duration': parse_duration(cls._get_element_by_class_and_tag('btn_duration', 'button', metadata_div)), + **self._extract_title_from_webpage(webpage), + 'thumbnail': self._html_search_meta(['og:image', 'twitter:image'], + webpage, 'thumbnail', default=None), + 'like_count': parse_count(self._get_element_by_class_and_tag('btn_favorite', 'button', metadata_div)), + 'repost_count': parse_count(self._get_element_by_class_and_tag('btn_share', 'button', metadata_div)), + 'comment_count': parse_count(self._get_element_by_class_and_tag('btn_comment', 'button', metadata_div)), + 'duration': parse_duration(self._get_element_by_class_and_tag('btn_duration', 'button', metadata_div)), 'upload_date': unified_strdate(strip_or_none( - cls._get_element_by_class_and_tag('btn_pubDate', 'button', metadata_div))), + self._get_element_by_class_and_tag('btn_pubDate', 'button', metadata_div))), 'description': description, } for metadata_entry in metadata_entries: @@ -147,6 +144,40 @@ class BoomplayBaseIE(InfoExtractor): page_metadata['release_year'] = int_or_none(v) return page_metadata + def _extract_title_from_webpage(self, webpage): + if h1_title := self._html_search_regex(r'(?i)]*>([^<]+)', webpage, 'title', default=None): + return {'title': h1_title} + else: + return self._fix_title( + self._html_search_meta(['og:title', 'twitter:title'], webpage, 'title', default=None) + or self._html_search_regex(r'(?i)]*>([^<]+)', webpage, 'title', default=None)) + + @staticmethod + def _fix_title(title): + """ + fix various types of titles(og:title, twitter:title, title tag in html head): + """ + if not title: + return {} + + title_patterns = ( + r'^(?P(?P<artist>.+)) Songs MP3 Download, New Songs \& Albums \| Boomplay$', # artists + r'^(?P<artist>.+?) - (?P<title>.+) MP3\ Download \& Lyrics \| Boomplay$', # music + r'^Download (?P<artist>.+) album songs: (?P<title>.+?) \| Boomplay Music$', # album + r'^Search:(?P<title>.+) \| Boomplay Music$', # search url + r'^(?P<title>.+) \| Podcast \| Boomplay$', # podcast, episode + r'^(?P<title>.+) \| Boomplay(?: Music)?$', # video, playlist, generic playlists + ) + + for pattern in title_patterns: + if match := re.search(pattern, title): + return { + 'title': match.group('title'), + 'artists': [match.group('artist')] if 'artist' in match.groupdict() else None, + } + + return {'title': title} + @classmethod def _extract_from_webpage(cls, url, webpage, **kwargs): if kwargs: @@ -166,7 +197,7 @@ class BoomplayBaseIE(InfoExtractor): <a (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)? (?<=\s)href\s*=\s*(?P<_q>['"]) - (?!javascript:)(?P<href>/(?:{media_types})/\d+/?[\-\w=?&#:;@]*) + (?P<href>/(?:{media_types})/\d+/?[\-\w=?&#:;@]*) (?P=_q) (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)? >''', webpage): @@ -290,7 +321,6 @@ class BoomplayEpisodeIE(BoomplayBaseIE): webpage = self._download_webpage(url, ep_id) return merge_dicts( self._extract_page_metadata(webpage, ep_id), { - 'title': self._og_search_title(webpage, default='').rsplit('|', 2)[0].strip() or None, 'description': self._html_search_meta( ['description', 'og:description', 'twitter:description'], webpage), 'formats': self._extract_formats(ep_id, 'EPISODE', vcodec='none'), From c59ce7d6a6a5be85b240e177dd60573a0fab1318 Mon Sep 17 00:00:00 2001 From: grqx_wsl <173253225+grqx@users.noreply.github.com> Date: Wed, 6 Nov 2024 01:04:36 +1300 Subject: [PATCH 26/26] [ie/boomplaypodcast] use the base extractor's method to extract title --- yt_dlp/extractor/boomplay.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/yt_dlp/extractor/boomplay.py b/yt_dlp/extractor/boomplay.py index d25f376798..71e76eafa5 100644 --- a/yt_dlp/extractor/boomplay.py +++ b/yt_dlp/extractor/boomplay.py @@ -155,7 +155,7 @@ class BoomplayBaseIE(InfoExtractor): @staticmethod def _fix_title(title): """ - fix various types of titles(og:title, twitter:title, title tag in html head): + fix various types of titles(og:title, twitter:title, title tag in html head) """ if not title: return {} @@ -364,7 +364,6 @@ class BoomplayPodcastIE(BoomplayBaseIE): })) return self.playlist_result( song_list, playlist_id, - playlist_title=self._og_search_title(webpage, fatal=True).rsplit('|', 2)[0].strip(), **self._extract_page_metadata(webpage, playlist_id))