diff --git a/yt_dlp/extractor/extractors.py b/yt_dlp/extractor/extractors.py index a36622f130..5c58e2ba41 100644 --- a/yt_dlp/extractor/extractors.py +++ b/yt_dlp/extractor/extractors.py @@ -1605,6 +1605,10 @@ from .vodlocker import VodlockerIE from .vodpl import VODPlIE from .vodplatform import VODPlatformIE from .voicerepublic import VoiceRepublicIE +from .voicy import ( + VoicyIE, + VoicyChannelIE, +) from .voot import ( VootIE, VootSeriesIE, diff --git a/yt_dlp/extractor/voicy.py b/yt_dlp/extractor/voicy.py new file mode 100644 index 0000000000..ae29c3de2d --- /dev/null +++ b/yt_dlp/extractor/voicy.py @@ -0,0 +1,148 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + smuggle_url, + traverse_obj, + unsmuggle_url, + unified_strdate, +) + +import re +import itertools + + +class VoicyBaseIE(InfoExtractor): + def _extract_from_playlist_data(self, value): + voice_id = compat_str(value.get('PlaylistId')) + upload_date = unified_strdate(value.get('Published'), False) + items = [self._extract_single_article(voice_data) for voice_data in value['VoiceData']] + return { + '_type': 'multi_video', + 'entries': items, + 'id': voice_id, + 'title': compat_str(value.get('PlaylistName')), + 'uploader': value.get('SpeakerName'), + 'uploader_id': compat_str(value.get('SpeakerId')), + 'channel': value.get('ChannelName'), + 'channel_id': compat_str(value.get('ChannelId')), + 'upload_date': upload_date, + } + + def _extract_single_article(self, entry): + formats = [{ + 'url': entry['VoiceHlsFile'], + 'format_id': 'hls', + 'ext': 'm4a', + 'acodec': 'aac', + 'vcodec': 'none', + 'protocol': 'm3u8_native', + }, { + 'url': entry['VoiceFile'], + 'format_id': 'mp3', + 'ext': 'mp3', + 'acodec': 'mp3', + 'vcodec': 'none', + }] + self._sort_formats(formats) + return { + 'id': compat_str(entry.get('ArticleId')), + 'title': entry.get('ArticleTitle'), + 'description': entry.get('MediaName'), + 'formats': formats, + } + + def _call_api(self, url, video_id, **kwargs): + response = self._download_json(url, video_id, **kwargs) + if response.get('Status') != 0: + message = traverse_obj(response, ('Value', 'Error', 'Message'), expected_type=compat_str) + if not message: + message = 'There was a error in the response: %d' % response.get('Status') + raise ExtractorError(message, expected=False) + return response.get('Value') + + +class VoicyIE(VoicyBaseIE): + IE_NAME = 'voicy' + _VALID_URL = r'https?://voicy\.jp/channel/(?P\d+)/(?P\d+)' + ARTICLE_LIST_API_URL = 'https://vmw.api.voicy.jp/articles_list?channel_id=%s&pid=%s' + _TESTS = [{ + 'url': 'https://voicy.jp/channel/1253/122754', + 'info_dict': { + 'id': '122754', + 'title': '1/21(木)声日記:ついに原稿終わった!!', + 'uploader': 'ちょまど@ ITエンジニアなオタク', + 'uploader_id': '7339', + }, + 'playlist_mincount': 9, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + assert mobj + voice_id = mobj.group('id') + channel_id = mobj.group('channel_id') + url, article_list = unsmuggle_url(url) + if not article_list: + article_list = self._call_api(self.ARTICLE_LIST_API_URL % (channel_id, voice_id), voice_id) + return self._extract_from_playlist_data(article_list) + + +class VoicyChannelIE(VoicyBaseIE): + IE_NAME = 'voicy:channel' + _VALID_URL = r'https?://voicy\.jp/channel/(?P\d+)' + PROGRAM_LIST_API_URL = 'https://vmw.api.voicy.jp/program_list/all?channel_id=%s&limit=20&public_type=3%s' + _TESTS = [{ + 'url': 'https://voicy.jp/channel/1253/', + 'info_dict': { + 'id': '7339', + 'title': 'ゆるふわ日常ラジオ #ちょまラジ', + 'uploader': 'ちょまど@ ITエンジニアなオタク', + 'uploader_id': '7339', + }, + 'playlist_mincount': 54, + }] + + @classmethod + def suitable(cls, url): + return not VoicyIE.suitable(url) and super(VoicyChannelIE, cls).suitable(url) + + def _entries(self, channel_id): + pager = '' + for count in itertools.count(1): + article_list = self._call_api(self.PROGRAM_LIST_API_URL % (channel_id, pager), channel_id, note='Paging #%d' % count) + playlist_data = article_list.get('PlaylistData') + if not playlist_data: + break + yield from playlist_data + last = playlist_data[-1] + pager = '&pid=%d&p_date=%s&play_count=%s' % (last['PlaylistId'], last['Published'], last['PlayCount']) + + def _real_extract(self, url): + channel_id = self._match_id(url) + articles = self._entries(channel_id) + + first_article = next(articles, None) + title = traverse_obj(first_article, ('ChannelName', ), expected_type=compat_str) + speaker_name = traverse_obj(first_article, ('SpeakerName', ), expected_type=compat_str) + if not title and speaker_name: + title = 'Uploads from %s' % speaker_name + if not title: + title = 'Uploads from channel ID %s' % channel_id + + articles = itertools.chain([first_article], articles) if first_article else articles + + playlist = ( + self.url_result(smuggle_url('https://voicy.jp/channel/%s/%d' % (channel_id, value['PlaylistId']), value), VoicyIE.ie_key()) + for value in articles) + return { + '_type': 'playlist', + 'entries': playlist, + 'id': channel_id, + 'title': title, + 'channel': speaker_name, + 'channel_id': channel_id, + }