diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index ea1d0a2dfb..672eb95962 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -831,6 +831,7 @@ from .jwplatform import JWPlatformIE from .kakao import KakaoIE from .kaltura import KalturaIE from .kanal2 import Kanal2IE +from .kankanews import KankaNewsIE from .karaoketv import KaraoketvIE from .karrierevideos import KarriereVideosIE from .keezmovies import KeezMoviesIE diff --git a/yt_dlp/extractor/kankanews.py b/yt_dlp/extractor/kankanews.py new file mode 100644 index 0000000000..46e239bd6c --- /dev/null +++ b/yt_dlp/extractor/kankanews.py @@ -0,0 +1,48 @@ +import time +import random +import string +import hashlib +import urllib.parse + +from .common import InfoExtractor + + +class KankaNewsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?kankanews\.com/a/\d+\-\d+\-\d+/(?P\d+)\.shtml' + _TESTS = [{ + 'url': 'https://www.kankanews.com/a/2022-11-08/00310276054.shtml?appid=1088227', + 'md5': '05e126513c74b1258d657452a6f4eef9', + 'info_dict': { + 'id': '4485057', + 'url': 'http://mediaplay.kksmg.com/2022/11/08/h264_450k_mp4_1a388ad771e0e4cc28b0da44d245054e_ncm.mp4', + 'ext': 'mp4', + 'title': '视频|第23个中国记者节,我们在进博切蛋糕', + 'thumbnail': r're:^https?://.*\.jpg*', + } + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex(r'omsid\s*=\s*"(\d+)"', webpage, 'video id') + + params = { + 'nonce': ''.join(random.choices(string.ascii_lowercase + string.digits, k=8)), + 'omsid': video_id, + 'platform': 'pc', + 'timestamp': int(time.time()), + 'version': '1.0', + } + params['sign'] = hashlib.md5((hashlib.md5(( + urllib.parse.urlencode(params) + '&28c8edde3d61a0411511d3b1866f0636' + ).encode()).hexdigest()).encode()).hexdigest() + + meta = self._download_json('https://api-app.kankanews.com/kankan/pc/getvideo', + video_id, query=params)['result']['video'] + + return { + 'id': video_id, + 'url': meta['videourl'], + 'title': self._search_regex(r'g\.title\s*=\s*"([^"]+)"', webpage, 'title'), + 'thumbnail': meta.get('titlepic'), + }