[lsm] Add extractor

This commit is contained in:
Caesim 2023-11-23 18:01:20 +02:00
parent a0b19d319a
commit 13e215ed50
3 changed files with 309 additions and 0 deletions

View File

@ -386,6 +386,7 @@ from .clipsyndicate import ClipsyndicateIE
from .closertotruth import CloserToTruthIE from .closertotruth import CloserToTruthIE
from .cloudflarestream import CloudflareStreamIE from .cloudflarestream import CloudflareStreamIE
from .cloudy import CloudyIE from .cloudy import CloudyIE
from .cloudycdn import CloudyCDNIE
from .clubic import ClubicIE from .clubic import ClubicIE
from .clyp import ClypIE from .clyp import ClypIE
from .cmt import CMTIE from .cmt import CMTIE
@ -1039,6 +1040,12 @@ from .lrt import (
LRTVODIE, LRTVODIE,
LRTStreamIE LRTStreamIE
) )
from .lsm import (
LSMLREmbedIE,
LSMLTVEmbedIE,
LSMLTVIE,
LSMReplayIE
)
from .lumni import ( from .lumni import (
LumniIE LumniIE
) )

View File

@ -0,0 +1,60 @@
from .common import InfoExtractor
from ..utils import (
int_or_none,
parse_iso8601,
traverse_obj,
url_or_none,
)
class CloudyCDNIE(InfoExtractor):
_VALID_URL = r'https?://embed\.cloudycdn\.services/(?P<site_id>[^/]+)/media/(?P<id>[^?]+)'
_TESTS = [{
'url': 'https://embed.cloudycdn.services/ltv/media/46k_d23-6000-105?',
'md5': '64f72a360ca530d5ed89c77646c9eee5',
'info_dict': {
'id': '46k_d23-6000-105',
'ext': 'mp4',
'timestamp': 1700589151,
'duration': 1442,
'upload_date': '20231121',
'title': 'D23-6000-105_cetstud',
'thumbnail': 'https://store.cloudycdn.services/tmsp00060/assets/media/660858/placeholder1700589200.jpg',
}
}, {
'url': 'https://embed.cloudycdn.services/izm/media/26e_lv-8-5-1',
'md5': '798828a479151e2444d8dcfbec76e482',
'info_dict': {
'id': '26e_lv-8-5-1',
'ext': 'mp4',
'title': 'LV-8-5-1',
'timestamp': 1669767167,
'thumbnail': 'https://store.cloudycdn.services/tmsp00120/assets/media/488306/placeholder1679423604.jpg',
'duration': 1205,
'upload_date': '20221130',
}
}]
def _real_extract(self, url):
site_id, video_id = self._match_valid_url(url).group('site_id', 'id')
json = self._download_json(
f'https://player.cloudycdn.services/player/{site_id}/media/{video_id}/',
video_id, data=b'referer=https://embed.cloudycdn.services/')
formats = []
subtitles = {}
for source in traverse_obj(json, ('source', 'sources'), default=[]):
fmts, subs = self._extract_m3u8_formats_and_subtitles(source.get('src'), video_id)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
return {
'id': video_id,
'title': json.get('name'),
'formats': formats,
'subtitles': subtitles,
'duration': int_or_none(json.get('duration')),
'timestamp': parse_iso8601(json.get('upload_date')),
'thumbnail': traverse_obj(json, ('source', 'poster', {url_or_none})),
}

242
yt_dlp/extractor/lsm.py Normal file
View File

@ -0,0 +1,242 @@
import re
import urllib.parse
from .common import InfoExtractor
from ..utils import (
int_or_none,
js_to_json,
parse_iso8601,
traverse_obj,
url_or_none,
)
class LSMBaseIE(InfoExtractor):
def fix_nuxt_data(self, webpage):
return re.sub(r'Object\.create\(null(?:,(\{.+\}))?\)', lambda m: m.group(1) or 'null', webpage)
class LSMLREmbedIE(InfoExtractor):
_VALID_URL = r'https?://(?:(?:latvijasradio|lr1|lr2|klasika|lr4|naba|radioteatris)\.lsm|pieci)\.lv/.*/embed.*[?&]id=(?P<id>\d+)'
_TESTS = [{
'url': 'https://latvijasradio.lsm.lv/lv/embed/?theme=black&size=16x9&showCaptions=0&id=183522',
'md5': '719b33875cd1429846eeeaeec6df2830',
'info_dict': {
'id': '183522',
'ext': 'mp3',
'duration': 1823,
'title': '#138 Nepilnīgā kompensējamo zāļu sistēma pat mēnešiem dzenā pacientus pa aptiekām',
'thumbnail': 'https://pic.latvijasradio.lv/public/assets/media/9/d/gallery_fd4675ac.jpg',
}
}, {
'url': 'https://lr1.lsm.lv/lv/embed/?id=183522&show=0&theme=white&size=16x9',
'only_matching': True,
}, {
'url': 'https://lr2.lsm.lv/lv/embed/?id=182126&show=0&theme=white&size=16x9',
'only_matching': True,
}, {
'url': 'https://klasika.lsm.lv/lv/embed/?id=110806&show=0&theme=white&size=16x9',
'only_matching': True,
}, {
'url': 'https://lr4.lsm.lv/lv/embed/?id=184282&show=0&theme=white&size=16x9',
'only_matching': True,
}, {
'url': 'https://pieci.lv/lv/embed/?id=168896&show=0&theme=white&size=16x9',
'only_matching': True,
}, {
'url': 'https://naba.lsm.lv/lv/embed/?id=182901&show=0&theme=white&size=16x9',
'only_matching': True,
}, {
'url': 'https://radioteatris.lsm.lv/lv/embed/?id=176439&show=0&theme=white&size=16x9',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
player_data, media_data = self._html_search_regex(
r'LR\.audio\.Player\s*\([^{]*(?P<player>\{.*?\}),(?P<media>\{.*\})\);',
webpage, 'player json', group=('player', 'media'))
player_json = self._parse_json(player_data, video_id, js_to_json)
media_json = self._parse_json(media_data, video_id, js_to_json)
formats = []
for source in traverse_obj(media_json, ('audio', 0, 'sources')):
url = source.get('file')
if url is not None:
if url.endswith('.m3u8'):
formats.extend(self._extract_m3u8_formats(url, video_id))
else:
formats.append({'url': url})
return {
'id': video_id,
'title': traverse_obj(media_json, ('audio', 0, 'title')),
'formats': formats,
'duration': traverse_obj(media_json, ('audio', 0, 'duration', {int_or_none})),
'thumbnail': url_or_none(player_json.get('poster')),
}
class LSMLTVEmbedIE(InfoExtractor):
_VALID_URL = r'https?://ltv\.lsm\.lv/embed.*[?&]c=(?P<id>[^&]+)'
_TESTS = [{
'url': 'https://ltv.lsm.lv/embed?c=eyJpdiI6IjQzbHVUeHAyaDJiamFjcjdSUUFKdnc9PSIsInZhbHVlIjoiMHl3SnJNRmd2TmFIdnZwOGtGUUpzODFzUEZ4SVVsN2xoRjliSW9vckUyMWZIWG8vbWVzaFFkY0lhNmRjbjRpaCIsIm1hYyI6ImMzNjdhMzFhNTFhZmY1ZmE0NWI5YmFjZGI1YmJiNGEyNjgzNDM4MjUzMWEwM2FmMDMyZDMwYWM1MDFjZmM5MGIiLCJ0YWciOiIifQ==',
'md5': '64f72a360ca530d5ed89c77646c9eee5',
'info_dict': {
'id': '46k_d23-6000-105',
'ext': 'mp4',
'timestamp': 1700589151,
'duration': 1442,
'upload_date': '20231121',
'title': 'D23-6000-105_cetstud',
'thumbnail': 'https://store.cloudycdn.services/tmsp00060/assets/media/660858/placeholder1700589200.jpg',
}
}, {
'url': 'https://ltv.lsm.lv/embed?enablesdkjs=1&c=eyJpdiI6IncwVzZmUFk2MU12enVWK1I3SUcwQ1E9PSIsInZhbHVlIjoid3FhV29vamc3T2sxL1RaRmJ5Rm1GTXozU0o2dVczdUtLK0cwZEZJMDQ2a3ZIRG5DK2pneGlnbktBQy9uazVleHN6VXhxdWIweWNvcHRDSnlISlNYOHlVZ1lpcTUrcWZSTUZPQW14TVdkMW9aOUtRWVNDcFF4eWpHNGcrT0VZbUNFQStKQk91cGpndW9FVjJIa0lpbkh3PT0iLCJtYWMiOiIyZGI1NDJlMWRlM2QyMGNhOGEwYTM2MmNlN2JlOGRhY2QyYjdkMmEzN2RlOTEzYTVkNzI1ODlhZDlhZjU4MjQ2IiwidGFnIjoiIn0=',
'md5': 'a1711e190fe680fdb68fd8413b378e87',
'info_dict': {
'id': 'wUnFArIPDSY',
'ext': 'mp4',
'uploader': 'LTV_16plus',
'release_date': '20220514',
'channel_url': 'https://www.youtube.com/channel/UCNMrnafwXD2XKeeQOyfkFCw',
'view_count': int,
'availability': 'public',
'thumbnail': 'https://i.ytimg.com/vi/wUnFArIPDSY/maxresdefault.jpg',
'release_timestamp': 1652544074,
'title': 'EIROVĪZIJA SALĀTOS',
'live_status': 'was_live',
'uploader_id': '@LTV16plus',
'comment_count': int,
'channel_id': 'UCNMrnafwXD2XKeeQOyfkFCw',
'channel_follower_count': int,
'categories': ['Entertainment'],
'duration': 5269,
'upload_date': '20220514',
'age_limit': 0,
'channel': 'LTV_16plus',
'playable_in_embed': True,
'tags': [],
'uploader_url': 'https://www.youtube.com/@LTV16plus',
'like_count': int,
'description': 'md5:7ff0c42ba971e3c13e4b8a2ff03b70b5',
}
}]
def _real_extract(self, url):
video_id = urllib.parse.unquote(self._match_id(url))
webpage = self._download_webpage(url, video_id)
json = self._search_json(r'window\.ltvEmbedPayload\s*=', webpage, 'embed json', video_id)
embed_type = traverse_obj(json, ('source', 'name'))
embed_data = {}
if embed_type == 'telia':
embed_data = {
'ie_key': 'CloudyCDN',
'url': traverse_obj(json, ('source', 'embed_url', {url_or_none})),
}
elif embed_type == 'youtube':
embed_data = {
'ie_key': 'Youtube',
'url': traverse_obj(json, ('source', 'id')),
}
return {
**embed_data,
'_type': 'url',
'id': video_id,
'title': traverse_obj(json, ('parentInfo', 'title')),
'duration': traverse_obj(json, ('parentInfo', 'duration', {int_or_none})),
'thumbnail': traverse_obj(json, ('source', 'poster', {url_or_none})),
}
class LSMLTVIE(LSMBaseIE):
_VALID_URL = r'https?://ltv\.lsm\.lv/.*/raksts/.*\.id(?P<id>\d+)'
_TESTS = [{
'url': 'https://ltv.lsm.lv/lv/raksts/21.11.2023-4-studija-zolitudes-tragedija-un-incupes-stacija.id311130',
'md5': '64f72a360ca530d5ed89c77646c9eee5',
'info_dict': {
'id': '46k_d23-6000-105',
'ext': 'mp4',
'timestamp': 1700586300,
'duration': 1442,
'upload_date': '20231121',
'title': '4. studija. Zolitūdes traģēdija un Inčupes stacija',
'thumbnail': 'https://ltv.lsm.lv/storage/media/8/7/large/5/1f9604e1.jpg',
}
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
json = self._search_nuxt_data(self.fix_nuxt_data(webpage), video_id)
embed_id = traverse_obj(json, ('article', 'videoMediaItem', 'video', 'embed_id'))
return {
'_type': 'url_transparent',
'ie_key': 'LSMLTVEmbed',
'url': f'https://ltv.lsm.lv/embed?c={embed_id}',
'id': video_id,
'title': traverse_obj(json, ('article', 'title')),
'timestamp': traverse_obj(json, ('article', 'aired_at', {parse_iso8601})),
'thumbnail': traverse_obj(json, ('article', 'thumbnail', {url_or_none})),
}
class LSMReplayIE(LSMBaseIE):
_VALID_URL = r'https?://replay\.lsm\.lv/.*/(?:ieraksts|statja)/[^/]+/(?P<id>\d+)'
_TESTS = [{
'url': 'https://replay.lsm.lv/lv/ieraksts/ltv/311130/4-studija-zolitudes-tragedija-un-incupes-stacija',
'md5': '64f72a360ca530d5ed89c77646c9eee5',
'info_dict': {
'id': '46k_d23-6000-105',
'ext': 'mp4',
'timestamp': 1700586300,
'description': 'md5:0f1b14798cc39e1ae578bd0eb268f759',
'duration': 1442,
'upload_date': '20231121',
'title': '4. studija. Zolitūdes traģēdija un Inčupes stacija',
'thumbnail': 'https://ltv.lsm.lv/storage/media/8/7/large/5/1f9604e1.jpg',
}
}, {
'url': 'https://replay.lsm.lv/lv/ieraksts/lr/183522/138-nepilniga-kompensejamo-zalu-sistema-pat-menesiem-dzena-pacientus-pa-aptiekam',
'md5': '719b33875cd1429846eeeaeec6df2830',
'info_dict': {
'id': '183522',
'ext': 'mp3',
'duration': 1823,
'title': '#138 Nepilnīgā kompensējamo zāļu sistēma pat mēnešiem dzenā pacientus pa aptiekām',
'thumbnail': 'https://pic.latvijasradio.lv/public/assets/media/9/d/large_fd4675ac.jpg',
'upload_date': '20231102',
'timestamp': 1698921060,
'description': 'md5:7bac3b2dd41e44325032943251c357b1',
}
}, {
'url': 'https://replay.lsm.lv/ru/statja/ltv/311130/4-studija-zolitudes-tragedija-un-incupes-stacija',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
json = self._search_nuxt_data(self.fix_nuxt_data(webpage), video_id, context_name='__REPLAY__')
return {
'_type': 'url_transparent',
'url': traverse_obj(json, ('playback', 'service', 'url', {url_or_none})),
'id': video_id,
'title': traverse_obj(json, ('mediaItem', 'title')),
'description': traverse_obj(json, ('mediaItem', ('lead', 'body')), get_all=False),
'duration': traverse_obj(json, ('mediaItem', 'duration', {int_or_none})),
'timestamp': traverse_obj(json, ('mediaItem', 'aired_at', {parse_iso8601})),
'thumbnail': traverse_obj(json, ('mediaItem', 'largeThumbnail', {url_or_none}))
}