[rudovideo] improvements to rudovideo extractor:

* implementing PR suggestions
* removing c13cl extractor in favor of RudoVideo with _EMBED_REGEX
* supporting VODs and Podcasts from rudo.video
* supporting embeded youtube
* adding tests
This commit is contained in:
Nicolas Dato 2023-12-02 18:05:32 -03:00
parent 9df466edfc
commit cfce954490
3 changed files with 104 additions and 33 deletions

View File

@ -278,7 +278,6 @@ from .businessinsider import BusinessInsiderIE
from .bundesliga import BundesligaIE from .bundesliga import BundesligaIE
from .buzzfeed import BuzzFeedIE from .buzzfeed import BuzzFeedIE
from .byutv import BYUtvIE from .byutv import BYUtvIE
from .c13cl import C13ClIE
from .c56 import C56IE from .c56 import C56IE
from .cableav import CableAVIE from .cableav import CableAVIE
from .callin import CallinIE from .callin import CallinIE
@ -1644,7 +1643,7 @@ from .rumble import (
RumbleIE, RumbleIE,
RumbleChannelIE, RumbleChannelIE,
) )
from .rudovideo import RudoVideoIE from .rudovideo import RudoVideoLiveIE
from .rutube import ( from .rutube import (
RutubeIE, RutubeIE,
RutubeChannelIE, RutubeChannelIE,

View File

@ -1,13 +0,0 @@
from .common import InfoExtractor
class C13ClIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?13\.cl/en-vivo'
def _real_extract(self, url):
display_id = 'C13'
webpage = self._download_webpage(url, display_id)
stream_url = self._search_regex(r'<div\s+.*id=(?:"|\')player(?:"|\')[^>]+><iframe\s+.*src=(?:"|\')([^?]+)', webpage, 'stream_url')
return self.url_result(stream_url, display_id=display_id, url_transparent=True)

View File

@ -1,31 +1,116 @@
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ExtractorError, traverse_obj from ..utils import ExtractorError, traverse_obj, js_to_json, update_url_query
class RudoVideoIE(InfoExtractor): class RudoVideoLiveIE(InfoExtractor):
_VALID_URL = r'https?://rudo\.video/live/(?P<id>[^/]+)' _VALID_URL = r'https?://rudo\.video/(?P<type>live|vod|podcast)/(?P<id>[^/?]+)'
_EMBED_REGEX = [r'<iframe[^>]+src=[\'"](?P<url>(?:https?:)//rudo\.video/(?:live|vod|podcast)/[^\'"]+)']
_TESTS = [{
'url': 'https://rudo.video/podcast/cz2wrUy8l0o',
'md5': '28ed82b477708dc5e12e072da2449221',
'info_dict': {
'id': 'cz2wrUy8l0o',
'title': 'Diego Cabot',
'ext': 'mp4',
'thumbnail': r're:^(?:https?:)?//.*\.(png|jpg)$',
},
}, {
'url': 'https://rudo.video/podcast/bQkt07',
'md5': '36b22a9863de0f47f00fc7532a32a898',
'info_dict': {
'id': 'bQkt07',
'title': 'Tubular Bells',
'ext': 'mp4',
'thumbnail': r're:^(?:https?:)?//.*\.(png|jpg)$',
},
}, {
'url': 'https://rudo.video/vod/bN5AaJ',
'md5': '01324a329227e2591530ecb4f555c881',
'info_dict': {
'id': 'bN5AaJ',
'title': 'Ucrania 19.03',
'creator': 'La Tercera',
'ext': 'mp4',
'thumbnail': r're:^(?:https?:)?//.*\.(png|jpg)$',
},
}, {
'url': 'https://rudo.video/live/bbtv',
'info_dict': {
'id': 'bbtv',
'ext': 'mp4',
'creator': 'BioBioTV',
'live_status': 'is_live',
'title': r're:^LIVE BBTV\s\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}$',
'thumbnail': r're:^(?:https?:)?//.*\.(png|jpg)$',
},
}, {
'url': 'https://rudo.video/live/c13',
'info_dict': {
'id': 'c13',
'title': 'CANAL13',
'ext': 'mp4',
},
'skip': 'Geo-restricted to Chile',
}, {
'url': 'https://rudo.video/live/t13-13cl',
'info_dict': {
'id': 't13-13cl',
'title': 'T13',
'ext': 'mp4',
},
'skip': 'Geo-restricted to Chile',
}]
def get_title(self, webpage):
title = self._search_regex(r'var\s+titleVideo\s*=\s*[\'"]([^\'"]+)', webpage, 'title', default=None)
if title is None:
title = self._search_regex(r'<meta[^>]+property=[\'"]og:title[\'"]\s+content=[\'"]([^\'"]+)', webpage, 'title', fatal=False)
return title
def get_thumbnail(self, webpage):
thumbnail = self._search_regex(r'var\s+posterIMG\s*=\s*[\'"]([^?\'"]+)', webpage, 'thumbnail', default=None)
if thumbnail is None:
thumbnail = self._search_regex(r'<meta[^>]+property=[\'"]og:image[\'"]\s+content=[\'"]([^\'"]+)', webpage, 'thumbnail', default=None)
return thumbnail
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
type = self._match_valid_url(url).group('type')
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
if "Streaming is not available in your area." in webpage: if 'Streaming is not available in your area.' in webpage:
self.raise_geo_restricted() self.raise_geo_restricted()
stream_url = self._search_regex(r'var\s+streamURL\s*=\s*\'([^?\']+)', webpage, "streamUrl") stream_url = self._search_regex(r'var\s+streamURL\s*=\s*[\'"]([^?\'"]+)', webpage, 'streamUrl', default=None)
token_array_string = self._search_regex(r'<script>var\s+_\$_[a-zA-Z0-9]+\s*=\s*(\[[^]]+\])', webpage, 'token_array', default=None) source_url = self._search_regex(r'<source[^>]+src=[\'"]([^\'"]+)', webpage, 'sourceUrl', default=None)
if token_array_string: youtube_url = self._search_regex(r'file:\s*[\'"]((?:https?:)//(?:www\.)?youtube.com[^\'"]+)', webpage, 'youtubeUrl', default=None)
token_array_string = token_array_string.replace("x", "u00") if stream_url is None:
token_array = self._parse_json(token_array_string, video_id) if source_url is not None:
stream_url = source_url
elif youtube_url is not None:
return self.url_result(youtube_url, display_id=video_id)
else:
raise ExtractorError('Unable to extract stream url')
title = self.get_title(webpage)
thumbnail = self.get_thumbnail(webpage)
is_live = None
if type == 'live':
is_live = True
token_array = self._search_json(r'<script>var\s+_\$_[a-zA-Z0-9]+\s*=', webpage, 'access token array', video_id,
contains_pattern=r'\[(?s:.+)\]', default=None, transform_source=js_to_json)
if token_array:
if len(token_array) != 9: if len(token_array) != 9:
raise ExtractorError('Couldnt get access token array', video_id=video_id) raise ExtractorError('Couldnt get access token array', video_id=video_id)
access_token_webpage = self._download_webpage(token_array[0], video_id) access_token = self._download_json(token_array[0], video_id, note='Downloading access token')
access_token = self._parse_json(access_token_webpage, video_id) stream_url = update_url_query(stream_url, {'auth-token': traverse_obj(access_token, ('data', 'authToken'))})
if "data" not in access_token or token_array[3] not in access_token.get("data"):
raise ExtractorError('Couldnt get access token', video_id=video_id)
query_string = token_array[5] + traverse_obj(access_token, ("data", token_array[3]))
stream_url = f'{stream_url}{query_string}'
return self.url_result( return {
stream_url, 'id': video_id,
display_id=video_id, url_transparent=True) 'title': title,
'formats': self._extract_m3u8_formats(stream_url, video_id, live=True),
'is_live': is_live,
'creator': self._search_regex(r'var\s+videoAuthor\s*=\s*[\'"]([^?\'"]+)', webpage, "videoAuthor", default=None),
'thumbnail': thumbnail,
}