From bfd973ece3369c593b5e82a88cc16de80088a73e Mon Sep 17 00:00:00 2001 From: pukkandan Date: Mon, 1 Aug 2022 06:53:25 +0530 Subject: [PATCH] [extractors] Use new framework for existing embeds (#4307) `Brightcove` is difficult to migrate because it's subclasses may depend on the signature of the current functions. So it is left as-is for now Note: Tests have not been migrated --- yt_dlp/extractor/_extractors.py | 3 +- yt_dlp/extractor/adobetv.py | 1 + yt_dlp/extractor/ant1newsgr.py | 15 +- yt_dlp/extractor/anvato.py | 30 +- yt_dlp/extractor/apa.py | 11 +- yt_dlp/extractor/aparat.py | 1 + yt_dlp/extractor/arcpublishing.py | 4 +- yt_dlp/extractor/arkena.py | 13 +- yt_dlp/extractor/arte.py | 7 +- yt_dlp/extractor/bandcamp.py | 1 + yt_dlp/extractor/bbc.py | 1 + yt_dlp/extractor/bitchute.py | 9 +- yt_dlp/extractor/blogger.py | 8 +- yt_dlp/extractor/buzzfeed.py | 2 +- yt_dlp/extractor/channel9.py | 7 +- yt_dlp/extractor/cinchcast.py | 2 + yt_dlp/extractor/cloudflarestream.py | 10 +- yt_dlp/extractor/common.py | 5 + yt_dlp/extractor/condenast.py | 5 +- yt_dlp/extractor/crooksandliars.py | 2 + yt_dlp/extractor/cspan.py | 2 +- yt_dlp/extractor/dailymail.py | 9 +- yt_dlp/extractor/dailymotion.py | 23 +- yt_dlp/extractor/dbtv.py | 9 +- yt_dlp/extractor/digiteka.py | 11 +- yt_dlp/extractor/drtuber.py | 7 +- yt_dlp/extractor/eagleplatform.py | 34 +- yt_dlp/extractor/embedly.py | 11 + yt_dlp/extractor/ertgr.py | 13 +- yt_dlp/extractor/expressen.py | 10 +- yt_dlp/extractor/facebook.py | 22 +- yt_dlp/extractor/foxnews.py | 6 +- yt_dlp/extractor/francetv.py | 3 +- yt_dlp/extractor/gedidigital.py | 30 +- yt_dlp/extractor/generic.py | 1020 +---------------------- yt_dlp/extractor/gfycat.py | 11 +- yt_dlp/extractor/glomex.py | 12 +- yt_dlp/extractor/googledrive.py | 6 +- yt_dlp/extractor/heise.py | 2 +- yt_dlp/extractor/huffpost.py | 1 + yt_dlp/extractor/indavideo.py | 24 +- yt_dlp/extractor/instagram.py | 24 +- yt_dlp/extractor/ivi.py | 1 + yt_dlp/extractor/joj.py | 11 +- yt_dlp/extractor/jwplatform.py | 9 +- yt_dlp/extractor/kaltura.py | 15 +- yt_dlp/extractor/kinja.py | 11 +- yt_dlp/extractor/libsyn.py | 1 + yt_dlp/extractor/limelight.py | 4 +- yt_dlp/extractor/livestream.py | 2 + yt_dlp/extractor/mainstreaming.py | 8 +- yt_dlp/extractor/mangomolo.py | 27 +- yt_dlp/extractor/medialaan.py | 4 +- yt_dlp/extractor/mediaset.py | 6 +- yt_dlp/extractor/mediasite.py | 14 +- yt_dlp/extractor/megaphone.py | 8 +- yt_dlp/extractor/megatvcom.py | 7 +- yt_dlp/extractor/mlb.py | 4 + yt_dlp/extractor/mofosex.py | 9 +- yt_dlp/extractor/mtv.py | 8 +- yt_dlp/extractor/myvi.py | 10 +- yt_dlp/extractor/nbc.py | 9 +- yt_dlp/extractor/nexx.py | 20 +- yt_dlp/extractor/nytimes.py | 1 + yt_dlp/extractor/odnoklassniki.py | 10 +- yt_dlp/extractor/onionstudios.py | 10 +- yt_dlp/extractor/ooyala.py | 24 + yt_dlp/extractor/panopto.py | 8 +- yt_dlp/extractor/peertube.py | 20 +- yt_dlp/extractor/periscope.py | 10 +- yt_dlp/extractor/piksel.py | 9 +- yt_dlp/extractor/pladform.py | 10 +- yt_dlp/extractor/playwire.py | 2 + yt_dlp/extractor/pornhub.py | 7 +- yt_dlp/extractor/rcs.py | 41 +- yt_dlp/extractor/redtube.py | 9 +- yt_dlp/extractor/rtlnl.py | 1 + yt_dlp/extractor/rumble.py | 8 +- yt_dlp/extractor/rutube.py | 8 +- yt_dlp/extractor/rutv.py | 17 +- yt_dlp/extractor/ruutu.py | 2 +- yt_dlp/extractor/sbs.py | 6 + yt_dlp/extractor/senategov.py | 9 +- yt_dlp/extractor/sendtonews.py | 4 +- yt_dlp/extractor/seznamzpravy.py | 12 +- yt_dlp/extractor/sharevideos.py | 6 + yt_dlp/extractor/simplecast.py | 16 +- yt_dlp/extractor/soundcloud.py | 7 +- yt_dlp/extractor/spankwire.py | 7 +- yt_dlp/extractor/sportbox.py | 9 +- yt_dlp/extractor/spotify.py | 7 +- yt_dlp/extractor/springboardplatform.py | 9 +- yt_dlp/extractor/streamable.py | 11 +- yt_dlp/extractor/substack.py | 5 +- yt_dlp/extractor/svt.py | 8 +- yt_dlp/extractor/teachable.py | 14 +- yt_dlp/extractor/ted.py | 6 +- yt_dlp/extractor/theplatform.py | 24 +- yt_dlp/extractor/threeqsdn.py | 16 +- yt_dlp/extractor/tiktok.py | 7 +- yt_dlp/extractor/tnaflix.py | 9 +- yt_dlp/extractor/tube8.py | 7 +- yt_dlp/extractor/tunein.py | 7 +- yt_dlp/extractor/tvc.py | 10 +- yt_dlp/extractor/tvigle.py | 1 + yt_dlp/extractor/tvopengr.py | 10 +- yt_dlp/extractor/tvp.py | 7 +- yt_dlp/extractor/twentymin.py | 9 +- yt_dlp/extractor/udn.py | 1 + yt_dlp/extractor/ustream.py | 8 +- yt_dlp/extractor/vbox7.py | 11 +- yt_dlp/extractor/vevo.py | 1 + yt_dlp/extractor/vice.py | 13 +- yt_dlp/extractor/viddler.py | 2 + yt_dlp/extractor/videa.py | 8 +- yt_dlp/extractor/videomore.py | 21 +- yt_dlp/extractor/videopress.py | 9 +- yt_dlp/extractor/viewlift.py | 10 +- yt_dlp/extractor/vimeo.py | 46 +- yt_dlp/extractor/vine.py | 1 + yt_dlp/extractor/viqeo.py | 11 +- yt_dlp/extractor/vk.py | 14 +- yt_dlp/extractor/vodplatform.py | 1 + yt_dlp/extractor/voxmedia.py | 1 + yt_dlp/extractor/vshare.py | 9 +- yt_dlp/extractor/vzaar.py | 9 +- yt_dlp/extractor/washingtonpost.py | 7 +- yt_dlp/extractor/webcaster.py | 16 +- yt_dlp/extractor/wimtv.py | 11 +- yt_dlp/extractor/wistia.py | 31 +- yt_dlp/extractor/xfileshare.py | 10 +- yt_dlp/extractor/xhamster.py | 7 +- yt_dlp/extractor/yahoo.py | 4 +- yt_dlp/extractor/yapfiles.py | 10 +- yt_dlp/extractor/youporn.py | 7 +- yt_dlp/extractor/youtube.py | 64 +- yt_dlp/extractor/zapiks.py | 1 + yt_dlp/extractor/zype.py | 9 +- 138 files changed, 499 insertions(+), 1909 deletions(-) create mode 100644 yt_dlp/extractor/sharevideos.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 221c1598df..5ca92f18b2 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -446,7 +446,7 @@ from .dw import ( DWIE, DWArticleIE, ) -from .eagleplatform import EaglePlatformIE +from .eagleplatform import EaglePlatformIE, ClipYouEmbedIE from .ebaumsworld import EbaumsWorldIE from .echomsk import EchoMskIE from .egghead import ( @@ -1555,6 +1555,7 @@ from .shared import ( SharedIE, VivoIE, ) +from .sharevideos import ShareVideosEmbedIE from .shemaroome import ShemarooMeIE from .showroomlive import ShowRoomLiveIE from .simplecast import ( diff --git a/yt_dlp/extractor/adobetv.py b/yt_dlp/extractor/adobetv.py index 941254243f..d8e07b3a17 100644 --- a/yt_dlp/extractor/adobetv.py +++ b/yt_dlp/extractor/adobetv.py @@ -232,6 +232,7 @@ class AdobeTVChannelIE(AdobeTVPlaylistBaseIE): class AdobeTVVideoIE(AdobeTVBaseIE): IE_NAME = 'adobetv:video' _VALID_URL = r'https?://video\.tv\.adobe\.com/v/(?P\d+)' + _EMBED_REGEX = [r']+src=[\'"](?P(?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]'] _TEST = { # From https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners diff --git a/yt_dlp/extractor/ant1newsgr.py b/yt_dlp/extractor/ant1newsgr.py index cd0f368569..fac476e21a 100644 --- a/yt_dlp/extractor/ant1newsgr.py +++ b/yt_dlp/extractor/ant1newsgr.py @@ -1,4 +1,3 @@ -import re import urllib.parse from .common import InfoExtractor @@ -7,7 +6,6 @@ from ..utils import ( ExtractorError, determine_ext, scale_thumbnails_to_max_format_width, - unescapeHTML, ) @@ -91,7 +89,7 @@ class Ant1NewsGrArticleIE(Ant1NewsGrBaseIE): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) info = self._search_json_ld(webpage, video_id, expected_type='NewsArticle') - embed_urls = list(Ant1NewsGrEmbedIE._extract_urls(webpage)) + embed_urls = list(Ant1NewsGrEmbedIE._extract_embed_urls(url, webpage)) if not embed_urls: raise ExtractorError('no videos found for %s' % video_id, expected=True) return self.playlist_from_matches( @@ -104,6 +102,7 @@ class Ant1NewsGrEmbedIE(Ant1NewsGrBaseIE): IE_DESC = 'ant1news.gr embedded videos' _BASE_PLAYER_URL_RE = r'(?:https?:)?//(?:[a-zA-Z0-9\-]+\.)?(?:antenna|ant1news)\.gr/templates/pages/player' _VALID_URL = rf'{_BASE_PLAYER_URL_RE}\?([^#]+&)?cid=(?P[^#&]+)' + _EMBED_REGEX = [rf']+?src=(?P<_q1>["\'])(?P{_BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+)(?P=_q1)'] _API_PATH = '/news/templates/data/jsonPlayer' _TESTS = [{ @@ -117,16 +116,6 @@ class Ant1NewsGrEmbedIE(Ant1NewsGrBaseIE): }, }] - @classmethod - def _extract_urls(cls, webpage): - _EMBED_URL_RE = rf'{cls._BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+' - _EMBED_RE = rf']+?src=(?P<_q1>["\'])(?P{_EMBED_URL_RE})(?P=_q1)' - for mobj in re.finditer(_EMBED_RE, webpage): - url = unescapeHTML(mobj.group('url')) - if not cls.suitable(url): - continue - yield url - def _real_extract(self, url): video_id = self._match_id(url) diff --git a/yt_dlp/extractor/anvato.py b/yt_dlp/extractor/anvato.py index 09dfffdb09..cb94835693 100644 --- a/yt_dlp/extractor/anvato.py +++ b/yt_dlp/extractor/anvato.py @@ -340,30 +340,16 @@ class AnvatoIE(InfoExtractor): 'subtitles': subtitles, } - @staticmethod - def _extract_urls(ie, webpage, video_id): - entries = [] - for mobj in re.finditer(AnvatoIE._ANVP_RE, webpage): - anvplayer_data = ie._parse_json( - mobj.group('anvp'), video_id, transform_source=unescapeHTML, - fatal=False) - if not anvplayer_data: - continue - video = anvplayer_data.get('video') - if not isinstance(video, compat_str) or not video.isdigit(): - continue - access_key = anvplayer_data.get('accessKey') - if not access_key: - mcp = anvplayer_data.get('mcp') - if mcp: - access_key = AnvatoIE._MCP_TO_ACCESS_KEY_TABLE.get( - mcp.lower()) + @classmethod + def _extract_from_webpage(cls, url, webpage): + for mobj in re.finditer(cls._ANVP_RE, webpage): + anvplayer_data = unescapeHTML(json.loads(mobj.group('anvp'))) or {} + video_id, access_key = anvplayer_data.get('video'), anvplayer_data.get('accessKey') if not access_key: + access_key = cls._MCP_TO_ACCESS_KEY_TABLE.get((anvplayer_data.get('mcp') or '').lower()) + if not (video_id or '').isdigit() or not access_key: continue - entries.append(ie.url_result( - 'anvato:%s:%s' % (access_key, video), ie=AnvatoIE.ie_key(), - video_id=video)) - return entries + yield cls.url_result(f'anvato:{access_key}:{video_id}', AnvatoIE, video_id) def _extract_anvato_videos(self, webpage, video_id): anvplayer_data = self._parse_json( diff --git a/yt_dlp/extractor/apa.py b/yt_dlp/extractor/apa.py index 847be6edf7..c9147e855a 100644 --- a/yt_dlp/extractor/apa.py +++ b/yt_dlp/extractor/apa.py @@ -1,5 +1,3 @@ -import re - from .common import InfoExtractor from ..utils import ( determine_ext, @@ -10,6 +8,7 @@ from ..utils import ( class APAIE(InfoExtractor): _VALID_URL = r'(?Phttps?://[^/]+\.apa\.at)/embed/(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + _EMBED_REGEX = [r']+\bsrc=(["\'])(?P(?:https?:)?//[^/]+\.apa\.at/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}.*?)\1'] _TESTS = [{ 'url': 'http://uvp.apa.at/embed/293f6d17-692a-44e3-9fd5-7b178f3a1029', 'md5': '2b12292faeb0a7d930c778c7a5b4759b', @@ -30,14 +29,6 @@ class APAIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return [ - mobj.group('url') - for mobj in re.finditer( - r']+\bsrc=(["\'])(?P(?:https?:)?//[^/]+\.apa\.at/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}.*?)\1', - webpage)] - def _real_extract(self, url): mobj = self._match_valid_url(url) video_id, base_url = mobj.group('id', 'base_url') diff --git a/yt_dlp/extractor/aparat.py b/yt_dlp/extractor/aparat.py index cd6cd1c790..90464556db 100644 --- a/yt_dlp/extractor/aparat.py +++ b/yt_dlp/extractor/aparat.py @@ -10,6 +10,7 @@ from ..utils import ( class AparatIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?aparat\.com/(?:v/|video/video/embed/videohash/)(?P[a-zA-Z0-9]+)' + _EMBED_REGEX = [r'