diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 06099a6795..5f3c77d8e0 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -72,15 +72,6 @@ class TestAllURLsMatching(unittest.TestCase): self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url']) self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url']) - def test_youtube_extract(self): - assertExtractId = lambda url, id: self.assertEqual(YoutubeIE.extract_id(url), id) - assertExtractId('http://www.youtube.com/watch?&v=BaW_jenozKc', 'BaW_jenozKc') - assertExtractId('https://www.youtube.com/watch?&v=BaW_jenozKc', 'BaW_jenozKc') - assertExtractId('https://www.youtube.com/watch?feature=player_embedded&v=BaW_jenozKc', 'BaW_jenozKc') - assertExtractId('https://www.youtube.com/watch_popup?v=BaW_jenozKc', 'BaW_jenozKc') - assertExtractId('http://www.youtube.com/watch?v=BaW_jenozKcsharePLED17F32AD9753930', 'BaW_jenozKc') - assertExtractId('BaW_jenozKc', 'BaW_jenozKc') - def test_facebook_matching(self): self.assertTrue(FacebookIE.suitable('https://www.facebook.com/Shiniknoh#!/photo.php?v=10153317450565268')) self.assertTrue(FacebookIE.suitable('https://www.facebook.com/cindyweather?fref=ts#!/photo.php?v=10152183998945793')) diff --git a/test/test_execution.py b/test/test_execution.py index 2aea4df1bd..8a0d65bfb1 100644 --- a/test/test_execution.py +++ b/test/test_execution.py @@ -39,6 +39,16 @@ class TestExecution(unittest.TestCase): _, stderr = p.communicate() self.assertFalse(stderr) + def test_lazy_extractors(self): + try: + subprocess.check_call([sys.executable, 'devscripts/make_lazy_extractors.py', 'yt_dlp/extractor/lazy_extractors.py'], cwd=rootDir, stdout=_DEV_NULL) + subprocess.check_call([sys.executable, 'test/test_all_urls.py'], cwd=rootDir, stdout=_DEV_NULL) + finally: + try: + os.remove('yt_dlp/extractor/lazy_extractors.py') + except (IOError, OSError): + pass + if __name__ == '__main__': unittest.main() diff --git a/test/test_youtube_misc.py b/test/test_youtube_misc.py new file mode 100644 index 0000000000..d9bb10d265 --- /dev/null +++ b/test/test_youtube_misc.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python +from __future__ import unicode_literals + +# Allow direct execution +import os +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + + +from yt_dlp.extractor import YoutubeIE + + +class TestYoutubeMisc(unittest.TestCase): + def test_youtube_extract(self): + assertExtractId = lambda url, id: self.assertEqual(YoutubeIE.extract_id(url), id) + assertExtractId('http://www.youtube.com/watch?&v=BaW_jenozKc', 'BaW_jenozKc') + assertExtractId('https://www.youtube.com/watch?&v=BaW_jenozKc', 'BaW_jenozKc') + assertExtractId('https://www.youtube.com/watch?feature=player_embedded&v=BaW_jenozKc', 'BaW_jenozKc') + assertExtractId('https://www.youtube.com/watch_popup?v=BaW_jenozKc', 'BaW_jenozKc') + assertExtractId('http://www.youtube.com/watch?v=BaW_jenozKcsharePLED17F32AD9753930', 'BaW_jenozKc') + assertExtractId('BaW_jenozKc', 'BaW_jenozKc') + + +if __name__ == '__main__': + unittest.main() diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py index 333796c80b..edc2c697b3 100644 --- a/yt_dlp/extractor/bbc.py +++ b/yt_dlp/extractor/bbc.py @@ -11,6 +11,7 @@ from ..compat import ( compat_etree_Element, compat_HTTPError, compat_parse_qs, + compat_str, compat_urllib_parse_urlparse, compat_urlparse, ) @@ -25,8 +26,10 @@ from ..utils import ( js_to_json, parse_duration, parse_iso8601, + strip_or_none, try_get, unescapeHTML, + unified_timestamp, url_or_none, urlencode_postdata, urljoin, @@ -761,8 +764,17 @@ class BBCIE(BBCCoUkIE): 'only_matching': True, }, { # custom redirection to www.bbc.com + # also, video with window.__INITIAL_DATA__ 'url': 'http://www.bbc.co.uk/news/science-environment-33661876', - 'only_matching': True, + 'info_dict': { + 'id': 'p02xzws1', + 'ext': 'mp4', + 'title': "Pluto may have 'nitrogen glaciers'", + 'description': 'md5:6a95b593f528d7a5f2605221bc56912f', + 'thumbnail': r're:https?://.+/.+\.jpg', + 'timestamp': 1437785037, + 'upload_date': '20150725', + }, }, { # single video article embedded with data-media-vpid 'url': 'http://www.bbc.co.uk/sport/rowing/35908187', @@ -1164,12 +1176,29 @@ class BBCIE(BBCCoUkIE): continue formats, subtitles = self._download_media_selector(item_id) self._sort_formats(formats) + item_desc = None + blocks = try_get(media, lambda x: x['summary']['blocks'], list) + if blocks: + summary = [] + for block in blocks: + text = try_get(block, lambda x: x['model']['text'], compat_str) + if text: + summary.append(text) + if summary: + item_desc = '\n\n'.join(summary) + item_time = None + for meta in try_get(media, lambda x: x['metadata']['items'], list) or []: + if try_get(meta, lambda x: x['label']) == 'Published': + item_time = unified_timestamp(meta.get('timestamp')) + break entries.append({ 'id': item_id, 'title': item_title, 'thumbnail': item.get('holdingImageUrl'), 'formats': formats, 'subtitles': subtitles, + 'timestamp': item_time, + 'description': strip_or_none(item_desc), }) for resp in (initial_data.get('data') or {}).values(): name = resp.get('name') diff --git a/yt_dlp/extractor/go.py b/yt_dlp/extractor/go.py index c31e07a0c8..7dcdc864f9 100644 --- a/yt_dlp/extractor/go.py +++ b/yt_dlp/extractor/go.py @@ -4,12 +4,14 @@ from __future__ import unicode_literals import re from .adobepass import AdobePassIE +from ..compat import compat_str from ..utils import ( int_or_none, determine_ext, parse_age_limit, remove_start, remove_end, + try_get, urlencode_postdata, ExtractorError, ) @@ -118,6 +120,18 @@ class GoIE(AdobePassIE): # m3u8 download 'skip_download': True, }, + }, { + 'url': 'https://abc.com/shows/modern-family/episode-guide/season-01/101-pilot', + 'info_dict': { + 'id': 'VDKA22600213', + 'ext': 'mp4', + 'title': 'Pilot', + 'description': 'md5:74306df917cfc199d76d061d66bebdb4', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { 'url': 'http://abc.go.com/shows/the-catch/episode-guide/season-01/10-the-wedding', 'only_matching': True, @@ -154,18 +168,30 @@ class GoIE(AdobePassIE): brand = site_info.get('brand') if not video_id or not site_info: webpage = self._download_webpage(url, display_id or video_id) - video_id = self._search_regex( - ( - # There may be inner quotes, e.g. data-video-id="'VDKA3609139'" - # from http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood - r'data-video-id=["\']*(VDKA\w+)', - # https://github.com/ytdl-org/youtube-dl/pull/25216/files - # The following is based on the pull request on the line above. Changed the ABC.com URL to a show available now. - # https://abc.com/shows/the-rookie/episode-guide/season-02/19-the-q-word - r'\bvideoIdCode["\']\s*:\s*["\'](vdka\w+)', - # Deprecated fallback pattern - r'\b(?:video)?id["\']\s*:\s*["\'](VDKA\w+)' - ), webpage, 'video id', default=video_id) + data = self._parse_json( + self._search_regex( + r'["\']__abc_com__["\']\s*\]\s*=\s*({.+?})\s*;', webpage, + 'data', default='{}'), + display_id or video_id, fatal=False) + # https://abc.com/shows/modern-family/episode-guide/season-01/101-pilot + layout = try_get(data, lambda x: x['page']['content']['video']['layout'], dict) + video_id = None + if layout: + video_id = try_get( + layout, + (lambda x: x['videoid'], lambda x: x['video']['id']), + compat_str) + if not video_id: + video_id = self._search_regex( + ( + # There may be inner quotes, e.g. data-video-id="'VDKA3609139'" + # from http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood + r'data-video-id=["\']*(VDKA\w+)', + # page.analytics.videoIdCode + r'\bvideoIdCode["\']\s*:\s*["\']((?:vdka|VDKA)\w+)', + # https://abc.com/shows/the-rookie/episode-guide/season-02/03-the-bet + r'\b(?:video)?id["\']\s*:\s*["\'](VDKA\w+)' + ), webpage, 'video id', default=video_id) if not site_info: brand = self._search_regex( (r'data-brand=\s*["\']\s*(\d+)', diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index eaa2db1da8..edc985d19e 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -77,11 +77,6 @@ class YoutubeBaseInfoExtractor(InfoExtractor): _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM|WL|LL|LM)' - def _ids_to_results(self, ids): - return [ - self.url_result(vid_id, 'Youtube', video_id=vid_id) - for vid_id in ids] - def _login(self): """ Attempt to log in to YouTube. @@ -1313,6 +1308,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): @classmethod def suitable(cls, url): + # Hack for lazy extractors until more generic solution is implemented + # (see #28780) + from .youtube import parse_qs qs = parse_qs(url) if qs.get('list', [None])[0]: return False @@ -3595,6 +3593,9 @@ class YoutubePlaylistIE(InfoExtractor): def suitable(cls, url): if YoutubeTabIE.suitable(url): return False + # Hack for lazy extractors until more generic solution is implemented + # (see #28780) + from .youtube import parse_qs qs = parse_qs(url) if qs.get('v', [None])[0]: return False