From 47a5cb77344536ca79d81a04904ac9ef9b02050f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 18 Mar 2018 02:46:50 +0700 Subject: [PATCH] Generalize XML manifest processing code and improve XSPF parsing (closes #15794) --- test/test_InfoExtractor.py | 41 ++++++++++++++++++------------- youtube_dl/extractor/common.py | 43 +++++++++++++++++++-------------- youtube_dl/extractor/generic.py | 4 ++- 3 files changed, 52 insertions(+), 36 deletions(-) diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index a695ce64b3..4833396a52 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -698,40 +698,47 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ _TEST_CASES = [ ( 'foo_xspf', - 'https://example.org/src/', + 'https://example.org/src/foo_xspf.xspf', [{ + 'id': 'foo_xspf', + 'title': 'Pandemonium', 'description': 'Visit http://bigbrother404.bandcamp.com', 'duration': 202.416, - 'formats': [{'url': 'https://example.org/src/cd1/track%201.mp3'}], + 'formats': [{ + 'manifest_url': 'https://example.org/src/foo_xspf.xspf', + 'url': 'https://example.org/src/cd1/track%201.mp3', + }], + }, { 'id': 'foo_xspf', - 'title': 'Pandemonium' - }, - { + 'title': 'Final Cartridge (Nichico Twelve Remix)', 'description': 'Visit http://bigbrother404.bandcamp.com', 'duration': 255.857, - 'formats': [{'url': 'https://example.org/%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF%E3%80%80%EF%BC%92.mp3'}], + 'formats': [{ + 'manifest_url': 'https://example.org/src/foo_xspf.xspf', + 'url': 'https://example.org/%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF%E3%80%80%EF%BC%92.mp3', + }], + }, { 'id': 'foo_xspf', - 'title': 'Final Cartridge (Nichico Twelve Remix)' - }, - { + 'title': 'Rebuilding Nightingale', 'description': 'Visit http://bigbrother404.bandcamp.com', 'duration': 287.915, - 'formats': [ - {'url': 'https://example.org/src/track3.mp3'}, - {'url': 'https://example.com/track3.mp3'} - ], - 'id': 'foo_xspf', - 'title': 'Rebuilding Nightingale' + 'formats': [{ + 'manifest_url': 'https://example.org/src/foo_xspf.xspf', + 'url': 'https://example.org/src/track3.mp3', + }, { + 'manifest_url': 'https://example.org/src/foo_xspf.xspf', + 'url': 'https://example.com/track3.mp3', + }] }] ), ] - for xspf_file, xspf_base_url, expected_entries in _TEST_CASES: + for xspf_file, xspf_url, expected_entries in _TEST_CASES: with io.open('./test/testdata/xspf/%s.xspf' % xspf_file, mode='r', encoding='utf-8') as f: entries = self.ie._parse_xspf( compat_etree_fromstring(f.read().encode('utf-8')), - xspf_file, xspf_base_url) + xspf_file, xspf_url=xspf_url, xspf_base_url=xspf_url) expect_value(self, entries, expected_entries, None) for i in range(len(entries)): expect_dict(self, entries[i], expected_entries[i]) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index a507785097..2e2a02948d 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1706,22 +1706,24 @@ class InfoExtractor(object): }) return subtitles - def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True): + def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True): xspf = self._download_xml( - playlist_url, playlist_id, 'Downloading xpsf playlist', + xspf_url, playlist_id, 'Downloading xpsf playlist', 'Unable to download xspf manifest', fatal=fatal) if xspf is False: return [] - return self._parse_xspf(xspf, playlist_id, base_url(playlist_url)) + return self._parse_xspf( + xspf, playlist_id, xspf_url=xspf_url, + xspf_base_url=base_url(xspf_url)) - def _parse_xspf(self, playlist, playlist_id, playlist_base_url=''): + def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None): NS_MAP = { 'xspf': 'http://xspf.org/ns/0/', 's1': 'http://static.streamone.nl/player/ns/0', } entries = [] - for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)): + for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)): title = xpath_text( track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id) description = xpath_text( @@ -1731,12 +1733,18 @@ class InfoExtractor(object): duration = float_or_none( xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000) - formats = [{ - 'url': urljoin(playlist_base_url, location.text), - 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)), - 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))), - 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))), - } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))] + formats = [] + for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)): + format_url = urljoin(xspf_base_url, location.text) + if not format_url: + continue + formats.append({ + 'url': format_url, + 'manifest_url': xspf_url, + 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)), + 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))), + 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))), + }) self._sort_formats(formats) entries.append({ @@ -1750,18 +1758,18 @@ class InfoExtractor(object): return entries def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}): - res = self._download_webpage_handle( + res = self._download_xml_handle( mpd_url, video_id, note=note or 'Downloading MPD manifest', errnote=errnote or 'Failed to download MPD manifest', fatal=fatal) if res is False: return [] - mpd, urlh = res + mpd_doc, urlh = res mpd_base_url = base_url(urlh.geturl()) return self._parse_mpd_formats( - compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, + mpd_doc, mpd_id=mpd_id, mpd_base_url=mpd_base_url, formats_dict=formats_dict, mpd_url=mpd_url) def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None): @@ -2035,17 +2043,16 @@ class InfoExtractor(object): return formats def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True): - res = self._download_webpage_handle( + res = self._download_xml_handle( ism_url, video_id, note=note or 'Downloading ISM manifest', errnote=errnote or 'Failed to download ISM manifest', fatal=fatal) if res is False: return [] - ism, urlh = res + ism_doc, urlh = res - return self._parse_ism_formats( - compat_etree_fromstring(ism.encode('utf-8')), urlh.geturl(), ism_id) + return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id) def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None): """ diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 023ccbc9bf..1cc491b19b 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2233,7 +2233,9 @@ class GenericIE(InfoExtractor): return smil elif doc.tag == '{http://xspf.org/ns/0/}playlist': return self.playlist_result( - self._parse_xspf(doc, video_id, compat_str(full_response.geturl())), + self._parse_xspf( + doc, video_id, xspf_url=url, + xspf_base_url=compat_str(full_response.geturl())), video_id) elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag): info_dict['formats'] = self._parse_mpd_formats(