From 5aba6ea4fe6ad227d64a7e8b487d7cd7c3ad1f11 Mon Sep 17 00:00:00 2001 From: Ricardo Garcia Date: Sat, 29 Jan 2011 11:55:20 +0100 Subject: [PATCH 1/7] =?UTF-8?q?Add=20YoutubeUserIE=20(code=20courtesy=20of?= =?UTF-8?q?=20Pawe=C5=82=20Paprota)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- youtube-dl | 79 ++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 56 insertions(+), 23 deletions(-) diff --git a/youtube-dl b/youtube-dl index 8dd03daf3c..c8c2ea88e3 100755 --- a/youtube-dl +++ b/youtube-dl @@ -5,6 +5,7 @@ # Author: Benjamin Johnson # Author: Vasyl' Vavrychuk # Author: Witold Baryluk +# Author: Paweł Paprota # License: Public domain code import cookielib import ctypes @@ -2159,9 +2160,11 @@ class YoutubePlaylistIE(InfoExtractor): class YoutubeUserIE(InfoExtractor): """Information Extractor for YouTube users.""" - _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)' + _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)' _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s' - _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this. + _GDATA_PAGE_SIZE = 50 + _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d' + _VIDEO_INDICATOR = r'/watch\?v=(.+?)&' _youtube_ie = None def __init__(self, youtube_ie, downloader=None): @@ -2172,9 +2175,10 @@ class YoutubeUserIE(InfoExtractor): def suitable(url): return (re.match(YoutubeUserIE._VALID_URL, url) is not None) - def report_download_page(self, username): + def report_download_page(self, username, start_index): """Report attempt to download user page.""" - self._downloader.to_screen(u'[youtube] user %s: Downloading page ' % (username)) + self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' % + (username, start_index, start_index + self._GDATA_PAGE_SIZE)) def _real_initialize(self): self._youtube_ie.initialize() @@ -2186,34 +2190,63 @@ class YoutubeUserIE(InfoExtractor): self._downloader.trouble(u'ERROR: invalid url: %s' % url) return - # Download user page username = mobj.group(1) + + # Download video ids using YouTube Data API. Result size per + # query is limited (currently to 50 videos) so we need to query + # page by page until there are no video ids - it means we got + # all of them. + video_ids = [] - pagenum = 1 + pagenum = 0 - self.report_download_page(username) - request = urllib2.Request(self._TEMPLATE_URL % (username)) - try: - page = urllib2.urlopen(request).read() - except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err)) - return + while True: + start_index = pagenum * self._GDATA_PAGE_SIZE + 1 + self.report_download_page(username, start_index) - # Extract video identifiers - ids_in_page = [] + request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)) - for mobj in re.finditer(self._VIDEO_INDICATOR, page): - if mobj.group(1) not in ids_in_page: - ids_in_page.append(mobj.group(1)) - video_ids.extend(ids_in_page) + try: + page = urllib2.urlopen(request).read() + except (urllib2.URLError, httplib.HTTPException, socket.error), err: + self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err)) + return + # Extract video identifiers + ids_in_page = [] + + for mobj in re.finditer(self._VIDEO_INDICATOR, page): + if mobj.group(1) not in ids_in_page: + ids_in_page.append(mobj.group(1)) + + video_ids.extend(ids_in_page) + + # A little optimization - if current page is not + # "full", ie. does not contain PAGE_SIZE video ids then + # we can assume that this page is the last one - there + # are no more ids on further pages - no need to query + # again. + + if len(ids_in_page) < self._GDATA_PAGE_SIZE: + break + + pagenum += 1 + + all_ids_count = len(video_ids) playliststart = self._downloader.params.get('playliststart', 1) - 1 playlistend = self._downloader.params.get('playlistend', -1) - video_ids = video_ids[playliststart:playlistend] - for id in video_ids: - self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id) - return + if playlistend == -1: + video_ids = video_ids[playliststart:] + else: + video_ids = video_ids[playliststart:playlistend] + + self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" % + (username, all_ids_count, len(video_ids))) + + for video_id in video_ids: + self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id) + class DepositFilesIE(InfoExtractor): """Information extractor for depositfiles.com""" From 9e0dd8692ea16d62564c6b05c6fdc3f2e0b2f02d Mon Sep 17 00:00:00 2001 From: Ricardo Garcia Date: Sun, 30 Jan 2011 12:58:01 +0100 Subject: [PATCH 2/7] Bump version number --- LATEST_VERSION | 2 +- youtube-dl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/LATEST_VERSION b/LATEST_VERSION index a1c4173c8f..4ab2093465 100644 --- a/LATEST_VERSION +++ b/LATEST_VERSION @@ -1 +1 @@ -2010.12.09 +2011.01.30 diff --git a/youtube-dl b/youtube-dl index c8c2ea88e3..e980f41f8b 100755 --- a/youtube-dl +++ b/youtube-dl @@ -2405,7 +2405,7 @@ if __name__ == '__main__': # Parse command line parser = optparse.OptionParser( usage='Usage: %prog [options] url...', - version='2010.12.09', + version='2011.01.30', conflict_handler='resolve', ) From 5776c3295b085b9177b24152e33049a8c4b0c90b Mon Sep 17 00:00:00 2001 From: Ricardo Garcia Date: Sun, 30 Jan 2011 12:59:18 +0100 Subject: [PATCH 3/7] Update User-Agent string --- youtube-dl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube-dl b/youtube-dl index e980f41f8b..4d1e942ff9 100755 --- a/youtube-dl +++ b/youtube-dl @@ -37,7 +37,7 @@ except ImportError: from cgi import parse_qs std_headers = { - 'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.12) Gecko/20101028 Firefox/3.6.12', + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:2.0b10) Gecko/20100101 Firefox/4.0b10' 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', From 16c73c2e513829197c4af5ee62bde88b2b2272e4 Mon Sep 17 00:00:00 2001 From: Ricardo Garcia Date: Sun, 30 Jan 2011 13:03:15 +0100 Subject: [PATCH 4/7] Fix SyntaxError problem (oops) --- youtube-dl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube-dl b/youtube-dl index 4d1e942ff9..a4c8f24942 100755 --- a/youtube-dl +++ b/youtube-dl @@ -37,7 +37,7 @@ except ImportError: from cgi import parse_qs std_headers = { - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:2.0b10) Gecko/20100101 Firefox/4.0b10' + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:2.0b10) Gecko/20100101 Firefox/4.0b10', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', From f74e22ae280ac1680251350c4672abfeb2a047fe Mon Sep 17 00:00:00 2001 From: Gergely Imreh Date: Mon, 31 Jan 2011 18:54:47 +0800 Subject: [PATCH 5/7] Enable artist playlists in YoutubePlaylistIE Artist playlist pages have different format compared to user playlists, thus more format checking is needed to construct the correct URL. From the artist playlist this method downloads all listed below the "Videos by [Artist Name]" header, plus usually there's one more video on the side, titled "Youtube Mix for [Artist Name]", which has a link format that currently cannot be distinguished from the other videos in the list. --- youtube-dl | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/youtube-dl b/youtube-dl index a4c8f24942..dd875a38ea 100755 --- a/youtube-dl +++ b/youtube-dl @@ -2096,8 +2096,8 @@ class YahooSearchIE(InfoExtractor): class YoutubePlaylistIE(InfoExtractor): """Information Extractor for YouTube playlists.""" - _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/|p/)([^&]+).*' - _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en' + _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist)\?.*?(p|a)=|user/.*?/user/|p/)([^&]+).*' + _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en' _VIDEO_INDICATOR = r'/watch\?v=(.+?)&' _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*' _youtube_ie = None @@ -2125,13 +2125,19 @@ class YoutubePlaylistIE(InfoExtractor): return # Download playlist pages - playlist_id = mobj.group(1) + # prefix is 'p' as default for playlists but there are other types that need extra care + playlist_prefix = mobj.group(1) + if playlist_prefix == 'a': + playlist_access = 'artist' + else: + playlist_access = 'view_play_list' + playlist_id = mobj.group(2) video_ids = [] pagenum = 1 while True: self.report_download_page(playlist_id, pagenum) - request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum)) + request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)) try: page = urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: From d119b54df6a02d3985284c36586f6ff7e4cac969 Mon Sep 17 00:00:00 2001 From: Ricardo Garcia Date: Sat, 12 Feb 2011 20:19:20 +0100 Subject: [PATCH 6/7] Support more common YouTube playlist URLs --- youtube-dl | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/youtube-dl b/youtube-dl index dd875a38ea..0f9724637b 100755 --- a/youtube-dl +++ b/youtube-dl @@ -2096,7 +2096,7 @@ class YahooSearchIE(InfoExtractor): class YoutubePlaylistIE(InfoExtractor): """Information Extractor for YouTube playlists.""" - _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist)\?.*?(p|a)=|user/.*?/user/|p/)([^&]+).*' + _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist)\?.*?(p|a)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*' _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en' _VIDEO_INDICATOR = r'/watch\?v=(.+?)&' _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*' @@ -2124,6 +2124,11 @@ class YoutubePlaylistIE(InfoExtractor): self._downloader.trouble(u'ERROR: invalid url: %s' % url) return + # Single video case + if mobj.group(3) is not None: + self._youtube_ie.extract(mobj.group(3)) + return + # Download playlist pages # prefix is 'p' as default for playlists but there are other types that need extra care playlist_prefix = mobj.group(1) From 7cc3c6fd62d82bac36c583a8d1dc6c2f6da8c178 Mon Sep 17 00:00:00 2001 From: Gergely Imreh Date: Sun, 13 Feb 2011 19:02:56 +0800 Subject: [PATCH 7/7] Fix possible missing parameter in playlist url extraction The "playlist_prefix" parameter was missing when parsing playlist urls that match the recently added format, e.g.: http://www.youtube.com/user/stanforduniversity#g/c/9D558D49CA734A02 For these URLs (basically, for every playlist type so far, except the artist list) playlist_prefix has to be equal to "p" for correct exctraction. --- youtube-dl | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube-dl b/youtube-dl index 0f9724637b..26af2e5bc5 100755 --- a/youtube-dl +++ b/youtube-dl @@ -2135,6 +2135,7 @@ class YoutubePlaylistIE(InfoExtractor): if playlist_prefix == 'a': playlist_access = 'artist' else: + playlist_prefix = 'p' playlist_access = 'view_play_list' playlist_id = mobj.group(2) video_ids = []