Add proper support for "gzip" and "deflate" encodings

This commit is contained in:
Ricardo Garcia 2011-01-12 20:20:37 +01:00
parent aac3fe0f4a
commit 1987c2325a

View File

@ -8,6 +8,7 @@
import cookielib import cookielib
import ctypes import ctypes
import datetime import datetime
import gzip
import htmlentitydefs import htmlentitydefs
import httplib import httplib
import locale import locale
@ -18,11 +19,13 @@ import os.path
import re import re
import socket import socket
import string import string
import StringIO
import subprocess import subprocess
import sys import sys
import time import time
import urllib import urllib
import urllib2 import urllib2
import zlib
# parse_qs was moved from the cgi module to the urlparse module recently. # parse_qs was moved from the cgi module to the urlparse module recently.
try: try:
@ -161,6 +164,56 @@ class ContentTooShortError(Exception):
self.downloaded = downloaded self.downloaded = downloaded
self.expected = expected self.expected = expected
class YoutubeDLHandler(urllib2.HTTPHandler):
"""Handler for HTTP requests and responses.
This class, when installed with an OpenerDirector, automatically adds
the standard headers to every HTTP request and handles gzipped and
deflated responses from web servers. If compression is to be avoided in
a particular request, the original request in the program code only has
to include the HTTP header "Youtubedl-No-Compression", which will be
removed before making the real request.
Part of this code was copied from:
http://techknack.net/python-urllib2-handlers/
Andrew Rowls, the author of that code, agreed to release it to the
public domain.
"""
@staticmethod
def deflate(data):
try:
return zlib.decompress(data, -zlib.MAX_WBITS)
except zlib.error:
return zlib.decompress(data)
def http_request(self, req):
for h in std_headers:
if h in req.headers:
del req.headers[h]
req.add_header(h, std_headers[h])
if 'Youtubedl-no-compression' in req.headers:
if 'Accept-encoding' in req.headers:
del req.headers['Accept-encoding']
del req.headers['Youtubedl-no-compression']
return req
def http_response(self, req, resp):
old_resp = resp
# gzip
if resp.headers.get('Content-encoding', '') == 'gzip':
gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
resp.msg = old_resp.msg
# deflate
if resp.headers.get('Content-encoding', '') == 'deflate':
gz = StringIO.StringIO(self.deflate(resp.read()))
resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
resp.msg = old_resp.msg
return resp
class FileDownloader(object): class FileDownloader(object):
"""File Downloader class. """File Downloader class.
@ -559,8 +612,11 @@ class FileDownloader(object):
tmpfilename = self.temp_name(filename) tmpfilename = self.temp_name(filename)
stream = None stream = None
open_mode = 'wb' open_mode = 'wb'
basic_request = urllib2.Request(url, None, std_headers)
request = urllib2.Request(url, None, std_headers) # Do not include the Accept-Encoding header
headers = {'Youtubedl-no-compression': 'True'}
basic_request = urllib2.Request(url, None, headers)
request = urllib2.Request(url, None, headers)
# Establish possible resume length # Establish possible resume length
if os.path.isfile(tmpfilename): if os.path.isfile(tmpfilename):
@ -822,7 +878,7 @@ class YoutubeIE(InfoExtractor):
return return
# Set language # Set language
request = urllib2.Request(self._LANG_URL, None, std_headers) request = urllib2.Request(self._LANG_URL)
try: try:
self.report_lang() self.report_lang()
urllib2.urlopen(request).read() urllib2.urlopen(request).read()
@ -842,7 +898,7 @@ class YoutubeIE(InfoExtractor):
'username': username, 'username': username,
'password': password, 'password': password,
} }
request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers) request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
try: try:
self.report_login() self.report_login()
login_results = urllib2.urlopen(request).read() login_results = urllib2.urlopen(request).read()
@ -858,7 +914,7 @@ class YoutubeIE(InfoExtractor):
'next_url': '/', 'next_url': '/',
'action_confirm': 'Confirm', 'action_confirm': 'Confirm',
} }
request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers) request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
try: try:
self.report_age_confirmation() self.report_age_confirmation()
age_results = urllib2.urlopen(request).read() age_results = urllib2.urlopen(request).read()
@ -876,7 +932,7 @@ class YoutubeIE(InfoExtractor):
# Get video webpage # Get video webpage
self.report_video_webpage_download(video_id) self.report_video_webpage_download(video_id)
request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id, None, std_headers) request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
try: try:
video_webpage = urllib2.urlopen(request).read() video_webpage = urllib2.urlopen(request).read()
except (urllib2.URLError, httplib.HTTPException, socket.error), err: except (urllib2.URLError, httplib.HTTPException, socket.error), err:
@ -895,7 +951,7 @@ class YoutubeIE(InfoExtractor):
for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']: for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en' video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
% (video_id, el_type)) % (video_id, el_type))
request = urllib2.Request(video_info_url, None, std_headers) request = urllib2.Request(video_info_url)
try: try:
video_info_webpage = urllib2.urlopen(request).read() video_info_webpage = urllib2.urlopen(request).read()
video_info = parse_qs(video_info_webpage) video_info = parse_qs(video_info_webpage)
@ -1055,7 +1111,7 @@ class MetacafeIE(InfoExtractor):
def _real_initialize(self): def _real_initialize(self):
# Retrieve disclaimer # Retrieve disclaimer
request = urllib2.Request(self._DISCLAIMER, None, std_headers) request = urllib2.Request(self._DISCLAIMER)
try: try:
self.report_disclaimer() self.report_disclaimer()
disclaimer = urllib2.urlopen(request).read() disclaimer = urllib2.urlopen(request).read()
@ -1068,7 +1124,7 @@ class MetacafeIE(InfoExtractor):
'filters': '0', 'filters': '0',
'submit': "Continue - I'm over 18", 'submit': "Continue - I'm over 18",
} }
request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers) request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
try: try:
self.report_age_confirmation() self.report_age_confirmation()
disclaimer = urllib2.urlopen(request).read() disclaimer = urllib2.urlopen(request).read()
@ -1771,7 +1827,7 @@ class YoutubeSearchIE(InfoExtractor):
while True: while True:
self.report_download_page(query, pagenum) self.report_download_page(query, pagenum)
result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum) result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
request = urllib2.Request(result_url, None, std_headers) request = urllib2.Request(result_url)
try: try:
page = urllib2.urlopen(request).read() page = urllib2.urlopen(request).read()
except (urllib2.URLError, httplib.HTTPException, socket.error), err: except (urllib2.URLError, httplib.HTTPException, socket.error), err:
@ -1862,7 +1918,7 @@ class GoogleSearchIE(InfoExtractor):
while True: while True:
self.report_download_page(query, pagenum) self.report_download_page(query, pagenum)
result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum) result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
request = urllib2.Request(result_url, None, std_headers) request = urllib2.Request(result_url)
try: try:
page = urllib2.urlopen(request).read() page = urllib2.urlopen(request).read()
except (urllib2.URLError, httplib.HTTPException, socket.error), err: except (urllib2.URLError, httplib.HTTPException, socket.error), err:
@ -1953,7 +2009,7 @@ class YahooSearchIE(InfoExtractor):
while True: while True:
self.report_download_page(query, pagenum) self.report_download_page(query, pagenum)
result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum) result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
request = urllib2.Request(result_url, None, std_headers) request = urllib2.Request(result_url)
try: try:
page = urllib2.urlopen(request).read() page = urllib2.urlopen(request).read()
except (urllib2.URLError, httplib.HTTPException, socket.error), err: except (urllib2.URLError, httplib.HTTPException, socket.error), err:
@ -2017,7 +2073,7 @@ class YoutubePlaylistIE(InfoExtractor):
while True: while True:
self.report_download_page(playlist_id, pagenum) self.report_download_page(playlist_id, pagenum)
request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers) request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum))
try: try:
page = urllib2.urlopen(request).read() page = urllib2.urlopen(request).read()
except (urllib2.URLError, httplib.HTTPException, socket.error), err: except (urllib2.URLError, httplib.HTTPException, socket.error), err:
@ -2079,7 +2135,7 @@ class YoutubeUserIE(InfoExtractor):
pagenum = 1 pagenum = 1
self.report_download_page(username) self.report_download_page(username)
request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers) request = urllib2.Request(self._TEMPLATE_URL % (username))
try: try:
page = urllib2.urlopen(request).read() page = urllib2.urlopen(request).read()
except (urllib2.URLError, httplib.HTTPException, socket.error), err: except (urllib2.URLError, httplib.HTTPException, socket.error), err:
@ -2135,7 +2191,7 @@ class DepositFilesIE(InfoExtractor):
# Retrieve file webpage with 'Free download' button pressed # Retrieve file webpage with 'Free download' button pressed
free_download_indication = { 'gateway_result' : '1' } free_download_indication = { 'gateway_result' : '1' }
request = urllib2.Request(url, urllib.urlencode(free_download_indication), std_headers) request = urllib2.Request(url, urllib.urlencode(free_download_indication))
try: try:
self.report_download_webpage(file_id) self.report_download_webpage(file_id)
webpage = urllib2.urlopen(request).read() webpage = urllib2.urlopen(request).read()
@ -2354,8 +2410,7 @@ if __name__ == '__main__':
# General configuration # General configuration
cookie_processor = urllib2.HTTPCookieProcessor(jar) cookie_processor = urllib2.HTTPCookieProcessor(jar)
urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler())) urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
urllib2.install_opener(urllib2.build_opener(cookie_processor))
socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words) socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
# Batch file verification # Batch file verification