[extractor/common] Improve _request_webpage

* Do not ignore data, headers and query for Requests
* Default values for headers and query switched to dicts since these are used by urllib itself
This commit is contained in:
Sergey M․ 2016-03-31 22:58:38 +06:00
parent 15d260ebaa
commit 41d06b0424

View File

@ -22,6 +22,7 @@ from ..compat import (
compat_str, compat_str,
compat_urllib_error, compat_urllib_error,
compat_urllib_parse_urlencode, compat_urllib_parse_urlencode,
compat_urllib_request,
compat_urlparse, compat_urlparse,
) )
from ..downloader.f4m import remove_encrypted_media from ..downloader.f4m import remove_encrypted_media
@ -49,6 +50,7 @@ from ..utils import (
determine_protocol, determine_protocol,
parse_duration, parse_duration,
mimetype2ext, mimetype2ext,
update_Request,
update_url_query, update_url_query,
) )
@ -347,7 +349,7 @@ class InfoExtractor(object):
def IE_NAME(self): def IE_NAME(self):
return compat_str(type(self).__name__[:-2]) return compat_str(type(self).__name__[:-2])
def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None): def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
""" Returns the response handle """ """ Returns the response handle """
if note is None: if note is None:
self.report_download_webpage(video_id) self.report_download_webpage(video_id)
@ -357,11 +359,14 @@ class InfoExtractor(object):
else: else:
self.to_screen('%s: %s' % (video_id, note)) self.to_screen('%s: %s' % (video_id, note))
# data, headers and query params will be ignored for `Request` objects # data, headers and query params will be ignored for `Request` objects
if isinstance(url_or_request, compat_str): if isinstance(url_or_request, compat_urllib_request.Request):
url_or_request = update_Request(
url_or_request, data=data, headers=headers, query=query)
else:
if query: if query:
url_or_request = update_url_query(url_or_request, query) url_or_request = update_url_query(url_or_request, query)
if data or headers: if data or headers:
url_or_request = sanitized_Request(url_or_request, data, headers or {}) url_or_request = sanitized_Request(url_or_request, data, headers)
try: try:
return self._downloader.urlopen(url_or_request) return self._downloader.urlopen(url_or_request)
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
@ -377,7 +382,7 @@ class InfoExtractor(object):
self._downloader.report_warning(errmsg) self._downloader.report_warning(errmsg)
return False return False
def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers=None, query=None): def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}):
""" Returns a tuple (page content as string, URL handle) """ """ Returns a tuple (page content as string, URL handle) """
# Strip hashes from the URL (#1038) # Strip hashes from the URL (#1038)
if isinstance(url_or_request, (compat_str, str)): if isinstance(url_or_request, (compat_str, str)):
@ -470,7 +475,7 @@ class InfoExtractor(object):
return content return content
def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers=None, query=None): def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
""" Returns the data of the page as a string """ """ Returns the data of the page as a string """
success = False success = False
try_count = 0 try_count = 0
@ -491,7 +496,7 @@ class InfoExtractor(object):
def _download_xml(self, url_or_request, video_id, def _download_xml(self, url_or_request, video_id,
note='Downloading XML', errnote='Unable to download XML', note='Downloading XML', errnote='Unable to download XML',
transform_source=None, fatal=True, encoding=None, data=None, headers=None, query=None): transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}):
"""Return the xml as an xml.etree.ElementTree.Element""" """Return the xml as an xml.etree.ElementTree.Element"""
xml_string = self._download_webpage( xml_string = self._download_webpage(
url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query) url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)
@ -505,7 +510,7 @@ class InfoExtractor(object):
note='Downloading JSON metadata', note='Downloading JSON metadata',
errnote='Unable to download JSON metadata', errnote='Unable to download JSON metadata',
transform_source=None, transform_source=None,
fatal=True, encoding=None, data=None, headers=None, query=None): fatal=True, encoding=None, data=None, headers={}, query={}):
json_string = self._download_webpage( json_string = self._download_webpage(
url_or_request, video_id, note, errnote, fatal=fatal, url_or_request, video_id, note, errnote, fatal=fatal,
encoding=encoding, data=data, headers=headers, query=query) encoding=encoding, data=data, headers=headers, query=query)