From 40b35b4aa6040ecc3ff7b3c9c8b908249633d86e Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 28 Nov 2012 02:01:09 +0100 Subject: [PATCH] hack for apparently broken parse_qs in python2 --- youtube_dl/utils.py | 76 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 75 insertions(+), 1 deletion(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index a5df62bf81..cf78e9dc84 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -49,7 +49,81 @@ except ImportError: # Python 2 try: from urllib.parse import parse_qs as compat_parse_qs except ImportError: # Python 2 - from urlparse import parse_qs as compat_parse_qs + # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib. + # Python 2's version is apparently totally broken + def _unquote(string, encoding='utf-8', errors='replace'): + if string == '': + return string + res = string.split('%') + if len(res) == 1: + return string + if encoding is None: + encoding = 'utf-8' + if errors is None: + errors = 'replace' + # pct_sequence: contiguous sequence of percent-encoded bytes, decoded + pct_sequence = b'' + string = res[0] + for item in res[1:]: + try: + if not item: + raise ValueError + pct_sequence += item[:2].decode('hex') + rest = item[2:] + if not rest: + # This segment was just a single percent-encoded character. + # May be part of a sequence of code units, so delay decoding. + # (Stored in pct_sequence). + continue + except ValueError: + rest = '%' + item + # Encountered non-percent-encoded characters. Flush the current + # pct_sequence. + string += pct_sequence.decode(encoding, errors) + rest + pct_sequence = b'' + if pct_sequence: + # Flush the final pct_sequence + string += pct_sequence.decode(encoding, errors) + return string + + def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False, + encoding='utf-8', errors='replace'): + qs, _coerce_result = qs, unicode + pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')] + r = [] + for name_value in pairs: + if not name_value and not strict_parsing: + continue + nv = name_value.split('=', 1) + if len(nv) != 2: + if strict_parsing: + raise ValueError("bad query field: %r" % (name_value,)) + # Handle case of a control-name with no equal sign + if keep_blank_values: + nv.append('') + else: + continue + if len(nv[1]) or keep_blank_values: + name = nv[0].replace('+', ' ') + name = _unquote(name, encoding=encoding, errors=errors) + name = _coerce_result(name) + value = nv[1].replace('+', ' ') + value = _unquote(value, encoding=encoding, errors=errors) + value = _coerce_result(value) + r.append((name, value)) + return r + + def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False, + encoding='utf-8', errors='replace'): + parsed_result = {} + pairs = _parse_qsl(qs, keep_blank_values, strict_parsing, + encoding=encoding, errors=errors) + for name, value in pairs: + if name in parsed_result: + parsed_result[name].append(value) + else: + parsed_result[name] = [value] + return parsed_result try: compat_str = unicode # Python 2