diff options
| author | Maximilian Hils <git@maximilianhils.com> | 2016-07-16 00:13:58 -0700 | 
|---|---|---|
| committer | GitHub <noreply@github.com> | 2016-07-16 00:13:58 -0700 | 
| commit | b27d59095d799436fed41eaeaba502ecceb40f76 (patch) | |
| tree | 152440c1e22850b81aa115817bee4d661f2435de /netlib/http | |
| parent | 903807292b42b2481a3d72d6dbdc72939fc39b01 (diff) | |
| parent | e6e39ce80f4daaf6a1d6f8d87616409486d358a5 (diff) | |
| download | mitmproxy-b27d59095d799436fed41eaeaba502ecceb40f76.tar.gz mitmproxy-b27d59095d799436fed41eaeaba502ecceb40f76.tar.bz2 mitmproxy-b27d59095d799436fed41eaeaba502ecceb40f76.zip | |
Merge pull request #1306 from mitmproxy/message-body-encoding
Improve Message Body Encoding
Diffstat (limited to 'netlib/http')
| -rw-r--r-- | netlib/http/headers.py | 15 | ||||
| -rw-r--r-- | netlib/http/http1/assemble.py | 4 | ||||
| -rw-r--r-- | netlib/http/message.py | 254 | ||||
| -rw-r--r-- | netlib/http/request.py | 14 | ||||
| -rw-r--r-- | netlib/http/response.py | 5 | 
5 files changed, 213 insertions, 79 deletions
| diff --git a/netlib/http/headers.py b/netlib/http/headers.py index c8cf3e43..36e5060c 100644 --- a/netlib/http/headers.py +++ b/netlib/http/headers.py @@ -2,6 +2,7 @@ from __future__ import absolute_import, print_function, division  import re +import collections  import six  from netlib import multidict  from netlib import strutils @@ -206,10 +207,22 @@ def parse_content_type(c):      ts = parts[0].split("/", 1)      if len(ts) != 2:          return None -    d = {} +    d = collections.OrderedDict()      if len(parts) == 2:          for i in parts[1].split(";"):              clause = i.split("=", 1)              if len(clause) == 2:                  d[clause[0].strip()] = clause[1].strip()      return ts[0].lower(), ts[1].lower(), d + + +def assemble_content_type(type, subtype, parameters): +    if not parameters: +        return "{}/{}".format(type, subtype) +    params = "; ".join( +        "{}={}".format(k, v) +        for k, v in parameters.items() +    ) +    return "{}/{}; {}".format( +        type, subtype, params +    ) diff --git a/netlib/http/http1/assemble.py b/netlib/http/http1/assemble.py index 511328f1..e74732d2 100644 --- a/netlib/http/http1/assemble.py +++ b/netlib/http/http1/assemble.py @@ -5,7 +5,7 @@ from netlib import exceptions  def assemble_request(request): -    if request.content is None: +    if request.data.content is None:          raise exceptions.HttpException("Cannot assemble flow with missing content")      head = assemble_request_head(request)      body = b"".join(assemble_body(request.data.headers, [request.data.content])) @@ -19,7 +19,7 @@ def assemble_request_head(request):  def assemble_response(response): -    if response.content is None: +    if response.data.content is None:          raise exceptions.HttpException("Cannot assemble flow with missing content")      head = assemble_response_head(response)      body = b"".join(assemble_body(response.data.headers, [response.data.content])) diff --git a/netlib/http/message.py b/netlib/http/message.py index b268fec9..34709f0a 100644 --- a/netlib/http/message.py +++ b/netlib/http/message.py @@ -52,7 +52,23 @@ class MessageData(basetypes.Serializable):          return cls(**state) +class CachedDecode(object): +    __slots__ = ["encoded", "encoding", "strict", "decoded"] + +    def __init__(self, object, encoding, strict, decoded): +        self.encoded = object +        self.encoding = encoding +        self.strict = strict +        self.decoded = decoded + +no_cached_decode = CachedDecode(None, None, None, None) + +  class Message(basetypes.Serializable): +    def __init__(self): +        self._content_cache = no_cached_decode  # type: CachedDecode +        self._text_cache = no_cached_decode  # type: CachedDecode +      def __eq__(self, other):          if isinstance(other, Message):              return self.data == other.data @@ -90,22 +106,82 @@ class Message(basetypes.Serializable):          self.data.headers = h      @property -    def content(self): +    def raw_content(self): +        # type: () -> bytes          """          The raw (encoded) HTTP message body -        See also: :py:attr:`text` +        See also: :py:attr:`content`, :py:class:`text`          """          return self.data.content -    @content.setter -    def content(self, content): -        # type: (Optional[bytes]) -> None +    @raw_content.setter +    def raw_content(self, content):          self.data.content = content -        if isinstance(content, six.text_type): -            raise ValueError("Message content must be bytes, not {}".format(type(content).__name__)) -        if isinstance(content, bytes): -            self.headers["content-length"] = str(len(content)) + +    def get_content(self, strict=True): +        # type: (bool) -> bytes +        """ +        The HTTP message body decoded with the content-encoding header (e.g. gzip) + +        Raises: +            ValueError, when the content-encoding is invalid and strict is True. + +        See also: :py:class:`raw_content`, :py:attr:`text` +        """ +        if self.raw_content is None: +            return None +        ce = self.headers.get("content-encoding") +        cached = ( +            self._content_cache.encoded == self.raw_content and +            (self._content_cache.strict or not strict) and +            self._content_cache.encoding == ce +        ) +        if not cached: +            is_strict = True +            if ce: +                try: +                    decoded = encoding.decode(self.raw_content, ce) +                except ValueError: +                    if strict: +                        raise +                    is_strict = False +                    decoded = self.raw_content +            else: +                decoded = self.raw_content +            self._content_cache = CachedDecode(self.raw_content, ce, is_strict, decoded) +        return self._content_cache.decoded + +    def set_content(self, value): +        if value is None: +            self.raw_content = None +            return +        if not isinstance(value, bytes): +            raise TypeError( +                "Message content must be bytes, not {}. " +                "Please use .text if you want to assign a str." +                .format(type(value).__name__) +            ) +        ce = self.headers.get("content-encoding") +        cached = ( +            self._content_cache.decoded == value and +            self._content_cache.encoding == ce and +            self._content_cache.strict +        ) +        if not cached: +            try: +                encoded = encoding.encode(value, ce or "identity") +            except ValueError: +                # So we have an invalid content-encoding? +                # Let's remove it! +                del self.headers["content-encoding"] +                ce = None +                encoded = value +            self._content_cache = CachedDecode(encoded, ce, True, value) +        self.raw_content = self._content_cache.encoded +        self.headers["content-length"] = str(len(self.raw_content)) + +    content = property(get_content, set_content)      @property      def http_version(self): @@ -140,56 +216,108 @@ class Message(basetypes.Serializable):      def timestamp_end(self, timestamp_end):          self.data.timestamp_end = timestamp_end -    @property -    def text(self): -        """ -        The decoded HTTP message body. -        Decoded contents are not cached, so accessing this attribute repeatedly is relatively expensive. +    def _get_content_type_charset(self): +        # type: () -> Optional[str] +        ct = headers.parse_content_type(self.headers.get("content-type", "")) +        if ct: +            return ct[2].get("charset") -        .. note:: -            This is not implemented yet. +    def _guess_encoding(self): +        # type: () -> str +        enc = self._get_content_type_charset() +        if enc: +            return enc -        See also: :py:attr:`content`, :py:class:`decoded` +        if "json" in self.headers.get("content-type", ""): +            return "utf8" +        else: +            # We may also want to check for HTML meta tags here at some point. +            return "latin-1" + +    def get_text(self, strict=True): +        # type: (bool) -> six.text_type          """ -        # This attribute should be called text, because that's what requests does. -        raise NotImplementedError() +        The HTTP message body decoded with both content-encoding header (e.g. gzip) +        and content-type header charset. -    @text.setter -    def text(self, text): -        raise NotImplementedError() +        Raises: +            ValueError, when either content-encoding or charset is invalid and strict is True. -    def decode(self): +        See also: :py:attr:`content`, :py:class:`raw_content` +        """ +        if self.raw_content is None: +            return None +        enc = self._guess_encoding() + +        content = self.get_content(strict) +        cached = ( +            self._text_cache.encoded == content and +            (self._text_cache.strict or not strict) and +            self._text_cache.encoding == enc +        ) +        if not cached: +            is_strict = self._content_cache.strict +            try: +                decoded = encoding.decode(content, enc) +            except ValueError: +                if strict: +                    raise +                is_strict = False +                decoded = self.content.decode("utf8", "replace" if six.PY2 else "surrogateescape") +            self._text_cache = CachedDecode(content, enc, is_strict, decoded) +        return self._text_cache.decoded + +    def set_text(self, text): +        if text is None: +            self.content = None +            return +        enc = self._guess_encoding() + +        cached = ( +            self._text_cache.decoded == text and +            self._text_cache.encoding == enc and +            self._text_cache.strict +        ) +        if not cached: +            try: +                encoded = encoding.encode(text, enc) +            except ValueError: +                # Fall back to UTF-8 and update the content-type header. +                ct = headers.parse_content_type(self.headers.get("content-type", "")) or ("text", "plain", {}) +                ct[2]["charset"] = "utf-8" +                self.headers["content-type"] = headers.assemble_content_type(*ct) +                enc = "utf8" +                encoded = text.encode(enc, "replace" if six.PY2 else "surrogateescape") +            self._text_cache = CachedDecode(encoded, enc, True, text) +        self.content = self._text_cache.encoded + +    text = property(get_text, set_text) + +    def decode(self, strict=True):          """ -            Decodes body based on the current Content-Encoding header, then -            removes the header. If there is no Content-Encoding header, no -            action is taken. +        Decodes body based on the current Content-Encoding header, then +        removes the header. If there is no Content-Encoding header, no +        action is taken. -            Returns: -                True, if decoding succeeded. -                False, otherwise. +        Raises: +            ValueError, when the content-encoding is invalid and strict is True.          """ -        ce = self.headers.get("content-encoding") -        data = encoding.decode(ce, self.content) -        if data is None: -            return False -        self.content = data +        self.raw_content = self.get_content(strict)          self.headers.pop("content-encoding", None) -        return True      def encode(self, e):          """ -            Encodes body with the encoding e, where e is "gzip", "deflate" or "identity". +        Encodes body with the encoding e, where e is "gzip", "deflate" or "identity". +        Any existing content-encodings are overwritten, +        the content is not decoded beforehand. -            Returns: -                True, if decoding succeeded. -                False, otherwise. +        Raises: +            ValueError, when the specified content-encoding is invalid.          """ -        data = encoding.encode(e, self.content) -        if data is None: -            return False -        self.content = data          self.headers["content-encoding"] = e -        return True +        self.content = self.raw_content +        if "content-encoding" not in self.headers: +            raise ValueError("Invalid content encoding {}".format(repr(e)))      def replace(self, pattern, repl, flags=0):          """ @@ -206,10 +334,9 @@ class Message(basetypes.Serializable):              repl = strutils.escaped_str_to_bytes(repl)          replacements = 0          if self.content: -            with decoded(self): -                self.content, replacements = re.subn( -                    pattern, repl, self.content, flags=flags -                ) +            self.content, replacements = re.subn( +                pattern, repl, self.content, flags=flags +            )          replacements += self.headers.replace(pattern, repl, flags)          return replacements @@ -228,29 +355,16 @@ class Message(basetypes.Serializable):  class decoded(object):      """ -    A context manager that decodes a request or response, and then -    re-encodes it with the same encoding after execution of the block. - -    Example: - -    .. code-block:: python - -        with decoded(request): -            request.content = request.content.replace("foo", "bar") +    Deprecated: You can now directly use :py:attr:`content`. +    :py:attr:`raw_content` has the encoded content.      """ -    def __init__(self, message): -        self.message = message -        ce = message.headers.get("content-encoding") -        if ce in encoding.ENCODINGS: -            self.ce = ce -        else: -            self.ce = None +    def __init__(self, message):  # pragma no cover +        warnings.warn("decoded() is deprecated, you can now directly use .content instead. " +                      ".raw_content has the encoded content.", DeprecationWarning) -    def __enter__(self): -        if self.ce: -            self.message.decode() +    def __enter__(self):  # pragma no cover +        pass -    def __exit__(self, type, value, tb): -        if self.ce: -            self.message.encode(self.ce) +    def __exit__(self, type, value, tb):  # pragma no cover +        pass diff --git a/netlib/http/request.py b/netlib/http/request.py index c4c39942..ecaa9b79 100644 --- a/netlib/http/request.py +++ b/netlib/http/request.py @@ -5,7 +5,6 @@ import re  import six  from six.moves import urllib -from netlib import encoding  from netlib import multidict  from netlib import strutils  from netlib.http import multipart @@ -56,6 +55,7 @@ class Request(message.Message):      An HTTP request.      """      def __init__(self, *args, **kwargs): +        super(Request, self).__init__()          self.data = RequestData(*args, **kwargs)      def __repr__(self): @@ -339,7 +339,7 @@ class Request(message.Message):              self.headers["accept-encoding"] = (                  ', '.join(                      e -                    for e in encoding.ENCODINGS +                    for e in {"gzip", "identity", "deflate"}                      if e in accept_encoding                  )              ) @@ -359,7 +359,10 @@ class Request(message.Message):      def _get_urlencoded_form(self):          is_valid_content_type = "application/x-www-form-urlencoded" in self.headers.get("content-type", "").lower()          if is_valid_content_type: -            return tuple(netlib.http.url.decode(self.content)) +            try: +                return tuple(netlib.http.url.decode(self.content)) +            except ValueError: +                pass          return ()      def _set_urlencoded_form(self, value): @@ -388,7 +391,10 @@ class Request(message.Message):      def _get_multipart_form(self):          is_valid_content_type = "multipart/form-data" in self.headers.get("content-type", "").lower()          if is_valid_content_type: -            return multipart.decode(self.headers, self.content) +            try: +                return multipart.decode(self.headers, self.content) +            except ValueError: +                pass          return ()      def _set_multipart_form(self, value): diff --git a/netlib/http/response.py b/netlib/http/response.py index 7cfb55c8..85f54940 100644 --- a/netlib/http/response.py +++ b/netlib/http/response.py @@ -37,13 +37,14 @@ class Response(message.Message):      An HTTP response.      """      def __init__(self, *args, **kwargs): +        super(Response, self).__init__()          self.data = ResponseData(*args, **kwargs)      def __repr__(self): -        if self.content: +        if self.raw_content:              details = "{}, {}".format(                  self.headers.get("content-type", "unknown content type"), -                human.pretty_size(len(self.content)) +                human.pretty_size(len(self.raw_content))              )          else:              details = "no content" | 
