From 6032c4f2352260d32032800a2ff694339e2af6b2 Mon Sep 17 00:00:00 2001
From: Maximilian Hils <git@maximilianhils.com>
Date: Sat, 2 Jul 2016 01:51:47 -0700
Subject: message.content -> .raw_content, implement .text

This PR improves our handling of HTTP message body encodings:

- The unaltered message body is now accessible as `.raw_content`
- The "content-encoding"-decoded content (i.e. gzip removed) content
  is not `.content`, as this is what we want in 99% of the cases.
- `.text` now provides the "content-encoding"-decoded and then
  "content-type charset"-decoded message body.
- The decoded values for `.content` and `.text` are cached,
  so that repeated access and `x.text = x.text` is cheap.
- The `decoded()` decorator is now deprecated, as we can now just use
  `.content`. Similarly `HTTPMessage.get_decoded_content()` is
  deprecated.
---
 docs/dev/models.rst                   |   2 -
 mitmproxy/console/common.py           |  37 +++----
 mitmproxy/console/flowview.py         |  33 +++---
 mitmproxy/contentviews.py             |  17 +--
 mitmproxy/dump.py                     |   4 +-
 mitmproxy/filt.py                     |   8 +-
 mitmproxy/flow/master.py              |   8 +-
 mitmproxy/flow/modules.py             |   4 +-
 mitmproxy/models/http.py              |   8 +-
 mitmproxy/protocol/http.py            |   4 +-
 mitmproxy/web/app.py                  |   4 +-
 netlib/encoding.py                    |  97 +++++++++++------
 netlib/http/http1/assemble.py         |   4 +-
 netlib/http/message.py                | 192 +++++++++++++++++++++++-----------
 netlib/http/request.py                |   4 +-
 netlib/http/response.py               |   5 +-
 test/mitmproxy/test_contentview.py    |  22 ----
 test/mitmproxy/test_examples.py       |  10 +-
 test/mitmproxy/test_flow.py           |  18 +---
 test/mitmproxy/test_protocol_http2.py |   6 +-
 test/mitmproxy/tservers.py            |   1 -
 test/netlib/http/test_message.py      | 117 +++++++++++++++------
 test/netlib/test_encoding.py          |  40 +++----
 23 files changed, 377 insertions(+), 268 deletions(-)

diff --git a/docs/dev/models.rst b/docs/dev/models.rst
index 02f36f58..7260f1f7 100644
--- a/docs/dev/models.rst
+++ b/docs/dev/models.rst
@@ -56,8 +56,6 @@ Datastructures
         :special-members:
         :no-undoc-members:
 
-    .. autoclass:: decoded
-
 .. automodule:: netlib.multidict
 
     .. autoclass:: MultiDictView
diff --git a/mitmproxy/console/common.py b/mitmproxy/console/common.py
index b450c19d..b4369c0c 100644
--- a/mitmproxy/console/common.py
+++ b/mitmproxy/console/common.py
@@ -7,7 +7,6 @@ import urwid.util
 
 import netlib
 from mitmproxy import flow
-from mitmproxy import models
 from mitmproxy import utils
 from mitmproxy.console import signals
 from netlib import human
@@ -259,26 +258,24 @@ def copy_flow_format_data(part, scope, flow):
         if scope in ("q", "a"):
             if flow.request.content is None:
                 return None, "Request content is missing"
-            with models.decoded(flow.request):
-                if part == "h":
-                    data += netlib.http.http1.assemble_request(flow.request)
-                elif part == "c":
-                    data += flow.request.content
-                else:
-                    raise ValueError("Unknown part: {}".format(part))
+            if part == "h":
+                data += netlib.http.http1.assemble_request(flow.request)
+            elif part == "c":
+                data += flow.request.content
+            else:
+                raise ValueError("Unknown part: {}".format(part))
         if scope == "a" and flow.request.content and flow.response:
             # Add padding between request and response
             data += "\r\n" * 2
         if scope in ("s", "a") and flow.response:
             if flow.response.content is None:
                 return None, "Response content is missing"
-            with models.decoded(flow.response):
-                if part == "h":
-                    data += netlib.http.http1.assemble_response(flow.response)
-                elif part == "c":
-                    data += flow.response.content
-                else:
-                    raise ValueError("Unknown part: {}".format(part))
+            if part == "h":
+                data += netlib.http.http1.assemble_response(flow.response)
+            elif part == "c":
+                data += flow.response.content
+            else:
+                raise ValueError("Unknown part: {}".format(part))
     return data, False
 
 
@@ -388,12 +385,12 @@ def ask_save_body(part, master, state, flow):
     elif part == "q" and request_has_content:
         ask_save_path(
             "Save request content",
-            flow.request.get_decoded_content()
+            flow.request.content
         )
     elif part == "s" and response_has_content:
         ask_save_path(
             "Save response content",
-            flow.response.get_decoded_content()
+            flow.response.content
         )
     else:
         signals.status_message.send(message="No content to save.")
@@ -418,9 +415,9 @@ def format_flow(f, focus, extended=False, hostheader=False, marked=False):
         marked = marked,
     )
     if f.response:
-        if f.response.content:
-            contentdesc = human.pretty_size(len(f.response.content))
-        elif f.response.content is None:
+        if f.response.raw_content:
+            contentdesc = human.pretty_size(len(f.response.raw_content))
+        elif f.response.raw_content is None:
             contentdesc = "[content missing]"
         else:
             contentdesc = "[no content]"
diff --git a/mitmproxy/console/flowview.py b/mitmproxy/console/flowview.py
index e9b23176..208b0d44 100644
--- a/mitmproxy/console/flowview.py
+++ b/mitmproxy/console/flowview.py
@@ -176,7 +176,7 @@ class FlowView(tabs.Tabs):
             self.show()
 
     def content_view(self, viewmode, message):
-        if message.content is None:
+        if message.raw_content is None:
             msg, body = "", [urwid.Text([("error", "[content missing]")])]
             return msg, body
         else:
@@ -214,6 +214,12 @@ class FlowView(tabs.Tabs):
             )
             description = description.replace("Raw", "Couldn't parse: falling back to Raw")
 
+        if message.content != message.raw_content:
+            description = "[decoded {enc}] {desc}".format(
+                enc=message.headers.get("content-encoding"),
+                desc=description
+            )
+
         # Give hint that you have to tab for the response.
         if description == "No content" and isinstance(message, models.HTTPRequest):
             description = "No request content (press tab to view response)"
@@ -407,15 +413,14 @@ class FlowView(tabs.Tabs):
                 )
             )
         if part == "r":
-            with models.decoded(message):
-                # Fix an issue caused by some editors when editing a
-                # request/response body. Many editors make it hard to save a
-                # file without a terminating newline on the last line. When
-                # editing message bodies, this can cause problems. For now, I
-                # just strip the newlines off the end of the body when we return
-                # from an editor.
-                c = self.master.spawn_editor(message.content or "")
-                message.content = c.rstrip("\n")
+            # Fix an issue caused by some editors when editing a
+            # request/response body. Many editors make it hard to save a
+            # file without a terminating newline on the last line. When
+            # editing message bodies, this can cause problems. For now, I
+            # just strip the newlines off the end of the body when we return
+            # from an editor.
+            c = self.master.spawn_editor(message.content or b"")
+            message.content = c.rstrip(b"\n")
         elif part == "f":
             if not message.urlencoded_form and message.content:
                 signals.status_prompt_onekey.send(
@@ -512,14 +517,10 @@ class FlowView(tabs.Tabs):
         signals.flow_change.send(self, flow = self.flow)
 
     def delete_body(self, t):
-        if t == "m":
-            val = None
-        else:
-            val = None
         if self.tab_offset == TAB_REQ:
-            self.flow.request.content = val
+            self.flow.request.content = None
         else:
-            self.flow.response.content = val
+            self.flow.response.content = None
         signals.flow_change.send(self, flow = self.flow)
 
     def keypress(self, size, key):
diff --git a/mitmproxy/contentviews.py b/mitmproxy/contentviews.py
index de88c9ea..c9ea14ba 100644
--- a/mitmproxy/contentviews.py
+++ b/mitmproxy/contentviews.py
@@ -618,15 +618,6 @@ def get_content_view(viewmode, data, **metadata):
         Raises:
             ContentViewException, if the content view threw an error.
     """
-    msg = []
-
-    headers = metadata.get("headers", {})
-    enc = headers.get("content-encoding")
-    if enc and enc != "identity":
-        decoded = encoding.decode(enc, data)
-        if decoded:
-            data = decoded
-            msg.append("[decoded %s]" % enc)
     try:
         ret = viewmode(data, **metadata)
     # Third-party viewers can fail in unexpected ways...
@@ -637,8 +628,8 @@ def get_content_view(viewmode, data, **metadata):
             sys.exc_info()[2]
         )
     if not ret:
-        ret = get("Raw")(data, **metadata)
-        msg.append("Couldn't parse: falling back to Raw")
+        desc = "Couldn't parse: falling back to Raw"
+        _, content = get("Raw")(data, **metadata)
     else:
-        msg.append(ret[0])
-    return " ".join(msg), safe_to_print(ret[1])
+        desc, content = ret
+    return desc, safe_to_print(content)
diff --git a/mitmproxy/dump.py b/mitmproxy/dump.py
index 6670be9b..ea242bba 100644
--- a/mitmproxy/dump.py
+++ b/mitmproxy/dump.py
@@ -290,10 +290,10 @@ class DumpMaster(flow.FlowMaster):
         code = click.style(str(code), fg=code_color, bold=True, blink=(code == 418))
         reason = click.style(strutils.bytes_to_escaped_str(flow.response.reason), fg=code_color, bold=True)
 
-        if flow.response.content is None:
+        if flow.response.raw_content is None:
             size = "(content missing)"
         else:
-            size = human.pretty_size(len(flow.response.content))
+            size = human.pretty_size(len(flow.response.raw_content))
         size = click.style(size, bold=True)
 
         arrows = click.style("<<", bold=True)
diff --git a/mitmproxy/filt.py b/mitmproxy/filt.py
index b1b72aa7..95bae1ae 100644
--- a/mitmproxy/filt.py
+++ b/mitmproxy/filt.py
@@ -194,10 +194,10 @@ class FBod(_Rex):
 
     def __call__(self, f):
         if f.request and f.request.content:
-            if self.re.search(f.request.get_decoded_content()):
+            if self.re.search(f.request.content):
                 return True
         if f.response and f.response.content:
-            if self.re.search(f.response.get_decoded_content()):
+            if self.re.search(f.response.content):
                 return True
         return False
 
@@ -208,7 +208,7 @@ class FBodRequest(_Rex):
 
     def __call__(self, f):
         if f.request and f.request.content:
-            if self.re.search(f.request.get_decoded_content()):
+            if self.re.search(f.request.content):
                 return True
 
 
@@ -218,7 +218,7 @@ class FBodResponse(_Rex):
 
     def __call__(self, f):
         if f.response and f.response.content:
-            if self.re.search(f.response.get_decoded_content()):
+            if self.re.search(f.response.content):
                 return True
 
 
diff --git a/mitmproxy/flow/master.py b/mitmproxy/flow/master.py
index efb5d013..a4aa9a7e 100644
--- a/mitmproxy/flow/master.py
+++ b/mitmproxy/flow/master.py
@@ -16,7 +16,6 @@ from mitmproxy.flow import modules
 from mitmproxy.onboarding import app
 from mitmproxy.protocol import http_replay
 from mitmproxy.proxy.config import HostMatcher
-from netlib import strutils
 
 
 class FlowMaster(controller.Master):
@@ -348,13 +347,16 @@ class FlowMaster(controller.Master):
             return "Can't replay live request."
         if f.intercepted:
             return "Can't replay while intercepting..."
-        if f.request.content is None:
+        if f.request.raw_content is None:
             return "Can't replay request with missing content..."
         if f.request:
             f.backup()
             f.request.is_replay = True
+
+            # TODO: We should be able to remove this.
             if "Content-Length" in f.request.headers:
-                f.request.headers["Content-Length"] = str(len(f.request.content))
+                f.request.headers["Content-Length"] = str(len(f.request.raw_content))
+
             f.response = None
             f.error = None
             self.process_new_request(f)
diff --git a/mitmproxy/flow/modules.py b/mitmproxy/flow/modules.py
index 2998d259..85dff0f1 100644
--- a/mitmproxy/flow/modules.py
+++ b/mitmproxy/flow/modules.py
@@ -157,7 +157,7 @@ class StreamLargeBodies(object):
         expected_size = http1.expected_http_body_size(
             flow.request, flow.response if not is_request else None
         )
-        if not r.content and not (0 <= expected_size <= self.max_size):
+        if not r.raw_content and not (0 <= expected_size <= self.max_size):
             # r.stream may already be a callable, which we want to preserve.
             r.stream = r.stream or True
 
@@ -251,7 +251,7 @@ class ServerPlaybackState:
                     if p[0] not in self.ignore_payload_params
                 )
             else:
-                key.append(str(r.content))
+                key.append(str(r.raw_content))
 
         if not self.ignore_host:
             key.append(r.host)
diff --git a/mitmproxy/models/http.py b/mitmproxy/models/http.py
index 01f5f1ee..a50808ef 100644
--- a/mitmproxy/models/http.py
+++ b/mitmproxy/models/http.py
@@ -1,9 +1,9 @@
 from __future__ import absolute_import, print_function, division
 
 import cgi
+import warnings
 
 from mitmproxy.models.flow import Flow
-from netlib import encoding
 from netlib import version
 from netlib.http import Headers
 from netlib.http import Request
@@ -20,10 +20,8 @@ class MessageMixin(object):
             header.
             Doesn't change the message iteself or its headers.
         """
-        ce = self.headers.get("content-encoding")
-        if not self.content or ce not in encoding.ENCODINGS:
-            return self.content
-        return encoding.decode(ce, self.content)
+        warnings.warn(".get_decoded_content() is deprecated, please use .content directly instead.", DeprecationWarning)
+        return self.content
 
 
 class HTTPRequest(MessageMixin, Request):
diff --git a/mitmproxy/protocol/http.py b/mitmproxy/protocol/http.py
index 187c17f6..2c70f288 100644
--- a/mitmproxy/protocol/http.py
+++ b/mitmproxy/protocol/http.py
@@ -41,10 +41,10 @@ class _HttpTransmissionLayer(base.Layer):
         yield "this is a generator"  # pragma: no cover
 
     def send_response(self, response):
-        if response.content is None:
+        if response.data.content is None:
             raise netlib.exceptions.HttpException("Cannot assemble flow with missing content")
         self.send_response_headers(response)
-        self.send_response_body(response, [response.content])
+        self.send_response_body(response, [response.data.content])
 
     def send_response_headers(self, response):
         raise NotImplementedError()
diff --git a/mitmproxy/web/app.py b/mitmproxy/web/app.py
index a2798472..50fbaed8 100644
--- a/mitmproxy/web/app.py
+++ b/mitmproxy/web/app.py
@@ -272,7 +272,7 @@ class FlowContent(RequestHandler):
     def get(self, flow_id, message):
         message = getattr(self.flow, message)
 
-        if not message.content:
+        if not message.raw_content:
             raise APIError(400, "No content.")
 
         content_encoding = message.headers.get("Content-Encoding", None)
@@ -295,7 +295,7 @@ class FlowContent(RequestHandler):
         self.set_header("Content-Type", "application/text")
         self.set_header("X-Content-Type-Options", "nosniff")
         self.set_header("X-Frame-Options", "DENY")
-        self.write(message.content)
+        self.write(message.raw_content)
 
 
 class Events(RequestHandler):
diff --git a/netlib/encoding.py b/netlib/encoding.py
index 98502451..8b67b543 100644
--- a/netlib/encoding.py
+++ b/netlib/encoding.py
@@ -1,39 +1,62 @@
 """
-    Utility functions for decoding response bodies.
+Utility functions for decoding response bodies.
 """
 from __future__ import absolute_import
+
+import codecs
 from io import BytesIO
 import gzip
 import zlib
 
+from typing import Union  # noqa
+
 
-ENCODINGS = {"identity", "gzip", "deflate"}
+def decode(obj, encoding, errors='strict'):
+    # type: (Union[str, bytes], str) -> Union[str, bytes]
+    """
+    Decode the given input object
 
+    Returns:
+        The decoded value
 
-def decode(e, content):
-    if not isinstance(content, bytes):
-        return None
-    encoding_map = {
-        "identity": identity,
-        "gzip": decode_gzip,
-        "deflate": decode_deflate,
-    }
-    if e not in encoding_map:
-        return None
-    return encoding_map[e](content)
+    Raises:
+        ValueError, if decoding fails.
+    """
+    try:
+        try:
+            return custom_decode[encoding](obj)
+        except KeyError:
+            return codecs.decode(obj, encoding, errors)
+    except Exception as e:
+        raise ValueError("{} when decoding {} with {}".format(
+            type(e).__name__,
+            repr(obj)[:10],
+            repr(encoding),
+        ))
+
+
+def encode(obj, encoding, errors='strict'):
+    # type: (Union[str, bytes], str) -> Union[str, bytes]
+    """
+    Encode the given input object
 
+    Returns:
+        The encoded value
 
-def encode(e, content):
-    if not isinstance(content, bytes):
-        return None
-    encoding_map = {
-        "identity": identity,
-        "gzip": encode_gzip,
-        "deflate": encode_deflate,
-    }
-    if e not in encoding_map:
-        return None
-    return encoding_map[e](content)
+    Raises:
+        ValueError, if encoding fails.
+    """
+    try:
+        try:
+            return custom_encode[encoding](obj)
+        except KeyError:
+            return codecs.encode(obj, encoding, errors)
+    except Exception as e:
+        raise ValueError("{} when encoding {} with {}".format(
+            type(e).__name__,
+            repr(obj)[:10],
+            repr(encoding),
+        ))
 
 
 def identity(content):
@@ -46,10 +69,7 @@ def identity(content):
 
 def decode_gzip(content):
     gfile = gzip.GzipFile(fileobj=BytesIO(content))
-    try:
-        return gfile.read()
-    except (IOError, EOFError):
-        return None
+    return gfile.read()
 
 
 def encode_gzip(content):
@@ -70,12 +90,9 @@ def decode_deflate(content):
         http://bugs.python.org/issue5784
     """
     try:
-        try:
-            return zlib.decompress(content)
-        except zlib.error:
-            return zlib.decompress(content, -15)
+        return zlib.decompress(content)
     except zlib.error:
-        return None
+        return zlib.decompress(content, -15)
 
 
 def encode_deflate(content):
@@ -84,4 +101,16 @@ def encode_deflate(content):
     """
     return zlib.compress(content)
 
-__all__ = ["ENCODINGS", "encode", "decode"]
+
+custom_decode = {
+    "identity": identity,
+    "gzip": decode_gzip,
+    "deflate": decode_deflate,
+}
+custom_encode = {
+    "identity": identity,
+    "gzip": encode_gzip,
+    "deflate": encode_deflate,
+}
+
+__all__ = ["encode", "decode"]
diff --git a/netlib/http/http1/assemble.py b/netlib/http/http1/assemble.py
index 511328f1..e74732d2 100644
--- a/netlib/http/http1/assemble.py
+++ b/netlib/http/http1/assemble.py
@@ -5,7 +5,7 @@ from netlib import exceptions
 
 
 def assemble_request(request):
-    if request.content is None:
+    if request.data.content is None:
         raise exceptions.HttpException("Cannot assemble flow with missing content")
     head = assemble_request_head(request)
     body = b"".join(assemble_body(request.data.headers, [request.data.content]))
@@ -19,7 +19,7 @@ def assemble_request_head(request):
 
 
 def assemble_response(response):
-    if response.content is None:
+    if response.data.content is None:
         raise exceptions.HttpException("Cannot assemble flow with missing content")
     head = assemble_response_head(response)
     body = b"".join(assemble_body(response.data.headers, [response.data.content]))
diff --git a/netlib/http/message.py b/netlib/http/message.py
index 0583c246..668198f8 100644
--- a/netlib/http/message.py
+++ b/netlib/http/message.py
@@ -52,7 +52,22 @@ class MessageData(basetypes.Serializable):
         return cls(**state)
 
 
+class CachedDecode(object):
+    __slots__ = ["encoded", "encoding", "decoded"]
+
+    def __init__(self, object, encoding, decoded):
+        self.encoded = object
+        self.encoding = encoding
+        self.decoded = decoded
+
+no_cached_decode = CachedDecode(None, None, None)
+
+
 class Message(basetypes.Serializable):
+    def __init__(self):
+        self._content_cache = no_cached_decode  # type: CachedDecode
+        self._text_cache = no_cached_decode  # type: CachedDecode
+
     def __eq__(self, other):
         if isinstance(other, Message):
             return self.data == other.data
@@ -90,19 +105,65 @@ class Message(basetypes.Serializable):
         self.data.headers = h
 
     @property
-    def content(self):
+    def raw_content(self):
+        # type: () -> bytes
         """
         The raw (encoded) HTTP message body
 
-        See also: :py:attr:`text`
+        See also: :py:attr:`content`, :py:class:`text`
         """
         return self.data.content
 
-    @content.setter
-    def content(self, content):
+    @raw_content.setter
+    def raw_content(self, content):
         self.data.content = content
-        if isinstance(content, bytes):
-            self.headers["content-length"] = str(len(content))
+
+    @property
+    def content(self):
+        # type: () -> bytes
+        """
+        The HTTP message body decoded with the content-encoding header (e.g. gzip)
+
+        See also: :py:class:`raw_content`, :py:attr:`text`
+        """
+        ce = self.headers.get("content-encoding")
+        cached = (
+            self._content_cache.encoded == self.raw_content and
+            self._content_cache.encoding == ce
+        )
+        if not cached:
+            try:
+                if not ce:
+                    raise ValueError()
+                decoded = encoding.decode(self.raw_content, ce)
+            except ValueError:
+                decoded = self.raw_content
+            self._content_cache = CachedDecode(self.raw_content, ce, decoded)
+        return self._content_cache.decoded
+
+    @content.setter
+    def content(self, value):
+        ce = self.headers.get("content-encoding")
+        cached = (
+            self._content_cache.decoded == value and
+            self._content_cache.encoding == ce
+        )
+        if not cached:
+            try:
+                if not ce:
+                    raise ValueError()
+                encoded = encoding.encode(value, ce)
+            except ValueError:
+                # Do we have an unknown content-encoding?
+                # If so, we want to remove it.
+                if value and ce:
+                    self.headers.pop("content-encoding", None)
+                    ce = None
+                encoded = value
+            self._content_cache = CachedDecode(encoded, ce, value)
+        self.raw_content = self._content_cache.encoded
+        if isinstance(self.raw_content, bytes):
+            self.headers["content-length"] = str(len(self.raw_content))
 
     @property
     def http_version(self):
@@ -137,56 +198,81 @@ class Message(basetypes.Serializable):
     def timestamp_end(self, timestamp_end):
         self.data.timestamp_end = timestamp_end
 
+    def _get_content_type_charset(self):
+        # type: () -> Optional[str]
+        ct = headers.parse_content_type(self.headers.get("content-type", ""))
+        if ct:
+            return ct[2].get("charset")
+
     @property
     def text(self):
+        # type: () -> six.text_type
         """
-        The decoded HTTP message body.
-        Decoded contents are not cached, so accessing this attribute repeatedly is relatively expensive.
-
-        .. note::
-            This is not implemented yet.
+        The HTTP message body decoded with both content-encoding header (e.g. gzip)
+        and content-type header charset.
 
-        See also: :py:attr:`content`, :py:class:`decoded`
+        See also: :py:attr:`content`, :py:class:`raw_content`
         """
         # This attribute should be called text, because that's what requests does.
-        raise NotImplementedError()
+        enc = self._get_content_type_charset()
+
+        # We may also want to check for HTML meta tags here at some point.
+
+        cached = (
+            self._text_cache.encoded == self.content and
+            self._text_cache.encoding == enc
+        )
+        if not cached:
+            try:
+                if not enc:
+                    raise ValueError()
+                decoded = encoding.decode(self.content, enc)
+            except ValueError:
+                decoded = self.content.decode("utf8", "replace" if six.PY2 else "surrogateescape")
+            self._text_cache = CachedDecode(self.content, enc, decoded)
+        return self._text_cache.decoded
 
     @text.setter
     def text(self, text):
-        raise NotImplementedError()
+        enc = self._get_content_type_charset()
+        cached = (
+            self._text_cache.decoded == text and
+            self._text_cache.encoding == enc
+        )
+        if not cached:
+            try:
+                if not enc:
+                    raise ValueError()
+                encoded = encoding.encode(text, enc)
+            except ValueError:
+                # Do we have an unknown content-type charset?
+                # If so, we want to replace it with utf8.
+                if text and enc:
+                    self.headers["content-type"] = re.sub(
+                        "charset=[^;]+",
+                        "charset=utf-8",
+                        self.headers["content-type"]
+                    )
+                encoded = text.encode("utf8", "replace" if six.PY2 else "surrogateescape")
+            self._text_cache = CachedDecode(encoded, enc, text)
+        self.content = self._text_cache.encoded
 
     def decode(self):
         """
-            Decodes body based on the current Content-Encoding header, then
-            removes the header. If there is no Content-Encoding header, no
-            action is taken.
-
-            Returns:
-                True, if decoding succeeded.
-                False, otherwise.
+        Decodes body based on the current Content-Encoding header, then
+        removes the header. If there is no Content-Encoding header, no
+        action is taken.
         """
-        ce = self.headers.get("content-encoding")
-        data = encoding.decode(ce, self.content)
-        if data is None:
-            return False
-        self.content = data
+        self.raw_content = self.content
         self.headers.pop("content-encoding", None)
-        return True
 
     def encode(self, e):
         """
-            Encodes body with the encoding e, where e is "gzip", "deflate" or "identity".
-
-            Returns:
-                True, if decoding succeeded.
-                False, otherwise.
+        Encodes body with the encoding e, where e is "gzip", "deflate" or "identity".
         """
-        data = encoding.encode(e, self.content)
-        if data is None:
-            return False
-        self.content = data
+        self.decode()  # remove the current encoding
         self.headers["content-encoding"] = e
-        return True
+        self.content = self.raw_content
 
     def replace(self, pattern, repl, flags=0):
         """
@@ -203,10 +289,9 @@ class Message(basetypes.Serializable):
             repl = strutils.escaped_str_to_bytes(repl)
         replacements = 0
         if self.content:
-            with decoded(self):
-                self.content, replacements = re.subn(
-                    pattern, repl, self.content, flags=flags
-                )
+            self.content, replacements = re.subn(
+                pattern, repl, self.content, flags=flags
+            )
         replacements += self.headers.replace(pattern, repl, flags)
         return replacements
 
@@ -225,29 +310,16 @@ class Message(basetypes.Serializable):
 
 class decoded(object):
     """
-    A context manager that decodes a request or response, and then
-    re-encodes it with the same encoding after execution of the block.
-
-    Example:
-
-    .. code-block:: python
-
-        with decoded(request):
-            request.content = request.content.replace("foo", "bar")
+    Deprecated: You can now directly use :py:attr:`content`.
+    :py:attr:`raw_content` has the encoded content.
     """
 
     def __init__(self, message):
-        self.message = message
-        ce = message.headers.get("content-encoding")
-        if ce in encoding.ENCODINGS:
-            self.ce = ce
-        else:
-            self.ce = None
+        warnings.warn("decoded() is deprecated, you can now directly use .content instead. "
+                      ".raw_content has the encoded content.", DeprecationWarning)
 
     def __enter__(self):
-        if self.ce:
-            self.message.decode()
+        pass
 
     def __exit__(self, type, value, tb):
-        if self.ce:
-            self.message.encode(self.ce)
+        pass
\ No newline at end of file
diff --git a/netlib/http/request.py b/netlib/http/request.py
index d9f4ed00..4ce94549 100644
--- a/netlib/http/request.py
+++ b/netlib/http/request.py
@@ -5,7 +5,6 @@ import re
 import six
 from six.moves import urllib
 
-from netlib import encoding
 from netlib import multidict
 from netlib import strutils
 from netlib.http import multipart
@@ -44,6 +43,7 @@ class Request(message.Message):
     An HTTP request.
     """
     def __init__(self, *args, **kwargs):
+        super(Request, self).__init__()
         self.data = RequestData(*args, **kwargs)
 
     def __repr__(self):
@@ -327,7 +327,7 @@ class Request(message.Message):
             self.headers["accept-encoding"] = (
                 ', '.join(
                     e
-                    for e in encoding.ENCODINGS
+                    for e in {"gzip", "identity", "deflate"}
                     if e in accept_encoding
                 )
             )
diff --git a/netlib/http/response.py b/netlib/http/response.py
index 17d69418..d2273edd 100644
--- a/netlib/http/response.py
+++ b/netlib/http/response.py
@@ -30,13 +30,14 @@ class Response(message.Message):
     An HTTP response.
     """
     def __init__(self, *args, **kwargs):
+        super(Response, self).__init__()
         self.data = ResponseData(*args, **kwargs)
 
     def __repr__(self):
-        if self.content:
+        if self.raw_content:
             details = "{}, {}".format(
                 self.headers.get("content-type", "unknown content type"),
-                human.pretty_size(len(self.content))
+                human.pretty_size(len(self.raw_content))
             )
         else:
             details = "no content"
diff --git a/test/mitmproxy/test_contentview.py b/test/mitmproxy/test_contentview.py
index 52fceeac..4b099d8d 100644
--- a/test/mitmproxy/test_contentview.py
+++ b/test/mitmproxy/test_contentview.py
@@ -209,28 +209,6 @@ Larry
             headers=Headers()
         )
 
-        r = cv.get_content_view(
-            cv.get("Auto"),
-            encoding.encode('gzip', b"[1, 2, 3]"),
-            headers=Headers(
-                content_type="application/json",
-                content_encoding="gzip"
-            )
-        )
-        assert "decoded gzip" in r[0]
-        assert "JSON" in r[0]
-
-        r = cv.get_content_view(
-            cv.get("XML"),
-            encoding.encode('gzip', b"[1, 2, 3]"),
-            headers=Headers(
-                content_type="application/json",
-                content_encoding="gzip"
-            )
-        )
-        assert "decoded gzip" in r[0]
-        assert "Raw" in r[0]
-
     def test_add_cv(self):
         class TestContentView(cv.View):
             name = "test"
diff --git a/test/mitmproxy/test_examples.py b/test/mitmproxy/test_examples.py
index 607d6faf..22d3c425 100644
--- a/test/mitmproxy/test_examples.py
+++ b/test/mitmproxy/test_examples.py
@@ -73,9 +73,9 @@ def test_add_header():
 def test_custom_contentviews():
     with example("custom_contentviews.py") as ex:
         pig = ex.ctx.contentview
-        _, fmt = pig("<html>test!</html>")
-        assert any('esttay!' in val[0][1] for val in fmt)
-        assert not pig("gobbledygook")
+        _, fmt = pig(b"<html>test!</html>")
+        assert any(b'esttay!' in val[0][1] for val in fmt)
+        assert not pig(b"gobbledygook")
 
 
 def test_iframe_injector():
@@ -103,7 +103,7 @@ def test_modify_form():
 
 
 def test_modify_querystring():
-    flow = tutils.tflow(req=netutils.treq(path="/search?q=term"))
+    flow = tutils.tflow(req=netutils.treq(path=b"/search?q=term"))
     with example("modify_querystring.py") as ex:
         ex.run("request", flow)
         assert flow.request.query["mitmproxy"] == "rocks"
@@ -126,7 +126,7 @@ def test_modify_response_body():
 
 
 def test_redirect_requests():
-    flow = tutils.tflow(req=netutils.treq(host="example.org"))
+    flow = tutils.tflow(req=netutils.treq(host=b"example.org"))
     with example("redirect_requests.py") as ex:
         ex.run("request", flow)
         assert flow.request.host == "mitmproxy.org"
diff --git a/test/mitmproxy/test_flow.py b/test/mitmproxy/test_flow.py
index 9eaab9aa..5753e728 100644
--- a/test/mitmproxy/test_flow.py
+++ b/test/mitmproxy/test_flow.py
@@ -518,13 +518,13 @@ class TestFlow(object):
 
         f.replace("foo", "bar")
 
-        assert f.request.content != "abarb"
+        assert f.request.raw_content != "abarb"
         f.request.decode()
-        assert f.request.content == "abarb"
+        assert f.request.raw_content == "abarb"
 
-        assert f.response.content != "abarb"
+        assert f.response.raw_content != "abarb"
         f.response.decode()
-        assert f.response.content == "abarb"
+        assert f.response.raw_content == "abarb"
 
 
 class TestState:
@@ -1102,16 +1102,6 @@ class TestRequest:
         r.constrain_encoding()
         assert "oink" not in r.headers["accept-encoding"]
 
-    def test_get_decoded_content(self):
-        r = HTTPRequest.wrap(netlib.tutils.treq())
-        r.content = None
-        r.headers["content-encoding"] = "identity"
-        assert r.get_decoded_content() is None
-
-        r.content = "falafel"
-        r.encode("gzip")
-        assert r.get_decoded_content() == "falafel"
-
     def test_get_content_type(self):
         resp = HTTPResponse.wrap(netlib.tutils.tresp())
         resp.headers = Headers(content_type="text/plain")
diff --git a/test/mitmproxy/test_protocol_http2.py b/test/mitmproxy/test_protocol_http2.py
index 932c8df2..6e021b2c 100644
--- a/test/mitmproxy/test_protocol_http2.py
+++ b/test/mitmproxy/test_protocol_http2.py
@@ -120,7 +120,7 @@ class _Http2TestBase(object):
         client.wfile.flush()
 
         # read CONNECT response
-        while client.rfile.readline() != "\r\n":
+        while client.rfile.readline() != b"\r\n":
             pass
 
         client.convert_to_ssl(alpn_protos=[b'h2'])
@@ -197,7 +197,7 @@ class TestSimple(_Http2TestBase, _Http2ServerBase):
             (':path', '/'),
             ('ClIeNt-FoO', 'client-bar-1'),
             ('ClIeNt-FoO', 'client-bar-2'),
-        ], body='my request body echoed back to me')
+        ], body=b'my request body echoed back to me')
 
         done = False
         while not done:
@@ -269,7 +269,7 @@ class TestWithBodies(_Http2TestBase, _Http2ServerBase):
                 (':scheme', 'https'),
                 (':path', '/'),
             ],
-            body='foobar with request body',
+            body=b'foobar with request body',
         )
 
         done = False
diff --git a/test/mitmproxy/tservers.py b/test/mitmproxy/tservers.py
index 51f4b4e2..6d8730f5 100644
--- a/test/mitmproxy/tservers.py
+++ b/test/mitmproxy/tservers.py
@@ -11,7 +11,6 @@ import pathod.pathoc
 from mitmproxy import flow, controller
 from mitmproxy.cmdline import APP_HOST, APP_PORT
 
-from netlib import strutils
 
 testapp = flask.Flask(__name__)
 
diff --git a/test/netlib/http/test_message.py b/test/netlib/http/test_message.py
index f5bf7f0c..aecde1ec 100644
--- a/test/netlib/http/test_message.py
+++ b/test/netlib/http/test_message.py
@@ -1,7 +1,8 @@
 # -*- coding: utf-8 -*-
 from __future__ import absolute_import, print_function, division
 
-from netlib.http import decoded
+import six
+
 from netlib.tutils import tresp
 
 
@@ -76,6 +77,9 @@ class TestMessage(object):
         resp.content = b""
         assert resp.data.content == b""
         assert resp.headers["content-length"] == "0"
+        resp.raw_content = b"bar"
+        assert resp.data.content == b"bar"
+        assert resp.headers["content-length"] == "0"
 
     def test_content_basic(self):
         _test_passthrough_attr(tresp(), "content")
@@ -93,61 +97,108 @@ class TestMessage(object):
         _test_decoded_attr(tresp(), "http_version")
 
 
-class TestDecodedDecorator(object):
-
+class TestMessageContentEncoding(object):
     def test_simple(self):
         r = tresp()
-        assert r.content == b"message"
+        assert r.raw_content == b"message"
         assert "content-encoding" not in r.headers
-        assert r.encode("gzip")
+        r.encode("gzip")
 
         assert r.headers["content-encoding"]
-        assert r.content != b"message"
-        with decoded(r):
-            assert "content-encoding" not in r.headers
-            assert r.content == b"message"
-        assert r.headers["content-encoding"]
-        assert r.content != b"message"
+        assert r.raw_content != b"message"
+        assert r.content == b"message"
+        assert r.raw_content != b"message"
 
     def test_modify(self):
         r = tresp()
         assert "content-encoding" not in r.headers
-        assert r.encode("gzip")
-
-        with decoded(r):
-            r.content = b"foo"
+        r.encode("gzip")
 
-        assert r.content != b"foo"
+        r.content = b"foo"
+        assert r.raw_content != b"foo"
         r.decode()
-        assert r.content == b"foo"
+        assert r.raw_content == b"foo"
 
     def test_unknown_ce(self):
         r = tresp()
         r.headers["content-encoding"] = "zopfli"
-        r.content = b"foo"
-        with decoded(r):
-            assert r.headers["content-encoding"]
-            assert r.content == b"foo"
-        assert r.headers["content-encoding"]
+        r.raw_content = b"foo"
         assert r.content == b"foo"
+        assert r.headers["content-encoding"]
 
     def test_cannot_decode(self):
         r = tresp()
-        assert r.encode("gzip")
-        r.content = b"foo"
-        with decoded(r):
-            assert r.headers["content-encoding"]
-            assert r.content == b"foo"
+        r.encode("gzip")
+        r.raw_content = b"foo"
+        assert r.content == b"foo"
         assert r.headers["content-encoding"]
-        assert r.content != b"foo"
         r.decode()
-        assert r.content == b"foo"
+        assert r.raw_content == b"foo"
+        assert "content-encoding" not in r.headers
 
     def test_cannot_encode(self):
         r = tresp()
-        assert r.encode("gzip")
-        with decoded(r):
-            r.content = None
+        r.encode("gzip")
+        r.content = None
+        assert r.headers["content-encoding"]
+        assert r.raw_content is None
 
+        r.headers["content-encoding"] = "zopfli"
+        r.content = b"foo"
         assert "content-encoding" not in r.headers
-        assert r.content is None
+        assert r.raw_content == b"foo"
+
+
+class TestMessageText(object):
+    def test_simple(self):
+        r = tresp(content=b'\xc3\xbc')
+        assert r.raw_content == b"\xc3\xbc"
+        assert r.content == b"\xc3\xbc"
+        assert r.text == u"ü"
+
+        r.encode("gzip")
+        assert r.text == u"ü"
+        r.decode()
+        assert r.text == u"ü"
+
+        r.headers["content-type"] = "text/html; charset=latin1"
+        assert r.content == b"\xc3\xbc"
+        assert r.text == u"Ã¼"
+
+    def test_modify(self):
+        r = tresp()
+
+        r.text = u"ü"
+        assert r.raw_content == b"\xc3\xbc"
+
+        r.headers["content-type"] = "text/html; charset=latin1"
+        r.text = u"ü"
+        assert r.raw_content == b"\xfc"
+        assert r.headers["content-length"] == "1"
+
+    def test_unknown_ce(self):
+        r = tresp()
+        r.headers["content-type"] = "text/html; charset=wtf"
+        r.raw_content = b"foo"
+        assert r.text == u"foo"
+
+    def test_cannot_decode(self):
+        r = tresp()
+        r.raw_content = b"\xFF"
+        assert r.text == u'\ufffd' if six.PY2 else '\udcff'
+
+    def test_cannot_encode(self):
+        r = tresp()
+        r.content = None
+        assert "content-type" not in r.headers
+        assert r.raw_content is None
+
+        r.headers["content-type"] = "text/html; charset=latin1"
+        r.text = u"☃"
+        assert r.headers["content-type"] == "text/html; charset=utf-8"
+        assert r.raw_content == b'\xe2\x98\x83'
+
+        r.headers["content-type"] = "text/html; charset=latin1"
+        r.text = u'\udcff'
+        assert r.headers["content-type"] == "text/html; charset=utf-8"
+        assert r.raw_content == b'\xed\xb3\xbf' if six.PY2 else b"\xFF"
diff --git a/test/netlib/test_encoding.py b/test/netlib/test_encoding.py
index 0ff1aad1..de10fc48 100644
--- a/test/netlib/test_encoding.py
+++ b/test/netlib/test_encoding.py
@@ -1,37 +1,39 @@
-from netlib import encoding
+from netlib import encoding, tutils
 
 
 def test_identity():
-    assert b"string" == encoding.decode("identity", b"string")
-    assert b"string" == encoding.encode("identity", b"string")
-    assert not encoding.encode("nonexistent", b"string")
-    assert not encoding.decode("nonexistent encoding", b"string")
+    assert b"string" == encoding.decode(b"string", "identity")
+    assert b"string" == encoding.encode(b"string", "identity")
+    with tutils.raises(ValueError):
+        encoding.encode(b"string", "nonexistent encoding")
 
 
 def test_gzip():
     assert b"string" == encoding.decode(
-        "gzip",
         encoding.encode(
-            "gzip",
-            b"string"
-        )
+            b"string",
+            "gzip"
+        ),
+        "gzip"
     )
-    assert encoding.decode("gzip", b"bogus") is None
+    with tutils.raises(ValueError):
+        encoding.decode(b"bogus", "gzip")
 
 
 def test_deflate():
     assert b"string" == encoding.decode(
-        "deflate",
         encoding.encode(
-            "deflate",
-            b"string"
-        )
+            b"string",
+            "deflate"
+        ),
+        "deflate"
     )
     assert b"string" == encoding.decode(
-        "deflate",
         encoding.encode(
-            "deflate",
-            b"string"
-        )[2:-4]
+            b"string",
+            "deflate"
+        )[2:-4],
+        "deflate"
     )
-    assert encoding.decode("deflate", b"bogus") is None
+    with tutils.raises(ValueError):
+        encoding.decode(b"bogus", "deflate")
-- 
cgit v1.2.3


From dbf7cb1a442e2c0823d853ca310395048496996d Mon Sep 17 00:00:00 2001
From: Maximilian Hils <git@maximilianhils.com>
Date: Sat, 2 Jul 2016 02:01:46 -0700
Subject: update examples: no decoded() anymore :tada:

---
 examples/custom_contentviews.py  |  2 +-
 examples/har_extractor.py        |  2 +-
 examples/iframe_injector.py      | 24 +++++++++++-------------
 examples/modify_response_body.py | 10 ++++------
 examples/redirect_requests.py    |  4 ++--
 examples/sslstrip.py             | 38 ++++++++++++++++++--------------------
 examples/upsidedownternet.py     | 20 +++++++++-----------
 7 files changed, 46 insertions(+), 54 deletions(-)

diff --git a/examples/custom_contentviews.py b/examples/custom_contentviews.py
index 05ebeb69..8a57bf74 100644
--- a/examples/custom_contentviews.py
+++ b/examples/custom_contentviews.py
@@ -20,7 +20,7 @@ class ViewPigLatin(contentviews.View):
             docinfo = d.getroottree().docinfo
 
             def piglify(src):
-                words = string.split(src)
+                words = src.split()
                 ret = ''
                 for word in words:
                     idx = -1
diff --git a/examples/har_extractor.py b/examples/har_extractor.py
index d6b50c21..54aa84d3 100644
--- a/examples/har_extractor.py
+++ b/examples/har_extractor.py
@@ -127,7 +127,7 @@ def response(context, flow):
                             for k, v in flow.request.query or {}]
 
     response_body_size = len(flow.response.content)
-    response_body_decoded_size = len(flow.response.get_decoded_content())
+    response_body_decoded_size = len(flow.response.content)
     response_body_compression = response_body_decoded_size - response_body_size
 
     entry = HAR.entries({
diff --git a/examples/iframe_injector.py b/examples/iframe_injector.py
index 9495da93..5803b4c1 100644
--- a/examples/iframe_injector.py
+++ b/examples/iframe_injector.py
@@ -2,7 +2,6 @@
 # (this script works best with --anticache)
 import sys
 from bs4 import BeautifulSoup
-from mitmproxy.models import decoded
 
 
 def start(context):
@@ -14,15 +13,14 @@ def start(context):
 def response(context, flow):
     if flow.request.host in context.iframe_url:
         return
-    with decoded(flow.response):  # Remove content encoding (gzip, ...)
-        html = BeautifulSoup(flow.response.content, "lxml")
-        if html.body:
-            iframe = html.new_tag(
-                "iframe",
-                src=context.iframe_url,
-                frameborder=0,
-                height=0,
-                width=0)
-            html.body.insert(0, iframe)
-            flow.response.content = str(html)
-            context.log("Iframe inserted.")
+    html = BeautifulSoup(flow.response.content, "lxml")
+    if html.body:
+        iframe = html.new_tag(
+            "iframe",
+            src=context.iframe_url,
+            frameborder=0,
+            height=0,
+            width=0)
+        html.body.insert(0, iframe)
+        flow.response.content = str(html)
+        context.log("Iframe inserted.")
diff --git a/examples/modify_response_body.py b/examples/modify_response_body.py
index 3034892e..03dfeaa4 100644
--- a/examples/modify_response_body.py
+++ b/examples/modify_response_body.py
@@ -2,8 +2,6 @@
 # (this script works best with --anticache)
 import sys
 
-from mitmproxy.models import decoded
-
 
 def start(context):
     if len(sys.argv) != 3:
@@ -14,7 +12,7 @@ def start(context):
 
 
 def response(context, flow):
-    with decoded(flow.response):  # automatically decode gzipped responses.
-        flow.response.content = flow.response.content.replace(
-            context.old,
-            context.new)
+    flow.response.content = flow.response.content.replace(
+        context.old,
+        context.new
+    )
diff --git a/examples/redirect_requests.py b/examples/redirect_requests.py
index d7db3f1c..bb1e6952 100644
--- a/examples/redirect_requests.py
+++ b/examples/redirect_requests.py
@@ -13,9 +13,9 @@ def request(context, flow):
     # Method 1: Answer with a locally generated response
     if flow.request.pretty_host.endswith("example.com"):
         resp = HTTPResponse(
-            "HTTP/1.1", 200, "OK",
+            b"HTTP/1.1", 200, b"OK",
             Headers(Content_Type="text/html"),
-            "helloworld")
+            b"helloworld")
         flow.reply.send(resp)
 
     # Method 2: Redirect the request to a different server
diff --git a/examples/sslstrip.py b/examples/sslstrip.py
index 8dde8e3e..77e91cc9 100644
--- a/examples/sslstrip.py
+++ b/examples/sslstrip.py
@@ -1,4 +1,3 @@
-from netlib.http import decoded
 import re
 from six.moves import urllib
 
@@ -19,22 +18,21 @@ def request(context, flow):
 
 
 def response(context, flow):
-    with decoded(flow.response):
-        flow.request.headers.pop('Strict-Transport-Security', None)
-        flow.request.headers.pop('Public-Key-Pins', None)
-
-        # strip links in response body
-        flow.response.content = flow.response.content.replace('https://', 'http://')
-
-        # strip links in 'Location' header
-        if flow.response.headers.get('Location', '').startswith('https://'):
-            location = flow.response.headers['Location']
-            hostname = urllib.parse.urlparse(location).hostname
-            if hostname:
-                context.secure_hosts.add(hostname)
-            flow.response.headers['Location'] = location.replace('https://', 'http://', 1)
-
-        # strip secure flag from 'Set-Cookie' headers
-        cookies = flow.response.headers.get_all('Set-Cookie')
-        cookies = [re.sub(r';\s*secure\s*', '', s) for s in cookies]
-        flow.response.headers.set_all('Set-Cookie', cookies)
+    flow.request.headers.pop('Strict-Transport-Security', None)
+    flow.request.headers.pop('Public-Key-Pins', None)
+
+    # strip links in response body
+    flow.response.content = flow.response.content.replace('https://', 'http://')
+
+    # strip links in 'Location' header
+    if flow.response.headers.get('Location', '').startswith('https://'):
+        location = flow.response.headers['Location']
+        hostname = urllib.parse.urlparse(location).hostname
+        if hostname:
+            context.secure_hosts.add(hostname)
+        flow.response.headers['Location'] = location.replace('https://', 'http://', 1)
+
+    # strip secure flag from 'Set-Cookie' headers
+    cookies = flow.response.headers.get_all('Set-Cookie')
+    cookies = [re.sub(r';\s*secure\s*', '', s) for s in cookies]
+    flow.response.headers.set_all('Set-Cookie', cookies)
diff --git a/examples/upsidedownternet.py b/examples/upsidedownternet.py
index 9aac9f05..58ed53d7 100644
--- a/examples/upsidedownternet.py
+++ b/examples/upsidedownternet.py
@@ -1,17 +1,15 @@
 from six.moves import cStringIO as StringIO
 from PIL import Image
-from mitmproxy.models import decoded
 
 
 def response(context, flow):
     if flow.response.headers.get("content-type", "").startswith("image"):
-        with decoded(flow.response):  # automatically decode gzipped responses.
-            try:
-                s = StringIO(flow.response.content)
-                img = Image.open(s).rotate(180)
-                s2 = StringIO()
-                img.save(s2, "png")
-                flow.response.content = s2.getvalue()
-                flow.response.headers["content-type"] = "image/png"
-            except:  # Unknown image types etc.
-                pass
+        try:
+            s = StringIO(flow.response.content)
+            img = Image.open(s).rotate(180)
+            s2 = StringIO()
+            img.save(s2, "png")
+            flow.response.content = s2.getvalue()
+            flow.response.headers["content-type"] = "image/png"
+        except:  # Unknown image types etc.
+            pass
-- 
cgit v1.2.3


From d9f797e7e6936809171d9c99144fb5ded3ee131f Mon Sep 17 00:00:00 2001
From: Maximilian Hils <git@maximilianhils.com>
Date: Sat, 2 Jul 2016 02:11:00 -0700
Subject: make the linter happy

---
 mitmproxy/contentviews.py          | 1 -
 netlib/http/message.py             | 2 +-
 test/mitmproxy/test_contentview.py | 1 -
 3 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/mitmproxy/contentviews.py b/mitmproxy/contentviews.py
index c9ea14ba..6072f959 100644
--- a/mitmproxy/contentviews.py
+++ b/mitmproxy/contentviews.py
@@ -31,7 +31,6 @@ from six import BytesIO
 from mitmproxy import exceptions
 from mitmproxy.contrib import jsbeautifier
 from mitmproxy.contrib.wbxml import ASCommandResponse
-from netlib import encoding
 from netlib import http
 from netlib import multidict
 from netlib.http import url
diff --git a/netlib/http/message.py b/netlib/http/message.py
index 668198f8..28278bd2 100644
--- a/netlib/http/message.py
+++ b/netlib/http/message.py
@@ -322,4 +322,4 @@ class decoded(object):
         pass
 
     def __exit__(self, type, value, tb):
-        pass
\ No newline at end of file
+        pass
diff --git a/test/mitmproxy/test_contentview.py b/test/mitmproxy/test_contentview.py
index 4b099d8d..7037745d 100644
--- a/test/mitmproxy/test_contentview.py
+++ b/test/mitmproxy/test_contentview.py
@@ -1,6 +1,5 @@
 from mitmproxy.exceptions import ContentViewException
 from netlib.http import Headers
-from netlib import encoding
 from netlib.http import url
 from netlib import multidict
 
-- 
cgit v1.2.3


From 2f8a1fd2cb1374941f436f36bbfa0d0b3d9213c7 Mon Sep 17 00:00:00 2001
From: Maximilian Hils <git@maximilianhils.com>
Date: Sat, 2 Jul 2016 03:03:42 -0700
Subject: tests++

---
 netlib/http/message.py           |  6 +++---
 test/netlib/http/test_message.py | 44 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 46 insertions(+), 4 deletions(-)

diff --git a/netlib/http/message.py b/netlib/http/message.py
index 28278bd2..ca3a4145 100644
--- a/netlib/http/message.py
+++ b/netlib/http/message.py
@@ -314,12 +314,12 @@ class decoded(object):
     :py:attr:`raw_content` has the encoded content.
     """
 
-    def __init__(self, message):
+    def __init__(self, message):  # pragma no cover
         warnings.warn("decoded() is deprecated, you can now directly use .content instead. "
                       ".raw_content has the encoded content.", DeprecationWarning)
 
-    def __enter__(self):
+    def __enter__(self):  # pragma no cover
         pass
 
-    def __exit__(self, type, value, tb):
+    def __exit__(self, type, value, tb):  # pragma no cover
         pass
diff --git a/test/netlib/http/test_message.py b/test/netlib/http/test_message.py
index aecde1ec..e1707a91 100644
--- a/test/netlib/http/test_message.py
+++ b/test/netlib/http/test_message.py
@@ -1,9 +1,11 @@
 # -*- coding: utf-8 -*-
 from __future__ import absolute_import, print_function, division
 
+import mock
 import six
 
 from netlib.tutils import tresp
+from netlib import http
 
 
 def _test_passthrough_attr(message, attr):
@@ -69,6 +71,15 @@ class TestMessage(object):
 
         assert resp != 0
 
+    def test_hash(self):
+        resp = tresp()
+        assert hash(resp)
+
+    def test_serializable(self):
+        resp = tresp()
+        resp2 = http.Response.from_state(resp.get_state())
+        assert resp == resp2
+
     def test_content_length_update(self):
         resp = tresp()
         resp.content = b"foo"
@@ -93,7 +104,7 @@ class TestMessage(object):
     def test_timestamp_end(self):
         _test_passthrough_attr(tresp(), "timestamp_end")
 
-    def teste_http_version(self):
+    def test_http_version(self):
         _test_decoded_attr(tresp(), "http_version")
 
 
@@ -109,6 +120,14 @@ class TestMessageContentEncoding(object):
         assert r.content == b"message"
         assert r.raw_content != b"message"
 
+        r.raw_content = b"foo"
+        with mock.patch("netlib.encoding.decode") as e:
+            assert r.content
+            assert e.call_count == 1
+            e.reset_mock()
+            assert r.content
+            assert e.call_count == 0
+
     def test_modify(self):
         r = tresp()
         assert "content-encoding" not in r.headers
@@ -119,6 +138,13 @@ class TestMessageContentEncoding(object):
         r.decode()
         assert r.raw_content == b"foo"
 
+        r.encode("identity")
+        with mock.patch("netlib.encoding.encode") as e:
+            r.content = b"foo"
+            assert e.call_count == 0
+            r.content = b"bar"
+            assert e.call_count == 1
+
     def test_unknown_ce(self):
         r = tresp()
         r.headers["content-encoding"] = "zopfli"
@@ -165,6 +191,15 @@ class TestMessageText(object):
         assert r.content == b"\xc3\xbc"
         assert r.text == u"Ã¼"
 
+        r.encode("identity")
+        r.raw_content = b"foo"
+        with mock.patch("netlib.encoding.decode") as e:
+            assert r.text
+            assert e.call_count == 2
+            e.reset_mock()
+            assert r.text
+            assert e.call_count == 0
+
     def test_modify(self):
         r = tresp()
 
@@ -176,6 +211,13 @@ class TestMessageText(object):
         assert r.raw_content == b"\xfc"
         assert r.headers["content-length"] == "1"
 
+        r.encode("identity")
+        with mock.patch("netlib.encoding.encode") as e:
+            r.text = u"ü"
+            assert e.call_count == 0
+            r.text = u"ä"
+            assert e.call_count == 2
+
     def test_unknown_ce(self):
         r = tresp()
         r.headers["content-type"] = "text/html; charset=wtf"
-- 
cgit v1.2.3


From a6b3551934e2b8768177d6831ca08f97f5bdae44 Mon Sep 17 00:00:00 2001
From: Maximilian Hils <git@maximilianhils.com>
Date: Mon, 4 Jul 2016 13:58:09 -0700
Subject: raise ValueError if content-encoding is invalid

---
 mitmproxy/console/common.py      | 40 +++++++++++++++++++++++++++-----------
 mitmproxy/console/flowview.py    | 38 +++++++++++++++++++++++++-----------
 mitmproxy/dump.py                | 13 +++++++++----
 mitmproxy/filt.py                | 36 ++++++++++++++++++++++------------
 mitmproxy/flow/export.py         | 18 +++++++++++------
 netlib/http/message.py           | 42 +++++++++++++++++++++++++++-------------
 netlib/http/request.py           | 12 +++++++++---
 netlib/wsgi.py                   |  6 +++++-
 test/netlib/http/test_message.py | 18 +++++++++--------
 9 files changed, 154 insertions(+), 69 deletions(-)

diff --git a/mitmproxy/console/common.py b/mitmproxy/console/common.py
index b4369c0c..ef220b4c 100644
--- a/mitmproxy/console/common.py
+++ b/mitmproxy/console/common.py
@@ -256,24 +256,34 @@ def copy_flow_format_data(part, scope, flow):
     else:
         data = ""
         if scope in ("q", "a"):
-            if flow.request.content is None:
+            request = flow.request.copy()
+            try:
+                request.decode()
+            except ValueError:
+                pass
+            if request.raw_content is None:
                 return None, "Request content is missing"
             if part == "h":
-                data += netlib.http.http1.assemble_request(flow.request)
+                data += netlib.http.http1.assemble_request(request)
             elif part == "c":
-                data += flow.request.content
+                data += request.raw_content
             else:
                 raise ValueError("Unknown part: {}".format(part))
-        if scope == "a" and flow.request.content and flow.response:
+        if scope == "a" and flow.request.raw_content and flow.response:
             # Add padding between request and response
             data += "\r\n" * 2
         if scope in ("s", "a") and flow.response:
-            if flow.response.content is None:
+            response = flow.response.copy()
+            try:
+                response.decode()
+            except ValueError:
+                pass
+            if response.raw_content is None:
                 return None, "Response content is missing"
             if part == "h":
-                data += netlib.http.http1.assemble_response(flow.response)
+                data += netlib.http.http1.assemble_response(response)
             elif part == "c":
-                data += flow.response.content
+                data += response.raw_content
             else:
                 raise ValueError("Unknown part: {}".format(part))
     return data, False
@@ -361,8 +371,8 @@ def ask_save_body(part, master, state, flow):
     "q" (request), "s" (response) or None (ask user if necessary).
     """
 
-    request_has_content = flow.request and flow.request.content
-    response_has_content = flow.response and flow.response.content
+    request_has_content = flow.request and flow.request.raw_content
+    response_has_content = flow.response and flow.response.raw_content
 
     if part is None:
         # We first need to determine whether we want to save the request or the
@@ -383,14 +393,22 @@ def ask_save_body(part, master, state, flow):
             ask_save_body("q", master, state, flow)
 
     elif part == "q" and request_has_content:
+        try:
+            content = flow.request.content
+        except ValueError:
+            content = flow.request.raw_content
         ask_save_path(
             "Save request content",
-            flow.request.content
+            content
         )
     elif part == "s" and response_has_content:
+        try:
+            content = flow.response.content
+        except ValueError:
+            content = flow.response.raw_content
         ask_save_path(
             "Save response content",
-            flow.response.content
+            content
         )
     else:
         signals.status_message.send(message="No content to save.")
diff --git a/mitmproxy/console/flowview.py b/mitmproxy/console/flowview.py
index 208b0d44..c4bb6c40 100644
--- a/mitmproxy/console/flowview.py
+++ b/mitmproxy/console/flowview.py
@@ -199,26 +199,34 @@ class FlowView(tabs.Tabs):
 
     def _get_content_view(self, viewmode, message, max_lines, _):
 
+        try:
+            content = message.content
+            if content != message.raw_content:
+                enc = "[decoded {}]".format(
+                    message.headers.get("content-encoding")
+                )
+            else:
+                enc = None
+        except ValueError:
+            content = message.raw_content
+            enc = "[cannot decode]"
         try:
             query = None
             if isinstance(message, models.HTTPRequest):
                 query = message.query
             description, lines = contentviews.get_content_view(
-                viewmode, message.content, headers=message.headers, query=query
+                viewmode, content, headers=message.headers, query=query
             )
         except exceptions.ContentViewException:
             s = "Content viewer failed: \n" + traceback.format_exc()
             signals.add_event(s, "error")
             description, lines = contentviews.get_content_view(
-                contentviews.get("Raw"), message.content, headers=message.headers
+                contentviews.get("Raw"), content, headers=message.headers
             )
             description = description.replace("Raw", "Couldn't parse: falling back to Raw")
 
-        if message.content != message.raw_content:
-            description = "[decoded {enc}] {desc}".format(
-                enc=message.headers.get("content-encoding"),
-                desc=description
-            )
+        if enc:
+            description = " ".join(enc, description)
 
         # Give hint that you have to tab for the response.
         if description == "No content" and isinstance(message, models.HTTPRequest):
@@ -419,10 +427,14 @@ class FlowView(tabs.Tabs):
             # editing message bodies, this can cause problems. For now, I
             # just strip the newlines off the end of the body when we return
             # from an editor.
-            c = self.master.spawn_editor(message.content or b"")
+            try:
+                content = message.content
+            except ValueError:
+                content = message.raw_content
+            c = self.master.spawn_editor(content or b"")
             message.content = c.rstrip(b"\n")
         elif part == "f":
-            if not message.urlencoded_form and message.content:
+            if not message.urlencoded_form and message.raw_content:
                 signals.status_prompt_onekey.send(
                     prompt = "Existing body is not a URL-encoded form. Clear and edit?",
                     keys = [
@@ -682,10 +694,14 @@ class FlowView(tabs.Tabs):
                 )
                 key = None
             elif key == "v":
-                if conn.content:
+                if conn.raw_content:
                     t = conn.headers.get("content-type")
                     if "EDITOR" in os.environ or "PAGER" in os.environ:
-                        self.master.spawn_external_viewer(conn.content, t)
+                        try:
+                            content = conn.content
+                        except ValueError:
+                            content = conn.raw_content
+                        self.master.spawn_external_viewer(content, t)
                     else:
                         signals.status_message.send(
                             message = "Error! Set $EDITOR or $PAGER."
diff --git a/mitmproxy/dump.py b/mitmproxy/dump.py
index ea242bba..0a9b76a7 100644
--- a/mitmproxy/dump.py
+++ b/mitmproxy/dump.py
@@ -187,15 +187,20 @@ class DumpMaster(flow.FlowMaster):
             )
             self.echo(headers, indent=4)
         if self.o.flow_detail >= 3:
-            if message.content is None:
+            try:
+                content = message.content
+            except ValueError:
+                content = message.raw_content
+
+            if content is None:
                 self.echo("(content missing)", indent=4)
-            elif message.content:
+            elif content:
                 self.echo("")
 
                 try:
                     type, lines = contentviews.get_content_view(
                         contentviews.get("Auto"),
-                        message.content,
+                        content,
                         headers=getattr(message, "headers", None)
                     )
                 except exceptions.ContentViewException:
@@ -203,7 +208,7 @@ class DumpMaster(flow.FlowMaster):
                     self.add_event(s, "debug")
                     type, lines = contentviews.get_content_view(
                         contentviews.get("Raw"),
-                        message.content,
+                        content,
                         headers=getattr(message, "headers", None)
                     )
 
diff --git a/mitmproxy/filt.py b/mitmproxy/filt.py
index 95bae1ae..e8687b9f 100644
--- a/mitmproxy/filt.py
+++ b/mitmproxy/filt.py
@@ -193,12 +193,18 @@ class FBod(_Rex):
     help = "Body"
 
     def __call__(self, f):
-        if f.request and f.request.content:
-            if self.re.search(f.request.content):
-                return True
-        if f.response and f.response.content:
-            if self.re.search(f.response.content):
-                return True
+        if f.request and f.request.raw_content:
+            try:
+                if self.re.search(f.request.content):
+                    return True
+            except ValueError:
+                pass
+        if f.response and f.response.raw_content:
+            try:
+                if self.re.search(f.response.content):
+                    return True
+            except ValueError:
+                pass
         return False
 
 
@@ -207,9 +213,12 @@ class FBodRequest(_Rex):
     help = "Request body"
 
     def __call__(self, f):
-        if f.request and f.request.content:
-            if self.re.search(f.request.content):
-                return True
+        if f.request and f.request.raw_content:
+            try:
+                if self.re.search(f.request.content):
+                    return True
+            except ValueError:
+                pass
 
 
 class FBodResponse(_Rex):
@@ -217,9 +226,12 @@ class FBodResponse(_Rex):
     help = "Response body"
 
     def __call__(self, f):
-        if f.response and f.response.content:
-            if self.re.search(f.response.content):
-                return True
+        if f.response and f.response.raw_content:
+            try:
+                if self.re.search(f.response.content):
+                    return True
+            except ValueError:
+                pass
 
 
 class FMethod(_Rex):
diff --git a/mitmproxy/flow/export.py b/mitmproxy/flow/export.py
index f0ac02ab..9da18f22 100644
--- a/mitmproxy/flow/export.py
+++ b/mitmproxy/flow/export.py
@@ -19,17 +19,23 @@ def dictstr(items, indent):
 def curl_command(flow):
     data = "curl "
 
-    for k, v in flow.request.headers.fields:
+    request = flow.request.copy()
+    try:
+        request.decode()
+    except ValueError:
+        pass
+
+    for k, v in request.headers.fields:
         data += "-H '%s:%s' " % (k, v)
 
-    if flow.request.method != "GET":
-        data += "-X %s " % flow.request.method
+    if request.method != "GET":
+        data += "-X %s " % request.method
 
-    full_url = flow.request.scheme + "://" + flow.request.host + flow.request.path
+    full_url = request.scheme + "://" + request.host + request.path
     data += "'%s'" % full_url
 
-    if flow.request.content:
-        data += " --data-binary '%s'" % flow.request.content
+    if request.raw_content:
+        data += " --data-binary '%s'" % request.raw_content
 
     return data
 
diff --git a/netlib/http/message.py b/netlib/http/message.py
index ca3a4145..86ff64d1 100644
--- a/netlib/http/message.py
+++ b/netlib/http/message.py
@@ -124,6 +124,9 @@ class Message(basetypes.Serializable):
         """
         The HTTP message body decoded with the content-encoding header (e.g. gzip)
 
+        Raises:
+            ValueError, when getting the content and the content-encoding is invalid.
+
         See also: :py:class:`raw_content`, :py:attr:`text`
         """
         ce = self.headers.get("content-encoding")
@@ -132,17 +135,21 @@ class Message(basetypes.Serializable):
             self._content_cache.encoding == ce
         )
         if not cached:
-            try:
-                if not ce:
-                    raise ValueError()
+            if ce:
                 decoded = encoding.decode(self.raw_content, ce)
-            except ValueError:
+            else:
                 decoded = self.raw_content
             self._content_cache = CachedDecode(self.raw_content, ce, decoded)
         return self._content_cache.decoded
 
     @content.setter
     def content(self, value):
+        if value is not None and not isinstance(value, bytes):
+            raise TypeError(
+                "Message content must be bytes, not {}. "
+                "Please use .text if you want to assign a str."
+                .format(type(value).__name__)
+            )
         ce = self.headers.get("content-encoding")
         cached = (
             self._content_cache.decoded == value and
@@ -150,15 +157,15 @@ class Message(basetypes.Serializable):
         )
         if not cached:
             try:
-                if not ce:
-                    raise ValueError()
-                encoded = encoding.encode(value, ce)
+                if ce and value is not None:
+                    encoded = encoding.encode(value, ce)
+                else:
+                    encoded = value
             except ValueError:
-                # Do we have an unknown content-encoding?
-                # If so, we want to remove it.
-                if value and ce:
-                    self.headers.pop("content-encoding", None)
-                    ce = None
+                # So we have an invalid content-encoding?
+                # Let's remove it!
+                del self.headers["content-encoding"]
+                ce = None
                 encoded = value
             self._content_cache = CachedDecode(encoded, ce, value)
         self.raw_content = self._content_cache.encoded
@@ -262,6 +269,9 @@ class Message(basetypes.Serializable):
         Decodes body based on the current Content-Encoding header, then
         removes the header. If there is no Content-Encoding header, no
         action is taken.
+
+        Raises:
+            ValueError, when the content-encoding is invalid.
         """
         self.raw_content = self.content
         self.headers.pop("content-encoding", None)
@@ -269,10 +279,16 @@ class Message(basetypes.Serializable):
     def encode(self, e):
         """
         Encodes body with the encoding e, where e is "gzip", "deflate" or "identity".
+        Any existing content-encodings are overwritten,
+        the content is not decoded beforehand.
+
+        Raises:
+            ValueError, when the specified content-encoding is invalid.
         """
-        self.decode()  # remove the current encoding
         self.headers["content-encoding"] = e
         self.content = self.raw_content
+        if "content-encoding" not in self.headers:
+            raise ValueError("Invalid content encoding {}".format(repr(e)))
 
     def replace(self, pattern, repl, flags=0):
         """
diff --git a/netlib/http/request.py b/netlib/http/request.py
index 4ce94549..a8ec6238 100644
--- a/netlib/http/request.py
+++ b/netlib/http/request.py
@@ -347,7 +347,10 @@ class Request(message.Message):
     def _get_urlencoded_form(self):
         is_valid_content_type = "application/x-www-form-urlencoded" in self.headers.get("content-type", "").lower()
         if is_valid_content_type:
-            return tuple(netlib.http.url.decode(self.content))
+            try:
+                return tuple(netlib.http.url.decode(self.content))
+            except ValueError:
+                pass
         return ()
 
     def _set_urlencoded_form(self, value):
@@ -356,7 +359,7 @@ class Request(message.Message):
         This will overwrite the existing content if there is one.
         """
         self.headers["content-type"] = "application/x-www-form-urlencoded"
-        self.content = netlib.http.url.encode(value)
+        self.content = netlib.http.url.encode(value).encode()
 
     @urlencoded_form.setter
     def urlencoded_form(self, value):
@@ -376,7 +379,10 @@ class Request(message.Message):
     def _get_multipart_form(self):
         is_valid_content_type = "multipart/form-data" in self.headers.get("content-type", "").lower()
         if is_valid_content_type:
-            return multipart.decode(self.headers, self.content)
+            try:
+                return multipart.decode(self.headers, self.content)
+            except ValueError:
+                pass
         return ()
 
     def _set_multipart_form(self, value):
diff --git a/netlib/wsgi.py b/netlib/wsgi.py
index c66fddc2..2444f449 100644
--- a/netlib/wsgi.py
+++ b/netlib/wsgi.py
@@ -60,10 +60,14 @@ class WSGIAdaptor(object):
         else:
             path_info = path
             query = ''
+        try:
+            content = flow.request.content
+        except ValueError:
+            content = flow.request.raw_content
         environ = {
             'wsgi.version': (1, 0),
             'wsgi.url_scheme': strutils.native(flow.request.scheme, "latin-1"),
-            'wsgi.input': BytesIO(flow.request.content or b""),
+            'wsgi.input': BytesIO(content or b""),
             'wsgi.errors': errsoc,
             'wsgi.multithread': True,
             'wsgi.multiprocess': False,
diff --git a/test/netlib/http/test_message.py b/test/netlib/http/test_message.py
index e1707a91..ed7d3da5 100644
--- a/test/netlib/http/test_message.py
+++ b/test/netlib/http/test_message.py
@@ -5,7 +5,7 @@ import mock
 import six
 
 from netlib.tutils import tresp
-from netlib import http
+from netlib import http, tutils
 
 
 def _test_passthrough_attr(message, attr):
@@ -92,9 +92,6 @@ class TestMessage(object):
         assert resp.data.content == b"bar"
         assert resp.headers["content-length"] == "0"
 
-    def test_content_basic(self):
-        _test_passthrough_attr(tresp(), "content")
-
     def test_headers(self):
         _test_passthrough_attr(tresp(), "headers")
 
@@ -149,18 +146,22 @@ class TestMessageContentEncoding(object):
         r = tresp()
         r.headers["content-encoding"] = "zopfli"
         r.raw_content = b"foo"
-        assert r.content == b"foo"
+        with tutils.raises(ValueError):
+            assert r.content
         assert r.headers["content-encoding"]
 
     def test_cannot_decode(self):
         r = tresp()
         r.encode("gzip")
         r.raw_content = b"foo"
-        assert r.content == b"foo"
+        with tutils.raises(ValueError):
+            assert r.content
         assert r.headers["content-encoding"]
-        r.decode()
+
+        with tutils.raises(ValueError):
+            r.decode()
         assert r.raw_content == b"foo"
-        assert "content-encoding" not in r.headers
+        assert "content-encoding" in r.headers
 
     def test_cannot_encode(self):
         r = tresp()
@@ -213,6 +214,7 @@ class TestMessageText(object):
 
         r.encode("identity")
         with mock.patch("netlib.encoding.encode") as e:
+            e.return_value = b""
             r.text = u"ü"
             assert e.call_count == 0
             r.text = u"ä"
-- 
cgit v1.2.3


From ca9de786fd7ed3edf7a485f7c019ac83d5abfc7f Mon Sep 17 00:00:00 2001
From: Maximilian Hils <git@maximilianhils.com>
Date: Mon, 4 Jul 2016 15:07:01 -0700
Subject: minor fix

---
 mitmproxy/console/flowview.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mitmproxy/console/flowview.py b/mitmproxy/console/flowview.py
index c4bb6c40..d994e670 100644
--- a/mitmproxy/console/flowview.py
+++ b/mitmproxy/console/flowview.py
@@ -226,7 +226,7 @@ class FlowView(tabs.Tabs):
             description = description.replace("Raw", "Couldn't parse: falling back to Raw")
 
         if enc:
-            description = " ".join(enc, description)
+            description = " ".join([enc, description])
 
         # Give hint that you have to tab for the response.
         if description == "No content" and isinstance(message, models.HTTPRequest):
-- 
cgit v1.2.3


From a3c7c84d49c3e6563e7f37ef60c989f99ed96788 Mon Sep 17 00:00:00 2001
From: Maximilian Hils <git@maximilianhils.com>
Date: Fri, 15 Jul 2016 22:50:33 -0700
Subject: improve message content semantics

---
 mitmproxy/console/common.py      |  30 +++------
 mitmproxy/console/flowview.py    |  12 +---
 mitmproxy/dump.py                |   2 +-
 mitmproxy/filt.py                |  28 +++------
 mitmproxy/flow/export.py         |  11 ++--
 netlib/http/headers.py           |  12 ++++
 netlib/http/message.py           | 133 ++++++++++++++++++++++++---------------
 netlib/wsgi.py                   |  10 +--
 test/netlib/http/test_headers.py |   9 ++-
 test/netlib/http/test_message.py |  77 +++++++++++++++++++----
 10 files changed, 194 insertions(+), 130 deletions(-)

diff --git a/mitmproxy/console/common.py b/mitmproxy/console/common.py
index ef220b4c..41f4f243 100644
--- a/mitmproxy/console/common.py
+++ b/mitmproxy/console/common.py
@@ -257,16 +257,13 @@ def copy_flow_format_data(part, scope, flow):
         data = ""
         if scope in ("q", "a"):
             request = flow.request.copy()
-            try:
-                request.decode()
-            except ValueError:
-                pass
-            if request.raw_content is None:
+            request.decode(strict=False)
+            if request.content is None:
                 return None, "Request content is missing"
             if part == "h":
                 data += netlib.http.http1.assemble_request(request)
             elif part == "c":
-                data += request.raw_content
+                data += request.content
             else:
                 raise ValueError("Unknown part: {}".format(part))
         if scope == "a" and flow.request.raw_content and flow.response:
@@ -274,16 +271,13 @@ def copy_flow_format_data(part, scope, flow):
             data += "\r\n" * 2
         if scope in ("s", "a") and flow.response:
             response = flow.response.copy()
-            try:
-                response.decode()
-            except ValueError:
-                pass
-            if response.raw_content is None:
+            response.decode(strict=False)
+            if response.content is None:
                 return None, "Response content is missing"
             if part == "h":
                 data += netlib.http.http1.assemble_response(response)
             elif part == "c":
-                data += response.raw_content
+                data += response.content
             else:
                 raise ValueError("Unknown part: {}".format(part))
     return data, False
@@ -393,22 +387,14 @@ def ask_save_body(part, master, state, flow):
             ask_save_body("q", master, state, flow)
 
     elif part == "q" and request_has_content:
-        try:
-            content = flow.request.content
-        except ValueError:
-            content = flow.request.raw_content
         ask_save_path(
             "Save request content",
-            content
+            flow.request.get_content(strict=False),
         )
     elif part == "s" and response_has_content:
-        try:
-            content = flow.response.content
-        except ValueError:
-            content = flow.response.raw_content
         ask_save_path(
             "Save response content",
-            content
+            flow.response.get_content(strict=False),
         )
     else:
         signals.status_message.send(message="No content to save.")
diff --git a/mitmproxy/console/flowview.py b/mitmproxy/console/flowview.py
index d994e670..f8686b41 100644
--- a/mitmproxy/console/flowview.py
+++ b/mitmproxy/console/flowview.py
@@ -427,11 +427,7 @@ class FlowView(tabs.Tabs):
             # editing message bodies, this can cause problems. For now, I
             # just strip the newlines off the end of the body when we return
             # from an editor.
-            try:
-                content = message.content
-            except ValueError:
-                content = message.raw_content
-            c = self.master.spawn_editor(content or b"")
+            c = self.master.spawn_editor(message.get_content(strict=False) or b"")
             message.content = c.rstrip(b"\n")
         elif part == "f":
             if not message.urlencoded_form and message.raw_content:
@@ -697,11 +693,7 @@ class FlowView(tabs.Tabs):
                 if conn.raw_content:
                     t = conn.headers.get("content-type")
                     if "EDITOR" in os.environ or "PAGER" in os.environ:
-                        try:
-                            content = conn.content
-                        except ValueError:
-                            content = conn.raw_content
-                        self.master.spawn_external_viewer(content, t)
+                        self.master.spawn_external_viewer(conn.get_content(strict=False), t)
                     else:
                         signals.status_message.send(
                             message = "Error! Set $EDITOR or $PAGER."
diff --git a/mitmproxy/dump.py b/mitmproxy/dump.py
index 0a9b76a7..14d55cd1 100644
--- a/mitmproxy/dump.py
+++ b/mitmproxy/dump.py
@@ -190,7 +190,7 @@ class DumpMaster(flow.FlowMaster):
             try:
                 content = message.content
             except ValueError:
-                content = message.raw_content
+                content = message.get_content(strict=False)
 
             if content is None:
                 self.echo("(content missing)", indent=4)
diff --git a/mitmproxy/filt.py b/mitmproxy/filt.py
index e8687b9f..a42988f1 100644
--- a/mitmproxy/filt.py
+++ b/mitmproxy/filt.py
@@ -194,17 +194,11 @@ class FBod(_Rex):
 
     def __call__(self, f):
         if f.request and f.request.raw_content:
-            try:
-                if self.re.search(f.request.content):
-                    return True
-            except ValueError:
-                pass
+            if self.re.search(f.request.get_content(strict=False)):
+                return True
         if f.response and f.response.raw_content:
-            try:
-                if self.re.search(f.response.content):
-                    return True
-            except ValueError:
-                pass
+            if self.re.search(f.response.get_content(strict=False)):
+                return True
         return False
 
 
@@ -214,11 +208,8 @@ class FBodRequest(_Rex):
 
     def __call__(self, f):
         if f.request and f.request.raw_content:
-            try:
-                if self.re.search(f.request.content):
-                    return True
-            except ValueError:
-                pass
+            if self.re.search(f.request.get_content(strict=False)):
+                return True
 
 
 class FBodResponse(_Rex):
@@ -227,11 +218,8 @@ class FBodResponse(_Rex):
 
     def __call__(self, f):
         if f.response and f.response.raw_content:
-            try:
-                if self.re.search(f.response.content):
-                    return True
-            except ValueError:
-                pass
+            if self.re.search(f.response.get_content(strict=False)):
+                return True
 
 
 class FMethod(_Rex):
diff --git a/mitmproxy/flow/export.py b/mitmproxy/flow/export.py
index 9da18f22..4659af7b 100644
--- a/mitmproxy/flow/export.py
+++ b/mitmproxy/flow/export.py
@@ -20,12 +20,9 @@ def curl_command(flow):
     data = "curl "
 
     request = flow.request.copy()
-    try:
-        request.decode()
-    except ValueError:
-        pass
+    request.decode(strict=False)
 
-    for k, v in request.headers.fields:
+    for k, v in request.headers.items(multi=True):
         data += "-H '%s:%s' " % (k, v)
 
     if request.method != "GET":
@@ -34,8 +31,8 @@ def curl_command(flow):
     full_url = request.scheme + "://" + request.host + request.path
     data += "'%s'" % full_url
 
-    if request.raw_content:
-        data += " --data-binary '%s'" % request.raw_content
+    if request.content:
+        data += " --data-binary '%s'" % request.content
 
     return data
 
diff --git a/netlib/http/headers.py b/netlib/http/headers.py
index f052a53b..13a8c98f 100644
--- a/netlib/http/headers.py
+++ b/netlib/http/headers.py
@@ -204,3 +204,15 @@ def parse_content_type(c):
             if len(clause) == 2:
                 d[clause[0].strip()] = clause[1].strip()
     return ts[0].lower(), ts[1].lower(), d
+
+
+def assemble_content_type(type, subtype, parameters):
+    if not parameters:
+        return "{}/{}".format(type, subtype)
+    params = "; ".join(
+        "{}={}".format(k, v)
+        for k, v in parameters.items()
+    )
+    return "{}/{}; {}".format(
+        type, subtype, params
+    )
diff --git a/netlib/http/message.py b/netlib/http/message.py
index 86ff64d1..1252ed25 100644
--- a/netlib/http/message.py
+++ b/netlib/http/message.py
@@ -53,14 +53,15 @@ class MessageData(basetypes.Serializable):
 
 
 class CachedDecode(object):
-    __slots__ = ["encoded", "encoding", "decoded"]
+    __slots__ = ["encoded", "encoding", "strict", "decoded"]
 
-    def __init__(self, object, encoding, decoded):
+    def __init__(self, object, encoding, strict, decoded):
         self.encoded = object
         self.encoding = encoding
+        self.strict = strict
         self.decoded = decoded
 
-no_cached_decode = CachedDecode(None, None, None)
+no_cached_decode = CachedDecode(None, None, None, None)
 
 
 class Message(basetypes.Serializable):
@@ -118,33 +119,44 @@ class Message(basetypes.Serializable):
     def raw_content(self, content):
         self.data.content = content
 
-    @property
-    def content(self):
-        # type: () -> bytes
+    def get_content(self, strict=True):
+        # type: (bool) -> bytes
         """
         The HTTP message body decoded with the content-encoding header (e.g. gzip)
 
         Raises:
-            ValueError, when getting the content and the content-encoding is invalid.
+            ValueError, when the content-encoding is invalid and strict is True.
 
         See also: :py:class:`raw_content`, :py:attr:`text`
         """
+        if self.raw_content is None:
+            return None
         ce = self.headers.get("content-encoding")
         cached = (
             self._content_cache.encoded == self.raw_content and
+            (self._content_cache.strict or not strict) and
             self._content_cache.encoding == ce
         )
         if not cached:
+            is_strict = True
             if ce:
-                decoded = encoding.decode(self.raw_content, ce)
+                try:
+                    decoded = encoding.decode(self.raw_content, ce)
+                except ValueError:
+                    if strict:
+                        raise
+                    is_strict = False
+                    decoded = self.raw_content
             else:
                 decoded = self.raw_content
-            self._content_cache = CachedDecode(self.raw_content, ce, decoded)
+            self._content_cache = CachedDecode(self.raw_content, ce, is_strict, decoded)
         return self._content_cache.decoded
 
-    @content.setter
-    def content(self, value):
-        if value is not None and not isinstance(value, bytes):
+    def set_content(self, value):
+        if value is None:
+            self.raw_content = None
+            return
+        if not isinstance(value, bytes):
             raise TypeError(
                 "Message content must be bytes, not {}. "
                 "Please use .text if you want to assign a str."
@@ -153,24 +165,23 @@ class Message(basetypes.Serializable):
         ce = self.headers.get("content-encoding")
         cached = (
             self._content_cache.decoded == value and
-            self._content_cache.encoding == ce
+            self._content_cache.encoding == ce and
+            self._content_cache.strict
         )
         if not cached:
             try:
-                if ce and value is not None:
-                    encoded = encoding.encode(value, ce)
-                else:
-                    encoded = value
+                encoded = encoding.encode(value, ce or "identity")
             except ValueError:
                 # So we have an invalid content-encoding?
                 # Let's remove it!
                 del self.headers["content-encoding"]
                 ce = None
                 encoded = value
-            self._content_cache = CachedDecode(encoded, ce, value)
+            self._content_cache = CachedDecode(encoded, ce, True, value)
         self.raw_content = self._content_cache.encoded
-        if isinstance(self.raw_content, bytes):
-            self.headers["content-length"] = str(len(self.raw_content))
+        self.headers["content-length"] = str(len(self.raw_content))
+
+    content = property(get_content, set_content)
 
     @property
     def http_version(self):
@@ -211,69 +222,87 @@ class Message(basetypes.Serializable):
         if ct:
             return ct[2].get("charset")
 
-    @property
-    def text(self):
-        # type: () -> six.text_type
+    def _guess_encoding(self):
+        # type: () -> str
+        enc = self._get_content_type_charset()
+        if enc:
+            return enc
+
+        if "json" in self.headers.get("content-type", ""):
+            return "utf8"
+        else:
+            # We may also want to check for HTML meta tags here at some point.
+            return "latin-1"
+
+    def get_text(self, strict=True):
+        # type: (bool) -> six.text_type
         """
         The HTTP message body decoded with both content-encoding header (e.g. gzip)
         and content-type header charset.
 
+        Raises:
+            ValueError, when either content-encoding or charset is invalid and strict is True.
+
         See also: :py:attr:`content`, :py:class:`raw_content`
         """
-        # This attribute should be called text, because that's what requests does.
-        enc = self._get_content_type_charset()
-
-        # We may also want to check for HTML meta tags here at some point.
+        if self.raw_content is None:
+            return None
+        enc = self._guess_encoding()
 
+        content = self.get_content(strict)
         cached = (
-            self._text_cache.encoded == self.content and
+            self._text_cache.encoded == content and
+            (self._text_cache.strict or not strict) and
             self._text_cache.encoding == enc
         )
         if not cached:
+            is_strict = self._content_cache.strict
             try:
-                if not enc:
-                    raise ValueError()
-                decoded = encoding.decode(self.content, enc)
+                decoded = encoding.decode(content, enc)
             except ValueError:
-                decoded = self.content.decode("utf8", "replace" if six.PY2 else "surrogateescape")
-            self._text_cache = CachedDecode(self.content, enc, decoded)
+                if strict:
+                    raise
+                is_strict = False
+                decoded = self.content.decode(enc, "replace" if six.PY2 else "surrogateescape")
+            self._text_cache = CachedDecode(content, enc, is_strict, decoded)
         return self._text_cache.decoded
 
-    @text.setter
-    def text(self, text):
-        enc = self._get_content_type_charset()
+    def set_text(self, text):
+        if text is None:
+            self.content = None
+            return
+        enc = self._guess_encoding()
+
         cached = (
             self._text_cache.decoded == text and
-            self._text_cache.encoding == enc
+            self._text_cache.encoding == enc and
+            self._text_cache.strict
         )
         if not cached:
             try:
-                if not enc:
-                    raise ValueError()
                 encoded = encoding.encode(text, enc)
             except ValueError:
-                # Do we have an unknown content-type charset?
-                # If so, we want to replace it with utf8.
-                if text and enc:
-                    self.headers["content-type"] = re.sub(
-                        "charset=[^;]+",
-                        "charset=utf-8",
-                        self.headers["content-type"]
-                    )
-                encoded = text.encode("utf8", "replace" if six.PY2 else "surrogateescape")
-            self._text_cache = CachedDecode(encoded, enc, text)
+                # Fall back to UTF-8 and update the content-type header.
+                ct = headers.parse_content_type(self.headers.get("content-type", "")) or ("text", "plain", {})
+                ct[2]["charset"] = "utf-8"
+                self.headers["content-type"] = headers.assemble_content_type(*ct)
+                enc = "utf8"
+                encoded = text.encode(enc, "replace" if six.PY2 else "surrogateescape")
+            self._text_cache = CachedDecode(encoded, enc, True, text)
         self.content = self._text_cache.encoded
 
-    def decode(self):
+    text = property(get_text, set_text)
+
+    def decode(self, strict=True):
         """
         Decodes body based on the current Content-Encoding header, then
         removes the header. If there is no Content-Encoding header, no
         action is taken.
 
         Raises:
-            ValueError, when the content-encoding is invalid.
+            ValueError, when the content-encoding is invalid and strict is True.
         """
-        self.raw_content = self.content
+        self.raw_content = self.get_content(strict)
         self.headers.pop("content-encoding", None)
 
     def encode(self, e):
diff --git a/netlib/wsgi.py b/netlib/wsgi.py
index 2444f449..0def75b5 100644
--- a/netlib/wsgi.py
+++ b/netlib/wsgi.py
@@ -54,20 +54,20 @@ class WSGIAdaptor(object):
         self.app, self.domain, self.port, self.sversion = app, domain, port, sversion
 
     def make_environ(self, flow, errsoc, **extra):
+        """
+        Raises:
+            ValueError, if the content-encoding is invalid.
+        """
         path = strutils.native(flow.request.path, "latin-1")
         if '?' in path:
             path_info, query = strutils.native(path, "latin-1").split('?', 1)
         else:
             path_info = path
             query = ''
-        try:
-            content = flow.request.content
-        except ValueError:
-            content = flow.request.raw_content
         environ = {
             'wsgi.version': (1, 0),
             'wsgi.url_scheme': strutils.native(flow.request.scheme, "latin-1"),
-            'wsgi.input': BytesIO(content or b""),
+            'wsgi.input': BytesIO(flow.request.content or b""),
             'wsgi.errors': errsoc,
             'wsgi.multithread': True,
             'wsgi.multiprocess': False,
diff --git a/test/netlib/http/test_headers.py b/test/netlib/http/test_headers.py
index 51819b86..8462a5af 100644
--- a/test/netlib/http/test_headers.py
+++ b/test/netlib/http/test_headers.py
@@ -1,4 +1,4 @@
-from netlib.http import Headers, parse_content_type
+from netlib.http.headers import Headers, parse_content_type, assemble_content_type
 from netlib.tutils import raises
 
 
@@ -81,3 +81,10 @@ def test_parse_content_type():
 
     v = p("text/html; charset=UTF-8")
     assert v == ('text', 'html', {'charset': 'UTF-8'})
+
+
+def test_assemble_content_type():
+    p = assemble_content_type
+    assert p("text", "html", {}) == "text/html"
+    assert p("text", "html", {"charset": "utf8"}) == "text/html; charset=utf8"
+    assert p("text", "html", {"charset": "utf8", "foo": "bar"}) == "text/html; charset=utf8; foo=bar"
diff --git a/test/netlib/http/test_message.py b/test/netlib/http/test_message.py
index ed7d3da5..8b178e04 100644
--- a/test/netlib/http/test_message.py
+++ b/test/netlib/http/test_message.py
@@ -142,6 +142,9 @@ class TestMessageContentEncoding(object):
             r.content = b"bar"
             assert e.call_count == 1
 
+        with tutils.raises(TypeError):
+            r.content = u"foo"
+
     def test_unknown_ce(self):
         r = tresp()
         r.headers["content-encoding"] = "zopfli"
@@ -149,6 +152,7 @@ class TestMessageContentEncoding(object):
         with tutils.raises(ValueError):
             assert r.content
         assert r.headers["content-encoding"]
+        assert r.get_content(strict=False) == b"foo"
 
     def test_cannot_decode(self):
         r = tresp()
@@ -157,12 +161,25 @@ class TestMessageContentEncoding(object):
         with tutils.raises(ValueError):
             assert r.content
         assert r.headers["content-encoding"]
+        assert r.get_content(strict=False) == b"foo"
 
         with tutils.raises(ValueError):
             r.decode()
         assert r.raw_content == b"foo"
         assert "content-encoding" in r.headers
 
+        r.decode(strict=False)
+        assert r.content == b"foo"
+        assert "content-encoding" not in r.headers
+
+    def test_none(self):
+        r = tresp(content=None)
+        assert r.content is None
+        r.content = b"foo"
+        assert r.content is not None
+        r.content = None
+        assert r.content is None
+
     def test_cannot_encode(self):
         r = tresp()
         r.encode("gzip")
@@ -175,12 +192,17 @@ class TestMessageContentEncoding(object):
         assert "content-encoding" not in r.headers
         assert r.raw_content == b"foo"
 
+        with tutils.raises(ValueError):
+            r.encode("zopfli")
+        assert r.raw_content == b"foo"
+        assert "content-encoding" not in r.headers
+
 
 class TestMessageText(object):
     def test_simple(self):
-        r = tresp(content=b'\xc3\xbc')
-        assert r.raw_content == b"\xc3\xbc"
-        assert r.content == b"\xc3\xbc"
+        r = tresp(content=b'\xfc')
+        assert r.raw_content == b"\xfc"
+        assert r.content == b"\xfc"
         assert r.text == u"ü"
 
         r.encode("gzip")
@@ -189,8 +211,10 @@ class TestMessageText(object):
         assert r.text == u"ü"
 
         r.headers["content-type"] = "text/html; charset=latin1"
-        assert r.content == b"\xc3\xbc"
+        r.content = b"\xc3\xbc"
         assert r.text == u"Ã¼"
+        r.headers["content-type"] = "text/html; charset=utf8"
+        assert r.text == u"ü"
 
         r.encode("identity")
         r.raw_content = b"foo"
@@ -201,16 +225,29 @@ class TestMessageText(object):
             assert r.text
             assert e.call_count == 0
 
+    def test_guess_json(self):
+        r = tresp(content=b'"\xc3\xbc"')
+        r.headers["content-type"] = "application/json"
+        assert r.text == u'"ü"'
+
+    def test_none(self):
+        r = tresp(content=None)
+        assert r.text is None
+        r.text = b"foo"
+        assert r.text is not None
+        r.text = None
+        assert r.text is None
+
     def test_modify(self):
         r = tresp()
 
         r.text = u"ü"
-        assert r.raw_content == b"\xc3\xbc"
+        assert r.raw_content == b"\xfc"
 
-        r.headers["content-type"] = "text/html; charset=latin1"
+        r.headers["content-type"] = "text/html; charset=utf8"
         r.text = u"ü"
-        assert r.raw_content == b"\xfc"
-        assert r.headers["content-length"] == "1"
+        assert r.raw_content == b"\xc3\xbc"
+        assert r.headers["content-length"] == "2"
 
         r.encode("identity")
         with mock.patch("netlib.encoding.encode") as e:
@@ -224,12 +261,18 @@ class TestMessageText(object):
         r = tresp()
         r.headers["content-type"] = "text/html; charset=wtf"
         r.raw_content = b"foo"
-        assert r.text == u"foo"
+        with tutils.raises(ValueError):
+            assert r.text == u"foo"
+        assert r.get_text(strict=False) == u"foo"
 
     def test_cannot_decode(self):
         r = tresp()
+        r.headers["content-type"] = "text/html; charset=utf8"
         r.raw_content = b"\xFF"
-        assert r.text == u'\ufffd' if six.PY2 else '\udcff'
+        with tutils.raises(ValueError):
+            assert r.text
+
+        assert r.get_text(strict=False) == u'\ufffd' if six.PY2 else '\udcff'
 
     def test_cannot_encode(self):
         r = tresp()
@@ -237,9 +280,19 @@ class TestMessageText(object):
         assert "content-type" not in r.headers
         assert r.raw_content is None
 
-        r.headers["content-type"] = "text/html; charset=latin1"
+        r.headers["content-type"] = "text/html; charset=latin1; foo=bar"
         r.text = u"☃"
-        assert r.headers["content-type"] == "text/html; charset=utf-8"
+        assert r.headers["content-type"] == "text/html; charset=utf-8; foo=bar"
+        assert r.raw_content == b'\xe2\x98\x83'
+
+        r.headers["content-type"] = "gibberish"
+        r.text = u"☃"
+        assert r.headers["content-type"] == "text/plain; charset=utf-8"
+        assert r.raw_content == b'\xe2\x98\x83'
+
+        del r.headers["content-type"]
+        r.text = u"☃"
+        assert r.headers["content-type"] == "text/plain; charset=utf-8"
         assert r.raw_content == b'\xe2\x98\x83'
 
         r.headers["content-type"] = "text/html; charset=latin1"
-- 
cgit v1.2.3


From e6e39ce80f4daaf6a1d6f8d87616409486d358a5 Mon Sep 17 00:00:00 2001
From: Maximilian Hils <git@maximilianhils.com>
Date: Fri, 15 Jul 2016 23:46:12 -0700
Subject: preserve content-type parameter order

---
 netlib/http/headers.py           | 3 ++-
 test/netlib/http/test_headers.py | 4 +++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/netlib/http/headers.py b/netlib/http/headers.py
index b8aa212a..9fa7e1e6 100644
--- a/netlib/http/headers.py
+++ b/netlib/http/headers.py
@@ -2,6 +2,7 @@ from __future__ import absolute_import, print_function, division
 
 import re
 
+import collections
 import six
 from netlib import multidict
 from netlib import strutils
@@ -206,7 +207,7 @@ def parse_content_type(c):
     ts = parts[0].split("/", 1)
     if len(ts) != 2:
         return None
-    d = {}
+    d = collections.OrderedDict()
     if len(parts) == 2:
         for i in parts[1].split(";"):
             clause = i.split("=", 1)
diff --git a/test/netlib/http/test_headers.py b/test/netlib/http/test_headers.py
index 8462a5af..51537310 100644
--- a/test/netlib/http/test_headers.py
+++ b/test/netlib/http/test_headers.py
@@ -1,3 +1,5 @@
+import collections
+
 from netlib.http.headers import Headers, parse_content_type, assemble_content_type
 from netlib.tutils import raises
 
@@ -87,4 +89,4 @@ def test_assemble_content_type():
     p = assemble_content_type
     assert p("text", "html", {}) == "text/html"
     assert p("text", "html", {"charset": "utf8"}) == "text/html; charset=utf8"
-    assert p("text", "html", {"charset": "utf8", "foo": "bar"}) == "text/html; charset=utf8; foo=bar"
+    assert p("text", "html", collections.OrderedDict([("charset", "utf8"), ("foo", "bar")])) == "text/html; charset=utf8; foo=bar"
-- 
cgit v1.2.3