From 6032c4f2352260d32032800a2ff694339e2af6b2 Mon Sep 17 00:00:00 2001 From: Maximilian Hils Date: Sat, 2 Jul 2016 01:51:47 -0700 Subject: message.content -> .raw_content, implement .text This PR improves our handling of HTTP message body encodings: - The unaltered message body is now accessible as `.raw_content` - The "content-encoding"-decoded content (i.e. gzip removed) content is not `.content`, as this is what we want in 99% of the cases. - `.text` now provides the "content-encoding"-decoded and then "content-type charset"-decoded message body. - The decoded values for `.content` and `.text` are cached, so that repeated access and `x.text = x.text` is cheap. - The `decoded()` decorator is now deprecated, as we can now just use `.content`. Similarly `HTTPMessage.get_decoded_content()` is deprecated. --- docs/dev/models.rst | 2 - mitmproxy/console/common.py | 37 +++---- mitmproxy/console/flowview.py | 33 +++--- mitmproxy/contentviews.py | 17 +-- mitmproxy/dump.py | 4 +- mitmproxy/filt.py | 8 +- mitmproxy/flow/master.py | 8 +- mitmproxy/flow/modules.py | 4 +- mitmproxy/models/http.py | 8 +- mitmproxy/protocol/http.py | 4 +- mitmproxy/web/app.py | 4 +- netlib/encoding.py | 97 +++++++++++------ netlib/http/http1/assemble.py | 4 +- netlib/http/message.py | 192 +++++++++++++++++++++++----------- netlib/http/request.py | 4 +- netlib/http/response.py | 5 +- test/mitmproxy/test_contentview.py | 22 ---- test/mitmproxy/test_examples.py | 10 +- test/mitmproxy/test_flow.py | 18 +--- test/mitmproxy/test_protocol_http2.py | 6 +- test/mitmproxy/tservers.py | 1 - test/netlib/http/test_message.py | 117 +++++++++++++++------ test/netlib/test_encoding.py | 40 +++---- 23 files changed, 377 insertions(+), 268 deletions(-) diff --git a/docs/dev/models.rst b/docs/dev/models.rst index 02f36f58..7260f1f7 100644 --- a/docs/dev/models.rst +++ b/docs/dev/models.rst @@ -56,8 +56,6 @@ Datastructures :special-members: :no-undoc-members: - .. autoclass:: decoded - .. automodule:: netlib.multidict .. autoclass:: MultiDictView diff --git a/mitmproxy/console/common.py b/mitmproxy/console/common.py index b450c19d..b4369c0c 100644 --- a/mitmproxy/console/common.py +++ b/mitmproxy/console/common.py @@ -7,7 +7,6 @@ import urwid.util import netlib from mitmproxy import flow -from mitmproxy import models from mitmproxy import utils from mitmproxy.console import signals from netlib import human @@ -259,26 +258,24 @@ def copy_flow_format_data(part, scope, flow): if scope in ("q", "a"): if flow.request.content is None: return None, "Request content is missing" - with models.decoded(flow.request): - if part == "h": - data += netlib.http.http1.assemble_request(flow.request) - elif part == "c": - data += flow.request.content - else: - raise ValueError("Unknown part: {}".format(part)) + if part == "h": + data += netlib.http.http1.assemble_request(flow.request) + elif part == "c": + data += flow.request.content + else: + raise ValueError("Unknown part: {}".format(part)) if scope == "a" and flow.request.content and flow.response: # Add padding between request and response data += "\r\n" * 2 if scope in ("s", "a") and flow.response: if flow.response.content is None: return None, "Response content is missing" - with models.decoded(flow.response): - if part == "h": - data += netlib.http.http1.assemble_response(flow.response) - elif part == "c": - data += flow.response.content - else: - raise ValueError("Unknown part: {}".format(part)) + if part == "h": + data += netlib.http.http1.assemble_response(flow.response) + elif part == "c": + data += flow.response.content + else: + raise ValueError("Unknown part: {}".format(part)) return data, False @@ -388,12 +385,12 @@ def ask_save_body(part, master, state, flow): elif part == "q" and request_has_content: ask_save_path( "Save request content", - flow.request.get_decoded_content() + flow.request.content ) elif part == "s" and response_has_content: ask_save_path( "Save response content", - flow.response.get_decoded_content() + flow.response.content ) else: signals.status_message.send(message="No content to save.") @@ -418,9 +415,9 @@ def format_flow(f, focus, extended=False, hostheader=False, marked=False): marked = marked, ) if f.response: - if f.response.content: - contentdesc = human.pretty_size(len(f.response.content)) - elif f.response.content is None: + if f.response.raw_content: + contentdesc = human.pretty_size(len(f.response.raw_content)) + elif f.response.raw_content is None: contentdesc = "[content missing]" else: contentdesc = "[no content]" diff --git a/mitmproxy/console/flowview.py b/mitmproxy/console/flowview.py index e9b23176..208b0d44 100644 --- a/mitmproxy/console/flowview.py +++ b/mitmproxy/console/flowview.py @@ -176,7 +176,7 @@ class FlowView(tabs.Tabs): self.show() def content_view(self, viewmode, message): - if message.content is None: + if message.raw_content is None: msg, body = "", [urwid.Text([("error", "[content missing]")])] return msg, body else: @@ -214,6 +214,12 @@ class FlowView(tabs.Tabs): ) description = description.replace("Raw", "Couldn't parse: falling back to Raw") + if message.content != message.raw_content: + description = "[decoded {enc}] {desc}".format( + enc=message.headers.get("content-encoding"), + desc=description + ) + # Give hint that you have to tab for the response. if description == "No content" and isinstance(message, models.HTTPRequest): description = "No request content (press tab to view response)" @@ -407,15 +413,14 @@ class FlowView(tabs.Tabs): ) ) if part == "r": - with models.decoded(message): - # Fix an issue caused by some editors when editing a - # request/response body. Many editors make it hard to save a - # file without a terminating newline on the last line. When - # editing message bodies, this can cause problems. For now, I - # just strip the newlines off the end of the body when we return - # from an editor. - c = self.master.spawn_editor(message.content or "") - message.content = c.rstrip("\n") + # Fix an issue caused by some editors when editing a + # request/response body. Many editors make it hard to save a + # file without a terminating newline on the last line. When + # editing message bodies, this can cause problems. For now, I + # just strip the newlines off the end of the body when we return + # from an editor. + c = self.master.spawn_editor(message.content or b"") + message.content = c.rstrip(b"\n") elif part == "f": if not message.urlencoded_form and message.content: signals.status_prompt_onekey.send( @@ -512,14 +517,10 @@ class FlowView(tabs.Tabs): signals.flow_change.send(self, flow = self.flow) def delete_body(self, t): - if t == "m": - val = None - else: - val = None if self.tab_offset == TAB_REQ: - self.flow.request.content = val + self.flow.request.content = None else: - self.flow.response.content = val + self.flow.response.content = None signals.flow_change.send(self, flow = self.flow) def keypress(self, size, key): diff --git a/mitmproxy/contentviews.py b/mitmproxy/contentviews.py index de88c9ea..c9ea14ba 100644 --- a/mitmproxy/contentviews.py +++ b/mitmproxy/contentviews.py @@ -618,15 +618,6 @@ def get_content_view(viewmode, data, **metadata): Raises: ContentViewException, if the content view threw an error. """ - msg = [] - - headers = metadata.get("headers", {}) - enc = headers.get("content-encoding") - if enc and enc != "identity": - decoded = encoding.decode(enc, data) - if decoded: - data = decoded - msg.append("[decoded %s]" % enc) try: ret = viewmode(data, **metadata) # Third-party viewers can fail in unexpected ways... @@ -637,8 +628,8 @@ def get_content_view(viewmode, data, **metadata): sys.exc_info()[2] ) if not ret: - ret = get("Raw")(data, **metadata) - msg.append("Couldn't parse: falling back to Raw") + desc = "Couldn't parse: falling back to Raw" + _, content = get("Raw")(data, **metadata) else: - msg.append(ret[0]) - return " ".join(msg), safe_to_print(ret[1]) + desc, content = ret + return desc, safe_to_print(content) diff --git a/mitmproxy/dump.py b/mitmproxy/dump.py index 6670be9b..ea242bba 100644 --- a/mitmproxy/dump.py +++ b/mitmproxy/dump.py @@ -290,10 +290,10 @@ class DumpMaster(flow.FlowMaster): code = click.style(str(code), fg=code_color, bold=True, blink=(code == 418)) reason = click.style(strutils.bytes_to_escaped_str(flow.response.reason), fg=code_color, bold=True) - if flow.response.content is None: + if flow.response.raw_content is None: size = "(content missing)" else: - size = human.pretty_size(len(flow.response.content)) + size = human.pretty_size(len(flow.response.raw_content)) size = click.style(size, bold=True) arrows = click.style("<<", bold=True) diff --git a/mitmproxy/filt.py b/mitmproxy/filt.py index b1b72aa7..95bae1ae 100644 --- a/mitmproxy/filt.py +++ b/mitmproxy/filt.py @@ -194,10 +194,10 @@ class FBod(_Rex): def __call__(self, f): if f.request and f.request.content: - if self.re.search(f.request.get_decoded_content()): + if self.re.search(f.request.content): return True if f.response and f.response.content: - if self.re.search(f.response.get_decoded_content()): + if self.re.search(f.response.content): return True return False @@ -208,7 +208,7 @@ class FBodRequest(_Rex): def __call__(self, f): if f.request and f.request.content: - if self.re.search(f.request.get_decoded_content()): + if self.re.search(f.request.content): return True @@ -218,7 +218,7 @@ class FBodResponse(_Rex): def __call__(self, f): if f.response and f.response.content: - if self.re.search(f.response.get_decoded_content()): + if self.re.search(f.response.content): return True diff --git a/mitmproxy/flow/master.py b/mitmproxy/flow/master.py index efb5d013..a4aa9a7e 100644 --- a/mitmproxy/flow/master.py +++ b/mitmproxy/flow/master.py @@ -16,7 +16,6 @@ from mitmproxy.flow import modules from mitmproxy.onboarding import app from mitmproxy.protocol import http_replay from mitmproxy.proxy.config import HostMatcher -from netlib import strutils class FlowMaster(controller.Master): @@ -348,13 +347,16 @@ class FlowMaster(controller.Master): return "Can't replay live request." if f.intercepted: return "Can't replay while intercepting..." - if f.request.content is None: + if f.request.raw_content is None: return "Can't replay request with missing content..." if f.request: f.backup() f.request.is_replay = True + + # TODO: We should be able to remove this. if "Content-Length" in f.request.headers: - f.request.headers["Content-Length"] = str(len(f.request.content)) + f.request.headers["Content-Length"] = str(len(f.request.raw_content)) + f.response = None f.error = None self.process_new_request(f) diff --git a/mitmproxy/flow/modules.py b/mitmproxy/flow/modules.py index 2998d259..85dff0f1 100644 --- a/mitmproxy/flow/modules.py +++ b/mitmproxy/flow/modules.py @@ -157,7 +157,7 @@ class StreamLargeBodies(object): expected_size = http1.expected_http_body_size( flow.request, flow.response if not is_request else None ) - if not r.content and not (0 <= expected_size <= self.max_size): + if not r.raw_content and not (0 <= expected_size <= self.max_size): # r.stream may already be a callable, which we want to preserve. r.stream = r.stream or True @@ -251,7 +251,7 @@ class ServerPlaybackState: if p[0] not in self.ignore_payload_params ) else: - key.append(str(r.content)) + key.append(str(r.raw_content)) if not self.ignore_host: key.append(r.host) diff --git a/mitmproxy/models/http.py b/mitmproxy/models/http.py index 01f5f1ee..a50808ef 100644 --- a/mitmproxy/models/http.py +++ b/mitmproxy/models/http.py @@ -1,9 +1,9 @@ from __future__ import absolute_import, print_function, division import cgi +import warnings from mitmproxy.models.flow import Flow -from netlib import encoding from netlib import version from netlib.http import Headers from netlib.http import Request @@ -20,10 +20,8 @@ class MessageMixin(object): header. Doesn't change the message iteself or its headers. """ - ce = self.headers.get("content-encoding") - if not self.content or ce not in encoding.ENCODINGS: - return self.content - return encoding.decode(ce, self.content) + warnings.warn(".get_decoded_content() is deprecated, please use .content directly instead.", DeprecationWarning) + return self.content class HTTPRequest(MessageMixin, Request): diff --git a/mitmproxy/protocol/http.py b/mitmproxy/protocol/http.py index 187c17f6..2c70f288 100644 --- a/mitmproxy/protocol/http.py +++ b/mitmproxy/protocol/http.py @@ -41,10 +41,10 @@ class _HttpTransmissionLayer(base.Layer): yield "this is a generator" # pragma: no cover def send_response(self, response): - if response.content is None: + if response.data.content is None: raise netlib.exceptions.HttpException("Cannot assemble flow with missing content") self.send_response_headers(response) - self.send_response_body(response, [response.content]) + self.send_response_body(response, [response.data.content]) def send_response_headers(self, response): raise NotImplementedError() diff --git a/mitmproxy/web/app.py b/mitmproxy/web/app.py index a2798472..50fbaed8 100644 --- a/mitmproxy/web/app.py +++ b/mitmproxy/web/app.py @@ -272,7 +272,7 @@ class FlowContent(RequestHandler): def get(self, flow_id, message): message = getattr(self.flow, message) - if not message.content: + if not message.raw_content: raise APIError(400, "No content.") content_encoding = message.headers.get("Content-Encoding", None) @@ -295,7 +295,7 @@ class FlowContent(RequestHandler): self.set_header("Content-Type", "application/text") self.set_header("X-Content-Type-Options", "nosniff") self.set_header("X-Frame-Options", "DENY") - self.write(message.content) + self.write(message.raw_content) class Events(RequestHandler): diff --git a/netlib/encoding.py b/netlib/encoding.py index 98502451..8b67b543 100644 --- a/netlib/encoding.py +++ b/netlib/encoding.py @@ -1,39 +1,62 @@ """ - Utility functions for decoding response bodies. +Utility functions for decoding response bodies. """ from __future__ import absolute_import + +import codecs from io import BytesIO import gzip import zlib +from typing import Union # noqa + -ENCODINGS = {"identity", "gzip", "deflate"} +def decode(obj, encoding, errors='strict'): + # type: (Union[str, bytes], str) -> Union[str, bytes] + """ + Decode the given input object + Returns: + The decoded value -def decode(e, content): - if not isinstance(content, bytes): - return None - encoding_map = { - "identity": identity, - "gzip": decode_gzip, - "deflate": decode_deflate, - } - if e not in encoding_map: - return None - return encoding_map[e](content) + Raises: + ValueError, if decoding fails. + """ + try: + try: + return custom_decode[encoding](obj) + except KeyError: + return codecs.decode(obj, encoding, errors) + except Exception as e: + raise ValueError("{} when decoding {} with {}".format( + type(e).__name__, + repr(obj)[:10], + repr(encoding), + )) + + +def encode(obj, encoding, errors='strict'): + # type: (Union[str, bytes], str) -> Union[str, bytes] + """ + Encode the given input object + Returns: + The encoded value -def encode(e, content): - if not isinstance(content, bytes): - return None - encoding_map = { - "identity": identity, - "gzip": encode_gzip, - "deflate": encode_deflate, - } - if e not in encoding_map: - return None - return encoding_map[e](content) + Raises: + ValueError, if encoding fails. + """ + try: + try: + return custom_encode[encoding](obj) + except KeyError: + return codecs.encode(obj, encoding, errors) + except Exception as e: + raise ValueError("{} when encoding {} with {}".format( + type(e).__name__, + repr(obj)[:10], + repr(encoding), + )) def identity(content): @@ -46,10 +69,7 @@ def identity(content): def decode_gzip(content): gfile = gzip.GzipFile(fileobj=BytesIO(content)) - try: - return gfile.read() - except (IOError, EOFError): - return None + return gfile.read() def encode_gzip(content): @@ -70,12 +90,9 @@ def decode_deflate(content): http://bugs.python.org/issue5784 """ try: - try: - return zlib.decompress(content) - except zlib.error: - return zlib.decompress(content, -15) + return zlib.decompress(content) except zlib.error: - return None + return zlib.decompress(content, -15) def encode_deflate(content): @@ -84,4 +101,16 @@ def encode_deflate(content): """ return zlib.compress(content) -__all__ = ["ENCODINGS", "encode", "decode"] + +custom_decode = { + "identity": identity, + "gzip": decode_gzip, + "deflate": decode_deflate, +} +custom_encode = { + "identity": identity, + "gzip": encode_gzip, + "deflate": encode_deflate, +} + +__all__ = ["encode", "decode"] diff --git a/netlib/http/http1/assemble.py b/netlib/http/http1/assemble.py index 511328f1..e74732d2 100644 --- a/netlib/http/http1/assemble.py +++ b/netlib/http/http1/assemble.py @@ -5,7 +5,7 @@ from netlib import exceptions def assemble_request(request): - if request.content is None: + if request.data.content is None: raise exceptions.HttpException("Cannot assemble flow with missing content") head = assemble_request_head(request) body = b"".join(assemble_body(request.data.headers, [request.data.content])) @@ -19,7 +19,7 @@ def assemble_request_head(request): def assemble_response(response): - if response.content is None: + if response.data.content is None: raise exceptions.HttpException("Cannot assemble flow with missing content") head = assemble_response_head(response) body = b"".join(assemble_body(response.data.headers, [response.data.content])) diff --git a/netlib/http/message.py b/netlib/http/message.py index 0583c246..668198f8 100644 --- a/netlib/http/message.py +++ b/netlib/http/message.py @@ -52,7 +52,22 @@ class MessageData(basetypes.Serializable): return cls(**state) +class CachedDecode(object): + __slots__ = ["encoded", "encoding", "decoded"] + + def __init__(self, object, encoding, decoded): + self.encoded = object + self.encoding = encoding + self.decoded = decoded + +no_cached_decode = CachedDecode(None, None, None) + + class Message(basetypes.Serializable): + def __init__(self): + self._content_cache = no_cached_decode # type: CachedDecode + self._text_cache = no_cached_decode # type: CachedDecode + def __eq__(self, other): if isinstance(other, Message): return self.data == other.data @@ -90,19 +105,65 @@ class Message(basetypes.Serializable): self.data.headers = h @property - def content(self): + def raw_content(self): + # type: () -> bytes """ The raw (encoded) HTTP message body - See also: :py:attr:`text` + See also: :py:attr:`content`, :py:class:`text` """ return self.data.content - @content.setter - def content(self, content): + @raw_content.setter + def raw_content(self, content): self.data.content = content - if isinstance(content, bytes): - self.headers["content-length"] = str(len(content)) + + @property + def content(self): + # type: () -> bytes + """ + The HTTP message body decoded with the content-encoding header (e.g. gzip) + + See also: :py:class:`raw_content`, :py:attr:`text` + """ + ce = self.headers.get("content-encoding") + cached = ( + self._content_cache.encoded == self.raw_content and + self._content_cache.encoding == ce + ) + if not cached: + try: + if not ce: + raise ValueError() + decoded = encoding.decode(self.raw_content, ce) + except ValueError: + decoded = self.raw_content + self._content_cache = CachedDecode(self.raw_content, ce, decoded) + return self._content_cache.decoded + + @content.setter + def content(self, value): + ce = self.headers.get("content-encoding") + cached = ( + self._content_cache.decoded == value and + self._content_cache.encoding == ce + ) + if not cached: + try: + if not ce: + raise ValueError() + encoded = encoding.encode(value, ce) + except ValueError: + # Do we have an unknown content-encoding? + # If so, we want to remove it. + if value and ce: + self.headers.pop("content-encoding", None) + ce = None + encoded = value + self._content_cache = CachedDecode(encoded, ce, value) + self.raw_content = self._content_cache.encoded + if isinstance(self.raw_content, bytes): + self.headers["content-length"] = str(len(self.raw_content)) @property def http_version(self): @@ -137,56 +198,81 @@ class Message(basetypes.Serializable): def timestamp_end(self, timestamp_end): self.data.timestamp_end = timestamp_end + def _get_content_type_charset(self): + # type: () -> Optional[str] + ct = headers.parse_content_type(self.headers.get("content-type", "")) + if ct: + return ct[2].get("charset") + @property def text(self): + # type: () -> six.text_type """ - The decoded HTTP message body. - Decoded contents are not cached, so accessing this attribute repeatedly is relatively expensive. - - .. note:: - This is not implemented yet. + The HTTP message body decoded with both content-encoding header (e.g. gzip) + and content-type header charset. - See also: :py:attr:`content`, :py:class:`decoded` + See also: :py:attr:`content`, :py:class:`raw_content` """ # This attribute should be called text, because that's what requests does. - raise NotImplementedError() + enc = self._get_content_type_charset() + + # We may also want to check for HTML meta tags here at some point. + + cached = ( + self._text_cache.encoded == self.content and + self._text_cache.encoding == enc + ) + if not cached: + try: + if not enc: + raise ValueError() + decoded = encoding.decode(self.content, enc) + except ValueError: + decoded = self.content.decode("utf8", "replace" if six.PY2 else "surrogateescape") + self._text_cache = CachedDecode(self.content, enc, decoded) + return self._text_cache.decoded @text.setter def text(self, text): - raise NotImplementedError() + enc = self._get_content_type_charset() + cached = ( + self._text_cache.decoded == text and + self._text_cache.encoding == enc + ) + if not cached: + try: + if not enc: + raise ValueError() + encoded = encoding.encode(text, enc) + except ValueError: + # Do we have an unknown content-type charset? + # If so, we want to replace it with utf8. + if text and enc: + self.headers["content-type"] = re.sub( + "charset=[^;]+", + "charset=utf-8", + self.headers["content-type"] + ) + encoded = text.encode("utf8", "replace" if six.PY2 else "surrogateescape") + self._text_cache = CachedDecode(encoded, enc, text) + self.content = self._text_cache.encoded def decode(self): """ - Decodes body based on the current Content-Encoding header, then - removes the header. If there is no Content-Encoding header, no - action is taken. - - Returns: - True, if decoding succeeded. - False, otherwise. + Decodes body based on the current Content-Encoding header, then + removes the header. If there is no Content-Encoding header, no + action is taken. """ - ce = self.headers.get("content-encoding") - data = encoding.decode(ce, self.content) - if data is None: - return False - self.content = data + self.raw_content = self.content self.headers.pop("content-encoding", None) - return True def encode(self, e): """ - Encodes body with the encoding e, where e is "gzip", "deflate" or "identity". - - Returns: - True, if decoding succeeded. - False, otherwise. + Encodes body with the encoding e, where e is "gzip", "deflate" or "identity". """ - data = encoding.encode(e, self.content) - if data is None: - return False - self.content = data + self.decode() # remove the current encoding self.headers["content-encoding"] = e - return True + self.content = self.raw_content def replace(self, pattern, repl, flags=0): """ @@ -203,10 +289,9 @@ class Message(basetypes.Serializable): repl = strutils.escaped_str_to_bytes(repl) replacements = 0 if self.content: - with decoded(self): - self.content, replacements = re.subn( - pattern, repl, self.content, flags=flags - ) + self.content, replacements = re.subn( + pattern, repl, self.content, flags=flags + ) replacements += self.headers.replace(pattern, repl, flags) return replacements @@ -225,29 +310,16 @@ class Message(basetypes.Serializable): class decoded(object): """ - A context manager that decodes a request or response, and then - re-encodes it with the same encoding after execution of the block. - - Example: - - .. code-block:: python - - with decoded(request): - request.content = request.content.replace("foo", "bar") + Deprecated: You can now directly use :py:attr:`content`. + :py:attr:`raw_content` has the encoded content. """ def __init__(self, message): - self.message = message - ce = message.headers.get("content-encoding") - if ce in encoding.ENCODINGS: - self.ce = ce - else: - self.ce = None + warnings.warn("decoded() is deprecated, you can now directly use .content instead. " + ".raw_content has the encoded content.", DeprecationWarning) def __enter__(self): - if self.ce: - self.message.decode() + pass def __exit__(self, type, value, tb): - if self.ce: - self.message.encode(self.ce) + pass \ No newline at end of file diff --git a/netlib/http/request.py b/netlib/http/request.py index d9f4ed00..4ce94549 100644 --- a/netlib/http/request.py +++ b/netlib/http/request.py @@ -5,7 +5,6 @@ import re import six from six.moves import urllib -from netlib import encoding from netlib import multidict from netlib import strutils from netlib.http import multipart @@ -44,6 +43,7 @@ class Request(message.Message): An HTTP request. """ def __init__(self, *args, **kwargs): + super(Request, self).__init__() self.data = RequestData(*args, **kwargs) def __repr__(self): @@ -327,7 +327,7 @@ class Request(message.Message): self.headers["accept-encoding"] = ( ', '.join( e - for e in encoding.ENCODINGS + for e in {"gzip", "identity", "deflate"} if e in accept_encoding ) ) diff --git a/netlib/http/response.py b/netlib/http/response.py index 17d69418..d2273edd 100644 --- a/netlib/http/response.py +++ b/netlib/http/response.py @@ -30,13 +30,14 @@ class Response(message.Message): An HTTP response. """ def __init__(self, *args, **kwargs): + super(Response, self).__init__() self.data = ResponseData(*args, **kwargs) def __repr__(self): - if self.content: + if self.raw_content: details = "{}, {}".format( self.headers.get("content-type", "unknown content type"), - human.pretty_size(len(self.content)) + human.pretty_size(len(self.raw_content)) ) else: details = "no content" diff --git a/test/mitmproxy/test_contentview.py b/test/mitmproxy/test_contentview.py index 52fceeac..4b099d8d 100644 --- a/test/mitmproxy/test_contentview.py +++ b/test/mitmproxy/test_contentview.py @@ -209,28 +209,6 @@ Larry headers=Headers() ) - r = cv.get_content_view( - cv.get("Auto"), - encoding.encode('gzip', b"[1, 2, 3]"), - headers=Headers( - content_type="application/json", - content_encoding="gzip" - ) - ) - assert "decoded gzip" in r[0] - assert "JSON" in r[0] - - r = cv.get_content_view( - cv.get("XML"), - encoding.encode('gzip', b"[1, 2, 3]"), - headers=Headers( - content_type="application/json", - content_encoding="gzip" - ) - ) - assert "decoded gzip" in r[0] - assert "Raw" in r[0] - def test_add_cv(self): class TestContentView(cv.View): name = "test" diff --git a/test/mitmproxy/test_examples.py b/test/mitmproxy/test_examples.py index 607d6faf..22d3c425 100644 --- a/test/mitmproxy/test_examples.py +++ b/test/mitmproxy/test_examples.py @@ -73,9 +73,9 @@ def test_add_header(): def test_custom_contentviews(): with example("custom_contentviews.py") as ex: pig = ex.ctx.contentview - _, fmt = pig("test!") - assert any('esttay!' in val[0][1] for val in fmt) - assert not pig("gobbledygook") + _, fmt = pig(b"test!") + assert any(b'esttay!' in val[0][1] for val in fmt) + assert not pig(b"gobbledygook") def test_iframe_injector(): @@ -103,7 +103,7 @@ def test_modify_form(): def test_modify_querystring(): - flow = tutils.tflow(req=netutils.treq(path="/search?q=term")) + flow = tutils.tflow(req=netutils.treq(path=b"/search?q=term")) with example("modify_querystring.py") as ex: ex.run("request", flow) assert flow.request.query["mitmproxy"] == "rocks" @@ -126,7 +126,7 @@ def test_modify_response_body(): def test_redirect_requests(): - flow = tutils.tflow(req=netutils.treq(host="example.org")) + flow = tutils.tflow(req=netutils.treq(host=b"example.org")) with example("redirect_requests.py") as ex: ex.run("request", flow) assert flow.request.host == "mitmproxy.org" diff --git a/test/mitmproxy/test_flow.py b/test/mitmproxy/test_flow.py index 9eaab9aa..5753e728 100644 --- a/test/mitmproxy/test_flow.py +++ b/test/mitmproxy/test_flow.py @@ -518,13 +518,13 @@ class TestFlow(object): f.replace("foo", "bar") - assert f.request.content != "abarb" + assert f.request.raw_content != "abarb" f.request.decode() - assert f.request.content == "abarb" + assert f.request.raw_content == "abarb" - assert f.response.content != "abarb" + assert f.response.raw_content != "abarb" f.response.decode() - assert f.response.content == "abarb" + assert f.response.raw_content == "abarb" class TestState: @@ -1102,16 +1102,6 @@ class TestRequest: r.constrain_encoding() assert "oink" not in r.headers["accept-encoding"] - def test_get_decoded_content(self): - r = HTTPRequest.wrap(netlib.tutils.treq()) - r.content = None - r.headers["content-encoding"] = "identity" - assert r.get_decoded_content() is None - - r.content = "falafel" - r.encode("gzip") - assert r.get_decoded_content() == "falafel" - def test_get_content_type(self): resp = HTTPResponse.wrap(netlib.tutils.tresp()) resp.headers = Headers(content_type="text/plain") diff --git a/test/mitmproxy/test_protocol_http2.py b/test/mitmproxy/test_protocol_http2.py index 932c8df2..6e021b2c 100644 --- a/test/mitmproxy/test_protocol_http2.py +++ b/test/mitmproxy/test_protocol_http2.py @@ -120,7 +120,7 @@ class _Http2TestBase(object): client.wfile.flush() # read CONNECT response - while client.rfile.readline() != "\r\n": + while client.rfile.readline() != b"\r\n": pass client.convert_to_ssl(alpn_protos=[b'h2']) @@ -197,7 +197,7 @@ class TestSimple(_Http2TestBase, _Http2ServerBase): (':path', '/'), ('ClIeNt-FoO', 'client-bar-1'), ('ClIeNt-FoO', 'client-bar-2'), - ], body='my request body echoed back to me') + ], body=b'my request body echoed back to me') done = False while not done: @@ -269,7 +269,7 @@ class TestWithBodies(_Http2TestBase, _Http2ServerBase): (':scheme', 'https'), (':path', '/'), ], - body='foobar with request body', + body=b'foobar with request body', ) done = False diff --git a/test/mitmproxy/tservers.py b/test/mitmproxy/tservers.py index 51f4b4e2..6d8730f5 100644 --- a/test/mitmproxy/tservers.py +++ b/test/mitmproxy/tservers.py @@ -11,7 +11,6 @@ import pathod.pathoc from mitmproxy import flow, controller from mitmproxy.cmdline import APP_HOST, APP_PORT -from netlib import strutils testapp = flask.Flask(__name__) diff --git a/test/netlib/http/test_message.py b/test/netlib/http/test_message.py index f5bf7f0c..aecde1ec 100644 --- a/test/netlib/http/test_message.py +++ b/test/netlib/http/test_message.py @@ -1,7 +1,8 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import, print_function, division -from netlib.http import decoded +import six + from netlib.tutils import tresp @@ -76,6 +77,9 @@ class TestMessage(object): resp.content = b"" assert resp.data.content == b"" assert resp.headers["content-length"] == "0" + resp.raw_content = b"bar" + assert resp.data.content == b"bar" + assert resp.headers["content-length"] == "0" def test_content_basic(self): _test_passthrough_attr(tresp(), "content") @@ -93,61 +97,108 @@ class TestMessage(object): _test_decoded_attr(tresp(), "http_version") -class TestDecodedDecorator(object): - +class TestMessageContentEncoding(object): def test_simple(self): r = tresp() - assert r.content == b"message" + assert r.raw_content == b"message" assert "content-encoding" not in r.headers - assert r.encode("gzip") + r.encode("gzip") assert r.headers["content-encoding"] - assert r.content != b"message" - with decoded(r): - assert "content-encoding" not in r.headers - assert r.content == b"message" - assert r.headers["content-encoding"] - assert r.content != b"message" + assert r.raw_content != b"message" + assert r.content == b"message" + assert r.raw_content != b"message" def test_modify(self): r = tresp() assert "content-encoding" not in r.headers - assert r.encode("gzip") - - with decoded(r): - r.content = b"foo" + r.encode("gzip") - assert r.content != b"foo" + r.content = b"foo" + assert r.raw_content != b"foo" r.decode() - assert r.content == b"foo" + assert r.raw_content == b"foo" def test_unknown_ce(self): r = tresp() r.headers["content-encoding"] = "zopfli" - r.content = b"foo" - with decoded(r): - assert r.headers["content-encoding"] - assert r.content == b"foo" - assert r.headers["content-encoding"] + r.raw_content = b"foo" assert r.content == b"foo" + assert r.headers["content-encoding"] def test_cannot_decode(self): r = tresp() - assert r.encode("gzip") - r.content = b"foo" - with decoded(r): - assert r.headers["content-encoding"] - assert r.content == b"foo" + r.encode("gzip") + r.raw_content = b"foo" + assert r.content == b"foo" assert r.headers["content-encoding"] - assert r.content != b"foo" r.decode() - assert r.content == b"foo" + assert r.raw_content == b"foo" + assert "content-encoding" not in r.headers def test_cannot_encode(self): r = tresp() - assert r.encode("gzip") - with decoded(r): - r.content = None + r.encode("gzip") + r.content = None + assert r.headers["content-encoding"] + assert r.raw_content is None + r.headers["content-encoding"] = "zopfli" + r.content = b"foo" assert "content-encoding" not in r.headers - assert r.content is None + assert r.raw_content == b"foo" + + +class TestMessageText(object): + def test_simple(self): + r = tresp(content=b'\xc3\xbc') + assert r.raw_content == b"\xc3\xbc" + assert r.content == b"\xc3\xbc" + assert r.text == u"ü" + + r.encode("gzip") + assert r.text == u"ü" + r.decode() + assert r.text == u"ü" + + r.headers["content-type"] = "text/html; charset=latin1" + assert r.content == b"\xc3\xbc" + assert r.text == u"ü" + + def test_modify(self): + r = tresp() + + r.text = u"ü" + assert r.raw_content == b"\xc3\xbc" + + r.headers["content-type"] = "text/html; charset=latin1" + r.text = u"ü" + assert r.raw_content == b"\xfc" + assert r.headers["content-length"] == "1" + + def test_unknown_ce(self): + r = tresp() + r.headers["content-type"] = "text/html; charset=wtf" + r.raw_content = b"foo" + assert r.text == u"foo" + + def test_cannot_decode(self): + r = tresp() + r.raw_content = b"\xFF" + assert r.text == u'\ufffd' if six.PY2 else '\udcff' + + def test_cannot_encode(self): + r = tresp() + r.content = None + assert "content-type" not in r.headers + assert r.raw_content is None + + r.headers["content-type"] = "text/html; charset=latin1" + r.text = u"☃" + assert r.headers["content-type"] == "text/html; charset=utf-8" + assert r.raw_content == b'\xe2\x98\x83' + + r.headers["content-type"] = "text/html; charset=latin1" + r.text = u'\udcff' + assert r.headers["content-type"] == "text/html; charset=utf-8" + assert r.raw_content == b'\xed\xb3\xbf' if six.PY2 else b"\xFF" diff --git a/test/netlib/test_encoding.py b/test/netlib/test_encoding.py index 0ff1aad1..de10fc48 100644 --- a/test/netlib/test_encoding.py +++ b/test/netlib/test_encoding.py @@ -1,37 +1,39 @@ -from netlib import encoding +from netlib import encoding, tutils def test_identity(): - assert b"string" == encoding.decode("identity", b"string") - assert b"string" == encoding.encode("identity", b"string") - assert not encoding.encode("nonexistent", b"string") - assert not encoding.decode("nonexistent encoding", b"string") + assert b"string" == encoding.decode(b"string", "identity") + assert b"string" == encoding.encode(b"string", "identity") + with tutils.raises(ValueError): + encoding.encode(b"string", "nonexistent encoding") def test_gzip(): assert b"string" == encoding.decode( - "gzip", encoding.encode( - "gzip", - b"string" - ) + b"string", + "gzip" + ), + "gzip" ) - assert encoding.decode("gzip", b"bogus") is None + with tutils.raises(ValueError): + encoding.decode(b"bogus", "gzip") def test_deflate(): assert b"string" == encoding.decode( - "deflate", encoding.encode( - "deflate", - b"string" - ) + b"string", + "deflate" + ), + "deflate" ) assert b"string" == encoding.decode( - "deflate", encoding.encode( - "deflate", - b"string" - )[2:-4] + b"string", + "deflate" + )[2:-4], + "deflate" ) - assert encoding.decode("deflate", b"bogus") is None + with tutils.raises(ValueError): + encoding.decode(b"bogus", "deflate") -- cgit v1.2.3 From dbf7cb1a442e2c0823d853ca310395048496996d Mon Sep 17 00:00:00 2001 From: Maximilian Hils Date: Sat, 2 Jul 2016 02:01:46 -0700 Subject: update examples: no decoded() anymore :tada: --- examples/custom_contentviews.py | 2 +- examples/har_extractor.py | 2 +- examples/iframe_injector.py | 24 +++++++++++------------- examples/modify_response_body.py | 10 ++++------ examples/redirect_requests.py | 4 ++-- examples/sslstrip.py | 38 ++++++++++++++++++-------------------- examples/upsidedownternet.py | 20 +++++++++----------- 7 files changed, 46 insertions(+), 54 deletions(-) diff --git a/examples/custom_contentviews.py b/examples/custom_contentviews.py index 05ebeb69..8a57bf74 100644 --- a/examples/custom_contentviews.py +++ b/examples/custom_contentviews.py @@ -20,7 +20,7 @@ class ViewPigLatin(contentviews.View): docinfo = d.getroottree().docinfo def piglify(src): - words = string.split(src) + words = src.split() ret = '' for word in words: idx = -1 diff --git a/examples/har_extractor.py b/examples/har_extractor.py index d6b50c21..54aa84d3 100644 --- a/examples/har_extractor.py +++ b/examples/har_extractor.py @@ -127,7 +127,7 @@ def response(context, flow): for k, v in flow.request.query or {}] response_body_size = len(flow.response.content) - response_body_decoded_size = len(flow.response.get_decoded_content()) + response_body_decoded_size = len(flow.response.content) response_body_compression = response_body_decoded_size - response_body_size entry = HAR.entries({ diff --git a/examples/iframe_injector.py b/examples/iframe_injector.py index 9495da93..5803b4c1 100644 --- a/examples/iframe_injector.py +++ b/examples/iframe_injector.py @@ -2,7 +2,6 @@ # (this script works best with --anticache) import sys from bs4 import BeautifulSoup -from mitmproxy.models import decoded def start(context): @@ -14,15 +13,14 @@ def start(context): def response(context, flow): if flow.request.host in context.iframe_url: return - with decoded(flow.response): # Remove content encoding (gzip, ...) - html = BeautifulSoup(flow.response.content, "lxml") - if html.body: - iframe = html.new_tag( - "iframe", - src=context.iframe_url, - frameborder=0, - height=0, - width=0) - html.body.insert(0, iframe) - flow.response.content = str(html) - context.log("Iframe inserted.") + html = BeautifulSoup(flow.response.content, "lxml") + if html.body: + iframe = html.new_tag( + "iframe", + src=context.iframe_url, + frameborder=0, + height=0, + width=0) + html.body.insert(0, iframe) + flow.response.content = str(html) + context.log("Iframe inserted.") diff --git a/examples/modify_response_body.py b/examples/modify_response_body.py index 3034892e..03dfeaa4 100644 --- a/examples/modify_response_body.py +++ b/examples/modify_response_body.py @@ -2,8 +2,6 @@ # (this script works best with --anticache) import sys -from mitmproxy.models import decoded - def start(context): if len(sys.argv) != 3: @@ -14,7 +12,7 @@ def start(context): def response(context, flow): - with decoded(flow.response): # automatically decode gzipped responses. - flow.response.content = flow.response.content.replace( - context.old, - context.new) + flow.response.content = flow.response.content.replace( + context.old, + context.new + ) diff --git a/examples/redirect_requests.py b/examples/redirect_requests.py index d7db3f1c..bb1e6952 100644 --- a/examples/redirect_requests.py +++ b/examples/redirect_requests.py @@ -13,9 +13,9 @@ def request(context, flow): # Method 1: Answer with a locally generated response if flow.request.pretty_host.endswith("example.com"): resp = HTTPResponse( - "HTTP/1.1", 200, "OK", + b"HTTP/1.1", 200, b"OK", Headers(Content_Type="text/html"), - "helloworld") + b"helloworld") flow.reply.send(resp) # Method 2: Redirect the request to a different server diff --git a/examples/sslstrip.py b/examples/sslstrip.py index 8dde8e3e..77e91cc9 100644 --- a/examples/sslstrip.py +++ b/examples/sslstrip.py @@ -1,4 +1,3 @@ -from netlib.http import decoded import re from six.moves import urllib @@ -19,22 +18,21 @@ def request(context, flow): def response(context, flow): - with decoded(flow.response): - flow.request.headers.pop('Strict-Transport-Security', None) - flow.request.headers.pop('Public-Key-Pins', None) - - # strip links in response body - flow.response.content = flow.response.content.replace('https://', 'http://') - - # strip links in 'Location' header - if flow.response.headers.get('Location', '').startswith('https://'): - location = flow.response.headers['Location'] - hostname = urllib.parse.urlparse(location).hostname - if hostname: - context.secure_hosts.add(hostname) - flow.response.headers['Location'] = location.replace('https://', 'http://', 1) - - # strip secure flag from 'Set-Cookie' headers - cookies = flow.response.headers.get_all('Set-Cookie') - cookies = [re.sub(r';\s*secure\s*', '', s) for s in cookies] - flow.response.headers.set_all('Set-Cookie', cookies) + flow.request.headers.pop('Strict-Transport-Security', None) + flow.request.headers.pop('Public-Key-Pins', None) + + # strip links in response body + flow.response.content = flow.response.content.replace('https://', 'http://') + + # strip links in 'Location' header + if flow.response.headers.get('Location', '').startswith('https://'): + location = flow.response.headers['Location'] + hostname = urllib.parse.urlparse(location).hostname + if hostname: + context.secure_hosts.add(hostname) + flow.response.headers['Location'] = location.replace('https://', 'http://', 1) + + # strip secure flag from 'Set-Cookie' headers + cookies = flow.response.headers.get_all('Set-Cookie') + cookies = [re.sub(r';\s*secure\s*', '', s) for s in cookies] + flow.response.headers.set_all('Set-Cookie', cookies) diff --git a/examples/upsidedownternet.py b/examples/upsidedownternet.py index 9aac9f05..58ed53d7 100644 --- a/examples/upsidedownternet.py +++ b/examples/upsidedownternet.py @@ -1,17 +1,15 @@ from six.moves import cStringIO as StringIO from PIL import Image -from mitmproxy.models import decoded def response(context, flow): if flow.response.headers.get("content-type", "").startswith("image"): - with decoded(flow.response): # automatically decode gzipped responses. - try: - s = StringIO(flow.response.content) - img = Image.open(s).rotate(180) - s2 = StringIO() - img.save(s2, "png") - flow.response.content = s2.getvalue() - flow.response.headers["content-type"] = "image/png" - except: # Unknown image types etc. - pass + try: + s = StringIO(flow.response.content) + img = Image.open(s).rotate(180) + s2 = StringIO() + img.save(s2, "png") + flow.response.content = s2.getvalue() + flow.response.headers["content-type"] = "image/png" + except: # Unknown image types etc. + pass -- cgit v1.2.3 From d9f797e7e6936809171d9c99144fb5ded3ee131f Mon Sep 17 00:00:00 2001 From: Maximilian Hils Date: Sat, 2 Jul 2016 02:11:00 -0700 Subject: make the linter happy --- mitmproxy/contentviews.py | 1 - netlib/http/message.py | 2 +- test/mitmproxy/test_contentview.py | 1 - 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/mitmproxy/contentviews.py b/mitmproxy/contentviews.py index c9ea14ba..6072f959 100644 --- a/mitmproxy/contentviews.py +++ b/mitmproxy/contentviews.py @@ -31,7 +31,6 @@ from six import BytesIO from mitmproxy import exceptions from mitmproxy.contrib import jsbeautifier from mitmproxy.contrib.wbxml import ASCommandResponse -from netlib import encoding from netlib import http from netlib import multidict from netlib.http import url diff --git a/netlib/http/message.py b/netlib/http/message.py index 668198f8..28278bd2 100644 --- a/netlib/http/message.py +++ b/netlib/http/message.py @@ -322,4 +322,4 @@ class decoded(object): pass def __exit__(self, type, value, tb): - pass \ No newline at end of file + pass diff --git a/test/mitmproxy/test_contentview.py b/test/mitmproxy/test_contentview.py index 4b099d8d..7037745d 100644 --- a/test/mitmproxy/test_contentview.py +++ b/test/mitmproxy/test_contentview.py @@ -1,6 +1,5 @@ from mitmproxy.exceptions import ContentViewException from netlib.http import Headers -from netlib import encoding from netlib.http import url from netlib import multidict -- cgit v1.2.3 From 2f8a1fd2cb1374941f436f36bbfa0d0b3d9213c7 Mon Sep 17 00:00:00 2001 From: Maximilian Hils Date: Sat, 2 Jul 2016 03:03:42 -0700 Subject: tests++ --- netlib/http/message.py | 6 +++--- test/netlib/http/test_message.py | 44 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 46 insertions(+), 4 deletions(-) diff --git a/netlib/http/message.py b/netlib/http/message.py index 28278bd2..ca3a4145 100644 --- a/netlib/http/message.py +++ b/netlib/http/message.py @@ -314,12 +314,12 @@ class decoded(object): :py:attr:`raw_content` has the encoded content. """ - def __init__(self, message): + def __init__(self, message): # pragma no cover warnings.warn("decoded() is deprecated, you can now directly use .content instead. " ".raw_content has the encoded content.", DeprecationWarning) - def __enter__(self): + def __enter__(self): # pragma no cover pass - def __exit__(self, type, value, tb): + def __exit__(self, type, value, tb): # pragma no cover pass diff --git a/test/netlib/http/test_message.py b/test/netlib/http/test_message.py index aecde1ec..e1707a91 100644 --- a/test/netlib/http/test_message.py +++ b/test/netlib/http/test_message.py @@ -1,9 +1,11 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import, print_function, division +import mock import six from netlib.tutils import tresp +from netlib import http def _test_passthrough_attr(message, attr): @@ -69,6 +71,15 @@ class TestMessage(object): assert resp != 0 + def test_hash(self): + resp = tresp() + assert hash(resp) + + def test_serializable(self): + resp = tresp() + resp2 = http.Response.from_state(resp.get_state()) + assert resp == resp2 + def test_content_length_update(self): resp = tresp() resp.content = b"foo" @@ -93,7 +104,7 @@ class TestMessage(object): def test_timestamp_end(self): _test_passthrough_attr(tresp(), "timestamp_end") - def teste_http_version(self): + def test_http_version(self): _test_decoded_attr(tresp(), "http_version") @@ -109,6 +120,14 @@ class TestMessageContentEncoding(object): assert r.content == b"message" assert r.raw_content != b"message" + r.raw_content = b"foo" + with mock.patch("netlib.encoding.decode") as e: + assert r.content + assert e.call_count == 1 + e.reset_mock() + assert r.content + assert e.call_count == 0 + def test_modify(self): r = tresp() assert "content-encoding" not in r.headers @@ -119,6 +138,13 @@ class TestMessageContentEncoding(object): r.decode() assert r.raw_content == b"foo" + r.encode("identity") + with mock.patch("netlib.encoding.encode") as e: + r.content = b"foo" + assert e.call_count == 0 + r.content = b"bar" + assert e.call_count == 1 + def test_unknown_ce(self): r = tresp() r.headers["content-encoding"] = "zopfli" @@ -165,6 +191,15 @@ class TestMessageText(object): assert r.content == b"\xc3\xbc" assert r.text == u"ü" + r.encode("identity") + r.raw_content = b"foo" + with mock.patch("netlib.encoding.decode") as e: + assert r.text + assert e.call_count == 2 + e.reset_mock() + assert r.text + assert e.call_count == 0 + def test_modify(self): r = tresp() @@ -176,6 +211,13 @@ class TestMessageText(object): assert r.raw_content == b"\xfc" assert r.headers["content-length"] == "1" + r.encode("identity") + with mock.patch("netlib.encoding.encode") as e: + r.text = u"ü" + assert e.call_count == 0 + r.text = u"ä" + assert e.call_count == 2 + def test_unknown_ce(self): r = tresp() r.headers["content-type"] = "text/html; charset=wtf" -- cgit v1.2.3 From a6b3551934e2b8768177d6831ca08f97f5bdae44 Mon Sep 17 00:00:00 2001 From: Maximilian Hils Date: Mon, 4 Jul 2016 13:58:09 -0700 Subject: raise ValueError if content-encoding is invalid --- mitmproxy/console/common.py | 40 +++++++++++++++++++++++++++----------- mitmproxy/console/flowview.py | 38 +++++++++++++++++++++++++----------- mitmproxy/dump.py | 13 +++++++++---- mitmproxy/filt.py | 36 ++++++++++++++++++++++------------ mitmproxy/flow/export.py | 18 +++++++++++------ netlib/http/message.py | 42 +++++++++++++++++++++++++++------------- netlib/http/request.py | 12 +++++++++--- netlib/wsgi.py | 6 +++++- test/netlib/http/test_message.py | 18 +++++++++-------- 9 files changed, 154 insertions(+), 69 deletions(-) diff --git a/mitmproxy/console/common.py b/mitmproxy/console/common.py index b4369c0c..ef220b4c 100644 --- a/mitmproxy/console/common.py +++ b/mitmproxy/console/common.py @@ -256,24 +256,34 @@ def copy_flow_format_data(part, scope, flow): else: data = "" if scope in ("q", "a"): - if flow.request.content is None: + request = flow.request.copy() + try: + request.decode() + except ValueError: + pass + if request.raw_content is None: return None, "Request content is missing" if part == "h": - data += netlib.http.http1.assemble_request(flow.request) + data += netlib.http.http1.assemble_request(request) elif part == "c": - data += flow.request.content + data += request.raw_content else: raise ValueError("Unknown part: {}".format(part)) - if scope == "a" and flow.request.content and flow.response: + if scope == "a" and flow.request.raw_content and flow.response: # Add padding between request and response data += "\r\n" * 2 if scope in ("s", "a") and flow.response: - if flow.response.content is None: + response = flow.response.copy() + try: + response.decode() + except ValueError: + pass + if response.raw_content is None: return None, "Response content is missing" if part == "h": - data += netlib.http.http1.assemble_response(flow.response) + data += netlib.http.http1.assemble_response(response) elif part == "c": - data += flow.response.content + data += response.raw_content else: raise ValueError("Unknown part: {}".format(part)) return data, False @@ -361,8 +371,8 @@ def ask_save_body(part, master, state, flow): "q" (request), "s" (response) or None (ask user if necessary). """ - request_has_content = flow.request and flow.request.content - response_has_content = flow.response and flow.response.content + request_has_content = flow.request and flow.request.raw_content + response_has_content = flow.response and flow.response.raw_content if part is None: # We first need to determine whether we want to save the request or the @@ -383,14 +393,22 @@ def ask_save_body(part, master, state, flow): ask_save_body("q", master, state, flow) elif part == "q" and request_has_content: + try: + content = flow.request.content + except ValueError: + content = flow.request.raw_content ask_save_path( "Save request content", - flow.request.content + content ) elif part == "s" and response_has_content: + try: + content = flow.response.content + except ValueError: + content = flow.response.raw_content ask_save_path( "Save response content", - flow.response.content + content ) else: signals.status_message.send(message="No content to save.") diff --git a/mitmproxy/console/flowview.py b/mitmproxy/console/flowview.py index 208b0d44..c4bb6c40 100644 --- a/mitmproxy/console/flowview.py +++ b/mitmproxy/console/flowview.py @@ -199,26 +199,34 @@ class FlowView(tabs.Tabs): def _get_content_view(self, viewmode, message, max_lines, _): + try: + content = message.content + if content != message.raw_content: + enc = "[decoded {}]".format( + message.headers.get("content-encoding") + ) + else: + enc = None + except ValueError: + content = message.raw_content + enc = "[cannot decode]" try: query = None if isinstance(message, models.HTTPRequest): query = message.query description, lines = contentviews.get_content_view( - viewmode, message.content, headers=message.headers, query=query + viewmode, content, headers=message.headers, query=query ) except exceptions.ContentViewException: s = "Content viewer failed: \n" + traceback.format_exc() signals.add_event(s, "error") description, lines = contentviews.get_content_view( - contentviews.get("Raw"), message.content, headers=message.headers + contentviews.get("Raw"), content, headers=message.headers ) description = description.replace("Raw", "Couldn't parse: falling back to Raw") - if message.content != message.raw_content: - description = "[decoded {enc}] {desc}".format( - enc=message.headers.get("content-encoding"), - desc=description - ) + if enc: + description = " ".join(enc, description) # Give hint that you have to tab for the response. if description == "No content" and isinstance(message, models.HTTPRequest): @@ -419,10 +427,14 @@ class FlowView(tabs.Tabs): # editing message bodies, this can cause problems. For now, I # just strip the newlines off the end of the body when we return # from an editor. - c = self.master.spawn_editor(message.content or b"") + try: + content = message.content + except ValueError: + content = message.raw_content + c = self.master.spawn_editor(content or b"") message.content = c.rstrip(b"\n") elif part == "f": - if not message.urlencoded_form and message.content: + if not message.urlencoded_form and message.raw_content: signals.status_prompt_onekey.send( prompt = "Existing body is not a URL-encoded form. Clear and edit?", keys = [ @@ -682,10 +694,14 @@ class FlowView(tabs.Tabs): ) key = None elif key == "v": - if conn.content: + if conn.raw_content: t = conn.headers.get("content-type") if "EDITOR" in os.environ or "PAGER" in os.environ: - self.master.spawn_external_viewer(conn.content, t) + try: + content = conn.content + except ValueError: + content = conn.raw_content + self.master.spawn_external_viewer(content, t) else: signals.status_message.send( message = "Error! Set $EDITOR or $PAGER." diff --git a/mitmproxy/dump.py b/mitmproxy/dump.py index ea242bba..0a9b76a7 100644 --- a/mitmproxy/dump.py +++ b/mitmproxy/dump.py @@ -187,15 +187,20 @@ class DumpMaster(flow.FlowMaster): ) self.echo(headers, indent=4) if self.o.flow_detail >= 3: - if message.content is None: + try: + content = message.content + except ValueError: + content = message.raw_content + + if content is None: self.echo("(content missing)", indent=4) - elif message.content: + elif content: self.echo("") try: type, lines = contentviews.get_content_view( contentviews.get("Auto"), - message.content, + content, headers=getattr(message, "headers", None) ) except exceptions.ContentViewException: @@ -203,7 +208,7 @@ class DumpMaster(flow.FlowMaster): self.add_event(s, "debug") type, lines = contentviews.get_content_view( contentviews.get("Raw"), - message.content, + content, headers=getattr(message, "headers", None) ) diff --git a/mitmproxy/filt.py b/mitmproxy/filt.py index 95bae1ae..e8687b9f 100644 --- a/mitmproxy/filt.py +++ b/mitmproxy/filt.py @@ -193,12 +193,18 @@ class FBod(_Rex): help = "Body" def __call__(self, f): - if f.request and f.request.content: - if self.re.search(f.request.content): - return True - if f.response and f.response.content: - if self.re.search(f.response.content): - return True + if f.request and f.request.raw_content: + try: + if self.re.search(f.request.content): + return True + except ValueError: + pass + if f.response and f.response.raw_content: + try: + if self.re.search(f.response.content): + return True + except ValueError: + pass return False @@ -207,9 +213,12 @@ class FBodRequest(_Rex): help = "Request body" def __call__(self, f): - if f.request and f.request.content: - if self.re.search(f.request.content): - return True + if f.request and f.request.raw_content: + try: + if self.re.search(f.request.content): + return True + except ValueError: + pass class FBodResponse(_Rex): @@ -217,9 +226,12 @@ class FBodResponse(_Rex): help = "Response body" def __call__(self, f): - if f.response and f.response.content: - if self.re.search(f.response.content): - return True + if f.response and f.response.raw_content: + try: + if self.re.search(f.response.content): + return True + except ValueError: + pass class FMethod(_Rex): diff --git a/mitmproxy/flow/export.py b/mitmproxy/flow/export.py index f0ac02ab..9da18f22 100644 --- a/mitmproxy/flow/export.py +++ b/mitmproxy/flow/export.py @@ -19,17 +19,23 @@ def dictstr(items, indent): def curl_command(flow): data = "curl " - for k, v in flow.request.headers.fields: + request = flow.request.copy() + try: + request.decode() + except ValueError: + pass + + for k, v in request.headers.fields: data += "-H '%s:%s' " % (k, v) - if flow.request.method != "GET": - data += "-X %s " % flow.request.method + if request.method != "GET": + data += "-X %s " % request.method - full_url = flow.request.scheme + "://" + flow.request.host + flow.request.path + full_url = request.scheme + "://" + request.host + request.path data += "'%s'" % full_url - if flow.request.content: - data += " --data-binary '%s'" % flow.request.content + if request.raw_content: + data += " --data-binary '%s'" % request.raw_content return data diff --git a/netlib/http/message.py b/netlib/http/message.py index ca3a4145..86ff64d1 100644 --- a/netlib/http/message.py +++ b/netlib/http/message.py @@ -124,6 +124,9 @@ class Message(basetypes.Serializable): """ The HTTP message body decoded with the content-encoding header (e.g. gzip) + Raises: + ValueError, when getting the content and the content-encoding is invalid. + See also: :py:class:`raw_content`, :py:attr:`text` """ ce = self.headers.get("content-encoding") @@ -132,17 +135,21 @@ class Message(basetypes.Serializable): self._content_cache.encoding == ce ) if not cached: - try: - if not ce: - raise ValueError() + if ce: decoded = encoding.decode(self.raw_content, ce) - except ValueError: + else: decoded = self.raw_content self._content_cache = CachedDecode(self.raw_content, ce, decoded) return self._content_cache.decoded @content.setter def content(self, value): + if value is not None and not isinstance(value, bytes): + raise TypeError( + "Message content must be bytes, not {}. " + "Please use .text if you want to assign a str." + .format(type(value).__name__) + ) ce = self.headers.get("content-encoding") cached = ( self._content_cache.decoded == value and @@ -150,15 +157,15 @@ class Message(basetypes.Serializable): ) if not cached: try: - if not ce: - raise ValueError() - encoded = encoding.encode(value, ce) + if ce and value is not None: + encoded = encoding.encode(value, ce) + else: + encoded = value except ValueError: - # Do we have an unknown content-encoding? - # If so, we want to remove it. - if value and ce: - self.headers.pop("content-encoding", None) - ce = None + # So we have an invalid content-encoding? + # Let's remove it! + del self.headers["content-encoding"] + ce = None encoded = value self._content_cache = CachedDecode(encoded, ce, value) self.raw_content = self._content_cache.encoded @@ -262,6 +269,9 @@ class Message(basetypes.Serializable): Decodes body based on the current Content-Encoding header, then removes the header. If there is no Content-Encoding header, no action is taken. + + Raises: + ValueError, when the content-encoding is invalid. """ self.raw_content = self.content self.headers.pop("content-encoding", None) @@ -269,10 +279,16 @@ class Message(basetypes.Serializable): def encode(self, e): """ Encodes body with the encoding e, where e is "gzip", "deflate" or "identity". + Any existing content-encodings are overwritten, + the content is not decoded beforehand. + + Raises: + ValueError, when the specified content-encoding is invalid. """ - self.decode() # remove the current encoding self.headers["content-encoding"] = e self.content = self.raw_content + if "content-encoding" not in self.headers: + raise ValueError("Invalid content encoding {}".format(repr(e))) def replace(self, pattern, repl, flags=0): """ diff --git a/netlib/http/request.py b/netlib/http/request.py index 4ce94549..a8ec6238 100644 --- a/netlib/http/request.py +++ b/netlib/http/request.py @@ -347,7 +347,10 @@ class Request(message.Message): def _get_urlencoded_form(self): is_valid_content_type = "application/x-www-form-urlencoded" in self.headers.get("content-type", "").lower() if is_valid_content_type: - return tuple(netlib.http.url.decode(self.content)) + try: + return tuple(netlib.http.url.decode(self.content)) + except ValueError: + pass return () def _set_urlencoded_form(self, value): @@ -356,7 +359,7 @@ class Request(message.Message): This will overwrite the existing content if there is one. """ self.headers["content-type"] = "application/x-www-form-urlencoded" - self.content = netlib.http.url.encode(value) + self.content = netlib.http.url.encode(value).encode() @urlencoded_form.setter def urlencoded_form(self, value): @@ -376,7 +379,10 @@ class Request(message.Message): def _get_multipart_form(self): is_valid_content_type = "multipart/form-data" in self.headers.get("content-type", "").lower() if is_valid_content_type: - return multipart.decode(self.headers, self.content) + try: + return multipart.decode(self.headers, self.content) + except ValueError: + pass return () def _set_multipart_form(self, value): diff --git a/netlib/wsgi.py b/netlib/wsgi.py index c66fddc2..2444f449 100644 --- a/netlib/wsgi.py +++ b/netlib/wsgi.py @@ -60,10 +60,14 @@ class WSGIAdaptor(object): else: path_info = path query = '' + try: + content = flow.request.content + except ValueError: + content = flow.request.raw_content environ = { 'wsgi.version': (1, 0), 'wsgi.url_scheme': strutils.native(flow.request.scheme, "latin-1"), - 'wsgi.input': BytesIO(flow.request.content or b""), + 'wsgi.input': BytesIO(content or b""), 'wsgi.errors': errsoc, 'wsgi.multithread': True, 'wsgi.multiprocess': False, diff --git a/test/netlib/http/test_message.py b/test/netlib/http/test_message.py index e1707a91..ed7d3da5 100644 --- a/test/netlib/http/test_message.py +++ b/test/netlib/http/test_message.py @@ -5,7 +5,7 @@ import mock import six from netlib.tutils import tresp -from netlib import http +from netlib import http, tutils def _test_passthrough_attr(message, attr): @@ -92,9 +92,6 @@ class TestMessage(object): assert resp.data.content == b"bar" assert resp.headers["content-length"] == "0" - def test_content_basic(self): - _test_passthrough_attr(tresp(), "content") - def test_headers(self): _test_passthrough_attr(tresp(), "headers") @@ -149,18 +146,22 @@ class TestMessageContentEncoding(object): r = tresp() r.headers["content-encoding"] = "zopfli" r.raw_content = b"foo" - assert r.content == b"foo" + with tutils.raises(ValueError): + assert r.content assert r.headers["content-encoding"] def test_cannot_decode(self): r = tresp() r.encode("gzip") r.raw_content = b"foo" - assert r.content == b"foo" + with tutils.raises(ValueError): + assert r.content assert r.headers["content-encoding"] - r.decode() + + with tutils.raises(ValueError): + r.decode() assert r.raw_content == b"foo" - assert "content-encoding" not in r.headers + assert "content-encoding" in r.headers def test_cannot_encode(self): r = tresp() @@ -213,6 +214,7 @@ class TestMessageText(object): r.encode("identity") with mock.patch("netlib.encoding.encode") as e: + e.return_value = b"" r.text = u"ü" assert e.call_count == 0 r.text = u"ä" -- cgit v1.2.3 From ca9de786fd7ed3edf7a485f7c019ac83d5abfc7f Mon Sep 17 00:00:00 2001 From: Maximilian Hils Date: Mon, 4 Jul 2016 15:07:01 -0700 Subject: minor fix --- mitmproxy/console/flowview.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mitmproxy/console/flowview.py b/mitmproxy/console/flowview.py index c4bb6c40..d994e670 100644 --- a/mitmproxy/console/flowview.py +++ b/mitmproxy/console/flowview.py @@ -226,7 +226,7 @@ class FlowView(tabs.Tabs): description = description.replace("Raw", "Couldn't parse: falling back to Raw") if enc: - description = " ".join(enc, description) + description = " ".join([enc, description]) # Give hint that you have to tab for the response. if description == "No content" and isinstance(message, models.HTTPRequest): -- cgit v1.2.3 From a3c7c84d49c3e6563e7f37ef60c989f99ed96788 Mon Sep 17 00:00:00 2001 From: Maximilian Hils Date: Fri, 15 Jul 2016 22:50:33 -0700 Subject: improve message content semantics --- mitmproxy/console/common.py | 30 +++------ mitmproxy/console/flowview.py | 12 +--- mitmproxy/dump.py | 2 +- mitmproxy/filt.py | 28 +++------ mitmproxy/flow/export.py | 11 ++-- netlib/http/headers.py | 12 ++++ netlib/http/message.py | 133 ++++++++++++++++++++++++--------------- netlib/wsgi.py | 10 +-- test/netlib/http/test_headers.py | 9 ++- test/netlib/http/test_message.py | 77 +++++++++++++++++++---- 10 files changed, 194 insertions(+), 130 deletions(-) diff --git a/mitmproxy/console/common.py b/mitmproxy/console/common.py index ef220b4c..41f4f243 100644 --- a/mitmproxy/console/common.py +++ b/mitmproxy/console/common.py @@ -257,16 +257,13 @@ def copy_flow_format_data(part, scope, flow): data = "" if scope in ("q", "a"): request = flow.request.copy() - try: - request.decode() - except ValueError: - pass - if request.raw_content is None: + request.decode(strict=False) + if request.content is None: return None, "Request content is missing" if part == "h": data += netlib.http.http1.assemble_request(request) elif part == "c": - data += request.raw_content + data += request.content else: raise ValueError("Unknown part: {}".format(part)) if scope == "a" and flow.request.raw_content and flow.response: @@ -274,16 +271,13 @@ def copy_flow_format_data(part, scope, flow): data += "\r\n" * 2 if scope in ("s", "a") and flow.response: response = flow.response.copy() - try: - response.decode() - except ValueError: - pass - if response.raw_content is None: + response.decode(strict=False) + if response.content is None: return None, "Response content is missing" if part == "h": data += netlib.http.http1.assemble_response(response) elif part == "c": - data += response.raw_content + data += response.content else: raise ValueError("Unknown part: {}".format(part)) return data, False @@ -393,22 +387,14 @@ def ask_save_body(part, master, state, flow): ask_save_body("q", master, state, flow) elif part == "q" and request_has_content: - try: - content = flow.request.content - except ValueError: - content = flow.request.raw_content ask_save_path( "Save request content", - content + flow.request.get_content(strict=False), ) elif part == "s" and response_has_content: - try: - content = flow.response.content - except ValueError: - content = flow.response.raw_content ask_save_path( "Save response content", - content + flow.response.get_content(strict=False), ) else: signals.status_message.send(message="No content to save.") diff --git a/mitmproxy/console/flowview.py b/mitmproxy/console/flowview.py index d994e670..f8686b41 100644 --- a/mitmproxy/console/flowview.py +++ b/mitmproxy/console/flowview.py @@ -427,11 +427,7 @@ class FlowView(tabs.Tabs): # editing message bodies, this can cause problems. For now, I # just strip the newlines off the end of the body when we return # from an editor. - try: - content = message.content - except ValueError: - content = message.raw_content - c = self.master.spawn_editor(content or b"") + c = self.master.spawn_editor(message.get_content(strict=False) or b"") message.content = c.rstrip(b"\n") elif part == "f": if not message.urlencoded_form and message.raw_content: @@ -697,11 +693,7 @@ class FlowView(tabs.Tabs): if conn.raw_content: t = conn.headers.get("content-type") if "EDITOR" in os.environ or "PAGER" in os.environ: - try: - content = conn.content - except ValueError: - content = conn.raw_content - self.master.spawn_external_viewer(content, t) + self.master.spawn_external_viewer(conn.get_content(strict=False), t) else: signals.status_message.send( message = "Error! Set $EDITOR or $PAGER." diff --git a/mitmproxy/dump.py b/mitmproxy/dump.py index 0a9b76a7..14d55cd1 100644 --- a/mitmproxy/dump.py +++ b/mitmproxy/dump.py @@ -190,7 +190,7 @@ class DumpMaster(flow.FlowMaster): try: content = message.content except ValueError: - content = message.raw_content + content = message.get_content(strict=False) if content is None: self.echo("(content missing)", indent=4) diff --git a/mitmproxy/filt.py b/mitmproxy/filt.py index e8687b9f..a42988f1 100644 --- a/mitmproxy/filt.py +++ b/mitmproxy/filt.py @@ -194,17 +194,11 @@ class FBod(_Rex): def __call__(self, f): if f.request and f.request.raw_content: - try: - if self.re.search(f.request.content): - return True - except ValueError: - pass + if self.re.search(f.request.get_content(strict=False)): + return True if f.response and f.response.raw_content: - try: - if self.re.search(f.response.content): - return True - except ValueError: - pass + if self.re.search(f.response.get_content(strict=False)): + return True return False @@ -214,11 +208,8 @@ class FBodRequest(_Rex): def __call__(self, f): if f.request and f.request.raw_content: - try: - if self.re.search(f.request.content): - return True - except ValueError: - pass + if self.re.search(f.request.get_content(strict=False)): + return True class FBodResponse(_Rex): @@ -227,11 +218,8 @@ class FBodResponse(_Rex): def __call__(self, f): if f.response and f.response.raw_content: - try: - if self.re.search(f.response.content): - return True - except ValueError: - pass + if self.re.search(f.response.get_content(strict=False)): + return True class FMethod(_Rex): diff --git a/mitmproxy/flow/export.py b/mitmproxy/flow/export.py index 9da18f22..4659af7b 100644 --- a/mitmproxy/flow/export.py +++ b/mitmproxy/flow/export.py @@ -20,12 +20,9 @@ def curl_command(flow): data = "curl " request = flow.request.copy() - try: - request.decode() - except ValueError: - pass + request.decode(strict=False) - for k, v in request.headers.fields: + for k, v in request.headers.items(multi=True): data += "-H '%s:%s' " % (k, v) if request.method != "GET": @@ -34,8 +31,8 @@ def curl_command(flow): full_url = request.scheme + "://" + request.host + request.path data += "'%s'" % full_url - if request.raw_content: - data += " --data-binary '%s'" % request.raw_content + if request.content: + data += " --data-binary '%s'" % request.content return data diff --git a/netlib/http/headers.py b/netlib/http/headers.py index f052a53b..13a8c98f 100644 --- a/netlib/http/headers.py +++ b/netlib/http/headers.py @@ -204,3 +204,15 @@ def parse_content_type(c): if len(clause) == 2: d[clause[0].strip()] = clause[1].strip() return ts[0].lower(), ts[1].lower(), d + + +def assemble_content_type(type, subtype, parameters): + if not parameters: + return "{}/{}".format(type, subtype) + params = "; ".join( + "{}={}".format(k, v) + for k, v in parameters.items() + ) + return "{}/{}; {}".format( + type, subtype, params + ) diff --git a/netlib/http/message.py b/netlib/http/message.py index 86ff64d1..1252ed25 100644 --- a/netlib/http/message.py +++ b/netlib/http/message.py @@ -53,14 +53,15 @@ class MessageData(basetypes.Serializable): class CachedDecode(object): - __slots__ = ["encoded", "encoding", "decoded"] + __slots__ = ["encoded", "encoding", "strict", "decoded"] - def __init__(self, object, encoding, decoded): + def __init__(self, object, encoding, strict, decoded): self.encoded = object self.encoding = encoding + self.strict = strict self.decoded = decoded -no_cached_decode = CachedDecode(None, None, None) +no_cached_decode = CachedDecode(None, None, None, None) class Message(basetypes.Serializable): @@ -118,33 +119,44 @@ class Message(basetypes.Serializable): def raw_content(self, content): self.data.content = content - @property - def content(self): - # type: () -> bytes + def get_content(self, strict=True): + # type: (bool) -> bytes """ The HTTP message body decoded with the content-encoding header (e.g. gzip) Raises: - ValueError, when getting the content and the content-encoding is invalid. + ValueError, when the content-encoding is invalid and strict is True. See also: :py:class:`raw_content`, :py:attr:`text` """ + if self.raw_content is None: + return None ce = self.headers.get("content-encoding") cached = ( self._content_cache.encoded == self.raw_content and + (self._content_cache.strict or not strict) and self._content_cache.encoding == ce ) if not cached: + is_strict = True if ce: - decoded = encoding.decode(self.raw_content, ce) + try: + decoded = encoding.decode(self.raw_content, ce) + except ValueError: + if strict: + raise + is_strict = False + decoded = self.raw_content else: decoded = self.raw_content - self._content_cache = CachedDecode(self.raw_content, ce, decoded) + self._content_cache = CachedDecode(self.raw_content, ce, is_strict, decoded) return self._content_cache.decoded - @content.setter - def content(self, value): - if value is not None and not isinstance(value, bytes): + def set_content(self, value): + if value is None: + self.raw_content = None + return + if not isinstance(value, bytes): raise TypeError( "Message content must be bytes, not {}. " "Please use .text if you want to assign a str." @@ -153,24 +165,23 @@ class Message(basetypes.Serializable): ce = self.headers.get("content-encoding") cached = ( self._content_cache.decoded == value and - self._content_cache.encoding == ce + self._content_cache.encoding == ce and + self._content_cache.strict ) if not cached: try: - if ce and value is not None: - encoded = encoding.encode(value, ce) - else: - encoded = value + encoded = encoding.encode(value, ce or "identity") except ValueError: # So we have an invalid content-encoding? # Let's remove it! del self.headers["content-encoding"] ce = None encoded = value - self._content_cache = CachedDecode(encoded, ce, value) + self._content_cache = CachedDecode(encoded, ce, True, value) self.raw_content = self._content_cache.encoded - if isinstance(self.raw_content, bytes): - self.headers["content-length"] = str(len(self.raw_content)) + self.headers["content-length"] = str(len(self.raw_content)) + + content = property(get_content, set_content) @property def http_version(self): @@ -211,69 +222,87 @@ class Message(basetypes.Serializable): if ct: return ct[2].get("charset") - @property - def text(self): - # type: () -> six.text_type + def _guess_encoding(self): + # type: () -> str + enc = self._get_content_type_charset() + if enc: + return enc + + if "json" in self.headers.get("content-type", ""): + return "utf8" + else: + # We may also want to check for HTML meta tags here at some point. + return "latin-1" + + def get_text(self, strict=True): + # type: (bool) -> six.text_type """ The HTTP message body decoded with both content-encoding header (e.g. gzip) and content-type header charset. + Raises: + ValueError, when either content-encoding or charset is invalid and strict is True. + See also: :py:attr:`content`, :py:class:`raw_content` """ - # This attribute should be called text, because that's what requests does. - enc = self._get_content_type_charset() - - # We may also want to check for HTML meta tags here at some point. + if self.raw_content is None: + return None + enc = self._guess_encoding() + content = self.get_content(strict) cached = ( - self._text_cache.encoded == self.content and + self._text_cache.encoded == content and + (self._text_cache.strict or not strict) and self._text_cache.encoding == enc ) if not cached: + is_strict = self._content_cache.strict try: - if not enc: - raise ValueError() - decoded = encoding.decode(self.content, enc) + decoded = encoding.decode(content, enc) except ValueError: - decoded = self.content.decode("utf8", "replace" if six.PY2 else "surrogateescape") - self._text_cache = CachedDecode(self.content, enc, decoded) + if strict: + raise + is_strict = False + decoded = self.content.decode(enc, "replace" if six.PY2 else "surrogateescape") + self._text_cache = CachedDecode(content, enc, is_strict, decoded) return self._text_cache.decoded - @text.setter - def text(self, text): - enc = self._get_content_type_charset() + def set_text(self, text): + if text is None: + self.content = None + return + enc = self._guess_encoding() + cached = ( self._text_cache.decoded == text and - self._text_cache.encoding == enc + self._text_cache.encoding == enc and + self._text_cache.strict ) if not cached: try: - if not enc: - raise ValueError() encoded = encoding.encode(text, enc) except ValueError: - # Do we have an unknown content-type charset? - # If so, we want to replace it with utf8. - if text and enc: - self.headers["content-type"] = re.sub( - "charset=[^;]+", - "charset=utf-8", - self.headers["content-type"] - ) - encoded = text.encode("utf8", "replace" if six.PY2 else "surrogateescape") - self._text_cache = CachedDecode(encoded, enc, text) + # Fall back to UTF-8 and update the content-type header. + ct = headers.parse_content_type(self.headers.get("content-type", "")) or ("text", "plain", {}) + ct[2]["charset"] = "utf-8" + self.headers["content-type"] = headers.assemble_content_type(*ct) + enc = "utf8" + encoded = text.encode(enc, "replace" if six.PY2 else "surrogateescape") + self._text_cache = CachedDecode(encoded, enc, True, text) self.content = self._text_cache.encoded - def decode(self): + text = property(get_text, set_text) + + def decode(self, strict=True): """ Decodes body based on the current Content-Encoding header, then removes the header. If there is no Content-Encoding header, no action is taken. Raises: - ValueError, when the content-encoding is invalid. + ValueError, when the content-encoding is invalid and strict is True. """ - self.raw_content = self.content + self.raw_content = self.get_content(strict) self.headers.pop("content-encoding", None) def encode(self, e): diff --git a/netlib/wsgi.py b/netlib/wsgi.py index 2444f449..0def75b5 100644 --- a/netlib/wsgi.py +++ b/netlib/wsgi.py @@ -54,20 +54,20 @@ class WSGIAdaptor(object): self.app, self.domain, self.port, self.sversion = app, domain, port, sversion def make_environ(self, flow, errsoc, **extra): + """ + Raises: + ValueError, if the content-encoding is invalid. + """ path = strutils.native(flow.request.path, "latin-1") if '?' in path: path_info, query = strutils.native(path, "latin-1").split('?', 1) else: path_info = path query = '' - try: - content = flow.request.content - except ValueError: - content = flow.request.raw_content environ = { 'wsgi.version': (1, 0), 'wsgi.url_scheme': strutils.native(flow.request.scheme, "latin-1"), - 'wsgi.input': BytesIO(content or b""), + 'wsgi.input': BytesIO(flow.request.content or b""), 'wsgi.errors': errsoc, 'wsgi.multithread': True, 'wsgi.multiprocess': False, diff --git a/test/netlib/http/test_headers.py b/test/netlib/http/test_headers.py index 51819b86..8462a5af 100644 --- a/test/netlib/http/test_headers.py +++ b/test/netlib/http/test_headers.py @@ -1,4 +1,4 @@ -from netlib.http import Headers, parse_content_type +from netlib.http.headers import Headers, parse_content_type, assemble_content_type from netlib.tutils import raises @@ -81,3 +81,10 @@ def test_parse_content_type(): v = p("text/html; charset=UTF-8") assert v == ('text', 'html', {'charset': 'UTF-8'}) + + +def test_assemble_content_type(): + p = assemble_content_type + assert p("text", "html", {}) == "text/html" + assert p("text", "html", {"charset": "utf8"}) == "text/html; charset=utf8" + assert p("text", "html", {"charset": "utf8", "foo": "bar"}) == "text/html; charset=utf8; foo=bar" diff --git a/test/netlib/http/test_message.py b/test/netlib/http/test_message.py index ed7d3da5..8b178e04 100644 --- a/test/netlib/http/test_message.py +++ b/test/netlib/http/test_message.py @@ -142,6 +142,9 @@ class TestMessageContentEncoding(object): r.content = b"bar" assert e.call_count == 1 + with tutils.raises(TypeError): + r.content = u"foo" + def test_unknown_ce(self): r = tresp() r.headers["content-encoding"] = "zopfli" @@ -149,6 +152,7 @@ class TestMessageContentEncoding(object): with tutils.raises(ValueError): assert r.content assert r.headers["content-encoding"] + assert r.get_content(strict=False) == b"foo" def test_cannot_decode(self): r = tresp() @@ -157,12 +161,25 @@ class TestMessageContentEncoding(object): with tutils.raises(ValueError): assert r.content assert r.headers["content-encoding"] + assert r.get_content(strict=False) == b"foo" with tutils.raises(ValueError): r.decode() assert r.raw_content == b"foo" assert "content-encoding" in r.headers + r.decode(strict=False) + assert r.content == b"foo" + assert "content-encoding" not in r.headers + + def test_none(self): + r = tresp(content=None) + assert r.content is None + r.content = b"foo" + assert r.content is not None + r.content = None + assert r.content is None + def test_cannot_encode(self): r = tresp() r.encode("gzip") @@ -175,12 +192,17 @@ class TestMessageContentEncoding(object): assert "content-encoding" not in r.headers assert r.raw_content == b"foo" + with tutils.raises(ValueError): + r.encode("zopfli") + assert r.raw_content == b"foo" + assert "content-encoding" not in r.headers + class TestMessageText(object): def test_simple(self): - r = tresp(content=b'\xc3\xbc') - assert r.raw_content == b"\xc3\xbc" - assert r.content == b"\xc3\xbc" + r = tresp(content=b'\xfc') + assert r.raw_content == b"\xfc" + assert r.content == b"\xfc" assert r.text == u"ü" r.encode("gzip") @@ -189,8 +211,10 @@ class TestMessageText(object): assert r.text == u"ü" r.headers["content-type"] = "text/html; charset=latin1" - assert r.content == b"\xc3\xbc" + r.content = b"\xc3\xbc" assert r.text == u"ü" + r.headers["content-type"] = "text/html; charset=utf8" + assert r.text == u"ü" r.encode("identity") r.raw_content = b"foo" @@ -201,16 +225,29 @@ class TestMessageText(object): assert r.text assert e.call_count == 0 + def test_guess_json(self): + r = tresp(content=b'"\xc3\xbc"') + r.headers["content-type"] = "application/json" + assert r.text == u'"ü"' + + def test_none(self): + r = tresp(content=None) + assert r.text is None + r.text = b"foo" + assert r.text is not None + r.text = None + assert r.text is None + def test_modify(self): r = tresp() r.text = u"ü" - assert r.raw_content == b"\xc3\xbc" + assert r.raw_content == b"\xfc" - r.headers["content-type"] = "text/html; charset=latin1" + r.headers["content-type"] = "text/html; charset=utf8" r.text = u"ü" - assert r.raw_content == b"\xfc" - assert r.headers["content-length"] == "1" + assert r.raw_content == b"\xc3\xbc" + assert r.headers["content-length"] == "2" r.encode("identity") with mock.patch("netlib.encoding.encode") as e: @@ -224,12 +261,18 @@ class TestMessageText(object): r = tresp() r.headers["content-type"] = "text/html; charset=wtf" r.raw_content = b"foo" - assert r.text == u"foo" + with tutils.raises(ValueError): + assert r.text == u"foo" + assert r.get_text(strict=False) == u"foo" def test_cannot_decode(self): r = tresp() + r.headers["content-type"] = "text/html; charset=utf8" r.raw_content = b"\xFF" - assert r.text == u'\ufffd' if six.PY2 else '\udcff' + with tutils.raises(ValueError): + assert r.text + + assert r.get_text(strict=False) == u'\ufffd' if six.PY2 else '\udcff' def test_cannot_encode(self): r = tresp() @@ -237,9 +280,19 @@ class TestMessageText(object): assert "content-type" not in r.headers assert r.raw_content is None - r.headers["content-type"] = "text/html; charset=latin1" + r.headers["content-type"] = "text/html; charset=latin1; foo=bar" r.text = u"☃" - assert r.headers["content-type"] == "text/html; charset=utf-8" + assert r.headers["content-type"] == "text/html; charset=utf-8; foo=bar" + assert r.raw_content == b'\xe2\x98\x83' + + r.headers["content-type"] = "gibberish" + r.text = u"☃" + assert r.headers["content-type"] == "text/plain; charset=utf-8" + assert r.raw_content == b'\xe2\x98\x83' + + del r.headers["content-type"] + r.text = u"☃" + assert r.headers["content-type"] == "text/plain; charset=utf-8" assert r.raw_content == b'\xe2\x98\x83' r.headers["content-type"] = "text/html; charset=latin1" -- cgit v1.2.3 From e6e39ce80f4daaf6a1d6f8d87616409486d358a5 Mon Sep 17 00:00:00 2001 From: Maximilian Hils Date: Fri, 15 Jul 2016 23:46:12 -0700 Subject: preserve content-type parameter order --- netlib/http/headers.py | 3 ++- test/netlib/http/test_headers.py | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/netlib/http/headers.py b/netlib/http/headers.py index b8aa212a..9fa7e1e6 100644 --- a/netlib/http/headers.py +++ b/netlib/http/headers.py @@ -2,6 +2,7 @@ from __future__ import absolute_import, print_function, division import re +import collections import six from netlib import multidict from netlib import strutils @@ -206,7 +207,7 @@ def parse_content_type(c): ts = parts[0].split("/", 1) if len(ts) != 2: return None - d = {} + d = collections.OrderedDict() if len(parts) == 2: for i in parts[1].split(";"): clause = i.split("=", 1) diff --git a/test/netlib/http/test_headers.py b/test/netlib/http/test_headers.py index 8462a5af..51537310 100644 --- a/test/netlib/http/test_headers.py +++ b/test/netlib/http/test_headers.py @@ -1,3 +1,5 @@ +import collections + from netlib.http.headers import Headers, parse_content_type, assemble_content_type from netlib.tutils import raises @@ -87,4 +89,4 @@ def test_assemble_content_type(): p = assemble_content_type assert p("text", "html", {}) == "text/html" assert p("text", "html", {"charset": "utf8"}) == "text/html; charset=utf8" - assert p("text", "html", {"charset": "utf8", "foo": "bar"}) == "text/html; charset=utf8; foo=bar" + assert p("text", "html", collections.OrderedDict([("charset", "utf8"), ("foo", "bar")])) == "text/html; charset=utf8; foo=bar" -- cgit v1.2.3