From 123ef043dc7214c9818985aeea565a82c2680c0e Mon Sep 17 00:00:00 2001 From: Maximilian Hils Date: Sat, 10 Dec 2016 11:36:32 +0100 Subject: add new xml/html pretty-printer :tada: --- mitmproxy/contentviews/__init__.py | 3 +- mitmproxy/contentviews/xml_html.py | 234 +++++++++++++++++++++ test/mitmproxy/contentviews/test_html_outline.py | 2 +- test/mitmproxy/contentviews/test_xml_html.py | 29 +++ .../test_xml_html_data/cdata-formatted.xml | 10 + .../contentviews/test_xml_html_data/cdata.xml | 10 + .../test_xml_html_data/comment-formatted.xml | 10 + .../contentviews/test_xml_html_data/comment.xml | 10 + .../test_xml_html_data/inline-formatted.html | 14 ++ .../contentviews/test_xml_html_data/inline.html | 7 + .../test_xml_html_data/simple-formatted.html | 10 + .../contentviews/test_xml_html_data/simple.html | 1 + 12 files changed, 338 insertions(+), 2 deletions(-) create mode 100644 mitmproxy/contentviews/xml_html.py create mode 100644 test/mitmproxy/contentviews/test_xml_html.py create mode 100644 test/mitmproxy/contentviews/test_xml_html_data/cdata-formatted.xml create mode 100644 test/mitmproxy/contentviews/test_xml_html_data/cdata.xml create mode 100644 test/mitmproxy/contentviews/test_xml_html_data/comment-formatted.xml create mode 100644 test/mitmproxy/contentviews/test_xml_html_data/comment.xml create mode 100644 test/mitmproxy/contentviews/test_xml_html_data/inline-formatted.html create mode 100644 test/mitmproxy/contentviews/test_xml_html_data/inline.html create mode 100644 test/mitmproxy/contentviews/test_xml_html_data/simple-formatted.html create mode 100644 test/mitmproxy/contentviews/test_xml_html_data/simple.html diff --git a/mitmproxy/contentviews/__init__.py b/mitmproxy/contentviews/__init__.py index 3857d5e5..357172e3 100644 --- a/mitmproxy/contentviews/__init__.py +++ b/mitmproxy/contentviews/__init__.py @@ -22,7 +22,7 @@ from mitmproxy import exceptions from mitmproxy.net import http from mitmproxy.utils import strutils from . import ( - auto, raw, hex, json, html_outline, wbxml, javascript, css, + auto, raw, hex, json, xml_html, html_outline, wbxml, javascript, css, urlencoded, multipart, image, query, protobuf ) from .base import View, VIEW_CUTOFF, KEY_MAX, format_text, format_dict @@ -163,6 +163,7 @@ add(auto.ViewAuto()) add(raw.ViewRaw()) add(hex.ViewHex()) add(json.ViewJSON()) +add(xml_html.ViewXmlHtml()) add(wbxml.ViewWBXML()) add(html_outline.ViewHTMLOutline()) add(javascript.ViewJavaScript()) diff --git a/mitmproxy/contentviews/xml_html.py b/mitmproxy/contentviews/xml_html.py new file mode 100644 index 00000000..0f2ce57d --- /dev/null +++ b/mitmproxy/contentviews/xml_html.py @@ -0,0 +1,234 @@ +import io +import re +import textwrap +from typing import Iterable + +from mitmproxy.contentviews import base +from mitmproxy.utils import sliding_window + +""" +A custom XML/HTML prettifier. Compared to other prettifiers, its main features are: + +- Implemented in pure Python. +- Modifies whitespace only. +- Works with any input. +- Lazy evaluation. + +The implementation is split into two main parts: tokenization and formatting of tokens. +""" + +# http://www.xml.com/pub/a/2001/07/25/namingparts.html - this is close enough for what we do. +REGEX_TAG = re.compile("[a-zA-Z0-9._:\-]+(?!=)") +# https://www.w3.org/TR/html5/syntax.html#void-elements +HTML_VOID_ELEMENTS = { + "area", "base", "br", "col", "embed", "hr", "img", "input", "keygen", "link", "meta", "param", + "source", "track", "wbr" +} +NO_INDENT_TAGS = {"xml", "doctype", "html"} +INDENT = 2 + + +class Token: + def __init__(self, data): + self.data = data + + def __repr__(self): + return "{}({})".format( + type(self).__name__, + self.data + ) + + +class Text(Token): + @property + def text(self): + return self.data.strip() + + +class Tag(Token): + @property + def tag(self): + t = REGEX_TAG.search(self.data) + if t is not None: + return t.group(0).lower() + return "" + + @property + def is_comment(self) -> bool: + return self.data.startswith("") + elif self.is_cdata: + return self.data.endswith("]]>") + else: + # This fails for attributes that contain an unescaped ">" + return self.data.endswith(">") + + +def tokenize(data: str) -> Iterable[Token]: + token = Text("") # type: Token + + i = 0 + + def readuntil(char, start, include=1): + nonlocal i + end = data.find(char, start) + if end == -1: + end = len(data) + ret = data[i:end + include] + i = end + include + return ret + + while i < len(data): + if isinstance(token, Text): + token.data = readuntil("<", i, 0) + if token.text: + yield token + token = Tag("") + elif isinstance(token, Tag): + token.data += readuntil(">", i, 1) + if token.done: + yield token + token = Text("") + if token.data.strip(): + yield token + + +def indent_text(data: str, prefix: str) -> str: + # Add spacing to first line so that we dedent in cases like this: + #
  • This is + # example text + # over multiple lines + #
  • + dedented = textwrap.dedent(" " * 32 + data).strip() + return textwrap.indent(dedented, prefix[:32]) + + +def is_inline_text(a: Token, b: Token, c: Token) -> bool: + if isinstance(a, Tag) and isinstance(b, Text) and isinstance(c, Tag): + if a.is_opening and "\n" not in b.data and c.is_closing and a.tag == c.tag: + return True + + +def is_inline(prev2: Token, prev1: Token, t: Token, next1: Token, next2: Token) -> bool: + if isinstance(t, Text): + return is_inline_text(prev1, t, next1) + elif isinstance(t, Tag): + if is_inline_text(prev2, prev1, t) or is_inline_text(t, next1, next2): + return True + if isinstance(next1, Tag) and t.is_opening and next1.is_closing and t.tag == next1.tag: + return True #
    (start tag) + if isinstance(prev1, Tag) and prev1.is_opening and t.is_closing and prev1.tag == t.tag: + return True #
    (end tag) + + +class ElementStack: + """ + Keep track of how deeply nested our document is. + """ + + def __init__(self): + self.open_tags = [] + self.indent = "" + + def push_tag(self, tag: str): + if len(self.open_tags) > 16: + return + self.open_tags.append(tag) + if tag not in NO_INDENT_TAGS: + self.indent += " " * INDENT + + def pop_tag(self, tag: str): + if tag in self.open_tags: + remove_indent = 0 + while True: + t = self.open_tags.pop() + if t not in NO_INDENT_TAGS: + remove_indent += INDENT + if t == tag: + break + self.indent = self.indent[:-remove_indent] + else: + pass # this closing tag has no start tag. let's keep indentation as-is. + + +def format_xml(tokens: Iterable[Token]) -> str: + out = io.StringIO() + + context = ElementStack() + + for prev2, prev1, token, next1, next2 in sliding_window.window(tokens, 2, 2): + if isinstance(token, Tag): + if token.is_opening: + out.write(indent_text(token.data, context.indent)) + + if not is_inline(prev2, prev1, token, next1, next2): + out.write("\n") + + context.push_tag(token.tag) + elif token.is_closing: + context.pop_tag(token.tag) + + if is_inline(prev2, prev1, token, next1, next2): + out.write(token.data) + else: + out.write(indent_text(token.data, context.indent)) + out.write("\n") + + else: # self-closing + out.write(indent_text(token.data, context.indent)) + out.write("\n") + elif isinstance(token, Text): + if is_inline(prev2, prev1, token, next1, next2): + out.write(token.text) + else: + out.write(indent_text(token.data, context.indent)) + out.write("\n") + else: # pragma: no cover + raise RuntimeError() + + return out.getvalue() + + +class ViewXmlHtml(base.View): + name = "XML/HTML" + prompt = ("xml/html", "x") + content_types = ["text/xml", "text/html"] + + def __call__(self, data, **metadata): + # TODO: + # We should really have the message text as str here, + # not the message content as bytes. + # https://github.com/mitmproxy/mitmproxy/issues/1662#issuecomment-266192578 + data = data.decode("utf8", "xmlcharrefreplace") + tokens = tokenize(data) + # TODO: + # Performance: Don't render the whole document right away. + # Let's wait with this until we have a sequence-like interface, + # this thing is reasonably fast right now anyway. + pretty = base.format_text(format_xml(tokens)) + if "html" in data.lower(): + t = "HTML" + else: + t = "XML" + return t, pretty diff --git a/test/mitmproxy/contentviews/test_html_outline.py b/test/mitmproxy/contentviews/test_html_outline.py index d9ccc406..9e664e52 100644 --- a/test/mitmproxy/contentviews/test_html_outline.py +++ b/test/mitmproxy/contentviews/test_html_outline.py @@ -6,4 +6,4 @@ def test_view_html_outline(): v = full_eval(html_outline.ViewHTMLOutline()) s = b"


    one

    " assert v(s) - assert v(b'\xfe') \ No newline at end of file + assert v(b'\xfe') diff --git a/test/mitmproxy/contentviews/test_xml_html.py b/test/mitmproxy/contentviews/test_xml_html.py new file mode 100644 index 00000000..899ecfde --- /dev/null +++ b/test/mitmproxy/contentviews/test_xml_html.py @@ -0,0 +1,29 @@ +import pytest + +from mitmproxy.contentviews import xml_html +from mitmproxy.test import tutils +from . import full_eval + +data = tutils.test_data.push("mitmproxy/contentviews/test_xml_html_data/") + + +def test_simple(): + v = full_eval(xml_html.ViewXmlHtml()) + assert v(b"foo") == ('XML', [[('text', 'foo')]]) + assert v(b"") == ('HTML', [[('text', '')]]) + + +@pytest.mark.parametrize("filename", [ + "simple.html", + "cdata.xml", + "comment.xml", + "inline.html", +]) +def test_format_xml(filename): + path = data.path(filename) + with open(path) as f: + input = f.read() + with open(path.replace(".", "-formatted.")) as f: + expected = f.read() + tokens = xml_html.tokenize(input) + assert xml_html.format_xml(tokens) == expected diff --git a/test/mitmproxy/contentviews/test_xml_html_data/cdata-formatted.xml b/test/mitmproxy/contentviews/test_xml_html_data/cdata-formatted.xml new file mode 100644 index 00000000..44a81a83 --- /dev/null +++ b/test/mitmproxy/contentviews/test_xml_html_data/cdata-formatted.xml @@ -0,0 +1,10 @@ + + < " and & + or write things like + + but my document is still well formed! + ]]> + diff --git a/test/mitmproxy/contentviews/test_xml_html_data/cdata.xml b/test/mitmproxy/contentviews/test_xml_html_data/cdata.xml new file mode 100644 index 00000000..b4c5dfca --- /dev/null +++ b/test/mitmproxy/contentviews/test_xml_html_data/cdata.xml @@ -0,0 +1,10 @@ + + < " and & +or write things like + + but my document is still well formed! +]]> + \ No newline at end of file diff --git a/test/mitmproxy/contentviews/test_xml_html_data/comment-formatted.xml b/test/mitmproxy/contentviews/test_xml_html_data/comment-formatted.xml new file mode 100644 index 00000000..d0da6665 --- /dev/null +++ b/test/mitmproxy/contentviews/test_xml_html_data/comment-formatted.xml @@ -0,0 +1,10 @@ + + + diff --git a/test/mitmproxy/contentviews/test_xml_html_data/comment.xml b/test/mitmproxy/contentviews/test_xml_html_data/comment.xml new file mode 100644 index 00000000..3f54ddba --- /dev/null +++ b/test/mitmproxy/contentviews/test_xml_html_data/comment.xml @@ -0,0 +1,10 @@ + + + \ No newline at end of file diff --git a/test/mitmproxy/contentviews/test_xml_html_data/inline-formatted.html b/test/mitmproxy/contentviews/test_xml_html_data/inline-formatted.html new file mode 100644 index 00000000..5253bf4f --- /dev/null +++ b/test/mitmproxy/contentviews/test_xml_html_data/inline-formatted.html @@ -0,0 +1,14 @@ + + + Test Page + + +

    + + Some things should be + inline + , some things shouldn't! +

    + + + diff --git a/test/mitmproxy/contentviews/test_xml_html_data/inline.html b/test/mitmproxy/contentviews/test_xml_html_data/inline.html new file mode 100644 index 00000000..3e4b16b9 --- /dev/null +++ b/test/mitmproxy/contentviews/test_xml_html_data/inline.html @@ -0,0 +1,7 @@ + +Test Page + +

    Some things should be inline, some things shouldn't!

    + + + \ No newline at end of file diff --git a/test/mitmproxy/contentviews/test_xml_html_data/simple-formatted.html b/test/mitmproxy/contentviews/test_xml_html_data/simple-formatted.html new file mode 100644 index 00000000..23438428 --- /dev/null +++ b/test/mitmproxy/contentviews/test_xml_html_data/simple-formatted.html @@ -0,0 +1,10 @@ + + + + title + + +

    Hello World

    + + + diff --git a/test/mitmproxy/contentviews/test_xml_html_data/simple.html b/test/mitmproxy/contentviews/test_xml_html_data/simple.html new file mode 100644 index 00000000..73e81a5e --- /dev/null +++ b/test/mitmproxy/contentviews/test_xml_html_data/simple.html @@ -0,0 +1 @@ +title

    Hello World

    -- cgit v1.2.3