aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--mitmproxy/contentviews/__init__.py3
-rw-r--r--mitmproxy/contentviews/xml_html.py234
-rw-r--r--test/mitmproxy/contentviews/test_html_outline.py2
-rw-r--r--test/mitmproxy/contentviews/test_xml_html.py29
-rw-r--r--test/mitmproxy/contentviews/test_xml_html_data/cdata-formatted.xml10
-rw-r--r--test/mitmproxy/contentviews/test_xml_html_data/cdata.xml10
-rw-r--r--test/mitmproxy/contentviews/test_xml_html_data/comment-formatted.xml10
-rw-r--r--test/mitmproxy/contentviews/test_xml_html_data/comment.xml10
-rw-r--r--test/mitmproxy/contentviews/test_xml_html_data/inline-formatted.html14
-rw-r--r--test/mitmproxy/contentviews/test_xml_html_data/inline.html7
-rw-r--r--test/mitmproxy/contentviews/test_xml_html_data/simple-formatted.html10
-rw-r--r--test/mitmproxy/contentviews/test_xml_html_data/simple.html1
12 files changed, 338 insertions, 2 deletions
diff --git a/mitmproxy/contentviews/__init__.py b/mitmproxy/contentviews/__init__.py
index 3857d5e5..357172e3 100644
--- a/mitmproxy/contentviews/__init__.py
+++ b/mitmproxy/contentviews/__init__.py
@@ -22,7 +22,7 @@ from mitmproxy import exceptions
from mitmproxy.net import http
from mitmproxy.utils import strutils
from . import (
- auto, raw, hex, json, html_outline, wbxml, javascript, css,
+ auto, raw, hex, json, xml_html, html_outline, wbxml, javascript, css,
urlencoded, multipart, image, query, protobuf
)
from .base import View, VIEW_CUTOFF, KEY_MAX, format_text, format_dict
@@ -163,6 +163,7 @@ add(auto.ViewAuto())
add(raw.ViewRaw())
add(hex.ViewHex())
add(json.ViewJSON())
+add(xml_html.ViewXmlHtml())
add(wbxml.ViewWBXML())
add(html_outline.ViewHTMLOutline())
add(javascript.ViewJavaScript())
diff --git a/mitmproxy/contentviews/xml_html.py b/mitmproxy/contentviews/xml_html.py
new file mode 100644
index 00000000..0f2ce57d
--- /dev/null
+++ b/mitmproxy/contentviews/xml_html.py
@@ -0,0 +1,234 @@
+import io
+import re
+import textwrap
+from typing import Iterable
+
+from mitmproxy.contentviews import base
+from mitmproxy.utils import sliding_window
+
+"""
+A custom XML/HTML prettifier. Compared to other prettifiers, its main features are:
+
+- Implemented in pure Python.
+- Modifies whitespace only.
+- Works with any input.
+- Lazy evaluation.
+
+The implementation is split into two main parts: tokenization and formatting of tokens.
+"""
+
+# http://www.xml.com/pub/a/2001/07/25/namingparts.html - this is close enough for what we do.
+REGEX_TAG = re.compile("[a-zA-Z0-9._:\-]+(?!=)")
+# https://www.w3.org/TR/html5/syntax.html#void-elements
+HTML_VOID_ELEMENTS = {
+ "area", "base", "br", "col", "embed", "hr", "img", "input", "keygen", "link", "meta", "param",
+ "source", "track", "wbr"
+}
+NO_INDENT_TAGS = {"xml", "doctype", "html"}
+INDENT = 2
+
+
+class Token:
+ def __init__(self, data):
+ self.data = data
+
+ def __repr__(self):
+ return "{}({})".format(
+ type(self).__name__,
+ self.data
+ )
+
+
+class Text(Token):
+ @property
+ def text(self):
+ return self.data.strip()
+
+
+class Tag(Token):
+ @property
+ def tag(self):
+ t = REGEX_TAG.search(self.data)
+ if t is not None:
+ return t.group(0).lower()
+ return "<empty>"
+
+ @property
+ def is_comment(self) -> bool:
+ return self.data.startswith("<!--")
+
+ @property
+ def is_cdata(self) -> bool:
+ return self.data.startswith("<![CDATA[")
+
+ @property
+ def is_closing(self):
+ return self.data.startswith("</")
+
+ @property
+ def is_self_closing(self):
+ return self.is_comment or self.is_cdata or self.data.endswith(
+ "/>") or self.tag in HTML_VOID_ELEMENTS
+
+ @property
+ def is_opening(self):
+ return not self.is_closing and not self.is_self_closing
+
+ @property
+ def done(self):
+ if self.is_comment:
+ return self.data.endswith("-->")
+ elif self.is_cdata:
+ return self.data.endswith("]]>")
+ else:
+ # This fails for attributes that contain an unescaped ">"
+ return self.data.endswith(">")
+
+
+def tokenize(data: str) -> Iterable[Token]:
+ token = Text("") # type: Token
+
+ i = 0
+
+ def readuntil(char, start, include=1):
+ nonlocal i
+ end = data.find(char, start)
+ if end == -1:
+ end = len(data)
+ ret = data[i:end + include]
+ i = end + include
+ return ret
+
+ while i < len(data):
+ if isinstance(token, Text):
+ token.data = readuntil("<", i, 0)
+ if token.text:
+ yield token
+ token = Tag("")
+ elif isinstance(token, Tag):
+ token.data += readuntil(">", i, 1)
+ if token.done:
+ yield token
+ token = Text("")
+ if token.data.strip():
+ yield token
+
+
+def indent_text(data: str, prefix: str) -> str:
+ # Add spacing to first line so that we dedent in cases like this:
+ # <li>This is
+ # example text
+ # over multiple lines
+ # </li>
+ dedented = textwrap.dedent(" " * 32 + data).strip()
+ return textwrap.indent(dedented, prefix[:32])
+
+
+def is_inline_text(a: Token, b: Token, c: Token) -> bool:
+ if isinstance(a, Tag) and isinstance(b, Text) and isinstance(c, Tag):
+ if a.is_opening and "\n" not in b.data and c.is_closing and a.tag == c.tag:
+ return True
+
+
+def is_inline(prev2: Token, prev1: Token, t: Token, next1: Token, next2: Token) -> bool:
+ if isinstance(t, Text):
+ return is_inline_text(prev1, t, next1)
+ elif isinstance(t, Tag):
+ if is_inline_text(prev2, prev1, t) or is_inline_text(t, next1, next2):
+ return True
+ if isinstance(next1, Tag) and t.is_opening and next1.is_closing and t.tag == next1.tag:
+ return True # <div></div> (start tag)
+ if isinstance(prev1, Tag) and prev1.is_opening and t.is_closing and prev1.tag == t.tag:
+ return True # <div></div> (end tag)
+
+
+class ElementStack:
+ """
+ Keep track of how deeply nested our document is.
+ """
+
+ def __init__(self):
+ self.open_tags = []
+ self.indent = ""
+
+ def push_tag(self, tag: str):
+ if len(self.open_tags) > 16:
+ return
+ self.open_tags.append(tag)
+ if tag not in NO_INDENT_TAGS:
+ self.indent += " " * INDENT
+
+ def pop_tag(self, tag: str):
+ if tag in self.open_tags:
+ remove_indent = 0
+ while True:
+ t = self.open_tags.pop()
+ if t not in NO_INDENT_TAGS:
+ remove_indent += INDENT
+ if t == tag:
+ break
+ self.indent = self.indent[:-remove_indent]
+ else:
+ pass # this closing tag has no start tag. let's keep indentation as-is.
+
+
+def format_xml(tokens: Iterable[Token]) -> str:
+ out = io.StringIO()
+
+ context = ElementStack()
+
+ for prev2, prev1, token, next1, next2 in sliding_window.window(tokens, 2, 2):
+ if isinstance(token, Tag):
+ if token.is_opening:
+ out.write(indent_text(token.data, context.indent))
+
+ if not is_inline(prev2, prev1, token, next1, next2):
+ out.write("\n")
+
+ context.push_tag(token.tag)
+ elif token.is_closing:
+ context.pop_tag(token.tag)
+
+ if is_inline(prev2, prev1, token, next1, next2):
+ out.write(token.data)
+ else:
+ out.write(indent_text(token.data, context.indent))
+ out.write("\n")
+
+ else: # self-closing
+ out.write(indent_text(token.data, context.indent))
+ out.write("\n")
+ elif isinstance(token, Text):
+ if is_inline(prev2, prev1, token, next1, next2):
+ out.write(token.text)
+ else:
+ out.write(indent_text(token.data, context.indent))
+ out.write("\n")
+ else: # pragma: no cover
+ raise RuntimeError()
+
+ return out.getvalue()
+
+
+class ViewXmlHtml(base.View):
+ name = "XML/HTML"
+ prompt = ("xml/html", "x")
+ content_types = ["text/xml", "text/html"]
+
+ def __call__(self, data, **metadata):
+ # TODO:
+ # We should really have the message text as str here,
+ # not the message content as bytes.
+ # https://github.com/mitmproxy/mitmproxy/issues/1662#issuecomment-266192578
+ data = data.decode("utf8", "xmlcharrefreplace")
+ tokens = tokenize(data)
+ # TODO:
+ # Performance: Don't render the whole document right away.
+ # Let's wait with this until we have a sequence-like interface,
+ # this thing is reasonably fast right now anyway.
+ pretty = base.format_text(format_xml(tokens))
+ if "html" in data.lower():
+ t = "HTML"
+ else:
+ t = "XML"
+ return t, pretty
diff --git a/test/mitmproxy/contentviews/test_html_outline.py b/test/mitmproxy/contentviews/test_html_outline.py
index d9ccc406..9e664e52 100644
--- a/test/mitmproxy/contentviews/test_html_outline.py
+++ b/test/mitmproxy/contentviews/test_html_outline.py
@@ -6,4 +6,4 @@ def test_view_html_outline():
v = full_eval(html_outline.ViewHTMLOutline())
s = b"<html><br><br></br><p>one</p></html>"
assert v(s)
- assert v(b'\xfe') \ No newline at end of file
+ assert v(b'\xfe')
diff --git a/test/mitmproxy/contentviews/test_xml_html.py b/test/mitmproxy/contentviews/test_xml_html.py
new file mode 100644
index 00000000..899ecfde
--- /dev/null
+++ b/test/mitmproxy/contentviews/test_xml_html.py
@@ -0,0 +1,29 @@
+import pytest
+
+from mitmproxy.contentviews import xml_html
+from mitmproxy.test import tutils
+from . import full_eval
+
+data = tutils.test_data.push("mitmproxy/contentviews/test_xml_html_data/")
+
+
+def test_simple():
+ v = full_eval(xml_html.ViewXmlHtml())
+ assert v(b"foo") == ('XML', [[('text', 'foo')]])
+ assert v(b"<html></html>") == ('HTML', [[('text', '<html></html>')]])
+
+
+@pytest.mark.parametrize("filename", [
+ "simple.html",
+ "cdata.xml",
+ "comment.xml",
+ "inline.html",
+])
+def test_format_xml(filename):
+ path = data.path(filename)
+ with open(path) as f:
+ input = f.read()
+ with open(path.replace(".", "-formatted.")) as f:
+ expected = f.read()
+ tokens = xml_html.tokenize(input)
+ assert xml_html.format_xml(tokens) == expected
diff --git a/test/mitmproxy/contentviews/test_xml_html_data/cdata-formatted.xml b/test/mitmproxy/contentviews/test_xml_html_data/cdata-formatted.xml
new file mode 100644
index 00000000..44a81a83
--- /dev/null
+++ b/test/mitmproxy/contentviews/test_xml_html_data/cdata-formatted.xml
@@ -0,0 +1,10 @@
+<exampleOfACDATA>
+ <![CDATA[
+ Since this is a CDATA section
+ I can use all sorts of reserved characters
+ like > < " and &
+ or write things like
+ <foo></bar>
+ but my document is still well formed!
+ ]]>
+</exampleOfACDATA>
diff --git a/test/mitmproxy/contentviews/test_xml_html_data/cdata.xml b/test/mitmproxy/contentviews/test_xml_html_data/cdata.xml
new file mode 100644
index 00000000..b4c5dfca
--- /dev/null
+++ b/test/mitmproxy/contentviews/test_xml_html_data/cdata.xml
@@ -0,0 +1,10 @@
+<exampleOfACDATA>
+<![CDATA[
+ Since this is a CDATA section
+ I can use all sorts of reserved characters
+ like > < " and &
+or write things like
+ <foo></bar>
+ but my document is still well formed!
+]]>
+</exampleOfACDATA> \ No newline at end of file
diff --git a/test/mitmproxy/contentviews/test_xml_html_data/comment-formatted.xml b/test/mitmproxy/contentviews/test_xml_html_data/comment-formatted.xml
new file mode 100644
index 00000000..d0da6665
--- /dev/null
+++ b/test/mitmproxy/contentviews/test_xml_html_data/comment-formatted.xml
@@ -0,0 +1,10 @@
+<exampleOfAComment>
+ <!--
+ Since this is a comment
+ I can use all sorts of reserved characters
+ like > < " and &
+ or write things like
+ <foo></bar>
+ but my document is still well formed!
+ -->
+</exampleOfAComment>
diff --git a/test/mitmproxy/contentviews/test_xml_html_data/comment.xml b/test/mitmproxy/contentviews/test_xml_html_data/comment.xml
new file mode 100644
index 00000000..3f54ddba
--- /dev/null
+++ b/test/mitmproxy/contentviews/test_xml_html_data/comment.xml
@@ -0,0 +1,10 @@
+<exampleOfAComment>
+<!--
+ Since this is a comment
+ I can use all sorts of reserved characters
+ like > < " and &
+ or write things like
+ <foo></bar>
+ but my document is still well formed!
+-->
+</exampleOfAComment> \ No newline at end of file
diff --git a/test/mitmproxy/contentviews/test_xml_html_data/inline-formatted.html b/test/mitmproxy/contentviews/test_xml_html_data/inline-formatted.html
new file mode 100644
index 00000000..5253bf4f
--- /dev/null
+++ b/test/mitmproxy/contentviews/test_xml_html_data/inline-formatted.html
@@ -0,0 +1,14 @@
+<html>
+<head>
+ <title>Test Page</title>
+</head>
+<body>
+ <p>
+ <i class="fa fa-alert"></i>
+ Some things should be
+ <b>inline</b>
+ , some things shouldn't!
+ </p>
+ <i class="fa fa-warning"/>
+</body>
+</html>
diff --git a/test/mitmproxy/contentviews/test_xml_html_data/inline.html b/test/mitmproxy/contentviews/test_xml_html_data/inline.html
new file mode 100644
index 00000000..3e4b16b9
--- /dev/null
+++ b/test/mitmproxy/contentviews/test_xml_html_data/inline.html
@@ -0,0 +1,7 @@
+<html>
+<head><title>Test Page</title></head>
+<body>
+ <p><i class="fa fa-alert"></i>Some things should be <b>inline</b>, some things shouldn't!</p>
+ <i class="fa fa-warning"/>
+</body>
+</html> \ No newline at end of file
diff --git a/test/mitmproxy/contentviews/test_xml_html_data/simple-formatted.html b/test/mitmproxy/contentviews/test_xml_html_data/simple-formatted.html
new file mode 100644
index 00000000..23438428
--- /dev/null
+++ b/test/mitmproxy/contentviews/test_xml_html_data/simple-formatted.html
@@ -0,0 +1,10 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+ <title>title</title>
+</head>
+<body>
+ <h1>Hello World</h1>
+ <!-- page content -->
+</body>
+</html>
diff --git a/test/mitmproxy/contentviews/test_xml_html_data/simple.html b/test/mitmproxy/contentviews/test_xml_html_data/simple.html
new file mode 100644
index 00000000..73e81a5e
--- /dev/null
+++ b/test/mitmproxy/contentviews/test_xml_html_data/simple.html
@@ -0,0 +1 @@
+<!DOCTYPE html><html lang="en"><head><title>title</title></head><body><h1>Hello World</h1><!-- page content --></body></html>