Integrate lxml for pretty-printing HTML and XML.

Tackling the pretty-printing performance problem head-on, at the cost of a major dependency.
author: Aldo Cortesi <aldo@nullcube.com> 2012-04-07 13:47:03 +1200
committer: Aldo Cortesi <aldo@nullcube.com> 2012-04-07 13:47:03 +1200
commit: f1dc3f2ab2d78ce2bc1a0418239fa6fa1f6c4429 (patch)
tree: fd1de610fc9a75bf962c18dedc83d4497a85efe9 /libmproxy
parent: 549512e93e7c0accf161b76b54f1338eb7aa5921 (diff)
download: mitmproxy-f1dc3f2ab2d78ce2bc1a0418239fa6fa1f6c4429.tar.gz
mitmproxy-f1dc3f2ab2d78ce2bc1a0418239fa6fa1f6c4429.tar.bz2
mitmproxy-f1dc3f2ab2d78ce2bc1a0418239fa6fa1f6c4429.zip
2 files changed, 57 insertions, 55 deletions
diff --git a/libmproxy/console/contentview.py b/libmproxy/console/contentview.py
index 0d725c9d..02394c6f 100644
--- a/libmproxy/console/contentview.py
+++ b/libmproxy/console/contentview.py
@@ -2,11 +2,12 @@ import re, cStringIO
 import urwid
 from PIL import Image
 from PIL.ExifTags import TAGS
+import lxml.html, lxml.etree
 import common
 from .. import utils, encoding, flow
 from ..contrib import jsbeautifier
 
-VIEW_CUTOFF = 1024*20
+VIEW_CUTOFF = 1024*200
 
 VIEW_AUTO = 0
 VIEW_JSON = 1
@@ -17,6 +18,7 @@ VIEW_JAVASCRIPT = 5
 VIEW_IMAGE = 6
 VIEW_RAW = 7
 VIEW_HEX = 8
+VIEW_HTML = 9
 
 VIEW_NAMES = {
     VIEW_AUTO: "Auto",
@@ -28,35 +30,38 @@ VIEW_NAMES = {
     VIEW_IMAGE: "Image",
     VIEW_RAW: "Raw",
     VIEW_HEX: "Hex",
+    VIEW_HTML: "HTML",
 }
 
 
 VIEW_PROMPT = (
     ("auto detect", "a"),
-    ("hex", "h"),
+    ("hex", "e"),
+    ("html", "h"),
     ("image", "i"),
     ("javascript", "j"),
     ("json", "s"),
     ("raw", "r"),
     ("multipart", "m"),
     ("urlencoded", "u"),
-    ("xmlish", "x"),
+    ("xml", "x"),
 )
 
 VIEW_SHORTCUTS = {
     "a": VIEW_AUTO,
+    "x": VIEW_XML,
+    "h": VIEW_HTML,
     "i": VIEW_IMAGE,
     "j": VIEW_JAVASCRIPT,
     "s": VIEW_JSON,
     "u": VIEW_URLENCODED,
     "m": VIEW_MULTIPART,
-    "x": VIEW_XML,
     "r": VIEW_RAW,
-    "h": VIEW_HEX,
+    "e": VIEW_HEX,
 }
 
 CONTENT_TYPES_MAP = {
-    "text/html": VIEW_XML,
+    "text/html": VIEW_HTML,
     "application/json": VIEW_JSON,
     "text/xml": VIEW_XML,
     "multipart/form-data": VIEW_MULTIPART,
@@ -116,9 +121,34 @@ def view_hex(hdrs, content):
     return "Hex", txt
 
 
-def view_xmlish(hdrs, content):
+def view_xml(hdrs, content):
+    parser = lxml.etree.XMLParser(remove_blank_text=True, resolve_entities=False, strip_cdata=False, recover=False)
+    try:
+        document = lxml.etree.fromstring(content, parser)
+    except lxml.etree.XMLSyntaxError, v:
+        print v
+        return None
+    docinfo = document.getroottree().docinfo
+
+    prev = []
+    p = document.getroottree().getroot().getprevious()
+    while p is not None:
+        prev.insert(
+            0,
+            lxml.etree.tostring(p)
+        )
+        p = p.getprevious()
+
+    s = lxml.etree.tostring(
+            document,
+            pretty_print=True,
+            xml_declaration=True,
+            doctype=docinfo.doctype + "\n".join(prev),
+            encoding = docinfo.encoding
+        )
+
     txt = []
-    for i in utils.pretty_xmlish(content[:VIEW_CUTOFF]):
+    for i in s[:VIEW_CUTOFF].strip().split("\n"):
         txt.append(
             urwid.Text(("text", i)),
         )
@@ -126,6 +156,22 @@ def view_xmlish(hdrs, content):
     return "XML-like data", txt
 
 
+def view_html(hdrs, content):
+    if utils.isXML(content):
+        parser = lxml.etree.HTMLParser(strip_cdata=True, remove_blank_text=True)
+        d = lxml.html.fromstring(content, parser=parser)
+        docinfo = d.getroottree().docinfo
+        s = lxml.etree.tostring(d, pretty_print=True, doctype=docinfo.doctype)
+
+        txt = []
+        for i in s[:VIEW_CUTOFF].strip().split("\n"):
+            txt.append(
+                urwid.Text(("text", i)),
+            )
+        trailer(len(content), txt)
+        return "HTML", txt
+
+
 def view_json(hdrs, content):
     lines = utils.pretty_json(content)
     if lines:
@@ -229,7 +275,8 @@ def view_image(hdrs, content):
 
 
 PRETTY_FUNCTION_MAP = {
-    VIEW_XML: view_xmlish,
+    VIEW_XML: view_xml,
+    VIEW_HTML: view_html,
     VIEW_JSON: view_json,
     VIEW_URLENCODED: view_urlencoded,
     VIEW_MULTIPART: view_multipart,
@@ -274,7 +321,7 @@ def get_content_view(viewmode, hdrItems, content):
     if not ret:
         viewmode = VIEW_RAW
         ret = view_raw(hdrs, content)
-        msg.append("Fallback to Raw")
+        msg.append("Couldn't parse: falling back to Raw")
     else:
         msg.append(ret[0])
     return " ".join(msg), ret[1]
diff --git a/libmproxy/utils.py b/libmproxy/utils.py
index b4e317c5..d8345399 100644
--- a/libmproxy/utils.py
+++ b/libmproxy/utils.py
@@ -72,51 +72,6 @@ def cleanBin(s, fixspacing=False):
     return "".join(parts)
 
 
-TAG = r"""
-        <\s*
-        (?!\s*[!"])
-        (?P<close>\s*\/)?
-        (?P<name>\w+)
-        (
-            [^'"\t >]+ |
-            "[^\"]*"['\"]* |
-            '[^']*'['\"]* |
-            \s+
-        )*
-        (?P<selfcont>\s*\/\s*)?
-        \s*>
-      """
-UNI = set(["br", "hr", "img", "input", "area", "link"])
-INDENT = " "*4
-def pretty_xmlish(s):
-    """
-        A robust pretty-printer for XML-ish data.
-        Returns a list of lines.
-    """
-    s = cleanBin(s)
-    data, offset, indent, prev = [], 0, 0, None
-    for i in re.finditer(TAG, s, re.VERBOSE|re.MULTILINE):
-        start, end = i.span()
-        name = i.group("name")
-        if start > offset:
-            txt = []
-            for x in textwrap.dedent(s[offset:start]).split("\n"):
-                if x.strip():
-                    txt.append(indent*INDENT + x)
-            data.extend(txt)
-        if i.group("close") and not (name in UNI and name==prev):
-            indent = max(indent - 1, 0)
-        data.append(indent*INDENT + i.group().strip())
-        offset = end
-        if not any([i.group("close"), i.group("selfcont"), name in UNI]):
-            indent += 1
-        prev = name
-    trail = s[offset:]
-    if trail.strip():
-        data.append(s[offset:])
-    return data
-
-
 def pretty_json(s):
     try:
         p = json.loads(s)
author	Aldo Cortesi <aldo@nullcube.com>	2012-04-07 13:47:03 +1200
committer	Aldo Cortesi <aldo@nullcube.com>	2012-04-07 13:47:03 +1200
commit	f1dc3f2ab2d78ce2bc1a0418239fa6fa1f6c4429 (patch)
tree	fd1de610fc9a75bf962c18dedc83d4497a85efe9 /libmproxy
parent	549512e93e7c0accf161b76b54f1338eb7aa5921 (diff)
download	mitmproxy-f1dc3f2ab2d78ce2bc1a0418239fa6fa1f6c4429.tar.gz mitmproxy-f1dc3f2ab2d78ce2bc1a0418239fa6fa1f6c4429.tar.bz2 mitmproxy-f1dc3f2ab2d78ce2bc1a0418239fa6fa1f6c4429.zip