aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--mitmproxy/net/http/message.py36
-rw-r--r--test/mitmproxy/net/http/test_message.py10
2 files changed, 30 insertions, 16 deletions
diff --git a/mitmproxy/net/http/message.py b/mitmproxy/net/http/message.py
index 06d00377..86782e8a 100644
--- a/mitmproxy/net/http/message.py
+++ b/mitmproxy/net/http/message.py
@@ -68,7 +68,7 @@ class Message(serializable.Serializable):
@property
def raw_content(self) -> bytes:
"""
- The raw (encoded) HTTP message body
+ The raw (potentially compressed) HTTP message body as bytes.
See also: :py:attr:`content`, :py:class:`text`
"""
@@ -80,10 +80,10 @@ class Message(serializable.Serializable):
def get_content(self, strict: bool=True) -> bytes:
"""
- The HTTP message body decoded with the content-encoding header (e.g. gzip)
+ The uncompressed HTTP message body as bytes.
Raises:
- ValueError, when the content-encoding is invalid and strict is True.
+ ValueError, when the HTTP content-encoding is invalid and strict is True.
See also: :py:class:`raw_content`, :py:attr:`text`
"""
@@ -165,22 +165,26 @@ class Message(serializable.Serializable):
return ct[2].get("charset")
return None
- def _guess_encoding(self) -> str:
+ def _guess_encoding(self, content=b"") -> str:
enc = self._get_content_type_charset()
- if enc:
- return enc
-
- if "json" in self.headers.get("content-type", ""):
- return "utf8"
- else:
- # We may also want to check for HTML meta tags here at some point.
- # REGEX_ENCODING = re.compile(rb"""<meta[^>]+charset=['"]?([^'"]+)""")
- return "latin-1"
+ if not enc:
+ if "json" in self.headers.get("content-type", ""):
+ enc = "utf8"
+ if not enc:
+ meta_charset = re.search(rb"""<meta[^>]+charset=['"]?([^'">]+)""", content)
+ if meta_charset:
+ enc = meta_charset.group(1).decode("ascii", "ignore")
+ if not enc:
+ enc = "latin-1"
+ # Use GB 18030 as the superset of GB2312 and GBK to fix common encoding problems on Chinese websites.
+ if enc.lower() in ("gb2312", "gbk"):
+ enc = "gb18030"
+
+ return enc
def get_text(self, strict: bool=True) -> Optional[str]:
"""
- The HTTP message body decoded with both content-encoding header (e.g. gzip)
- and content-type header charset.
+ The uncompressed and decoded HTTP message body as text.
Raises:
ValueError, when either content-encoding or charset is invalid and strict is True.
@@ -189,9 +193,9 @@ class Message(serializable.Serializable):
"""
if self.raw_content is None:
return None
- enc = self._guess_encoding()
content = self.get_content(strict)
+ enc = self._guess_encoding(content)
try:
return encoding.decode(content, enc)
except ValueError:
diff --git a/test/mitmproxy/net/http/test_message.py b/test/mitmproxy/net/http/test_message.py
index 512f3199..7ad7890c 100644
--- a/test/mitmproxy/net/http/test_message.py
+++ b/test/mitmproxy/net/http/test_message.py
@@ -229,6 +229,16 @@ class TestMessageText:
r.headers["content-type"] = "application/json"
assert r.text == u'"ü"'
+ def test_guess_meta_charset(self):
+ r = tutils.tresp(content=b'<meta http-equiv="content-type" '
+ b'content="text/html;charset=gb2312">\xe6\x98\x8e\xe4\xbc\xaf')
+ # "鏄庝集" is decoded form of \xe6\x98\x8e\xe4\xbc\xaf in gb18030
+ assert u"鏄庝集" in r.text
+
+ def test_guess_latin_1(self):
+ r = tutils.tresp(content=b"\xF0\xE2")
+ assert r.text == u"ðâ"
+
def test_none(self):
r = tutils.tresp(content=None)
assert r.text is None