diff options
author | Rajat Gupta <35985127+rjt-gupta@users.noreply.github.com> | 2018-12-13 20:04:12 +0530 |
---|---|---|
committer | Maximilian Hils <git@maximilianhils.com> | 2018-12-13 15:34:12 +0100 |
commit | e2bcca47b1ad8040451cbd95039acf200e9b0e84 (patch) | |
tree | 56c9bdf1797a77b2f24ca0dfe9158884080c6817 | |
parent | db658b12edf9a44e40ca79209a652e839ffa78dd (diff) | |
download | mitmproxy-e2bcca47b1ad8040451cbd95039acf200e9b0e84.tar.gz mitmproxy-e2bcca47b1ad8040451cbd95039acf200e9b0e84.tar.bz2 mitmproxy-e2bcca47b1ad8040451cbd95039acf200e9b0e84.zip |
charset in meta tags (#3411)
original contribution from @0xHJK in https://github.com/mitmproxy/mitmproxy/pull/3150
-rw-r--r-- | mitmproxy/net/http/message.py | 36 | ||||
-rw-r--r-- | test/mitmproxy/net/http/test_message.py | 10 |
2 files changed, 30 insertions, 16 deletions
diff --git a/mitmproxy/net/http/message.py b/mitmproxy/net/http/message.py index 06d00377..86782e8a 100644 --- a/mitmproxy/net/http/message.py +++ b/mitmproxy/net/http/message.py @@ -68,7 +68,7 @@ class Message(serializable.Serializable): @property def raw_content(self) -> bytes: """ - The raw (encoded) HTTP message body + The raw (potentially compressed) HTTP message body as bytes. See also: :py:attr:`content`, :py:class:`text` """ @@ -80,10 +80,10 @@ class Message(serializable.Serializable): def get_content(self, strict: bool=True) -> bytes: """ - The HTTP message body decoded with the content-encoding header (e.g. gzip) + The uncompressed HTTP message body as bytes. Raises: - ValueError, when the content-encoding is invalid and strict is True. + ValueError, when the HTTP content-encoding is invalid and strict is True. See also: :py:class:`raw_content`, :py:attr:`text` """ @@ -165,22 +165,26 @@ class Message(serializable.Serializable): return ct[2].get("charset") return None - def _guess_encoding(self) -> str: + def _guess_encoding(self, content=b"") -> str: enc = self._get_content_type_charset() - if enc: - return enc - - if "json" in self.headers.get("content-type", ""): - return "utf8" - else: - # We may also want to check for HTML meta tags here at some point. - # REGEX_ENCODING = re.compile(rb"""<meta[^>]+charset=['"]?([^'"]+)""") - return "latin-1" + if not enc: + if "json" in self.headers.get("content-type", ""): + enc = "utf8" + if not enc: + meta_charset = re.search(rb"""<meta[^>]+charset=['"]?([^'">]+)""", content) + if meta_charset: + enc = meta_charset.group(1).decode("ascii", "ignore") + if not enc: + enc = "latin-1" + # Use GB 18030 as the superset of GB2312 and GBK to fix common encoding problems on Chinese websites. + if enc.lower() in ("gb2312", "gbk"): + enc = "gb18030" + + return enc def get_text(self, strict: bool=True) -> Optional[str]: """ - The HTTP message body decoded with both content-encoding header (e.g. gzip) - and content-type header charset. + The uncompressed and decoded HTTP message body as text. Raises: ValueError, when either content-encoding or charset is invalid and strict is True. @@ -189,9 +193,9 @@ class Message(serializable.Serializable): """ if self.raw_content is None: return None - enc = self._guess_encoding() content = self.get_content(strict) + enc = self._guess_encoding(content) try: return encoding.decode(content, enc) except ValueError: diff --git a/test/mitmproxy/net/http/test_message.py b/test/mitmproxy/net/http/test_message.py index 512f3199..7ad7890c 100644 --- a/test/mitmproxy/net/http/test_message.py +++ b/test/mitmproxy/net/http/test_message.py @@ -229,6 +229,16 @@ class TestMessageText: r.headers["content-type"] = "application/json" assert r.text == u'"ü"' + def test_guess_meta_charset(self): + r = tutils.tresp(content=b'<meta http-equiv="content-type" ' + b'content="text/html;charset=gb2312">\xe6\x98\x8e\xe4\xbc\xaf') + # "鏄庝集" is decoded form of \xe6\x98\x8e\xe4\xbc\xaf in gb18030 + assert u"鏄庝集" in r.text + + def test_guess_latin_1(self): + r = tutils.tresp(content=b"\xF0\xE2") + assert r.text == u"ðâ" + def test_none(self): r = tutils.tresp(content=None) assert r.text is None |