2 files changed, 30 insertions, 16 deletions
diff --git a/mitmproxy/net/http/message.py b/mitmproxy/net/http/message.py
index 06d00377..86782e8a 100644
--- a/mitmproxy/net/http/message.py
+++ b/mitmproxy/net/http/message.py
@@ -68,7 +68,7 @@ class Message(serializable.Serializable):
     @property
     def raw_content(self) -> bytes:
         """
-        The raw (encoded) HTTP message body
+        The raw (potentially compressed) HTTP message body as bytes.
 
         See also: :py:attr:`content`, :py:class:`text`
         """
@@ -80,10 +80,10 @@ class Message(serializable.Serializable):
 
     def get_content(self, strict: bool=True) -> bytes:
         """
-        The HTTP message body decoded with the content-encoding header (e.g. gzip)
+        The uncompressed HTTP message body as bytes.
 
         Raises:
-            ValueError, when the content-encoding is invalid and strict is True.
+            ValueError, when the HTTP content-encoding is invalid and strict is True.
 
         See also: :py:class:`raw_content`, :py:attr:`text`
         """
@@ -165,22 +165,26 @@ class Message(serializable.Serializable):
             return ct[2].get("charset")
         return None
 
-    def _guess_encoding(self) -> str:
+    def _guess_encoding(self, content=b"") -> str:
         enc = self._get_content_type_charset()
-        if enc:
-            return enc
-
-        if "json" in self.headers.get("content-type", ""):
-            return "utf8"
-        else:
-            # We may also want to check for HTML meta tags here at some point.
-            # REGEX_ENCODING = re.compile(rb"""<meta[^>]+charset=['"]?([^'"]+)""")
-            return "latin-1"
+        if not enc:
+            if "json" in self.headers.get("content-type", ""):
+                enc = "utf8"
+        if not enc:
+            meta_charset = re.search(rb"""<meta[^>]+charset=['"]?([^'">]+)""", content)
+            if meta_charset:
+                enc = meta_charset.group(1).decode("ascii", "ignore")
+        if not enc:
+            enc = "latin-1"
+        # Use GB 18030 as the superset of GB2312 and GBK to fix common encoding problems on Chinese websites.
+        if enc.lower() in ("gb2312", "gbk"):
+            enc = "gb18030"
+
+        return enc
 
     def get_text(self, strict: bool=True) -> Optional[str]:
         """
-        The HTTP message body decoded with both content-encoding header (e.g. gzip)
-        and content-type header charset.
+        The uncompressed and decoded HTTP message body as text.
 
         Raises:
             ValueError, when either content-encoding or charset is invalid and strict is True.
@@ -189,9 +193,9 @@ class Message(serializable.Serializable):
         """
         if self.raw_content is None:
             return None
-        enc = self._guess_encoding()
 
         content = self.get_content(strict)
+        enc = self._guess_encoding(content)
         try:
             return encoding.decode(content, enc)
         except ValueError:
diff --git a/test/mitmproxy/net/http/test_message.py b/test/mitmproxy/net/http/test_message.py
index 512f3199..7ad7890c 100644
--- a/test/mitmproxy/net/http/test_message.py
+++ b/test/mitmproxy/net/http/test_message.py
@@ -229,6 +229,16 @@ class TestMessageText:
         r.headers["content-type"] = "application/json"
         assert r.text == u'"ü"'
 
+    def test_guess_meta_charset(self):
+        r = tutils.tresp(content=b'<meta http-equiv="content-type" '
+                                 b'content="text/html;charset=gb2312">\xe6\x98\x8e\xe4\xbc\xaf')
+        # "鏄庝集" is decoded form of \xe6\x98\x8e\xe4\xbc\xaf in gb18030
+        assert u"鏄庝集" in r.text
+
+    def test_guess_latin_1(self):
+        r = tutils.tresp(content=b"\xF0\xE2")
+        assert r.text == u"ðâ"
+
     def test_none(self):
         r = tutils.tresp(content=None)
         assert r.text is None