charset in meta tags (#3411)

original contribution from @0xHJK in https://github.com/mitmproxy/mitmproxy/pull/3150
author: Rajat Gupta <35985127+rjt-gupta@users.noreply.github.com> 2018-12-13 20:04:12 +0530
committer: Maximilian Hils <git@maximilianhils.com> 2018-12-13 15:34:12 +0100
commit: e2bcca47b1ad8040451cbd95039acf200e9b0e84 (patch)
tree: 56c9bdf1797a77b2f24ca0dfe9158884080c6817 /mitmproxy/net
parent: db658b12edf9a44e40ca79209a652e839ffa78dd (diff)
download: mitmproxy-e2bcca47b1ad8040451cbd95039acf200e9b0e84.tar.gz
mitmproxy-e2bcca47b1ad8040451cbd95039acf200e9b0e84.tar.bz2
mitmproxy-e2bcca47b1ad8040451cbd95039acf200e9b0e84.zip
1 files changed, 20 insertions, 16 deletions
diff --git a/mitmproxy/net/http/message.py b/mitmproxy/net/http/message.py
index 06d00377..86782e8a 100644
--- a/mitmproxy/net/http/message.py
+++ b/mitmproxy/net/http/message.py
@@ -68,7 +68,7 @@ class Message(serializable.Serializable):
     @property
     def raw_content(self) -> bytes:
         """
-        The raw (encoded) HTTP message body
+        The raw (potentially compressed) HTTP message body as bytes.
 
         See also: :py:attr:`content`, :py:class:`text`
         """
@@ -80,10 +80,10 @@ class Message(serializable.Serializable):
 
     def get_content(self, strict: bool=True) -> bytes:
         """
-        The HTTP message body decoded with the content-encoding header (e.g. gzip)
+        The uncompressed HTTP message body as bytes.
 
         Raises:
-            ValueError, when the content-encoding is invalid and strict is True.
+            ValueError, when the HTTP content-encoding is invalid and strict is True.
 
         See also: :py:class:`raw_content`, :py:attr:`text`
         """
@@ -165,22 +165,26 @@ class Message(serializable.Serializable):
             return ct[2].get("charset")
         return None
 
-    def _guess_encoding(self) -> str:
+    def _guess_encoding(self, content=b"") -> str:
         enc = self._get_content_type_charset()
-        if enc:
-            return enc
-
-        if "json" in self.headers.get("content-type", ""):
-            return "utf8"
-        else:
-            # We may also want to check for HTML meta tags here at some point.
-            # REGEX_ENCODING = re.compile(rb"""<meta[^>]+charset=['"]?([^'"]+)""")
-            return "latin-1"
+        if not enc:
+            if "json" in self.headers.get("content-type", ""):
+                enc = "utf8"
+        if not enc:
+            meta_charset = re.search(rb"""<meta[^>]+charset=['"]?([^'">]+)""", content)
+            if meta_charset:
+                enc = meta_charset.group(1).decode("ascii", "ignore")
+        if not enc:
+            enc = "latin-1"
+        # Use GB 18030 as the superset of GB2312 and GBK to fix common encoding problems on Chinese websites.
+        if enc.lower() in ("gb2312", "gbk"):
+            enc = "gb18030"
+
+        return enc
 
     def get_text(self, strict: bool=True) -> Optional[str]:
         """
-        The HTTP message body decoded with both content-encoding header (e.g. gzip)
-        and content-type header charset.
+        The uncompressed and decoded HTTP message body as text.
 
         Raises:
             ValueError, when either content-encoding or charset is invalid and strict is True.
@@ -189,9 +193,9 @@ class Message(serializable.Serializable):
         """
         if self.raw_content is None:
             return None
-        enc = self._guess_encoding()
 
         content = self.get_content(strict)
+        enc = self._guess_encoding(content)
         try:
             return encoding.decode(content, enc)
         except ValueError:
author	Rajat Gupta <35985127+rjt-gupta@users.noreply.github.com>	2018-12-13 20:04:12 +0530
committer	Maximilian Hils <git@maximilianhils.com>	2018-12-13 15:34:12 +0100
commit	e2bcca47b1ad8040451cbd95039acf200e9b0e84 (patch)
tree	56c9bdf1797a77b2f24ca0dfe9158884080c6817 /mitmproxy/net
parent	db658b12edf9a44e40ca79209a652e839ffa78dd (diff)
download	mitmproxy-e2bcca47b1ad8040451cbd95039acf200e9b0e84.tar.gz mitmproxy-e2bcca47b1ad8040451cbd95039acf200e9b0e84.tar.bz2 mitmproxy-e2bcca47b1ad8040451cbd95039acf200e9b0e84.zip