aboutsummaryrefslogtreecommitdiffstats
path: root/mitmproxy/net
diff options
context:
space:
mode:
authorRajat Gupta <35985127+rjt-gupta@users.noreply.github.com>2018-12-13 20:04:12 +0530
committerMaximilian Hils <git@maximilianhils.com>2018-12-13 15:34:12 +0100
commite2bcca47b1ad8040451cbd95039acf200e9b0e84 (patch)
tree56c9bdf1797a77b2f24ca0dfe9158884080c6817 /mitmproxy/net
parentdb658b12edf9a44e40ca79209a652e839ffa78dd (diff)
downloadmitmproxy-e2bcca47b1ad8040451cbd95039acf200e9b0e84.tar.gz
mitmproxy-e2bcca47b1ad8040451cbd95039acf200e9b0e84.tar.bz2
mitmproxy-e2bcca47b1ad8040451cbd95039acf200e9b0e84.zip
charset in meta tags (#3411)
original contribution from @0xHJK in https://github.com/mitmproxy/mitmproxy/pull/3150
Diffstat (limited to 'mitmproxy/net')
-rw-r--r--mitmproxy/net/http/message.py36
1 files changed, 20 insertions, 16 deletions
diff --git a/mitmproxy/net/http/message.py b/mitmproxy/net/http/message.py
index 06d00377..86782e8a 100644
--- a/mitmproxy/net/http/message.py
+++ b/mitmproxy/net/http/message.py
@@ -68,7 +68,7 @@ class Message(serializable.Serializable):
@property
def raw_content(self) -> bytes:
"""
- The raw (encoded) HTTP message body
+ The raw (potentially compressed) HTTP message body as bytes.
See also: :py:attr:`content`, :py:class:`text`
"""
@@ -80,10 +80,10 @@ class Message(serializable.Serializable):
def get_content(self, strict: bool=True) -> bytes:
"""
- The HTTP message body decoded with the content-encoding header (e.g. gzip)
+ The uncompressed HTTP message body as bytes.
Raises:
- ValueError, when the content-encoding is invalid and strict is True.
+ ValueError, when the HTTP content-encoding is invalid and strict is True.
See also: :py:class:`raw_content`, :py:attr:`text`
"""
@@ -165,22 +165,26 @@ class Message(serializable.Serializable):
return ct[2].get("charset")
return None
- def _guess_encoding(self) -> str:
+ def _guess_encoding(self, content=b"") -> str:
enc = self._get_content_type_charset()
- if enc:
- return enc
-
- if "json" in self.headers.get("content-type", ""):
- return "utf8"
- else:
- # We may also want to check for HTML meta tags here at some point.
- # REGEX_ENCODING = re.compile(rb"""<meta[^>]+charset=['"]?([^'"]+)""")
- return "latin-1"
+ if not enc:
+ if "json" in self.headers.get("content-type", ""):
+ enc = "utf8"
+ if not enc:
+ meta_charset = re.search(rb"""<meta[^>]+charset=['"]?([^'">]+)""", content)
+ if meta_charset:
+ enc = meta_charset.group(1).decode("ascii", "ignore")
+ if not enc:
+ enc = "latin-1"
+ # Use GB 18030 as the superset of GB2312 and GBK to fix common encoding problems on Chinese websites.
+ if enc.lower() in ("gb2312", "gbk"):
+ enc = "gb18030"
+
+ return enc
def get_text(self, strict: bool=True) -> Optional[str]:
"""
- The HTTP message body decoded with both content-encoding header (e.g. gzip)
- and content-type header charset.
+ The uncompressed and decoded HTTP message body as text.
Raises:
ValueError, when either content-encoding or charset is invalid and strict is True.
@@ -189,9 +193,9 @@ class Message(serializable.Serializable):
"""
if self.raw_content is None:
return None
- enc = self._guess_encoding()
content = self.get_content(strict)
+ enc = self._guess_encoding(content)
try:
return encoding.decode(content, enc)
except ValueError: