diff options
author | Thomas Kriechbaumer <thomas@kriechbaumer.name> | 2016-09-29 12:46:56 +0200 |
---|---|---|
committer | Thomas Kriechbaumer <thomas@kriechbaumer.name> | 2016-10-03 11:45:54 +0200 |
commit | ba842489103caba2691084a1f32f3c79f8f564d6 (patch) | |
tree | 0f59f5361da6db63a28ca6830101eccf300edfad /mitmproxy/flowfilter.py | |
parent | 611dad12a7a94fd90619a2db44cd42aab922b4e2 (diff) | |
download | mitmproxy-ba842489103caba2691084a1f32f3c79f8f564d6.tar.gz mitmproxy-ba842489103caba2691084a1f32f3c79f8f564d6.tar.bz2 mitmproxy-ba842489103caba2691084a1f32f3c79f8f564d6.zip |
rename mitmproxy.filt -> mitmproxy.flowfilter
Diffstat (limited to 'mitmproxy/flowfilter.py')
-rw-r--r-- | mitmproxy/flowfilter.py | 543 |
1 files changed, 543 insertions, 0 deletions
diff --git a/mitmproxy/flowfilter.py b/mitmproxy/flowfilter.py new file mode 100644 index 00000000..a6398957 --- /dev/null +++ b/mitmproxy/flowfilter.py @@ -0,0 +1,543 @@ +""" + The following operators are understood: + + ~q Request + ~s Response + + Headers: + + Patterns are matched against "name: value" strings. Field names are + all-lowercase. + + ~a Asset content-type in response. Asset content types are: + text/javascript + application/x-javascript + application/javascript + text/css + image/* + application/x-shockwave-flash + ~h rex Header line in either request or response + ~hq rex Header in request + ~hs rex Header in response + + ~b rex Expression in the body of either request or response + ~bq rex Expression in the body of request + ~bq rex Expression in the body of response + ~t rex Shortcut for content-type header. + + ~d rex Request domain + ~m rex Method + ~u rex URL + ~c CODE Response code. + rex Equivalent to ~u rex +""" +from __future__ import absolute_import, print_function, division + +import re +import sys +import functools + +from mitmproxy.models.http import HTTPFlow +from mitmproxy.models.tcp import TCPFlow +from mitmproxy.models.flow import Flow + +from netlib import strutils + +import pyparsing as pp +from typing import Callable + + +def only(*types): + def decorator(fn): + @functools.wraps(fn) + def filter_types(self, flow): + if isinstance(flow, types): + return fn(self, flow) + return False + return filter_types + return decorator + + +class _Token(object): + + def dump(self, indent=0, fp=sys.stdout): + print("{spacing}{name}{expr}".format( + spacing="\t" * indent, + name=self.__class__.__name__, + expr=getattr(self, "expr", "") + ), file=fp) + + +class _Action(_Token): + + @classmethod + def make(klass, s, loc, toks): + return klass(*toks[1:]) + + +class FErr(_Action): + code = "e" + help = "Match error" + + def __call__(self, f): + return True if f.error else False + + +class FMarked(_Action): + code = "marked" + help = "Match marked flows" + + def __call__(self, f): + return f.marked + + +class FHTTP(_Action): + code = "http" + help = "Match HTTP flows" + + @only(HTTPFlow) + def __call__(self, f): + return True + + +class FTCP(_Action): + code = "tcp" + help = "Match TCP flows" + + @only(TCPFlow) + def __call__(self, f): + return True + + +class FReq(_Action): + code = "q" + help = "Match request with no response" + + @only(HTTPFlow) + def __call__(self, f): + if not f.response: + return True + + +class FResp(_Action): + code = "s" + help = "Match response" + + @only(HTTPFlow) + def __call__(self, f): + return bool(f.response) + + +class _Rex(_Action): + flags = 0 + is_binary = True + + def __init__(self, expr): + self.expr = expr + if self.is_binary: + expr = strutils.escaped_str_to_bytes(expr) + try: + self.re = re.compile(expr, self.flags) + except: + raise ValueError("Cannot compile expression.") + + +def _check_content_type(rex, message): + return any( + name.lower() == b"content-type" and + rex.search(value) + for name, value in message.headers.fields + ) + + +class FAsset(_Action): + code = "a" + help = "Match asset in response: CSS, Javascript, Flash, images." + ASSET_TYPES = [ + b"text/javascript", + b"application/x-javascript", + b"application/javascript", + b"text/css", + b"image/.*", + b"application/x-shockwave-flash" + ] + ASSET_TYPES = [re.compile(x) for x in ASSET_TYPES] + + @only(HTTPFlow) + def __call__(self, f): + if f.response: + for i in self.ASSET_TYPES: + if _check_content_type(i, f.response): + return True + return False + + +class FContentType(_Rex): + code = "t" + help = "Content-type header" + + @only(HTTPFlow) + def __call__(self, f): + if _check_content_type(self.re, f.request): + return True + elif f.response and _check_content_type(self.re, f.response): + return True + return False + + +class FContentTypeRequest(_Rex): + code = "tq" + help = "Request Content-Type header" + + @only(HTTPFlow) + def __call__(self, f): + return _check_content_type(self.re, f.request) + + +class FContentTypeResponse(_Rex): + code = "ts" + help = "Response Content-Type header" + + @only(HTTPFlow) + def __call__(self, f): + if f.response: + return _check_content_type(self.re, f.response) + return False + + +class FHead(_Rex): + code = "h" + help = "Header" + flags = re.MULTILINE + + @only(HTTPFlow) + def __call__(self, f): + if f.request and self.re.search(bytes(f.request.headers)): + return True + if f.response and self.re.search(bytes(f.response.headers)): + return True + return False + + +class FHeadRequest(_Rex): + code = "hq" + help = "Request header" + flags = re.MULTILINE + + @only(HTTPFlow) + def __call__(self, f): + if f.request and self.re.search(bytes(f.request.headers)): + return True + + +class FHeadResponse(_Rex): + code = "hs" + help = "Response header" + flags = re.MULTILINE + + @only(HTTPFlow) + def __call__(self, f): + if f.response and self.re.search(bytes(f.response.headers)): + return True + + +class FBod(_Rex): + code = "b" + help = "Body" + flags = re.DOTALL + + @only(HTTPFlow, TCPFlow) + def __call__(self, f): + if isinstance(f, HTTPFlow): + if f.request and f.request.raw_content: + if self.re.search(f.request.get_content(strict=False)): + return True + if f.response and f.response.raw_content: + if self.re.search(f.response.get_content(strict=False)): + return True + elif isinstance(f, TCPFlow): + for msg in f.messages: + if self.re.search(msg.content): + return True + return False + + +class FBodRequest(_Rex): + code = "bq" + help = "Request body" + flags = re.DOTALL + + @only(HTTPFlow, TCPFlow) + def __call__(self, f): + if isinstance(f, HTTPFlow): + if f.request and f.request.raw_content: + if self.re.search(f.request.get_content(strict=False)): + return True + elif isinstance(f, TCPFlow): + for msg in f.messages: + if msg.from_client and self.re.search(msg.content): + return True + + +class FBodResponse(_Rex): + code = "bs" + help = "Response body" + flags = re.DOTALL + + @only(HTTPFlow, TCPFlow) + def __call__(self, f): + if isinstance(f, HTTPFlow): + if f.response and f.response.raw_content: + if self.re.search(f.response.get_content(strict=False)): + return True + elif isinstance(f, TCPFlow): + for msg in f.messages: + if not msg.from_client and self.re.search(msg.content): + return True + + +class FMethod(_Rex): + code = "m" + help = "Method" + flags = re.IGNORECASE + + @only(HTTPFlow) + def __call__(self, f): + return bool(self.re.search(f.request.data.method)) + + +class FDomain(_Rex): + code = "d" + help = "Domain" + flags = re.IGNORECASE + + @only(HTTPFlow) + def __call__(self, f): + return bool(self.re.search(f.request.data.host)) + + +class FUrl(_Rex): + code = "u" + help = "URL" + is_binary = False + # FUrl is special, because it can be "naked". + + @classmethod + def make(klass, s, loc, toks): + if len(toks) > 1: + toks = toks[1:] + return klass(*toks) + + @only(HTTPFlow) + def __call__(self, f): + return self.re.search(f.request.url) + + +class FSrc(_Rex): + code = "src" + help = "Match source address" + is_binary = False + + def __call__(self, f): + return f.client_conn.address and self.re.search(repr(f.client_conn.address)) + + +class FDst(_Rex): + code = "dst" + help = "Match destination address" + is_binary = False + + def __call__(self, f): + return f.server_conn.address and self.re.search(repr(f.server_conn.address)) + + +class _Int(_Action): + + def __init__(self, num): + self.num = int(num) + + +class FCode(_Int): + code = "c" + help = "HTTP response code" + + @only(HTTPFlow) + def __call__(self, f): + if f.response and f.response.status_code == self.num: + return True + + +class FAnd(_Token): + + def __init__(self, lst): + self.lst = lst + + def dump(self, indent=0, fp=sys.stdout): + super(FAnd, self).dump(indent, fp) + for i in self.lst: + i.dump(indent + 1, fp) + + def __call__(self, f): + return all(i(f) for i in self.lst) + + +class FOr(_Token): + + def __init__(self, lst): + self.lst = lst + + def dump(self, indent=0, fp=sys.stdout): + super(FOr, self).dump(indent, fp) + for i in self.lst: + i.dump(indent + 1, fp) + + def __call__(self, f): + return any(i(f) for i in self.lst) + + +class FNot(_Token): + + def __init__(self, itm): + self.itm = itm[0] + + def dump(self, indent=0, fp=sys.stdout): + super(FNot, self).dump(indent, fp) + self.itm.dump(indent + 1, fp) + + def __call__(self, f): + return not self.itm(f) + + +filter_unary = [ + FAsset, + FErr, + FHTTP, + FMarked, + FReq, + FResp, + FTCP, +] +filter_rex = [ + FBod, + FBodRequest, + FBodResponse, + FContentType, + FContentTypeRequest, + FContentTypeResponse, + FDomain, + FDst, + FHead, + FHeadRequest, + FHeadResponse, + FMethod, + FSrc, + FUrl, +] +filter_int = [ + FCode +] + + +def _make(): + # Order is important - multi-char expressions need to come before narrow + # ones. + parts = [] + for klass in filter_unary: + f = pp.Literal("~%s" % klass.code) + pp.WordEnd() + f.setParseAction(klass.make) + parts.append(f) + + simplerex = "".join(c for c in pp.printables if c not in "()~'\"") + rex = pp.Word(simplerex) |\ + pp.QuotedString("\"", escChar='\\') |\ + pp.QuotedString("'", escChar='\\') + for klass in filter_rex: + f = pp.Literal("~%s" % klass.code) + pp.WordEnd() + rex.copy() + f.setParseAction(klass.make) + parts.append(f) + + for klass in filter_int: + f = pp.Literal("~%s" % klass.code) + pp.WordEnd() + pp.Word(pp.nums) + f.setParseAction(klass.make) + parts.append(f) + + # A naked rex is a URL rex: + f = rex.copy() + f.setParseAction(FUrl.make) + parts.append(f) + + atom = pp.MatchFirst(parts) + expr = pp.operatorPrecedence(atom, + [(pp.Literal("!").suppress(), + 1, + pp.opAssoc.RIGHT, + lambda x: FNot(*x)), + (pp.Literal("&").suppress(), + 2, + pp.opAssoc.LEFT, + lambda x: FAnd(*x)), + (pp.Literal("|").suppress(), + 2, + pp.opAssoc.LEFT, + lambda x: FOr(*x)), + ]) + expr = pp.OneOrMore(expr) + return expr.setParseAction(lambda x: FAnd(x) if len(x) != 1 else x) +bnf = _make() + + +TFilter = Callable[[Flow], bool] + + +def parse(s): + # type: (str) -> TFilter + try: + flt = bnf.parseString(s, parseAll=True)[0] + flt.pattern = s + return flt + except pp.ParseException: + return None + except ValueError: + return None + + +def match(self, flow, flt): + """ + Match a flow against a compiled filter expression. + Returns True if matched, False if not. + + If flt is a string, it will be compiled as a filter expression. + If the expression is invalid, ValueError is raised. + """ + if isinstance(flt, six.string_types): + + flt = parse(flt) + if not flt: + raise ValueError("Invalid filter expression.") + if flt: + return flt(flow) + return True + + +help = [] +for i in filter_unary: + help.append( + ("~%s" % i.code, i.help) + ) +for i in filter_rex: + help.append( + ("~%s regex" % i.code, i.help) + ) +for i in filter_int: + help.append( + ("~%s int" % i.code, i.help) + ) +help.sort() +help.extend( + [ + ("!", "unary not"), + ("&", "and"), + ("|", "or"), + ("(...)", "grouping"), + ] +) |