diff options
| author | Maximilian Hils <git@maximilianhils.com> | 2015-09-02 01:16:48 +0200 | 
|---|---|---|
| committer | Maximilian Hils <git@maximilianhils.com> | 2015-09-02 01:16:48 +0200 | 
| commit | c14fbc7794eee2a60d3c90f818ec481cf9db544b (patch) | |
| tree | 529949dc40052291460b485142330932cf51819a /libmproxy/models/http.py | |
| parent | e8de7595c2e8a98418593e90b886e45a745e234a (diff) | |
| parent | f1c8b47b1eb153d448061c0ddce21030c31af2b7 (diff) | |
| download | mitmproxy-c14fbc7794eee2a60d3c90f818ec481cf9db544b.tar.gz mitmproxy-c14fbc7794eee2a60d3c90f818ec481cf9db544b.tar.bz2 mitmproxy-c14fbc7794eee2a60d3c90f818ec481cf9db544b.zip | |
Merge pull request #741 from mitmproxy/proxy-refactor-cb
Proxy Refactor
Diffstat (limited to 'libmproxy/models/http.py')
| -rw-r--r-- | libmproxy/models/http.py | 554 | 
1 files changed, 554 insertions, 0 deletions
| diff --git a/libmproxy/models/http.py b/libmproxy/models/http.py new file mode 100644 index 00000000..fb2f305b --- /dev/null +++ b/libmproxy/models/http.py @@ -0,0 +1,554 @@ +from __future__ import (absolute_import, print_function, division) +import Cookie +import copy +from email.utils import parsedate_tz, formatdate, mktime_tz +import time + +from libmproxy import utils +from netlib import odict, encoding +from netlib.http import status_codes +from netlib.tcp import Address +from netlib.http.semantics import Request, Response, CONTENT_MISSING +from .. import version, stateobject +from .flow import Flow + + +class MessageMixin(stateobject.StateObject): +    _stateobject_attributes = dict( +        httpversion=tuple, +        headers=odict.ODictCaseless, +        body=str, +        timestamp_start=float, +        timestamp_end=float +    ) +    _stateobject_long_attributes = {"body"} + +    def get_state(self, short=False): +        ret = super(MessageMixin, self).get_state(short) +        if short: +            if self.body: +                ret["contentLength"] = len(self.body) +            elif self.body == CONTENT_MISSING: +                ret["contentLength"] = None +            else: +                ret["contentLength"] = 0 +        return ret + +    def get_decoded_content(self): +        """ +            Returns the decoded content based on the current Content-Encoding +            header. +            Doesn't change the message iteself or its headers. +        """ +        ce = self.headers.get_first("content-encoding") +        if not self.body or ce not in encoding.ENCODINGS: +            return self.body +        return encoding.decode(ce, self.body) + +    def decode(self): +        """ +            Decodes body based on the current Content-Encoding header, then +            removes the header. If there is no Content-Encoding header, no +            action is taken. + +            Returns True if decoding succeeded, False otherwise. +        """ +        ce = self.headers.get_first("content-encoding") +        if not self.body or ce not in encoding.ENCODINGS: +            return False +        data = encoding.decode(ce, self.body) +        if data is None: +            return False +        self.body = data +        del self.headers["content-encoding"] +        return True + +    def encode(self, e): +        """ +            Encodes body with the encoding e, where e is "gzip", "deflate" +            or "identity". +        """ +        # FIXME: Error if there's an existing encoding header? +        self.body = encoding.encode(e, self.body) +        self.headers["content-encoding"] = [e] + +    def copy(self): +        c = copy.copy(self) +        c.headers = self.headers.copy() +        return c + +    def replace(self, pattern, repl, *args, **kwargs): +        """ +            Replaces a regular expression pattern with repl in both the headers +            and the body of the message. Encoded body will be decoded +            before replacement, and re-encoded afterwards. + +            Returns the number of replacements made. +        """ +        with decoded(self): +            self.body, c = utils.safe_subn( +                pattern, repl, self.body, *args, **kwargs +            ) +        c += self.headers.replace(pattern, repl, *args, **kwargs) +        return c + + +class HTTPRequest(MessageMixin, Request): +    """ +    An HTTP request. + +    Exposes the following attributes: + +        method: HTTP method + +        scheme: URL scheme (http/https) + +        host: Target hostname of the request. This is not neccessarily the +        directy upstream server (which could be another proxy), but it's always +        the target server we want to reach at the end. This attribute is either +        inferred from the request itself (absolute-form, authority-form) or from +        the connection metadata (e.g. the host in reverse proxy mode). + +        port: Destination port + +        path: Path portion of the URL (not present in authority-form) + +        httpversion: HTTP version tuple, e.g. (1,1) + +        headers: odict.ODictCaseless object + +        content: Content of the request, None, or CONTENT_MISSING if there +        is content associated, but not present. CONTENT_MISSING evaluates +        to False to make checking for the presence of content natural. + +        form_in: The request form which mitmproxy has received. The following +        values are possible: + +             - relative (GET /index.html, OPTIONS *) (covers origin form and +               asterisk form) +             - absolute (GET http://example.com:80/index.html) +             - authority-form (CONNECT example.com:443) +             Details: http://tools.ietf.org/html/draft-ietf-httpbis-p1-messaging-25#section-5.3 + +        form_out: The request form which mitmproxy will send out to the +        destination + +        timestamp_start: Timestamp indicating when request transmission started + +        timestamp_end: Timestamp indicating when request transmission ended +    """ + +    def __init__( +            self, +            form_in, +            method, +            scheme, +            host, +            port, +            path, +            httpversion, +            headers, +            body, +            timestamp_start=None, +            timestamp_end=None, +            form_out=None, +    ): +        Request.__init__( +            self, +            form_in, +            method, +            scheme, +            host, +            port, +            path, +            httpversion, +            headers, +            body, +            timestamp_start, +            timestamp_end, +        ) +        self.form_out = form_out or form_in + +        # Have this request's cookies been modified by sticky cookies or auth? +        self.stickycookie = False +        self.stickyauth = False + +        # Is this request replayed? +        self.is_replay = False + +    _stateobject_attributes = MessageMixin._stateobject_attributes.copy() +    _stateobject_attributes.update( +        form_in=str, +        method=str, +        scheme=str, +        host=str, +        port=int, +        path=str, +        form_out=str, +        is_replay=bool +    ) + +    @classmethod +    def from_state(cls, state): +        f = cls( +            None, +            None, +            None, +            None, +            None, +            None, +            None, +            None, +            None, +            None, +            None) +        f.load_state(state) +        return f + +    @classmethod +    def from_protocol( +            self, +            protocol, +            *args, +            **kwargs +    ): +        req = protocol.read_request(*args, **kwargs) +        return self.wrap(req) + +    @classmethod +    def wrap(self, request): +        req = HTTPRequest( +            form_in=request.form_in, +            method=request.method, +            scheme=request.scheme, +            host=request.host, +            port=request.port, +            path=request.path, +            httpversion=request.httpversion, +            headers=request.headers, +            body=request.body, +            timestamp_start=request.timestamp_start, +            timestamp_end=request.timestamp_end, +            form_out=(request.form_out if hasattr(request, 'form_out') else None), +        ) +        if hasattr(request, 'stream_id'): +            req.stream_id = request.stream_id +        return req + +    def __hash__(self): +        return id(self) + +    def replace(self, pattern, repl, *args, **kwargs): +        """ +            Replaces a regular expression pattern with repl in the headers, the +            request path and the body of the request. Encoded content will be +            decoded before replacement, and re-encoded afterwards. + +            Returns the number of replacements made. +        """ +        c = MessageMixin.replace(self, pattern, repl, *args, **kwargs) +        self.path, pc = utils.safe_subn( +            pattern, repl, self.path, *args, **kwargs +        ) +        c += pc +        return c + + +class HTTPResponse(MessageMixin, Response): +    """ +    An HTTP response. + +    Exposes the following attributes: + +        httpversion: HTTP version tuple, e.g. (1, 0), (1, 1), or (2, 0) + +        status_code: HTTP response status code + +        msg: HTTP response message + +        headers: ODict Caseless object + +        content: Content of the request, None, or CONTENT_MISSING if there +        is content associated, but not present. CONTENT_MISSING evaluates +        to False to make checking for the presence of content natural. + +        timestamp_start: Timestamp indicating when request transmission started + +        timestamp_end: Timestamp indicating when request transmission ended +    """ + +    def __init__( +            self, +            httpversion, +            status_code, +            msg, +            headers, +            body, +            timestamp_start=None, +            timestamp_end=None, +    ): +        Response.__init__( +            self, +            httpversion, +            status_code, +            msg, +            headers, +            body, +            timestamp_start=timestamp_start, +            timestamp_end=timestamp_end, +        ) + +        # Is this request replayed? +        self.is_replay = False +        self.stream = False + +    _stateobject_attributes = MessageMixin._stateobject_attributes.copy() +    _stateobject_attributes.update( +        status_code=int, +        msg=str +    ) + +    @classmethod +    def from_state(cls, state): +        f = cls(None, None, None, None, None) +        f.load_state(state) +        return f + +    @classmethod +    def from_protocol( +            self, +            protocol, +            *args, +            **kwargs +    ): +        resp = protocol.read_response(*args, **kwargs) +        return self.wrap(resp) + +    @classmethod +    def wrap(self, response): +        resp = HTTPResponse( +            httpversion=response.httpversion, +            status_code=response.status_code, +            msg=response.msg, +            headers=response.headers, +            body=response.body, +            timestamp_start=response.timestamp_start, +            timestamp_end=response.timestamp_end, +        ) +        if hasattr(response, 'stream_id'): +            resp.stream_id = response.stream_id +        return resp + +    def _refresh_cookie(self, c, delta): +        """ +            Takes a cookie string c and a time delta in seconds, and returns +            a refreshed cookie string. +        """ +        c = Cookie.SimpleCookie(str(c)) +        for i in c.values(): +            if "expires" in i: +                d = parsedate_tz(i["expires"]) +                if d: +                    d = mktime_tz(d) + delta +                    i["expires"] = formatdate(d) +                else: +                    # This can happen when the expires tag is invalid. +                    # reddit.com sends a an expires tag like this: "Thu, 31 Dec +                    # 2037 23:59:59 GMT", which is valid RFC 1123, but not +                    # strictly correct according to the cookie spec. Browsers +                    # appear to parse this tolerantly - maybe we should too. +                    # For now, we just ignore this. +                    del i["expires"] +        return c.output(header="").strip() + +    def refresh(self, now=None): +        """ +            This fairly complex and heuristic function refreshes a server +            response for replay. + +                - It adjusts date, expires and last-modified headers. +                - It adjusts cookie expiration. +        """ +        if not now: +            now = time.time() +        delta = now - self.timestamp_start +        refresh_headers = [ +            "date", +            "expires", +            "last-modified", +        ] +        for i in refresh_headers: +            if i in self.headers: +                d = parsedate_tz(self.headers[i][0]) +                if d: +                    new = mktime_tz(d) + delta +                    self.headers[i] = [formatdate(new)] +        c = [] +        for i in self.headers["set-cookie"]: +            c.append(self._refresh_cookie(i, delta)) +        if c: +            self.headers["set-cookie"] = c + + +class HTTPFlow(Flow): +    """ +    A HTTPFlow is a collection of objects representing a single HTTP +    transaction. The main attributes are: + +        request: HTTPRequest object +        response: HTTPResponse object +        error: Error object +        server_conn: ServerConnection object +        client_conn: ClientConnection object + +    Note that it's possible for a Flow to have both a response and an error +    object. This might happen, for instance, when a response was received +    from the server, but there was an error sending it back to the client. + +    The following additional attributes are exposed: + +        intercepted: Is this flow currently being intercepted? +        live: Does this flow have a live client connection? +    """ + +    def __init__(self, client_conn, server_conn, live=None): +        super(HTTPFlow, self).__init__("http", client_conn, server_conn, live) +        self.request = None +        """@type: HTTPRequest""" +        self.response = None +        """@type: HTTPResponse""" + +    _stateobject_attributes = Flow._stateobject_attributes.copy() +    _stateobject_attributes.update( +        request=HTTPRequest, +        response=HTTPResponse +    ) + +    @classmethod +    def from_state(cls, state): +        f = cls(None, None) +        f.load_state(state) +        return f + +    def __repr__(self): +        s = "<HTTPFlow" +        for a in ("request", "response", "error", "client_conn", "server_conn"): +            if getattr(self, a, False): +                s += "\r\n  %s = {flow.%s}" % (a, a) +        s += ">" +        return s.format(flow=self) + +    def copy(self): +        f = super(HTTPFlow, self).copy() +        if self.request: +            f.request = self.request.copy() +        if self.response: +            f.response = self.response.copy() +        return f + +    def match(self, f): +        """ +            Match this flow against a compiled filter expression. Returns True +            if matched, False if not. + +            If f is a string, it will be compiled as a filter expression. If +            the expression is invalid, ValueError is raised. +        """ +        if isinstance(f, basestring): +            from .. import filt + +            f = filt.parse(f) +            if not f: +                raise ValueError("Invalid filter expression.") +        if f: +            return f(self) +        return True + +    def replace(self, pattern, repl, *args, **kwargs): +        """ +            Replaces a regular expression pattern with repl in both request and +            response of the flow. Encoded content will be decoded before +            replacement, and re-encoded afterwards. + +            Returns the number of replacements made. +        """ +        c = self.request.replace(pattern, repl, *args, **kwargs) +        if self.response: +            c += self.response.replace(pattern, repl, *args, **kwargs) +        return c + + +class decoded(object): +    """ +    A context manager that decodes a request or response, and then +    re-encodes it with the same encoding after execution of the block. + +    Example: +    with decoded(request): +        request.content = request.content.replace("foo", "bar") +    """ + +    def __init__(self, o): +        self.o = o +        ce = o.headers.get_first("content-encoding") +        if ce in encoding.ENCODINGS: +            self.ce = ce +        else: +            self.ce = None + +    def __enter__(self): +        if self.ce: +            self.o.decode() + +    def __exit__(self, type, value, tb): +        if self.ce: +            self.o.encode(self.ce) + + +def make_error_response(status_code, message, headers=None): +    response = status_codes.RESPONSES.get(status_code, "Unknown") +    body = """ +        <html> +            <head> +                <title>%d %s</title> +            </head> +            <body>%s</body> +        </html> +    """.strip() % (status_code, response, message) + +    if not headers: +        headers = odict.ODictCaseless() +    headers["Server"] = [version.NAMEVERSION] +    headers["Connection"] = ["close"] +    headers["Content-Length"] = [len(body)] +    headers["Content-Type"] = ["text/html"] + +    return HTTPResponse( +        (1, 1),  # FIXME: Should be a string. +        status_code, +        response, +        headers, +        body, +    ) + + +def make_connect_request(address): +    address = Address.wrap(address) +    return HTTPRequest( +        "authority", "CONNECT", None, address.host, address.port, None, (1, 1), +        odict.ODictCaseless(), "" +    ) + + +def make_connect_response(httpversion): +    headers = odict.ODictCaseless([ +        ["Content-Length", "0"], +        ["Proxy-Agent", version.NAMEVERSION] +    ]) +    return HTTPResponse( +        httpversion, +        200, +        "Connection established", +        headers, +        "", +    ) | 
