examples/har_extractor.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193

"""
    This inline script utilizes harparser.HAR from https://github.com/JustusW/harparser
    to generate a HAR log object.
"""
from harparser import HAR
from datetime import datetime, timedelta, tzinfo


class UTC(tzinfo):
    def utcoffset(self, dt):
        return timedelta(0)

    def dst(self, dt):
        return timedelta(0)

    def tzname(self, dt):
        return "Z"


class _HARLog(HAR.log):
    def __init__(self):
        HAR.log.__init__(self, {"version": "1.2",
                                "creator": {"name": "MITMPROXY HARExtractor",
                                            "version": "0.1",
                                            "comment": ""},
                                "pages": [],
                                "entries": []})

    def reset(self):
        self.__init__()

    def add(self, obj):
        if isinstance(obj, HAR.pages):
            self['pages'].append(obj)
        if isinstance(obj, HAR.entries):
            self['entries'].append(obj)


def start(context, argv):
    """
        On start we reset the HAR, it's not really necessary since it will have been
        instantiated earlier during initial parsing of this file. You will have to
        adapt this to suit your actual needs of HAR generation.
    """
    HARLog.reset()
    context.seen_server_connect = set()
    context.seen_server_ssl = set()


def response(context, flow):
    """
       Called when a server response has been received. At the time of this message both
       a request and a response are present and completely done.
    """
    connect_time = -1
    if flow.server_conn not in context.seen_server_connect:
        # Calculate the connect_time for this server_conn. Afterwards add it to seen list, in
        # order to avoid the connect_time being present in entries that use an existing connection.
        connect_time = flow.server_conn.timestamp_tcp_setup - flow.server_conn.timestamp_start
        context.seen_server_connect.add(flow.server_conn)

    ssl_time = -1
    if flow.server_conn not in context.seen_server_connect \
            and flow.server_conn.timestamp_ssl_setup is not None:
        # Get the ssl_time for this server_conn as the difference between the start of the successful
        # tcp setup and the successful ssl setup. Afterwards add it to seen list, in order to avoid
        # the ssl_time being present in entries that use an existing connection. If  no ssl setup has
        # been made it is also left as -1 since it doesn't apply to this connection.
        ssl_time = flow.server_conn.timestamp_ssl_setup - flow.server_conn.timestamp_tcp_setup
        context.seen_server_ssl.add(flow.server_conn)

    # Calculate the raw timings from the different timestamps present in the request and response object.
    # For lack of a way to measure it dns timings can not be calculated. The same goes for HAR blocked:
    # MITMProxy will open a server connection as soon as it receives the host and port from the client
    # connection. So the time spent waiting is actually spent waiting between request.timestamp_end and
    # response.timestamp_start thus it correlates to HAR wait instead.
    timings_raw = {'send': flow.request.timestamp_end - flow.request.timestamp_start,
                   'wait': flow.response.timestamp_start - flow.request.timestamp_end,
                   'receive': flow.response.timestamp_end - flow.response.timestamp_start,
                   'connect': connect_time,
                   'ssl': ssl_time}

    # HAR timings are integers in ms, so we have to re-encode the raw timings to that format.
    timings = dict([(key, int(1000 * value)) for key, value in timings_raw.iteritems()])

    # The full_time is the sum of all timings. Timings set to -1 will be ignored as per spec.
    full_time = 0
    for item in timings.values():
        if item > -1:
            full_time += item

    started_date_time = datetime.fromtimestamp(flow.request.timestamp_start, tz=UTC()).isoformat()

    request_query_string = [{"name": k, "value": v} for k, v in flow.request.get_query()]
    request_http_version = ".".join([str(v) for v in flow.request.httpversion])
    # Cookies are shaped as tuples by MITMProxy.
    request_cookies = [{"name": k.strip(), "value": v[0]} for k, v in (flow.request.get_cookies() or {}).iteritems()]
    request_headers = [{"name": k, "value": v} for k, v in flow.request.headers]
    request_headers_size = len(str(flow.request.headers))
    request_body_size = len(flow.request.content)

    response_http_version = ".".join([str(v) for v in flow.response.httpversion])
    # Cookies are shaped as tuples by MITMProxy.
    response_cookies = [{"name": k.strip(), "value": v[0]} for k, v in (flow.response.get_cookies() or {}).iteritems()]
    response_headers = [{"name": k, "value": v} for k, v in flow.response.headers]
    response_headers_size = len(str(flow.response.headers))
    response_body_size = len(flow.response.content)
    response_body_decoded_size = len(flow.response.content)
    response_body_compression = response_body_decoded_size - response_body_size
    response_mime_type = flow.response.headers.get('Content-Type', [''])[0]
    response_redirect_url = flow.response.headers.get('Location', [''])[0]

    entry = HAR.entries({"startedDateTime": started_date_time,
                         "time": full_time,
                         "request": {"method": flow.request.method,
                                     "url": flow.request.url,
                                     "httpVersion": request_http_version,
                                     "cookies": request_cookies,
                                     "headers": request_headers,
                                     "queryString": request_query_string,
                                     "headersSize": request_headers_size,
                                     "bodySize": request_body_size, },
                         "response": {"status": flow.response.code,
                                      "statusText": flow.response.msg,
                                      "httpVersion": response_http_version,
                                      "cookies": response_cookies,
                                      "headers": response_headers,
                                      "content": {"size": response_body_size,
                                                  "compression": response_body_compression,
                                                  "mimeType": response_mime_type},
                                      "redirectURL": response_redirect_url,
                                      "headersSize": response_headers_size,
                                      "bodySize": response_body_size, },
                         "cache": {},
                         "timings": timings, })

    # If the current url is in HARPAGE_LIST or does not have a referer we add it as a new pages object.
    if flow.request.url in HARPAGE_LIST or flow.request.headers.get('Referer', None) is None:
        PAGE_COUNT[1] += 1
        page_id = "_".join([str(v) for v in PAGE_COUNT])
        HARLog.add(HAR.pages({"startedDateTime": entry['startedDateTime'],
                              "id": page_id,
                              "title": flow.request.url, }))
        PAGE_REF[flow.request.url] = page_id
        entry['pageref'] = page_id

    # Lookup the referer in our PAGE_REF dict to point this entries pageref attribute to the right pages object.
    elif flow.request.headers.get('Referer', (None, ))[0] in PAGE_REF.keys():
        entry['pageref'] = PAGE_REF[flow.request.headers['Referer'][0]]
        PAGE_REF[flow.request.url] = entry['pageref']

    HARLog.add(entry)


def done(context):
    """
        Called once on script shutdown, after any other events.
    """
    from pprint import pprint
    import json

    json_dump = HARLog.json()
    compressed_json_dump = HARLog.compress()

    print "=" * 100
    pprint(json.loads(json_dump))
    print "=" * 100
    print "HAR log finished with %s bytes (%s bytes compressed)" % (len(json_dump), len(compressed_json_dump))
    print "Compression rate is %s%%" % str(100. * len(compressed_json_dump) / len(json_dump))
    print "=" * 100


def print_attributes(obj, filter_string=None, hide_privates=False):
    """
        Useful helper method to quickly get all attributes of an object and its values.
    """
    for attr in dir(obj):
        if hide_privates and "__" in attr:
            continue
        if filter_string is not None and filter_string not in attr:
            continue
        value = getattr(obj, attr)
        print "%s.%s" % ('obj', attr), value, type(value)


# Some initializations. Add any page you want to have its own pages object to HARPAGE_LIST
HARPAGE_LIST = ['https://github.com/']
HARLog = _HARLog()

CONNECT_TIMES = {}
SSL_TIMES = {}
PAGE_REF = {}
PAGE_COUNT = ['autopage', 0]