aboutsummaryrefslogtreecommitdiffstats
path: root/examples/har_extractor.py
blob: 504e98df18905a1e5e2a28200af9a68787ea5e17 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
"""
    This inline script utilizes harparser.HAR from https://github.com/JustusW/harparser
    to generate a HAR log object.
"""
from harparser import HAR
from datetime import datetime, timedelta, tzinfo


class UTC(tzinfo):
    def utcoffset(self, dt):
        return timedelta(0)

    def dst(self, dt):
        return timedelta(0)

    def tzname(self, dt):
        return "Z"


class _HARLog(HAR.log):
    def __init__(self):
        HAR.log.__init__(self, {"version": "1.2",
                                "creator": {"name": "MITMPROXY HARExtractor",
                                            "version": "0.1",
                                            "comment": ""},
                                "pages": [],
                                "entries": []})

    def reset(self):
        self.__init__()

    def add(self, obj):
        if isinstance(obj, HAR.pages):
            self['pages'].append(obj)
        if isinstance(obj, HAR.entries):
            self['entries'].append(obj)


def start(context, argv):
    HARLog.reset()


def clientconnect(context, conn_handler):
    """
        Called when a client initiates a connection to the proxy. Note that a
        connection can correspond to multiple HTTP requests
    """
    import time
    context.log("clientconnect" + str(time.time()))


def serverconnect(context, conn_handler):
    """
        Called when the proxy initiates a connection to the target server. Note that a
        connection can correspond to multiple HTTP requests
    """
    CONNECT_TIMES.pop(conn_handler.server_conn.address.address, None)
    SSL_TIMES.pop(conn_handler.server_conn.address.address, None)
    import time
    context.log("serverconnect " + str(time.time()))


def request(context, flow):
    """
        Called when a client request has been received.
    """
    # print_attributes(flow)
    # print_attributes(context)
    import time
    context.log("request " + str(time.time()) + " " + str(flow.request.timestamp_start))


def responseheaders(context, flow):
    """
        Called when the response headers for a server response have been received,
        but the response body has not been processed yet. Can be used to tell mitmproxy
        to stream the response.
    """
    context.log("responseheaders")


def response(context, flow):
    """
       Called when a server response has been received.
    """
    import time
    context.log("response " + str(time.time()) + " " + str(flow.request.timestamp_start))
    context.log("response " + str(time.time()) + " " + str(flow.response.timestamp_end))
    connect_time = CONNECT_TIMES.get(flow.server_conn.address.address,
                                     int((flow.server_conn.timestamp_tcp_setup
                                          - flow.server_conn.timestamp_start)
                                         * 1000))
    CONNECT_TIMES[flow.server_conn.address.address] = -1

    ssl_time = -1
    if flow.server_conn.timestamp_ssl_setup is not None:
        ssl_time = SSL_TIMES.get(flow.server_conn.address.address,
                                 int((flow.server_conn.timestamp_ssl_setup
                                      - flow.server_conn.timestamp_tcp_setup)
                                     * 1000))
        SSL_TIMES[flow.server_conn.address.address] = -1

    timings = {'send': int((flow.request.timestamp_end - flow.request.timestamp_start) * 1000),
               'wait': int((flow.response.timestamp_start - flow.request.timestamp_end) * 1000),
               'receive': int((flow.response.timestamp_end - flow.response.timestamp_start) * 1000),
               'connect': connect_time,
               'ssl': ssl_time}

    full_time = 0
    for item in timings.values():
        if item > -1:
            full_time += item

    entry = HAR.entries({"startedDateTime": datetime.fromtimestamp(flow.request.timestamp_start, tz=UTC()).isoformat(),
                         "time": full_time,
                         "request": {"method": flow.request.method,
                                     "url": flow.request.url,
                                     "httpVersion": ".".join([str(v) for v in flow.request.httpversion]),
                                     "cookies": [{"name": k.strip(), "value": v[0]}
                                                 for k, v in (flow.request.get_cookies() or {}).iteritems()],
                                     "headers": [{"name": k, "value": v}
                                                 for k, v in flow.request.headers],
                                     "queryString": [{"name": k, "value": v}
                                                     for k, v in flow.request.get_query()],
                                     "headersSize": len(str(flow.request.headers).split("\r\n\r\n")[0]),
                                     "bodySize": len(flow.request.content), },
                         "response": {"status": flow.response.code,
                                      "statusText": flow.response.msg,
                                      "httpVersion": ".".join([str(v) for v in flow.response.httpversion]),
                                      "cookies": [{"name": k.strip(), "value": v[0]}
                                                  for k, v in (flow.response.get_cookies() or {}).iteritems()],
                                      "headers": [{"name": k, "value": v}
                                                  for k, v in flow.response.headers],
                                      "content": {"size": len(flow.response.content),
                                                  "compression": len(flow.response.get_decoded_content()) - len(
                                                      flow.response.content),
                                                  "mimeType": flow.response.headers.get('Content-Type', ('', ))[0]},
                                      "redirectURL": flow.response.headers.get('Location', ''),
                                      "headersSize": len(str(flow.response.headers).split("\r\n\r\n")[0]),
                                      "bodySize": len(flow.response.content), },
                         "cache": {},
                         "timings": timings, })

    if flow.request.url in HARPAGE_LIST or flow.request.headers.get('Referer', None) is None:
        PAGE_COUNT[1] += 1
        page_id = "_".join([str(v) for v in PAGE_COUNT])
        HARLog.add(HAR.pages({"startedDateTime": entry['startedDateTime'],
                              "id": page_id,
                              "title": flow.request.url, }))
        PAGE_REF[flow.request.url] = page_id
        entry['pageref'] = page_id

    if flow.request.headers.get('Referer', (None, ))[0] in PAGE_REF.keys():
        entry['pageref'] = PAGE_REF[flow.request.headers['Referer'][0]]
        PAGE_REF[flow.request.url] = entry['pageref']

    HARLog.add(entry)


def error(context, flow):
    """
        Called when a flow error has occured, e.g. invalid server responses, or
        interrupted connections. This is distinct from a valid server HTTP error
        response, which is simply a response with an HTTP error code.
    """
    # context.log("error")


def clientdisconnect(context, conn_handler):
    """
        Called when a client disconnects from the proxy.
    """
    # print "clientdisconnect"
    # print_attributes(context._master)
    # print_attributes(conn_handler)


def done(context):
    """
        Called once on script shutdown, after any other events.
    """
    from pprint import pprint
    import json

    pprint(json.loads(HARLog.json()))
    print HARLog.json()
    print HARLog.compress()
    print "%s%%" % str(100. * len(HARLog.compress()) / len(HARLog.json()))


def print_attributes(obj, filter=None):
    for attr in dir(obj):
        # if "__" in attr:
        # continue
        if filter is not None and filter not in attr:
            continue
        value = getattr(obj, attr)
        print "%s.%s" % ('obj', attr), value, type(value)


HARPAGE_LIST = ['https://github.com/']
HARLog = _HARLog()

CONNECT_TIMES = {}
SSL_TIMES = {}
PAGE_REF = {}
PAGE_COUNT = ['autopage', 0]