aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJustus Wingert <justus@abi007.info>2014-11-15 18:38:59 +0100
committerJustus Wingert <justus@abi007.info>2014-11-15 18:38:59 +0100
commitfd48a70128581c508420901910c285f247c930c7 (patch)
treecef1ccfa5f62c9d5e1d793b495556b58fced2de4
parentacce67e1fd468bf1ac4a536d007ac56d9ab652e3 (diff)
downloadmitmproxy-fd48a70128581c508420901910c285f247c930c7.tar.gz
mitmproxy-fd48a70128581c508420901910c285f247c930c7.tar.bz2
mitmproxy-fd48a70128581c508420901910c285f247c930c7.zip
Updated documentation and cleaned up the code.
-rw-r--r--examples/har_extractor.py196
1 files changed, 91 insertions, 105 deletions
diff --git a/examples/har_extractor.py b/examples/har_extractor.py
index 504e98df..bc67d299 100644
--- a/examples/har_extractor.py
+++ b/examples/har_extractor.py
@@ -37,110 +37,104 @@ class _HARLog(HAR.log):
def start(context, argv):
- HARLog.reset()
-
-
-def clientconnect(context, conn_handler):
- """
- Called when a client initiates a connection to the proxy. Note that a
- connection can correspond to multiple HTTP requests
- """
- import time
- context.log("clientconnect" + str(time.time()))
-
-
-def serverconnect(context, conn_handler):
- """
- Called when the proxy initiates a connection to the target server. Note that a
- connection can correspond to multiple HTTP requests
- """
- CONNECT_TIMES.pop(conn_handler.server_conn.address.address, None)
- SSL_TIMES.pop(conn_handler.server_conn.address.address, None)
- import time
- context.log("serverconnect " + str(time.time()))
-
-
-def request(context, flow):
- """
- Called when a client request has been received.
- """
- # print_attributes(flow)
- # print_attributes(context)
- import time
- context.log("request " + str(time.time()) + " " + str(flow.request.timestamp_start))
-
-
-def responseheaders(context, flow):
"""
- Called when the response headers for a server response have been received,
- but the response body has not been processed yet. Can be used to tell mitmproxy
- to stream the response.
+ On start we reset the HAR, it's not really necessary since it will have been
+ instantiated earlier during initial parsing of this file. You will have to
+ adapt this to suit your actual needs of HAR generation.
"""
- context.log("responseheaders")
+ HARLog.reset()
+ context.seen_server_connect = set()
+ context.seen_server_ssl = set()
def response(context, flow):
"""
- Called when a server response has been received.
+ Called when a server response has been received. At the time of this message both
+ a request and a response are present and completely done.
"""
- import time
- context.log("response " + str(time.time()) + " " + str(flow.request.timestamp_start))
- context.log("response " + str(time.time()) + " " + str(flow.response.timestamp_end))
- connect_time = CONNECT_TIMES.get(flow.server_conn.address.address,
- int((flow.server_conn.timestamp_tcp_setup
- - flow.server_conn.timestamp_start)
- * 1000))
- CONNECT_TIMES[flow.server_conn.address.address] = -1
+ connect_time = -1
+ if flow.server_conn not in context.seen_server_connect:
+ # Calculate the connect_time for this server_conn. Afterwards add it to seen list, in
+ # order to avoid the connect_time being present in entries that use an existing connection.
+ connect_time = flow.server_conn.timestamp_tcp_setup - flow.server_conn.timestamp_start
+ context.seen_server_connect.add(flow.server_conn)
ssl_time = -1
- if flow.server_conn.timestamp_ssl_setup is not None:
- ssl_time = SSL_TIMES.get(flow.server_conn.address.address,
- int((flow.server_conn.timestamp_ssl_setup
- - flow.server_conn.timestamp_tcp_setup)
- * 1000))
- SSL_TIMES[flow.server_conn.address.address] = -1
-
- timings = {'send': int((flow.request.timestamp_end - flow.request.timestamp_start) * 1000),
- 'wait': int((flow.response.timestamp_start - flow.request.timestamp_end) * 1000),
- 'receive': int((flow.response.timestamp_end - flow.response.timestamp_start) * 1000),
- 'connect': connect_time,
- 'ssl': ssl_time}
-
+ if flow.server_conn not in context.seen_server_connect \
+ and flow.server_conn.timestamp_ssl_setup is not None:
+ # Get the ssl_time for this server_conn as the difference between the start of the successful
+ # tcp setup and the successful ssl setup. Afterwards add it to seen list, in order to avoid
+ # the ssl_time being present in entries that use an existing connection. If no ssl setup has
+ # been made initiate it is also left as -1 since it doesn't apply to this connection.
+ ssl_time = flow.server_conn.timestamp_ssl_setup - flow.server_conn.timestamp_tcp_setup
+ context.seen_server_ssl.add(flow.server_conn)
+
+ # Calculate the raw timings from the different timestamps present in the request and response object.
+ # For lack of a way to measure it dns timings can not be calculated. The same goes for HAR blocked:
+ # MITMProxy will open a server connection as soon as it receives the host and port from the client
+ # connection. So the time spent waiting is actually spent waiting between request.timestamp_end and
+ # response.timestamp_start thus it correlates to HAR wait instead.
+ timings_raw = {'send': flow.request.timestamp_end - flow.request.timestamp_start,
+ 'wait': flow.response.timestamp_start - flow.request.timestamp_end,
+ 'receive': flow.response.timestamp_end - flow.response.timestamp_start,
+ 'connect': connect_time,
+ 'ssl': ssl_time}
+
+ # HAR timings are integers in ms, so we have to re-encode the raw timings to that format.
+ timings = dict([(key, int(1000 * value)) for key, value in timings_raw.iteritems()])
+
+ # The full_time is the sum of all timings. Timings set to -1 will be ignored as per spec.
full_time = 0
for item in timings.values():
if item > -1:
full_time += item
- entry = HAR.entries({"startedDateTime": datetime.fromtimestamp(flow.request.timestamp_start, tz=UTC()).isoformat(),
+ started_date_time = datetime.fromtimestamp(flow.request.timestamp_start, tz=UTC()).isoformat()
+
+ request_query_string = [{"name": k, "value": v} for k, v in flow.request.get_query()]
+ request_http_version = ".".join([str(v) for v in flow.request.httpversion])
+ # Cookies are shaped as tuples by MITMProxy.
+ request_cookies = [{"name": k.strip(), "value": v[0]} for k, v in (flow.request.get_cookies() or {}).iteritems()]
+ request_headers = [{"name": k, "value": v} for k, v in flow.request.headers]
+ request_headers_size = len(str(flow.request.headers))
+ request_body_size = len(flow.request.content)
+
+ response_http_version = ".".join([str(v) for v in flow.response.httpversion])
+ # Cookies are shaped as tuples by MITMProxy.
+ response_cookies = [{"name": k.strip(), "value": v[0]} for k, v in (flow.response.get_cookies() or {}).iteritems()]
+ response_headers = [{"name": k, "value": v} for k, v in flow.response.headers]
+ response_headers_size = len(str(flow.response.headers))
+ response_body_size = len(flow.response.content)
+ response_body_decoded_size = len(flow.response.content)
+ response_body_compression = response_body_decoded_size - response_body_size
+ response_mime_type = flow.response.headers.get('Content-Type', [''])[0]
+ response_redirect_url = flow.response.headers.get('Location', [''])[0]
+
+ entry = HAR.entries({"startedDateTime": started_date_time,
"time": full_time,
"request": {"method": flow.request.method,
"url": flow.request.url,
- "httpVersion": ".".join([str(v) for v in flow.request.httpversion]),
- "cookies": [{"name": k.strip(), "value": v[0]}
- for k, v in (flow.request.get_cookies() or {}).iteritems()],
- "headers": [{"name": k, "value": v}
- for k, v in flow.request.headers],
- "queryString": [{"name": k, "value": v}
- for k, v in flow.request.get_query()],
- "headersSize": len(str(flow.request.headers).split("\r\n\r\n")[0]),
- "bodySize": len(flow.request.content), },
+ "httpVersion": request_http_version,
+ "cookies": request_cookies,
+ "headers": request_headers,
+ "queryString": request_query_string,
+ "headersSize": request_headers_size,
+ "bodySize": request_body_size, },
"response": {"status": flow.response.code,
"statusText": flow.response.msg,
- "httpVersion": ".".join([str(v) for v in flow.response.httpversion]),
- "cookies": [{"name": k.strip(), "value": v[0]}
- for k, v in (flow.response.get_cookies() or {}).iteritems()],
- "headers": [{"name": k, "value": v}
- for k, v in flow.response.headers],
- "content": {"size": len(flow.response.content),
- "compression": len(flow.response.get_decoded_content()) - len(
- flow.response.content),
- "mimeType": flow.response.headers.get('Content-Type', ('', ))[0]},
- "redirectURL": flow.response.headers.get('Location', ''),
- "headersSize": len(str(flow.response.headers).split("\r\n\r\n")[0]),
- "bodySize": len(flow.response.content), },
+ "httpVersion": response_http_version,
+ "cookies": response_cookies,
+ "headers": response_headers,
+ "content": {"size": response_body_size,
+ "compression": response_body_compression,
+ "mimeType": response_mime_type},
+ "redirectURL": response_redirect_url,
+ "headersSize": response_headers_size,
+ "bodySize": response_body_size, },
"cache": {},
"timings": timings, })
+ # If the current url is in HARPAGE_LIST or does not have a referer we add it as a new pages object.
if flow.request.url in HARPAGE_LIST or flow.request.headers.get('Referer', None) is None:
PAGE_COUNT[1] += 1
page_id = "_".join([str(v) for v in PAGE_COUNT])
@@ -150,31 +144,14 @@ def response(context, flow):
PAGE_REF[flow.request.url] = page_id
entry['pageref'] = page_id
- if flow.request.headers.get('Referer', (None, ))[0] in PAGE_REF.keys():
+ # Lookup the referer in our PAGE_REF dict to point this entries pageref attribute to the right pages object.
+ elif flow.request.headers.get('Referer', (None, ))[0] in PAGE_REF.keys():
entry['pageref'] = PAGE_REF[flow.request.headers['Referer'][0]]
PAGE_REF[flow.request.url] = entry['pageref']
HARLog.add(entry)
-def error(context, flow):
- """
- Called when a flow error has occured, e.g. invalid server responses, or
- interrupted connections. This is distinct from a valid server HTTP error
- response, which is simply a response with an HTTP error code.
- """
- # context.log("error")
-
-
-def clientdisconnect(context, conn_handler):
- """
- Called when a client disconnects from the proxy.
- """
- # print "clientdisconnect"
- # print_attributes(context._master)
- # print_attributes(conn_handler)
-
-
def done(context):
"""
Called once on script shutdown, after any other events.
@@ -182,13 +159,21 @@ def done(context):
from pprint import pprint
import json
- pprint(json.loads(HARLog.json()))
- print HARLog.json()
- print HARLog.compress()
- print "%s%%" % str(100. * len(HARLog.compress()) / len(HARLog.json()))
+ json_dump = HARLog.json()
+ compressed_json_dump = HARLog.compress()
+
+ print "=" * 100
+ pprint(json.loads(json_dump))
+ print "=" * 100
+ print "HAR log finished with %s bytes (%s bytes compressed)" % (len(json_dump), len(compressed_json_dump))
+ print "Compression rate is %s%%" % str(100. * len(compressed_json_dump) / len(json_dump))
+ print "=" * 100
def print_attributes(obj, filter=None):
+ """
+ Useful helper method to quickly get all attributes of an object and its values.
+ """
for attr in dir(obj):
# if "__" in attr:
# continue
@@ -198,6 +183,7 @@ def print_attributes(obj, filter=None):
print "%s.%s" % ('obj', attr), value, type(value)
+# Some initializations. Add any page you want to have its own pages object to HARPAGE_LIST
HARPAGE_LIST = ['https://github.com/']
HARLog = _HARLog()