Added link_expander.py example to simple examples (#3348)

author: jbrule <jbrulz@gmail.com> 2018-10-12 14:56:58 -0500
committer: Thomas Kriechbaumer <Kriechi@users.noreply.github.com> 2018-10-12 21:56:58 +0200
commit: 57868ef5fab5c9f101126ec7981db2d769b22a4f (patch)
tree: 2d7fe8b30ece9cfcac2085e4e8a3669e7f8030f2 /examples
parent: 82797efe9f1fed48022cc9bb3f3133db3635c07b (diff)
download: mitmproxy-57868ef5fab5c9f101126ec7981db2d769b22a4f.tar.gz
mitmproxy-57868ef5fab5c9f101126ec7981db2d769b22a4f.tar.bz2
mitmproxy-57868ef5fab5c9f101126ec7981db2d769b22a4f.zip
2 files changed, 29 insertions, 0 deletions
diff --git a/examples/simple/README.md b/examples/simple/README.md
index 2fafdd5a..66a05b30 100644
--- a/examples/simple/README.md
+++ b/examples/simple/README.md
@@ -8,6 +8,7 @@
 | filter_flows.py                | This script demonstrates how to use mitmproxy's filter pattern in scripts.   |
 | io_read_dumpfile.py            | Read a dumpfile generated by mitmproxy.                                      |
 | io_write_dumpfile.py           | Only write selected flows into a mitmproxy dumpfile.                         |
+| link_expander.py               | Discover relative links in HTML traffic and replace them with absolute paths |
 | log_events.py                  | Use mitmproxy's logging API.                                                 |
 | modify_body_inject_iframe.py   | Inject configurable iframe into pages.                                       |
 | modify_form.py                 | Modify HTTP form submissions.                                                |
diff --git a/examples/simple/link_expander.py b/examples/simple/link_expander.py
new file mode 100644
index 00000000..0edf7c98
--- /dev/null
+++ b/examples/simple/link_expander.py
@@ -0,0 +1,28 @@
+# This script determines if request is an HTML webpage and if so seeks out
+# relative links (<a href="./about.html">) and expands them to absolute links
+# In practice this can be used to front an indexing spider that may not have the capability to expand relative page links.
+# Usage: mitmdump -s link_expander.py or mitmproxy -s link_expander.py
+
+import re
+from urllib.parse import urljoin
+
+
+def response(flow):
+
+    if "Content-Type" in flow.response.headers and flow.response.headers["Content-Type"].find("text/html") != -1:
+        pageUrl = flow.request.url
+        pageText = flow.response.text
+        pattern = (r"<a\s+(?:[^>]*?\s+)?href=(?P<delimiter>[\"'])"
+                   r"(?P<link>(?!https?:\/\/|ftps?:\/\/|\/\/|#|javascript:|mailto:).*?)(?P=delimiter)")
+        rel_matcher = re.compile(pattern, flags=re.IGNORECASE)
+        rel_matches = rel_matcher.finditer(pageText)
+        map_dict = {}
+        for match_num, match in enumerate(rel_matches):
+            (delimiter, rel_link) = match.group("delimiter", "link")
+            abs_link = urljoin(pageUrl, rel_link)
+            map_dict["{0}{1}{0}".format(delimiter, rel_link)] = "{0}{1}{0}".format(delimiter, abs_link)
+        for map in map_dict.items():
+            pageText = pageText.replace(*map)
+            # Uncomment the following to print the expansion mapping
+            # print("{0} -> {1}".format(*map))
+        flow.response.text = pageText
+\ No newline at end of file
author	jbrule <jbrulz@gmail.com>	2018-10-12 14:56:58 -0500
committer	Thomas Kriechbaumer <Kriechi@users.noreply.github.com>	2018-10-12 21:56:58 +0200
commit	57868ef5fab5c9f101126ec7981db2d769b22a4f (patch)
tree	2d7fe8b30ece9cfcac2085e4e8a3669e7f8030f2 /examples
parent	82797efe9f1fed48022cc9bb3f3133db3635c07b (diff)
download	mitmproxy-57868ef5fab5c9f101126ec7981db2d769b22a4f.tar.gz mitmproxy-57868ef5fab5c9f101126ec7981db2d769b22a4f.tar.bz2 mitmproxy-57868ef5fab5c9f101126ec7981db2d769b22a4f.zip