aboutsummaryrefslogtreecommitdiffstats
path: root/examples/simple/link_expander.py
diff options
context:
space:
mode:
Diffstat (limited to 'examples/simple/link_expander.py')
-rw-r--r--examples/simple/link_expander.py28
1 files changed, 28 insertions, 0 deletions
diff --git a/examples/simple/link_expander.py b/examples/simple/link_expander.py
new file mode 100644
index 00000000..0edf7c98
--- /dev/null
+++ b/examples/simple/link_expander.py
@@ -0,0 +1,28 @@
+# This script determines if request is an HTML webpage and if so seeks out
+# relative links (<a href="./about.html">) and expands them to absolute links
+# In practice this can be used to front an indexing spider that may not have the capability to expand relative page links.
+# Usage: mitmdump -s link_expander.py or mitmproxy -s link_expander.py
+
+import re
+from urllib.parse import urljoin
+
+
+def response(flow):
+
+ if "Content-Type" in flow.response.headers and flow.response.headers["Content-Type"].find("text/html") != -1:
+ pageUrl = flow.request.url
+ pageText = flow.response.text
+ pattern = (r"<a\s+(?:[^>]*?\s+)?href=(?P<delimiter>[\"'])"
+ r"(?P<link>(?!https?:\/\/|ftps?:\/\/|\/\/|#|javascript:|mailto:).*?)(?P=delimiter)")
+ rel_matcher = re.compile(pattern, flags=re.IGNORECASE)
+ rel_matches = rel_matcher.finditer(pageText)
+ map_dict = {}
+ for match_num, match in enumerate(rel_matches):
+ (delimiter, rel_link) = match.group("delimiter", "link")
+ abs_link = urljoin(pageUrl, rel_link)
+ map_dict["{0}{1}{0}".format(delimiter, rel_link)] = "{0}{1}{0}".format(delimiter, abs_link)
+ for map in map_dict.items():
+ pageText = pageText.replace(*map)
+ # Uncomment the following to print the expansion mapping
+ # print("{0} -> {1}".format(*map))
+ flow.response.text = pageText \ No newline at end of file