From 45f4768a5cc4e820d1b5d7d221e2974f6ddf390d Mon Sep 17 00:00:00 2001 From: Aldo Cortesi Date: Fri, 19 Aug 2011 21:38:05 +1200 Subject: Add attribution and license for tnetstring.py --- libmproxy/flow.py | 8 +- libmproxy/netstring.py | 393 ----------------------------------------------- libmproxy/tnetstring.py | 398 ++++++++++++++++++++++++++++++++++++++++++++++++ test/.pry | 1 + 4 files changed, 403 insertions(+), 397 deletions(-) delete mode 100644 libmproxy/netstring.py create mode 100644 libmproxy/tnetstring.py diff --git a/libmproxy/flow.py b/libmproxy/flow.py index 8ec1a6a3..c0130ec3 100644 --- a/libmproxy/flow.py +++ b/libmproxy/flow.py @@ -2,9 +2,9 @@ This module provides more sophisticated flow tracking. These match requests with their responses, and provide filtering and interception facilities. """ -import json, hashlib, Cookie, cookielib, base64, copy, re +import hashlib, Cookie, cookielib, copy, re import time -import netstring, filt, script, utils, encoding, proxy +import tnetstring, filt, script, utils, encoding, proxy from email.utils import parsedate_tz, formatdate, mktime_tz import controller, version @@ -1319,7 +1319,7 @@ class FlowWriter: def add(self, flow): d = flow._get_state() - netstring.dump(d, self.fo) + tnetstring.dump(d, self.fo) class FlowReadError(Exception): @@ -1339,7 +1339,7 @@ class FlowReader: off = 0 try: while 1: - data = netstring.load(self.fo) + data = tnetstring.load(self.fo) off = self.fo.tell() yield Flow._from_state(data) except ValueError, v: diff --git a/libmproxy/netstring.py b/libmproxy/netstring.py deleted file mode 100644 index 03e38c6a..00000000 --- a/libmproxy/netstring.py +++ /dev/null @@ -1,393 +0,0 @@ -""" - -tnetstring: data serialization using typed netstrings -====================================================== - - -This is a data serialization library. It's a lot like JSON but it uses a -new syntax called "typed netstrings" that Zed has proposed for use in the -Mongrel2 webserver. It's designed to be simpler and easier to implement -than JSON, with a happy consequence of also being faster in many cases. - -An ordinary netstring is a blob of data prefixed with its length and postfixed -with a sanity-checking comma. The string "hello world" encodes like this:: - - 11:hello world, - -Typed netstrings add other datatypes by replacing the comma with a type tag. -Here's the integer 12345 encoded as a tnetstring:: - - 5:12345# - -And here's the list [12345,True,0] which mixes integers and bools:: - - 19:5:12345#4:true!1:0#] - -Simple enough? This module gives you the following functions: - - :dump: dump an object as a tnetstring to a file - :dumps: dump an object as a tnetstring to a string - :load: load a tnetstring-encoded object from a file - :loads: load a tnetstring-encoded object from a string - :pop: pop a tnetstring-encoded object from the front of a string - -Note that since parsing a tnetstring requires reading all the data into memory -at once, there's no efficiency gain from using the file-based versions of these -functions. They're only here so you can use load() to read precisely one -item from a file or socket without consuming any extra data. - -By default tnetstrings work only with byte strings, not unicode. If you want -unicode strings then pass an optional encoding to the various functions, -like so:: - - >>> print repr(tnetstring.loads("2:\\xce\\xb1,")) - '\\xce\\xb1' - >>> - >>> print repr(tnetstring.loads("2:\\xce\\xb1,","utf8")) - u'\u03b1' - -""" - -__ver_major__ = 0 -__ver_minor__ = 2 -__ver_patch__ = 0 -__ver_sub__ = "" -__version__ = "%d.%d.%d%s" % (__ver_major__,__ver_minor__,__ver_patch__,__ver_sub__) - - -from collections import deque - - -def dumps(value,encoding=None): - """dumps(object,encoding=None) -> string - - This function dumps a python object as a tnetstring. - """ - # This uses a deque to collect output fragments in reverse order, - # then joins them together at the end. It's measurably faster - # than creating all the intermediate strings. - # If you're reading this to get a handle on the tnetstring format, - # consider the _gdumps() function instead; it's a standard top-down - # generator that's simpler to understand but much less efficient. - q = deque() - _rdumpq(q,0,value,encoding) - return "".join(q) - - -def dump(value,file,encoding=None): - """dump(object,file,encoding=None) - - This function dumps a python object as a tnetstring and writes it to - the given file. - """ - file.write(dumps(value,encoding)) - - -def _rdumpq(q,size,value,encoding=None): - """Dump value as a tnetstring, to a deque instance, last chunks first. - - This function generates the tnetstring representation of the given value, - pushing chunks of the output onto the given deque instance. It pushes - the last chunk first, then recursively generates more chunks. - - When passed in the current size of the string in the queue, it will return - the new size of the string in the queue. - - Operating last-chunk-first makes it easy to calculate the size written - for recursive structures without having to build their representation as - a string. This is measurably faster than generating the intermediate - strings, especially on deeply nested structures. - """ - write = q.appendleft - if value is None: - write("0:~") - return size + 3 - if value is True: - write("4:true!") - return size + 7 - if value is False: - write("5:false!") - return size + 8 - if isinstance(value,(int,long)): - data = str(value) - ldata = len(data) - span = str(ldata) - write("#") - write(data) - write(":") - write(span) - return size + 2 + len(span) + ldata - if isinstance(value,(float,)): - # Use repr() for float rather than str(). - # It round-trips more accurately. - # Probably unnecessary in later python versions that - # use David Gay's ftoa routines. - data = repr(value) - ldata = len(data) - span = str(ldata) - write("^") - write(data) - write(":") - write(span) - return size + 2 + len(span) + ldata - if isinstance(value,str): - lvalue = len(value) - span = str(lvalue) - write(",") - write(value) - write(":") - write(span) - return size + 2 + len(span) + lvalue - if isinstance(value,(list,tuple,)): - write("]") - init_size = size = size + 1 - for item in reversed(value): - size = _rdumpq(q,size,item,encoding) - span = str(size - init_size) - write(":") - write(span) - return size + 1 + len(span) - if isinstance(value,dict): - write("}") - init_size = size = size + 1 - for (k,v) in value.iteritems(): - size = _rdumpq(q,size,v,encoding) - size = _rdumpq(q,size,k,encoding) - span = str(size - init_size) - write(":") - write(span) - return size + 1 + len(span) - if isinstance(value,unicode): - if encoding is None: - raise ValueError("must specify encoding to dump unicode strings") - value = value.encode(encoding) - lvalue = len(value) - span = str(lvalue) - write(",") - write(value) - write(":") - write(span) - return size + 2 + len(span) + lvalue - raise ValueError("unserializable object") - - -def _gdumps(value,encoding): - """Generate fragments of value dumped as a tnetstring. - - This is the naive dumping algorithm, implemented as a generator so that - it's easy to pass to "".join() without building a new list. - - This is mainly here for comparison purposes; the _rdumpq version is - measurably faster as it doesn't have to build intermediate strins. - """ - if value is None: - yield "0:~" - elif value is True: - yield "4:true!" - elif value is False: - yield "5:false!" - elif isinstance(value,(int,long)): - data = str(value) - yield str(len(data)) - yield ":" - yield data - yield "#" - elif isinstance(value,(float,)): - data = repr(value) - yield str(len(data)) - yield ":" - yield data - yield "^" - elif isinstance(value,(str,)): - yield str(len(value)) - yield ":" - yield value - yield "," - elif isinstance(value,(list,tuple,)): - sub = [] - for item in value: - sub.extend(_gdumps(item)) - sub = "".join(sub) - yield str(len(sub)) - yield ":" - yield sub - yield "]" - elif isinstance(value,(dict,)): - sub = [] - for (k,v) in value.iteritems(): - sub.extend(_gdumps(k)) - sub.extend(_gdumps(v)) - sub = "".join(sub) - yield str(len(sub)) - yield ":" - yield sub - yield "}" - elif isinstance(value,(unicode,)): - if encoding is None: - raise ValueError("must specify encoding to dump unicode strings") - value = value.encode(encoding) - yield str(len(value)) - yield ":" - yield value - yield "," - else: - raise ValueError("unserializable object") - - -def loads(string,encoding=None): - """loads(string,encoding=None) -> object - - This function parses a tnetstring into a python object. - """ - # No point duplicating effort here. In the C-extension version, - # loads() is measurably faster then pop() since it can avoid - # the overhead of building a second string. - return pop(string,encoding)[0] - - -def load(file,encoding=None): - """load(file,encoding=None) -> object - - This function reads a tnetstring from a file and parses it into a - python object. The file must support the read() method, and this - function promises not to read more data than necessary. - """ - # Read the length prefix one char at a time. - # Note that the netstring spec explicitly forbids padding zeros. - c = file.read(1) - if not c.isdigit(): - raise ValueError("not a tnetstring: missing or invalid length prefix") - datalen = ord(c) - ord("0") - c = file.read(1) - if datalen != 0: - while c.isdigit(): - datalen = (10 * datalen) + (ord(c) - ord("0")) - if datalen > 999999999: - errmsg = "not a tnetstring: absurdly large length prefix" - raise ValueError(errmsg) - c = file.read(1) - if c != ":": - raise ValueError("not a tnetstring: missing or invalid length prefix") - # Now we can read and parse the payload. - # This repeats the dispatch logic of pop() so we can avoid - # re-constructing the outermost tnetstring. - data = file.read(datalen) - if len(data) != datalen: - raise ValueError("not a tnetstring: length prefix too big") - type = file.read(1) - if type == ",": - if encoding is not None: - return data.decode(encoding) - return data - if type == "#": - try: - return int(data) - except ValueError: - raise ValueError("not a tnetstring: invalid integer literal") - if type == "^": - try: - return float(data) - except ValueError: - raise ValueError("not a tnetstring: invalid float literal") - if type == "!": - if data == "true": - return True - elif data == "false": - return False - else: - raise ValueError("not a tnetstring: invalid boolean literal") - if type == "~": - if data: - raise ValueError("not a tnetstring: invalid null literal") - return None - if type == "]": - l = [] - while data: - (item,data) = pop(data,encoding) - l.append(item) - return l - if type == "}": - d = {} - while data: - (key,data) = pop(data,encoding) - (val,data) = pop(data,encoding) - d[key] = val - return d - raise ValueError("unknown type tag") - - - -def pop(string,encoding=None): - """pop(string,encoding=None) -> (object, remain) - - This function parses a tnetstring into a python object. - It returns a tuple giving the parsed object and a string - containing any unparsed data from the end of the string. - """ - # Parse out data length, type and remaining string. - try: - (dlen,rest) = string.split(":",1) - dlen = int(dlen) - except ValueError: - raise ValueError("not a tnetstring: missing or invalid length prefix") - try: - (data,type,remain) = (rest[:dlen],rest[dlen],rest[dlen+1:]) - except IndexError: - # This fires if len(rest) < dlen, meaning we don't need - # to further validate that data is the right length. - raise ValueError("not a tnetstring: invalid length prefix") - # Parse the data based on the type tag. - if type == ",": - if encoding is not None: - return (data.decode(encoding),remain) - return (data,remain) - if type == "#": - try: - return (int(data),remain) - except ValueError: - raise ValueError("not a tnetstring: invalid integer literal") - if type == "^": - try: - return (float(data),remain) - except ValueError: - raise ValueError("not a tnetstring: invalid float literal") - if type == "!": - if data == "true": - return (True,remain) - elif data == "false": - return (False,remain) - else: - raise ValueError("not a tnetstring: invalid boolean literal") - if type == "~": - if data: - raise ValueError("not a tnetstring: invalid null literal") - return (None,remain) - if type == "]": - l = [] - while data: - (item,data) = pop(data,encoding) - l.append(item) - return (l,remain) - if type == "}": - d = {} - while data: - (key,data) = pop(data,encoding) - (val,data) = pop(data,encoding) - d[key] = val - return (d,remain) - raise ValueError("unknown type tag") - - - -# Use the c-extension version if available -try: - import _tnetstring -except ImportError: - pass -else: - dumps = _tnetstring.dumps - load = _tnetstring.load - loads = _tnetstring.loads - pop = _tnetstring.pop - - diff --git a/libmproxy/tnetstring.py b/libmproxy/tnetstring.py new file mode 100644 index 00000000..76b15a02 --- /dev/null +++ b/libmproxy/tnetstring.py @@ -0,0 +1,398 @@ +# imported from the tnetstring project: https://github.com/rfk/tnetstring +# +# Copyright (c) 2011 Ryan Kelly +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +""" +tnetstring: data serialization using typed netstrings +====================================================== + + +This is a data serialization library. It's a lot like JSON but it uses a +new syntax called "typed netstrings" that Zed has proposed for use in the +Mongrel2 webserver. It's designed to be simpler and easier to implement +than JSON, with a happy consequence of also being faster in many cases. + +An ordinary netstring is a blob of data prefixed with its length and postfixed +with a sanity-checking comma. The string "hello world" encodes like this:: + + 11:hello world, + +Typed netstrings add other datatypes by replacing the comma with a type tag. +Here's the integer 12345 encoded as a tnetstring:: + + 5:12345# + +And here's the list [12345,True,0] which mixes integers and bools:: + + 19:5:12345#4:true!1:0#] + +Simple enough? This module gives you the following functions: + + :dump: dump an object as a tnetstring to a file + :dumps: dump an object as a tnetstring to a string + :load: load a tnetstring-encoded object from a file + :loads: load a tnetstring-encoded object from a string + :pop: pop a tnetstring-encoded object from the front of a string + +Note that since parsing a tnetstring requires reading all the data into memory +at once, there's no efficiency gain from using the file-based versions of these +functions. They're only here so you can use load() to read precisely one +item from a file or socket without consuming any extra data. + +By default tnetstrings work only with byte strings, not unicode. If you want +unicode strings then pass an optional encoding to the various functions, +like so:: + + >>> print repr(tnetstring.loads("2:\\xce\\xb1,")) + '\\xce\\xb1' + >>> + >>> print repr(tnetstring.loads("2:\\xce\\xb1,","utf8")) + u'\u03b1' + +""" + +__ver_major__ = 0 +__ver_minor__ = 2 +__ver_patch__ = 0 +__ver_sub__ = "" +__version__ = "%d.%d.%d%s" % (__ver_major__,__ver_minor__,__ver_patch__,__ver_sub__) + + +from collections import deque + + +def dumps(value,encoding=None): + """dumps(object,encoding=None) -> string + + This function dumps a python object as a tnetstring. + """ + # This uses a deque to collect output fragments in reverse order, + # then joins them together at the end. It's measurably faster + # than creating all the intermediate strings. + # If you're reading this to get a handle on the tnetstring format, + # consider the _gdumps() function instead; it's a standard top-down + # generator that's simpler to understand but much less efficient. + q = deque() + _rdumpq(q,0,value,encoding) + return "".join(q) + + +def dump(value,file,encoding=None): + """dump(object,file,encoding=None) + + This function dumps a python object as a tnetstring and writes it to + the given file. + """ + file.write(dumps(value,encoding)) + + +def _rdumpq(q,size,value,encoding=None): + """Dump value as a tnetstring, to a deque instance, last chunks first. + + This function generates the tnetstring representation of the given value, + pushing chunks of the output onto the given deque instance. It pushes + the last chunk first, then recursively generates more chunks. + + When passed in the current size of the string in the queue, it will return + the new size of the string in the queue. + + Operating last-chunk-first makes it easy to calculate the size written + for recursive structures without having to build their representation as + a string. This is measurably faster than generating the intermediate + strings, especially on deeply nested structures. + """ + write = q.appendleft + if value is None: + write("0:~") + return size + 3 + if value is True: + write("4:true!") + return size + 7 + if value is False: + write("5:false!") + return size + 8 + if isinstance(value,(int,long)): + data = str(value) + ldata = len(data) + span = str(ldata) + write("#") + write(data) + write(":") + write(span) + return size + 2 + len(span) + ldata + if isinstance(value,(float,)): + # Use repr() for float rather than str(). + # It round-trips more accurately. + # Probably unnecessary in later python versions that + # use David Gay's ftoa routines. + data = repr(value) + ldata = len(data) + span = str(ldata) + write("^") + write(data) + write(":") + write(span) + return size + 2 + len(span) + ldata + if isinstance(value,str): + lvalue = len(value) + span = str(lvalue) + write(",") + write(value) + write(":") + write(span) + return size + 2 + len(span) + lvalue + if isinstance(value,(list,tuple,)): + write("]") + init_size = size = size + 1 + for item in reversed(value): + size = _rdumpq(q,size,item,encoding) + span = str(size - init_size) + write(":") + write(span) + return size + 1 + len(span) + if isinstance(value,dict): + write("}") + init_size = size = size + 1 + for (k,v) in value.iteritems(): + size = _rdumpq(q,size,v,encoding) + size = _rdumpq(q,size,k,encoding) + span = str(size - init_size) + write(":") + write(span) + return size + 1 + len(span) + if isinstance(value,unicode): + if encoding is None: + raise ValueError("must specify encoding to dump unicode strings") + value = value.encode(encoding) + lvalue = len(value) + span = str(lvalue) + write(",") + write(value) + write(":") + write(span) + return size + 2 + len(span) + lvalue + raise ValueError("unserializable object") + + +def _gdumps(value,encoding): + """Generate fragments of value dumped as a tnetstring. + + This is the naive dumping algorithm, implemented as a generator so that + it's easy to pass to "".join() without building a new list. + + This is mainly here for comparison purposes; the _rdumpq version is + measurably faster as it doesn't have to build intermediate strins. + """ + if value is None: + yield "0:~" + elif value is True: + yield "4:true!" + elif value is False: + yield "5:false!" + elif isinstance(value,(int,long)): + data = str(value) + yield str(len(data)) + yield ":" + yield data + yield "#" + elif isinstance(value,(float,)): + data = repr(value) + yield str(len(data)) + yield ":" + yield data + yield "^" + elif isinstance(value,(str,)): + yield str(len(value)) + yield ":" + yield value + yield "," + elif isinstance(value,(list,tuple,)): + sub = [] + for item in value: + sub.extend(_gdumps(item)) + sub = "".join(sub) + yield str(len(sub)) + yield ":" + yield sub + yield "]" + elif isinstance(value,(dict,)): + sub = [] + for (k,v) in value.iteritems(): + sub.extend(_gdumps(k)) + sub.extend(_gdumps(v)) + sub = "".join(sub) + yield str(len(sub)) + yield ":" + yield sub + yield "}" + elif isinstance(value,(unicode,)): + if encoding is None: + raise ValueError("must specify encoding to dump unicode strings") + value = value.encode(encoding) + yield str(len(value)) + yield ":" + yield value + yield "," + else: + raise ValueError("unserializable object") + + +def loads(string,encoding=None): + """loads(string,encoding=None) -> object + + This function parses a tnetstring into a python object. + """ + # No point duplicating effort here. In the C-extension version, + # loads() is measurably faster then pop() since it can avoid + # the overhead of building a second string. + return pop(string,encoding)[0] + + +def load(file,encoding=None): + """load(file,encoding=None) -> object + + This function reads a tnetstring from a file and parses it into a + python object. The file must support the read() method, and this + function promises not to read more data than necessary. + """ + # Read the length prefix one char at a time. + # Note that the netstring spec explicitly forbids padding zeros. + c = file.read(1) + if not c.isdigit(): + raise ValueError("not a tnetstring: missing or invalid length prefix") + datalen = ord(c) - ord("0") + c = file.read(1) + if datalen != 0: + while c.isdigit(): + datalen = (10 * datalen) + (ord(c) - ord("0")) + if datalen > 999999999: + errmsg = "not a tnetstring: absurdly large length prefix" + raise ValueError(errmsg) + c = file.read(1) + if c != ":": + raise ValueError("not a tnetstring: missing or invalid length prefix") + # Now we can read and parse the payload. + # This repeats the dispatch logic of pop() so we can avoid + # re-constructing the outermost tnetstring. + data = file.read(datalen) + if len(data) != datalen: + raise ValueError("not a tnetstring: length prefix too big") + type = file.read(1) + if type == ",": + if encoding is not None: + return data.decode(encoding) + return data + if type == "#": + try: + return int(data) + except ValueError: + raise ValueError("not a tnetstring: invalid integer literal") + if type == "^": + try: + return float(data) + except ValueError: + raise ValueError("not a tnetstring: invalid float literal") + if type == "!": + if data == "true": + return True + elif data == "false": + return False + else: + raise ValueError("not a tnetstring: invalid boolean literal") + if type == "~": + if data: + raise ValueError("not a tnetstring: invalid null literal") + return None + if type == "]": + l = [] + while data: + (item,data) = pop(data,encoding) + l.append(item) + return l + if type == "}": + d = {} + while data: + (key,data) = pop(data,encoding) + (val,data) = pop(data,encoding) + d[key] = val + return d + raise ValueError("unknown type tag") + + + +def pop(string,encoding=None): + """pop(string,encoding=None) -> (object, remain) + + This function parses a tnetstring into a python object. + It returns a tuple giving the parsed object and a string + containing any unparsed data from the end of the string. + """ + # Parse out data length, type and remaining string. + try: + (dlen,rest) = string.split(":",1) + dlen = int(dlen) + except ValueError: + raise ValueError("not a tnetstring: missing or invalid length prefix") + try: + (data,type,remain) = (rest[:dlen],rest[dlen],rest[dlen+1:]) + except IndexError: + # This fires if len(rest) < dlen, meaning we don't need + # to further validate that data is the right length. + raise ValueError("not a tnetstring: invalid length prefix") + # Parse the data based on the type tag. + if type == ",": + if encoding is not None: + return (data.decode(encoding),remain) + return (data,remain) + if type == "#": + try: + return (int(data),remain) + except ValueError: + raise ValueError("not a tnetstring: invalid integer literal") + if type == "^": + try: + return (float(data),remain) + except ValueError: + raise ValueError("not a tnetstring: invalid float literal") + if type == "!": + if data == "true": + return (True,remain) + elif data == "false": + return (False,remain) + else: + raise ValueError("not a tnetstring: invalid boolean literal") + if type == "~": + if data: + raise ValueError("not a tnetstring: invalid null literal") + return (None,remain) + if type == "]": + l = [] + while data: + (item,data) = pop(data,encoding) + l.append(item) + return (l,remain) + if type == "}": + d = {} + while data: + (key,data) = pop(data,encoding) + (val,data) = pop(data,encoding) + d[key] = val + return (d,remain) + raise ValueError("unknown type tag") diff --git a/test/.pry b/test/.pry index 63b74633..f6f18e7b 100644 --- a/test/.pry +++ b/test/.pry @@ -2,4 +2,5 @@ base = .. coverage = ../libmproxy exclude = . ../libmproxy/contrib + ../libmproxy/tnetstring.py -- cgit v1.2.3