diff options
Diffstat (limited to 'mitmproxy/libmproxy/tnetstring.py')
-rw-r--r-- | mitmproxy/libmproxy/tnetstring.py | 399 |
1 files changed, 399 insertions, 0 deletions
diff --git a/mitmproxy/libmproxy/tnetstring.py b/mitmproxy/libmproxy/tnetstring.py new file mode 100644 index 00000000..c5c185c6 --- /dev/null +++ b/mitmproxy/libmproxy/tnetstring.py @@ -0,0 +1,399 @@ +# imported from the tnetstring project: https://github.com/rfk/tnetstring +# +# Copyright (c) 2011 Ryan Kelly +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +""" +tnetstring: data serialization using typed netstrings +====================================================== + + +This is a data serialization library. It's a lot like JSON but it uses a +new syntax called "typed netstrings" that Zed has proposed for use in the +Mongrel2 webserver. It's designed to be simpler and easier to implement +than JSON, with a happy consequence of also being faster in many cases. + +An ordinary netstring is a blob of data prefixed with its length and postfixed +with a sanity-checking comma. The string "hello world" encodes like this:: + + 11:hello world, + +Typed netstrings add other datatypes by replacing the comma with a type tag. +Here's the integer 12345 encoded as a tnetstring:: + + 5:12345# + +And here's the list [12345,True,0] which mixes integers and bools:: + + 19:5:12345#4:true!1:0#] + +Simple enough? This module gives you the following functions: + + :dump: dump an object as a tnetstring to a file + :dumps: dump an object as a tnetstring to a string + :load: load a tnetstring-encoded object from a file + :loads: load a tnetstring-encoded object from a string + :pop: pop a tnetstring-encoded object from the front of a string + +Note that since parsing a tnetstring requires reading all the data into memory +at once, there's no efficiency gain from using the file-based versions of these +functions. They're only here so you can use load() to read precisely one +item from a file or socket without consuming any extra data. + +By default tnetstrings work only with byte strings, not unicode. If you want +unicode strings then pass an optional encoding to the various functions, +like so:: + + >>> print(repr(tnetstring.loads("2:\\xce\\xb1,"))) + '\\xce\\xb1' + >>> + >>> print(repr(tnetstring.loads("2:\\xce\\xb1,","utf8"))) + u'\u03b1' + +""" + +__ver_major__ = 0 +__ver_minor__ = 2 +__ver_patch__ = 0 +__ver_sub__ = "" +__version__ = "%d.%d.%d%s" % ( + __ver_major__, __ver_minor__, __ver_patch__, __ver_sub__) + + +from collections import deque + + +def dumps(value, encoding=None): + """dumps(object,encoding=None) -> string + + This function dumps a python object as a tnetstring. + """ + # This uses a deque to collect output fragments in reverse order, + # then joins them together at the end. It's measurably faster + # than creating all the intermediate strings. + # If you're reading this to get a handle on the tnetstring format, + # consider the _gdumps() function instead; it's a standard top-down + # generator that's simpler to understand but much less efficient. + q = deque() + _rdumpq(q, 0, value, encoding) + return "".join(q) + + +def dump(value, file, encoding=None): + """dump(object,file,encoding=None) + + This function dumps a python object as a tnetstring and writes it to + the given file. + """ + file.write(dumps(value, encoding)) + file.flush() + + +def _rdumpq(q, size, value, encoding=None): + """Dump value as a tnetstring, to a deque instance, last chunks first. + + This function generates the tnetstring representation of the given value, + pushing chunks of the output onto the given deque instance. It pushes + the last chunk first, then recursively generates more chunks. + + When passed in the current size of the string in the queue, it will return + the new size of the string in the queue. + + Operating last-chunk-first makes it easy to calculate the size written + for recursive structures without having to build their representation as + a string. This is measurably faster than generating the intermediate + strings, especially on deeply nested structures. + """ + write = q.appendleft + if value is None: + write("0:~") + return size + 3 + if value is True: + write("4:true!") + return size + 7 + if value is False: + write("5:false!") + return size + 8 + if isinstance(value, (int, long)): + data = str(value) + ldata = len(data) + span = str(ldata) + write("#") + write(data) + write(":") + write(span) + return size + 2 + len(span) + ldata + if isinstance(value, (float,)): + # Use repr() for float rather than str(). + # It round-trips more accurately. + # Probably unnecessary in later python versions that + # use David Gay's ftoa routines. + data = repr(value) + ldata = len(data) + span = str(ldata) + write("^") + write(data) + write(":") + write(span) + return size + 2 + len(span) + ldata + if isinstance(value, str): + lvalue = len(value) + span = str(lvalue) + write(",") + write(value) + write(":") + write(span) + return size + 2 + len(span) + lvalue + if isinstance(value, (list, tuple,)): + write("]") + init_size = size = size + 1 + for item in reversed(value): + size = _rdumpq(q, size, item, encoding) + span = str(size - init_size) + write(":") + write(span) + return size + 1 + len(span) + if isinstance(value, dict): + write("}") + init_size = size = size + 1 + for (k, v) in value.iteritems(): + size = _rdumpq(q, size, v, encoding) + size = _rdumpq(q, size, k, encoding) + span = str(size - init_size) + write(":") + write(span) + return size + 1 + len(span) + if isinstance(value, unicode): + if encoding is None: + raise ValueError("must specify encoding to dump unicode strings") + value = value.encode(encoding) + lvalue = len(value) + span = str(lvalue) + write(",") + write(value) + write(":") + write(span) + return size + 2 + len(span) + lvalue + raise ValueError("unserializable object") + + +def _gdumps(value, encoding): + """Generate fragments of value dumped as a tnetstring. + + This is the naive dumping algorithm, implemented as a generator so that + it's easy to pass to "".join() without building a new list. + + This is mainly here for comparison purposes; the _rdumpq version is + measurably faster as it doesn't have to build intermediate strins. + """ + if value is None: + yield "0:~" + elif value is True: + yield "4:true!" + elif value is False: + yield "5:false!" + elif isinstance(value, (int, long)): + data = str(value) + yield str(len(data)) + yield ":" + yield data + yield "#" + elif isinstance(value, (float,)): + data = repr(value) + yield str(len(data)) + yield ":" + yield data + yield "^" + elif isinstance(value, (str,)): + yield str(len(value)) + yield ":" + yield value + yield "," + elif isinstance(value, (list, tuple,)): + sub = [] + for item in value: + sub.extend(_gdumps(item)) + sub = "".join(sub) + yield str(len(sub)) + yield ":" + yield sub + yield "]" + elif isinstance(value, (dict,)): + sub = [] + for (k, v) in value.iteritems(): + sub.extend(_gdumps(k)) + sub.extend(_gdumps(v)) + sub = "".join(sub) + yield str(len(sub)) + yield ":" + yield sub + yield "}" + elif isinstance(value, (unicode,)): + if encoding is None: + raise ValueError("must specify encoding to dump unicode strings") + value = value.encode(encoding) + yield str(len(value)) + yield ":" + yield value + yield "," + else: + raise ValueError("unserializable object") + + +def loads(string, encoding=None): + """loads(string,encoding=None) -> object + + This function parses a tnetstring into a python object. + """ + # No point duplicating effort here. In the C-extension version, + # loads() is measurably faster then pop() since it can avoid + # the overhead of building a second string. + return pop(string, encoding)[0] + + +def load(file, encoding=None): + """load(file,encoding=None) -> object + + This function reads a tnetstring from a file and parses it into a + python object. The file must support the read() method, and this + function promises not to read more data than necessary. + """ + # Read the length prefix one char at a time. + # Note that the netstring spec explicitly forbids padding zeros. + c = file.read(1) + if not c.isdigit(): + raise ValueError("not a tnetstring: missing or invalid length prefix") + datalen = ord(c) - ord("0") + c = file.read(1) + if datalen != 0: + while c.isdigit(): + datalen = (10 * datalen) + (ord(c) - ord("0")) + if datalen > 999999999: + errmsg = "not a tnetstring: absurdly large length prefix" + raise ValueError(errmsg) + c = file.read(1) + if c != ":": + raise ValueError("not a tnetstring: missing or invalid length prefix") + # Now we can read and parse the payload. + # This repeats the dispatch logic of pop() so we can avoid + # re-constructing the outermost tnetstring. + data = file.read(datalen) + if len(data) != datalen: + raise ValueError("not a tnetstring: length prefix too big") + type = file.read(1) + if type == ",": + if encoding is not None: + return data.decode(encoding) + return data + if type == "#": + try: + return int(data) + except ValueError: + raise ValueError("not a tnetstring: invalid integer literal") + if type == "^": + try: + return float(data) + except ValueError: + raise ValueError("not a tnetstring: invalid float literal") + if type == "!": + if data == "true": + return True + elif data == "false": + return False + else: + raise ValueError("not a tnetstring: invalid boolean literal") + if type == "~": + if data: + raise ValueError("not a tnetstring: invalid null literal") + return None + if type == "]": + l = [] + while data: + (item, data) = pop(data, encoding) + l.append(item) + return l + if type == "}": + d = {} + while data: + (key, data) = pop(data, encoding) + (val, data) = pop(data, encoding) + d[key] = val + return d + raise ValueError("unknown type tag") + + +def pop(string, encoding=None): + """pop(string,encoding=None) -> (object, remain) + + This function parses a tnetstring into a python object. + It returns a tuple giving the parsed object and a string + containing any unparsed data from the end of the string. + """ + # Parse out data length, type and remaining string. + try: + (dlen, rest) = string.split(":", 1) + dlen = int(dlen) + except ValueError: + raise ValueError("not a tnetstring: missing or invalid length prefix") + try: + (data, type, remain) = (rest[:dlen], rest[dlen], rest[dlen + 1:]) + except IndexError: + # This fires if len(rest) < dlen, meaning we don't need + # to further validate that data is the right length. + raise ValueError("not a tnetstring: invalid length prefix") + # Parse the data based on the type tag. + if type == ",": + if encoding is not None: + return (data.decode(encoding), remain) + return (data, remain) + if type == "#": + try: + return (int(data), remain) + except ValueError: + raise ValueError("not a tnetstring: invalid integer literal") + if type == "^": + try: + return (float(data), remain) + except ValueError: + raise ValueError("not a tnetstring: invalid float literal") + if type == "!": + if data == "true": + return (True, remain) + elif data == "false": + return (False, remain) + else: + raise ValueError("not a tnetstring: invalid boolean literal") + if type == "~": + if data: + raise ValueError("not a tnetstring: invalid null literal") + return (None, remain) + if type == "]": + l = [] + while data: + (item, data) = pop(data, encoding) + l.append(item) + return (l, remain) + if type == "}": + d = {} + while data: + (key, data) = pop(data, encoding) + (val, data) = pop(data, encoding) + d[key] = val + return (d, remain) + raise ValueError("unknown type tag") |