9 files changed, 331 insertions, 89 deletions
diff --git a/netlib/http/cookies.py b/netlib/http/cookies.py
index 768a85df..dd0af99c 100644
--- a/netlib/http/cookies.py
+++ b/netlib/http/cookies.py
@@ -1,7 +1,8 @@
 import collections
+import email.utils
 import re
+import time
 
-import email.utils
 from netlib import multidict
 
 """
@@ -260,3 +261,29 @@ def refresh_set_cookie_header(c, delta):
     if not ret:
         raise ValueError("Invalid Cookie")
     return ret
+
+
+def is_expired(cookie_attrs):
+    """
+        Determines whether a cookie has expired.
+
+        Returns: boolean
+    """
+
+    # See if 'expires' time is in the past
+    expires = False
+    if 'expires' in cookie_attrs:
+        e = email.utils.parsedate_tz(cookie_attrs["expires"])
+        if e:
+            exp_ts = email.utils.mktime_tz(e)
+            now_ts = time.time()
+            expires = exp_ts < now_ts
+
+    # or if Max-Age is 0
+    max_age = False
+    try:
+        max_age = int(cookie_attrs.get('Max-Age', 1)) == 0
+    except ValueError:
+        pass
+
+    return expires or max_age
diff --git a/netlib/http/headers.py b/netlib/http/headers.py
index 14888ea9..36e5060c 100644
--- a/netlib/http/headers.py
+++ b/netlib/http/headers.py
@@ -2,6 +2,7 @@ from __future__ import absolute_import, print_function, division
 
 import re
 
+import collections
 import six
 from netlib import multidict
 from netlib import strutils
@@ -148,6 +149,15 @@ class Headers(multidict.MultiDict):
         value = _always_bytes(value)
         super(Headers, self).insert(index, key, value)
 
+    def items(self, multi=False):
+        if multi:
+            return (
+                (_native(k), _native(v))
+                for k, v in self.fields
+            )
+        else:
+            return super(Headers, self).items()
+
     def replace(self, pattern, repl, flags=0):
         """
         Replaces a regular expression pattern with repl in each "name: value"
@@ -156,8 +166,10 @@ class Headers(multidict.MultiDict):
         Returns:
             The number of replacements made.
         """
-        pattern = _always_bytes(pattern)
-        repl = _always_bytes(repl)
+        if isinstance(pattern, six.text_type):
+            pattern = strutils.escaped_str_to_bytes(pattern)
+        if isinstance(repl, six.text_type):
+            repl = strutils.escaped_str_to_bytes(repl)
         pattern = re.compile(pattern, flags)
         replacements = 0
 
@@ -172,8 +184,8 @@ class Headers(multidict.MultiDict):
                 pass
             else:
                 replacements += n
-            fields.append([name, value])
-        self.fields = fields
+            fields.append((name, value))
+        self.fields = tuple(fields)
         return replacements
 
 
@@ -195,10 +207,22 @@ def parse_content_type(c):
     ts = parts[0].split("/", 1)
     if len(ts) != 2:
         return None
-    d = {}
+    d = collections.OrderedDict()
     if len(parts) == 2:
         for i in parts[1].split(";"):
             clause = i.split("=", 1)
             if len(clause) == 2:
                 d[clause[0].strip()] = clause[1].strip()
     return ts[0].lower(), ts[1].lower(), d
+
+
+def assemble_content_type(type, subtype, parameters):
+    if not parameters:
+        return "{}/{}".format(type, subtype)
+    params = "; ".join(
+        "{}={}".format(k, v)
+        for k, v in parameters.items()
+    )
+    return "{}/{}; {}".format(
+        type, subtype, params
+    )
diff --git a/netlib/http/http1/assemble.py b/netlib/http/http1/assemble.py
index 511328f1..e74732d2 100644
--- a/netlib/http/http1/assemble.py
+++ b/netlib/http/http1/assemble.py
@@ -5,7 +5,7 @@ from netlib import exceptions
 
 
 def assemble_request(request):
-    if request.content is None:
+    if request.data.content is None:
         raise exceptions.HttpException("Cannot assemble flow with missing content")
     head = assemble_request_head(request)
     body = b"".join(assemble_body(request.data.headers, [request.data.content]))
@@ -19,7 +19,7 @@ def assemble_request_head(request):
 
 
 def assemble_response(response):
-    if response.content is None:
+    if response.data.content is None:
         raise exceptions.HttpException("Cannot assemble flow with missing content")
     head = assemble_response_head(response)
     body = b"".join(assemble_body(response.data.headers, [response.data.content]))
diff --git a/netlib/http/http1/read.py b/netlib/http/http1/read.py
index a4c341fd..70fffbd4 100644
--- a/netlib/http/http1/read.py
+++ b/netlib/http/http1/read.py
@@ -244,7 +244,7 @@ def _read_request_line(rfile):
         raise exceptions.HttpReadDisconnect("Client disconnected")
 
     try:
-        method, path, http_version = line.split(b" ")
+        method, path, http_version = line.split()
 
         if path == b"*" or path.startswith(b"/"):
             form = "relative"
@@ -291,8 +291,7 @@ def _read_response_line(rfile):
         raise exceptions.HttpReadDisconnect("Server disconnected")
 
     try:
-
-        parts = line.split(b" ", 2)
+        parts = line.split(None, 2)
         if len(parts) == 2:  # handle missing message gracefully
             parts.append(b"")
 
diff --git a/netlib/http/http2/__init__.py b/netlib/http/http2/__init__.py
index 6a979a0d..60064190 100644
--- a/netlib/http/http2/__init__.py
+++ b/netlib/http/http2/__init__.py
@@ -1,6 +1,8 @@
 from __future__ import absolute_import, print_function, division
 from netlib.http.http2 import framereader
+from netlib.http.http2.utils import parse_headers
 
 __all__ = [
     "framereader",
+    "parse_headers",
 ]
diff --git a/netlib/http/http2/utils.py b/netlib/http/http2/utils.py
new file mode 100644
index 00000000..164bacc8
--- /dev/null
+++ b/netlib/http/http2/utils.py
@@ -0,0 +1,37 @@
+from netlib.http import url
+
+
+def parse_headers(headers):
+    authority = headers.get(':authority', '').encode()
+    method = headers.get(':method', 'GET').encode()
+    scheme = headers.get(':scheme', 'https').encode()
+    path = headers.get(':path', '/').encode()
+
+    headers.pop(":method", None)
+    headers.pop(":scheme", None)
+    headers.pop(":path", None)
+
+    host = None
+    port = None
+
+    if path == b'*' or path.startswith(b"/"):
+        first_line_format = "relative"
+    elif method == b'CONNECT':  # pragma: no cover
+        raise NotImplementedError("CONNECT over HTTP/2 is not implemented.")
+    else:  # pragma: no cover
+        first_line_format = "absolute"
+        # FIXME: verify if path or :host contains what we need
+        scheme, host, port, _ = url.parse(path)
+
+    if authority:
+        host, _, port = authority.partition(b':')
+
+    if not host:
+        host = b'localhost'
+
+    if not port:
+        port = 443 if scheme == b'https' else 80
+
+    port = int(port)
+
+    return first_line_format, method, scheme, host, port, path
diff --git a/netlib/http/message.py b/netlib/http/message.py
index b633b671..34709f0a 100644
--- a/netlib/http/message.py
+++ b/netlib/http/message.py
@@ -1,5 +1,6 @@
 from __future__ import absolute_import, print_function, division
 
+import re
 import warnings
 
 import six
@@ -51,7 +52,23 @@ class MessageData(basetypes.Serializable):
         return cls(**state)
 
 
+class CachedDecode(object):
+    __slots__ = ["encoded", "encoding", "strict", "decoded"]
+
+    def __init__(self, object, encoding, strict, decoded):
+        self.encoded = object
+        self.encoding = encoding
+        self.strict = strict
+        self.decoded = decoded
+
+no_cached_decode = CachedDecode(None, None, None, None)
+
+
 class Message(basetypes.Serializable):
+    def __init__(self):
+        self._content_cache = no_cached_decode  # type: CachedDecode
+        self._text_cache = no_cached_decode  # type: CachedDecode
+
     def __eq__(self, other):
         if isinstance(other, Message):
             return self.data == other.data
@@ -89,19 +106,82 @@ class Message(basetypes.Serializable):
         self.data.headers = h
 
     @property
-    def content(self):
+    def raw_content(self):
+        # type: () -> bytes
         """
         The raw (encoded) HTTP message body
 
-        See also: :py:attr:`text`
+        See also: :py:attr:`content`, :py:class:`text`
         """
         return self.data.content
 
-    @content.setter
-    def content(self, content):
+    @raw_content.setter
+    def raw_content(self, content):
         self.data.content = content
-        if isinstance(content, bytes):
-            self.headers["content-length"] = str(len(content))
+
+    def get_content(self, strict=True):
+        # type: (bool) -> bytes
+        """
+        The HTTP message body decoded with the content-encoding header (e.g. gzip)
+
+        Raises:
+            ValueError, when the content-encoding is invalid and strict is True.
+
+        See also: :py:class:`raw_content`, :py:attr:`text`
+        """
+        if self.raw_content is None:
+            return None
+        ce = self.headers.get("content-encoding")
+        cached = (
+            self._content_cache.encoded == self.raw_content and
+            (self._content_cache.strict or not strict) and
+            self._content_cache.encoding == ce
+        )
+        if not cached:
+            is_strict = True
+            if ce:
+                try:
+                    decoded = encoding.decode(self.raw_content, ce)
+                except ValueError:
+                    if strict:
+                        raise
+                    is_strict = False
+                    decoded = self.raw_content
+            else:
+                decoded = self.raw_content
+            self._content_cache = CachedDecode(self.raw_content, ce, is_strict, decoded)
+        return self._content_cache.decoded
+
+    def set_content(self, value):
+        if value is None:
+            self.raw_content = None
+            return
+        if not isinstance(value, bytes):
+            raise TypeError(
+                "Message content must be bytes, not {}. "
+                "Please use .text if you want to assign a str."
+                .format(type(value).__name__)
+            )
+        ce = self.headers.get("content-encoding")
+        cached = (
+            self._content_cache.decoded == value and
+            self._content_cache.encoding == ce and
+            self._content_cache.strict
+        )
+        if not cached:
+            try:
+                encoded = encoding.encode(value, ce or "identity")
+            except ValueError:
+                # So we have an invalid content-encoding?
+                # Let's remove it!
+                del self.headers["content-encoding"]
+                ce = None
+                encoded = value
+            self._content_cache = CachedDecode(encoded, ce, True, value)
+        self.raw_content = self._content_cache.encoded
+        self.headers["content-length"] = str(len(self.raw_content))
+
+    content = property(get_content, set_content)
 
     @property
     def http_version(self):
@@ -136,56 +216,108 @@ class Message(basetypes.Serializable):
     def timestamp_end(self, timestamp_end):
         self.data.timestamp_end = timestamp_end
 
-    @property
-    def text(self):
-        """
-        The decoded HTTP message body.
-        Decoded contents are not cached, so accessing this attribute repeatedly is relatively expensive.
+    def _get_content_type_charset(self):
+        # type: () -> Optional[str]
+        ct = headers.parse_content_type(self.headers.get("content-type", ""))
+        if ct:
+            return ct[2].get("charset")
 
-        .. note::
-            This is not implemented yet.
+    def _guess_encoding(self):
+        # type: () -> str
+        enc = self._get_content_type_charset()
+        if enc:
+            return enc
 
-        See also: :py:attr:`content`, :py:class:`decoded`
+        if "json" in self.headers.get("content-type", ""):
+            return "utf8"
+        else:
+            # We may also want to check for HTML meta tags here at some point.
+            return "latin-1"
+
+    def get_text(self, strict=True):
+        # type: (bool) -> six.text_type
         """
-        # This attribute should be called text, because that's what requests does.
-        raise NotImplementedError()
+        The HTTP message body decoded with both content-encoding header (e.g. gzip)
+        and content-type header charset.
 
-    @text.setter
-    def text(self, text):
-        raise NotImplementedError()
+        Raises:
+            ValueError, when either content-encoding or charset is invalid and strict is True.
 
-    def decode(self):
+        See also: :py:attr:`content`, :py:class:`raw_content`
+        """
+        if self.raw_content is None:
+            return None
+        enc = self._guess_encoding()
+
+        content = self.get_content(strict)
+        cached = (
+            self._text_cache.encoded == content and
+            (self._text_cache.strict or not strict) and
+            self._text_cache.encoding == enc
+        )
+        if not cached:
+            is_strict = self._content_cache.strict
+            try:
+                decoded = encoding.decode(content, enc)
+            except ValueError:
+                if strict:
+                    raise
+                is_strict = False
+                decoded = self.content.decode("utf8", "replace" if six.PY2 else "surrogateescape")
+            self._text_cache = CachedDecode(content, enc, is_strict, decoded)
+        return self._text_cache.decoded
+
+    def set_text(self, text):
+        if text is None:
+            self.content = None
+            return
+        enc = self._guess_encoding()
+
+        cached = (
+            self._text_cache.decoded == text and
+            self._text_cache.encoding == enc and
+            self._text_cache.strict
+        )
+        if not cached:
+            try:
+                encoded = encoding.encode(text, enc)
+            except ValueError:
+                # Fall back to UTF-8 and update the content-type header.
+                ct = headers.parse_content_type(self.headers.get("content-type", "")) or ("text", "plain", {})
+                ct[2]["charset"] = "utf-8"
+                self.headers["content-type"] = headers.assemble_content_type(*ct)
+                enc = "utf8"
+                encoded = text.encode(enc, "replace" if six.PY2 else "surrogateescape")
+            self._text_cache = CachedDecode(encoded, enc, True, text)
+        self.content = self._text_cache.encoded
+
+    text = property(get_text, set_text)
+
+    def decode(self, strict=True):
         """
-            Decodes body based on the current Content-Encoding header, then
-            removes the header. If there is no Content-Encoding header, no
-            action is taken.
+        Decodes body based on the current Content-Encoding header, then
+        removes the header. If there is no Content-Encoding header, no
+        action is taken.
 
-            Returns:
-                True, if decoding succeeded.
-                False, otherwise.
+        Raises:
+            ValueError, when the content-encoding is invalid and strict is True.
         """
-        ce = self.headers.get("content-encoding")
-        data = encoding.decode(ce, self.content)
-        if data is None:
-            return False
-        self.content = data
+        self.raw_content = self.get_content(strict)
         self.headers.pop("content-encoding", None)
-        return True
 
     def encode(self, e):
         """
-            Encodes body with the encoding e, where e is "gzip", "deflate" or "identity".
+        Encodes body with the encoding e, where e is "gzip", "deflate" or "identity".
+        Any existing content-encodings are overwritten,
+        the content is not decoded beforehand.
 
-            Returns:
-                True, if decoding succeeded.
-                False, otherwise.
+        Raises:
+            ValueError, when the specified content-encoding is invalid.
         """
-        data = encoding.encode(e, self.content)
-        if data is None:
-            return False
-        self.content = data
         self.headers["content-encoding"] = e
-        return True
+        self.content = self.raw_content
+        if "content-encoding" not in self.headers:
+            raise ValueError("Invalid content encoding {}".format(repr(e)))
 
     def replace(self, pattern, repl, flags=0):
         """
@@ -196,13 +328,15 @@ class Message(basetypes.Serializable):
         Returns:
             The number of replacements made.
         """
-        # TODO: Proper distinction between text and bytes.
+        if isinstance(pattern, six.text_type):
+            pattern = strutils.escaped_str_to_bytes(pattern)
+        if isinstance(repl, six.text_type):
+            repl = strutils.escaped_str_to_bytes(repl)
         replacements = 0
         if self.content:
-            with decoded(self):
-                self.content, replacements = strutils.safe_subn(
-                    pattern, repl, self.content, flags=flags
-                )
+            self.content, replacements = re.subn(
+                pattern, repl, self.content, flags=flags
+            )
         replacements += self.headers.replace(pattern, repl, flags)
         return replacements
 
@@ -221,29 +355,16 @@ class Message(basetypes.Serializable):
 
 class decoded(object):
     """
-    A context manager that decodes a request or response, and then
-    re-encodes it with the same encoding after execution of the block.
-
-    Example:
-
-    .. code-block:: python
-
-        with decoded(request):
-            request.content = request.content.replace("foo", "bar")
+    Deprecated: You can now directly use :py:attr:`content`.
+    :py:attr:`raw_content` has the encoded content.
     """
 
-    def __init__(self, message):
-        self.message = message
-        ce = message.headers.get("content-encoding")
-        if ce in encoding.ENCODINGS:
-            self.ce = ce
-        else:
-            self.ce = None
+    def __init__(self, message):  # pragma no cover
+        warnings.warn("decoded() is deprecated, you can now directly use .content instead. "
+                      ".raw_content has the encoded content.", DeprecationWarning)
 
-    def __enter__(self):
-        if self.ce:
-            self.message.decode()
+    def __enter__(self):  # pragma no cover
+        pass
 
-    def __exit__(self, type, value, tb):
-        if self.ce:
-            self.message.encode(self.ce)
+    def __exit__(self, type, value, tb):  # pragma no cover
+        pass
diff --git a/netlib/http/request.py b/netlib/http/request.py
index 01801d42..ecaa9b79 100644
--- a/netlib/http/request.py
+++ b/netlib/http/request.py
@@ -5,7 +5,6 @@ import re
 import six
 from six.moves import urllib
 
-from netlib import encoding
 from netlib import multidict
 from netlib import strutils
 from netlib.http import multipart
@@ -23,8 +22,20 @@ host_header_re = re.compile(r"^(?P<host>[^:]+|\[.+\])(?::(?P<port>\d+))?$")
 class RequestData(message.MessageData):
     def __init__(self, first_line_format, method, scheme, host, port, path, http_version, headers=(), content=None,
                  timestamp_start=None, timestamp_end=None):
+        if isinstance(method, six.text_type):
+            method = method.encode("ascii", "strict")
+        if isinstance(scheme, six.text_type):
+            scheme = scheme.encode("ascii", "strict")
+        if isinstance(host, six.text_type):
+            host = host.encode("idna", "strict")
+        if isinstance(path, six.text_type):
+            path = path.encode("ascii", "strict")
+        if isinstance(http_version, six.text_type):
+            http_version = http_version.encode("ascii", "strict")
         if not isinstance(headers, nheaders.Headers):
             headers = nheaders.Headers(headers)
+        if isinstance(content, six.text_type):
+            raise ValueError("Content must be bytes, not {}".format(type(content).__name__))
 
         self.first_line_format = first_line_format
         self.method = method
@@ -44,6 +55,7 @@ class Request(message.Message):
     An HTTP request.
     """
     def __init__(self, *args, **kwargs):
+        super(Request, self).__init__()
         self.data = RequestData(*args, **kwargs)
 
     def __repr__(self):
@@ -65,10 +77,14 @@ class Request(message.Message):
             Returns:
                 The number of replacements made.
         """
-        # TODO: Proper distinction between text and bytes.
+        if isinstance(pattern, six.text_type):
+            pattern = strutils.escaped_str_to_bytes(pattern)
+        if isinstance(repl, six.text_type):
+            repl = strutils.escaped_str_to_bytes(repl)
+
         c = super(Request, self).replace(pattern, repl, flags)
-        self.path, pc = strutils.safe_subn(
-            pattern, repl, self.path, flags=flags
+        self.path, pc = re.subn(
+            pattern, repl, self.data.path, flags=flags
         )
         c += pc
         return c
@@ -102,6 +118,8 @@ class Request(message.Message):
         """
         HTTP request scheme, which should be "http" or "https".
         """
+        if not self.data.scheme:
+            return self.data.scheme
         return message._native(self.data.scheme)
 
     @scheme.setter
@@ -321,7 +339,7 @@ class Request(message.Message):
             self.headers["accept-encoding"] = (
                 ', '.join(
                     e
-                    for e in encoding.ENCODINGS
+                    for e in {"gzip", "identity", "deflate"}
                     if e in accept_encoding
                 )
             )
@@ -341,7 +359,10 @@ class Request(message.Message):
     def _get_urlencoded_form(self):
         is_valid_content_type = "application/x-www-form-urlencoded" in self.headers.get("content-type", "").lower()
         if is_valid_content_type:
-            return tuple(netlib.http.url.decode(self.content))
+            try:
+                return tuple(netlib.http.url.decode(self.content))
+            except ValueError:
+                pass
         return ()
 
     def _set_urlencoded_form(self, value):
@@ -350,7 +371,7 @@ class Request(message.Message):
         This will overwrite the existing content if there is one.
         """
         self.headers["content-type"] = "application/x-www-form-urlencoded"
-        self.content = netlib.http.url.encode(value)
+        self.content = netlib.http.url.encode(value).encode()
 
     @urlencoded_form.setter
     def urlencoded_form(self, value):
@@ -370,7 +391,10 @@ class Request(message.Message):
     def _get_multipart_form(self):
         is_valid_content_type = "multipart/form-data" in self.headers.get("content-type", "").lower()
         if is_valid_content_type:
-            return multipart.decode(self.headers, self.content)
+            try:
+                return multipart.decode(self.headers, self.content)
+            except ValueError:
+                pass
         return ()
 
     def _set_multipart_form(self, value):
diff --git a/netlib/http/response.py b/netlib/http/response.py
index 17d69418..85f54940 100644
--- a/netlib/http/response.py
+++ b/netlib/http/response.py
@@ -2,6 +2,7 @@ from __future__ import absolute_import, print_function, division
 
 from email.utils import parsedate_tz, formatdate, mktime_tz
 import time
+import six
 
 from netlib.http import cookies
 from netlib.http import headers as nheaders
@@ -13,8 +14,14 @@ from netlib import human
 class ResponseData(message.MessageData):
     def __init__(self, http_version, status_code, reason=None, headers=(), content=None,
                  timestamp_start=None, timestamp_end=None):
+        if isinstance(http_version, six.text_type):
+            http_version = http_version.encode("ascii", "strict")
+        if isinstance(reason, six.text_type):
+            reason = reason.encode("ascii", "strict")
         if not isinstance(headers, nheaders.Headers):
             headers = nheaders.Headers(headers)
+        if isinstance(content, six.text_type):
+            raise ValueError("Content must be bytes, not {}".format(type(content).__name__))
 
         self.http_version = http_version
         self.status_code = status_code
@@ -30,13 +37,14 @@ class Response(message.Message):
     An HTTP response.
     """
     def __init__(self, *args, **kwargs):
+        super(Response, self).__init__()
         self.data = ResponseData(*args, **kwargs)
 
     def __repr__(self):
-        if self.content:
+        if self.raw_content:
             details = "{}, {}".format(
                 self.headers.get("content-type", "unknown content type"),
-                human.pretty_size(len(self.content))
+                human.pretty_size(len(self.raw_content))
             )
         else:
             details = "no content"