diff options
Diffstat (limited to 'netlib')
| -rw-r--r-- | netlib/encoding.py | 97 | ||||
| -rw-r--r-- | netlib/http/http1/assemble.py | 4 | ||||
| -rw-r--r-- | netlib/http/message.py | 192 | ||||
| -rw-r--r-- | netlib/http/request.py | 4 | ||||
| -rw-r--r-- | netlib/http/response.py | 5 | 
5 files changed, 202 insertions, 100 deletions
diff --git a/netlib/encoding.py b/netlib/encoding.py index 98502451..8b67b543 100644 --- a/netlib/encoding.py +++ b/netlib/encoding.py @@ -1,39 +1,62 @@  """ -    Utility functions for decoding response bodies. +Utility functions for decoding response bodies.  """  from __future__ import absolute_import + +import codecs  from io import BytesIO  import gzip  import zlib +from typing import Union  # noqa + -ENCODINGS = {"identity", "gzip", "deflate"} +def decode(obj, encoding, errors='strict'): +    # type: (Union[str, bytes], str) -> Union[str, bytes] +    """ +    Decode the given input object +    Returns: +        The decoded value -def decode(e, content): -    if not isinstance(content, bytes): -        return None -    encoding_map = { -        "identity": identity, -        "gzip": decode_gzip, -        "deflate": decode_deflate, -    } -    if e not in encoding_map: -        return None -    return encoding_map[e](content) +    Raises: +        ValueError, if decoding fails. +    """ +    try: +        try: +            return custom_decode[encoding](obj) +        except KeyError: +            return codecs.decode(obj, encoding, errors) +    except Exception as e: +        raise ValueError("{} when decoding {} with {}".format( +            type(e).__name__, +            repr(obj)[:10], +            repr(encoding), +        )) + + +def encode(obj, encoding, errors='strict'): +    # type: (Union[str, bytes], str) -> Union[str, bytes] +    """ +    Encode the given input object +    Returns: +        The encoded value -def encode(e, content): -    if not isinstance(content, bytes): -        return None -    encoding_map = { -        "identity": identity, -        "gzip": encode_gzip, -        "deflate": encode_deflate, -    } -    if e not in encoding_map: -        return None -    return encoding_map[e](content) +    Raises: +        ValueError, if encoding fails. +    """ +    try: +        try: +            return custom_encode[encoding](obj) +        except KeyError: +            return codecs.encode(obj, encoding, errors) +    except Exception as e: +        raise ValueError("{} when encoding {} with {}".format( +            type(e).__name__, +            repr(obj)[:10], +            repr(encoding), +        ))  def identity(content): @@ -46,10 +69,7 @@ def identity(content):  def decode_gzip(content):      gfile = gzip.GzipFile(fileobj=BytesIO(content)) -    try: -        return gfile.read() -    except (IOError, EOFError): -        return None +    return gfile.read()  def encode_gzip(content): @@ -70,12 +90,9 @@ def decode_deflate(content):          http://bugs.python.org/issue5784      """      try: -        try: -            return zlib.decompress(content) -        except zlib.error: -            return zlib.decompress(content, -15) +        return zlib.decompress(content)      except zlib.error: -        return None +        return zlib.decompress(content, -15)  def encode_deflate(content): @@ -84,4 +101,16 @@ def encode_deflate(content):      """      return zlib.compress(content) -__all__ = ["ENCODINGS", "encode", "decode"] + +custom_decode = { +    "identity": identity, +    "gzip": decode_gzip, +    "deflate": decode_deflate, +} +custom_encode = { +    "identity": identity, +    "gzip": encode_gzip, +    "deflate": encode_deflate, +} + +__all__ = ["encode", "decode"] diff --git a/netlib/http/http1/assemble.py b/netlib/http/http1/assemble.py index 511328f1..e74732d2 100644 --- a/netlib/http/http1/assemble.py +++ b/netlib/http/http1/assemble.py @@ -5,7 +5,7 @@ from netlib import exceptions  def assemble_request(request): -    if request.content is None: +    if request.data.content is None:          raise exceptions.HttpException("Cannot assemble flow with missing content")      head = assemble_request_head(request)      body = b"".join(assemble_body(request.data.headers, [request.data.content])) @@ -19,7 +19,7 @@ def assemble_request_head(request):  def assemble_response(response): -    if response.content is None: +    if response.data.content is None:          raise exceptions.HttpException("Cannot assemble flow with missing content")      head = assemble_response_head(response)      body = b"".join(assemble_body(response.data.headers, [response.data.content])) diff --git a/netlib/http/message.py b/netlib/http/message.py index 0583c246..668198f8 100644 --- a/netlib/http/message.py +++ b/netlib/http/message.py @@ -52,7 +52,22 @@ class MessageData(basetypes.Serializable):          return cls(**state) +class CachedDecode(object): +    __slots__ = ["encoded", "encoding", "decoded"] + +    def __init__(self, object, encoding, decoded): +        self.encoded = object +        self.encoding = encoding +        self.decoded = decoded + +no_cached_decode = CachedDecode(None, None, None) + +  class Message(basetypes.Serializable): +    def __init__(self): +        self._content_cache = no_cached_decode  # type: CachedDecode +        self._text_cache = no_cached_decode  # type: CachedDecode +      def __eq__(self, other):          if isinstance(other, Message):              return self.data == other.data @@ -90,19 +105,65 @@ class Message(basetypes.Serializable):          self.data.headers = h      @property -    def content(self): +    def raw_content(self): +        # type: () -> bytes          """          The raw (encoded) HTTP message body -        See also: :py:attr:`text` +        See also: :py:attr:`content`, :py:class:`text`          """          return self.data.content -    @content.setter -    def content(self, content): +    @raw_content.setter +    def raw_content(self, content):          self.data.content = content -        if isinstance(content, bytes): -            self.headers["content-length"] = str(len(content)) + +    @property +    def content(self): +        # type: () -> bytes +        """ +        The HTTP message body decoded with the content-encoding header (e.g. gzip) + +        See also: :py:class:`raw_content`, :py:attr:`text` +        """ +        ce = self.headers.get("content-encoding") +        cached = ( +            self._content_cache.encoded == self.raw_content and +            self._content_cache.encoding == ce +        ) +        if not cached: +            try: +                if not ce: +                    raise ValueError() +                decoded = encoding.decode(self.raw_content, ce) +            except ValueError: +                decoded = self.raw_content +            self._content_cache = CachedDecode(self.raw_content, ce, decoded) +        return self._content_cache.decoded + +    @content.setter +    def content(self, value): +        ce = self.headers.get("content-encoding") +        cached = ( +            self._content_cache.decoded == value and +            self._content_cache.encoding == ce +        ) +        if not cached: +            try: +                if not ce: +                    raise ValueError() +                encoded = encoding.encode(value, ce) +            except ValueError: +                # Do we have an unknown content-encoding? +                # If so, we want to remove it. +                if value and ce: +                    self.headers.pop("content-encoding", None) +                    ce = None +                encoded = value +            self._content_cache = CachedDecode(encoded, ce, value) +        self.raw_content = self._content_cache.encoded +        if isinstance(self.raw_content, bytes): +            self.headers["content-length"] = str(len(self.raw_content))      @property      def http_version(self): @@ -137,56 +198,81 @@ class Message(basetypes.Serializable):      def timestamp_end(self, timestamp_end):          self.data.timestamp_end = timestamp_end +    def _get_content_type_charset(self): +        # type: () -> Optional[str] +        ct = headers.parse_content_type(self.headers.get("content-type", "")) +        if ct: +            return ct[2].get("charset") +      @property      def text(self): +        # type: () -> six.text_type          """ -        The decoded HTTP message body. -        Decoded contents are not cached, so accessing this attribute repeatedly is relatively expensive. - -        .. note:: -            This is not implemented yet. +        The HTTP message body decoded with both content-encoding header (e.g. gzip) +        and content-type header charset. -        See also: :py:attr:`content`, :py:class:`decoded` +        See also: :py:attr:`content`, :py:class:`raw_content`          """          # This attribute should be called text, because that's what requests does. -        raise NotImplementedError() +        enc = self._get_content_type_charset() + +        # We may also want to check for HTML meta tags here at some point. + +        cached = ( +            self._text_cache.encoded == self.content and +            self._text_cache.encoding == enc +        ) +        if not cached: +            try: +                if not enc: +                    raise ValueError() +                decoded = encoding.decode(self.content, enc) +            except ValueError: +                decoded = self.content.decode("utf8", "replace" if six.PY2 else "surrogateescape") +            self._text_cache = CachedDecode(self.content, enc, decoded) +        return self._text_cache.decoded      @text.setter      def text(self, text): -        raise NotImplementedError() +        enc = self._get_content_type_charset() +        cached = ( +            self._text_cache.decoded == text and +            self._text_cache.encoding == enc +        ) +        if not cached: +            try: +                if not enc: +                    raise ValueError() +                encoded = encoding.encode(text, enc) +            except ValueError: +                # Do we have an unknown content-type charset? +                # If so, we want to replace it with utf8. +                if text and enc: +                    self.headers["content-type"] = re.sub( +                        "charset=[^;]+", +                        "charset=utf-8", +                        self.headers["content-type"] +                    ) +                encoded = text.encode("utf8", "replace" if six.PY2 else "surrogateescape") +            self._text_cache = CachedDecode(encoded, enc, text) +        self.content = self._text_cache.encoded      def decode(self):          """ -            Decodes body based on the current Content-Encoding header, then -            removes the header. If there is no Content-Encoding header, no -            action is taken. - -            Returns: -                True, if decoding succeeded. -                False, otherwise. +        Decodes body based on the current Content-Encoding header, then +        removes the header. If there is no Content-Encoding header, no +        action is taken.          """ -        ce = self.headers.get("content-encoding") -        data = encoding.decode(ce, self.content) -        if data is None: -            return False -        self.content = data +        self.raw_content = self.content          self.headers.pop("content-encoding", None) -        return True      def encode(self, e):          """ -            Encodes body with the encoding e, where e is "gzip", "deflate" or "identity". - -            Returns: -                True, if decoding succeeded. -                False, otherwise. +        Encodes body with the encoding e, where e is "gzip", "deflate" or "identity".          """ -        data = encoding.encode(e, self.content) -        if data is None: -            return False -        self.content = data +        self.decode()  # remove the current encoding          self.headers["content-encoding"] = e -        return True +        self.content = self.raw_content      def replace(self, pattern, repl, flags=0):          """ @@ -203,10 +289,9 @@ class Message(basetypes.Serializable):              repl = strutils.escaped_str_to_bytes(repl)          replacements = 0          if self.content: -            with decoded(self): -                self.content, replacements = re.subn( -                    pattern, repl, self.content, flags=flags -                ) +            self.content, replacements = re.subn( +                pattern, repl, self.content, flags=flags +            )          replacements += self.headers.replace(pattern, repl, flags)          return replacements @@ -225,29 +310,16 @@ class Message(basetypes.Serializable):  class decoded(object):      """ -    A context manager that decodes a request or response, and then -    re-encodes it with the same encoding after execution of the block. - -    Example: - -    .. code-block:: python - -        with decoded(request): -            request.content = request.content.replace("foo", "bar") +    Deprecated: You can now directly use :py:attr:`content`. +    :py:attr:`raw_content` has the encoded content.      """      def __init__(self, message): -        self.message = message -        ce = message.headers.get("content-encoding") -        if ce in encoding.ENCODINGS: -            self.ce = ce -        else: -            self.ce = None +        warnings.warn("decoded() is deprecated, you can now directly use .content instead. " +                      ".raw_content has the encoded content.", DeprecationWarning)      def __enter__(self): -        if self.ce: -            self.message.decode() +        pass      def __exit__(self, type, value, tb): -        if self.ce: -            self.message.encode(self.ce) +        pass
\ No newline at end of file diff --git a/netlib/http/request.py b/netlib/http/request.py index d9f4ed00..4ce94549 100644 --- a/netlib/http/request.py +++ b/netlib/http/request.py @@ -5,7 +5,6 @@ import re  import six  from six.moves import urllib -from netlib import encoding  from netlib import multidict  from netlib import strutils  from netlib.http import multipart @@ -44,6 +43,7 @@ class Request(message.Message):      An HTTP request.      """      def __init__(self, *args, **kwargs): +        super(Request, self).__init__()          self.data = RequestData(*args, **kwargs)      def __repr__(self): @@ -327,7 +327,7 @@ class Request(message.Message):              self.headers["accept-encoding"] = (                  ', '.join(                      e -                    for e in encoding.ENCODINGS +                    for e in {"gzip", "identity", "deflate"}                      if e in accept_encoding                  )              ) diff --git a/netlib/http/response.py b/netlib/http/response.py index 17d69418..d2273edd 100644 --- a/netlib/http/response.py +++ b/netlib/http/response.py @@ -30,13 +30,14 @@ class Response(message.Message):      An HTTP response.      """      def __init__(self, *args, **kwargs): +        super(Response, self).__init__()          self.data = ResponseData(*args, **kwargs)      def __repr__(self): -        if self.content: +        if self.raw_content:              details = "{}, {}".format(                  self.headers.get("content-type", "unknown content type"), -                human.pretty_size(len(self.content)) +                human.pretty_size(len(self.raw_content))              )          else:              details = "no content"  | 
