Merge pull request #3387 from bdarnell/chunked-parsing

bdarnell · web-flow · commit d65f6e71a77f · 2024-06-06T13:40:06.000-04:00
http1connection: Stricter handling of transfer-encoding and whitespace
diff --git a/tornado/http1connection.py b/tornado/http1connection.py
@@ -391,14 +391,11 @@ def write_headers(
             self._request_start_line = start_line
             lines.append(utf8("%s %s HTTP/1.1" % (start_line[0], start_line[1])))
             # Client requests with a non-empty body must have either a
-            # Content-Length or a Transfer-Encoding.
+            # Content-Length or a Transfer-Encoding. If Content-Length is not
+            # present we'll add our Transfer-Encoding below.
             self._chunking_output = (
                 start_line.method in ("POST", "PUT", "PATCH")
                 and "Content-Length" not in headers
-                and (
-                    "Transfer-Encoding" not in headers
-                    or headers["Transfer-Encoding"] == "chunked"
-                )
             )
         else:
             assert isinstance(start_line, httputil.ResponseStartLine)
@@ -420,9 +417,6 @@ def write_headers(
                 and (start_line.code < 100 or start_line.code >= 200)
                 # No need to chunk the output if a Content-Length is specified.
                 and "Content-Length" not in headers
-                # Applications are discouraged from touching Transfer-Encoding,
-                # but if they do, leave it alone.
-                and "Transfer-Encoding" not in headers
             )
             # If connection to a 1.1 client will be closed, inform client
             if (
@@ -562,7 +556,7 @@ def _can_keep_alive(
             return connection_header != "close"
         elif (
             "Content-Length" in headers
-            or headers.get("Transfer-Encoding", "").lower() == "chunked"
+            or is_transfer_encoding_chunked(headers)
             or getattr(start_line, "method", None) in ("HEAD", "GET")
         ):
             # start_line may be a request or response start line; only
@@ -600,13 +594,6 @@ def _read_body(
         delegate: httputil.HTTPMessageDelegate,
     ) -> Optional[Awaitable[None]]:
         if "Content-Length" in headers:
-            if "Transfer-Encoding" in headers:
-                # Response cannot contain both Content-Length and
-                # Transfer-Encoding headers.
-                # http://tools.ietf.org/html/rfc7230#section-3.3.3
-                raise httputil.HTTPInputError(
-                    "Response with both Transfer-Encoding and Content-Length"
-                )
             if "," in headers["Content-Length"]:
                 # Proxies sometimes cause Content-Length headers to get
                 # duplicated.  If all the values are identical then we can
@@ -633,20 +620,22 @@ def _read_body(
         else:
             content_length = None
 
+        is_chunked = is_transfer_encoding_chunked(headers)
+
         if code == 204:
             # This response code is not allowed to have a non-empty body,
             # and has an implicit length of zero instead of read-until-close.
             # http://www.w3.org/Protocols/rfc2616/rfc2616-sec4.html#sec4.3
-            if "Transfer-Encoding" in headers or content_length not in (None, 0):
+            if is_chunked or content_length not in (None, 0):
                 raise httputil.HTTPInputError(
                     "Response with code %d should not have body" % code
                 )
             content_length = 0
 
+        if is_chunked:
+            return self._read_chunked_body(delegate)
         if content_length is not None:
             return self._read_fixed_body(content_length, delegate)
-        if headers.get("Transfer-Encoding", "").lower() == "chunked":
-            return self._read_chunked_body(delegate)
         if self.is_client:
             return self._read_body_until_close(delegate)
         return None
@@ -865,3 +854,33 @@ def parse_hex_int(s: str) -> int:
     if HEXDIGITS.fullmatch(s) is None:
         raise ValueError("not a hexadecimal integer: %r" % s)
     return int(s, 16)
+
+
+def is_transfer_encoding_chunked(headers: httputil.HTTPHeaders) -> bool:
+    """Returns true if the headers specify Transfer-Encoding: chunked.
+
+    Raise httputil.HTTPInputError if any other transfer encoding is used.
+    """
+    # Note that transfer-encoding is an area in which postel's law can lead
+    # us astray. If a proxy and a backend server are liberal in what they accept,
+    # but accept slightly different things, this can lead to mismatched framing
+    # and request smuggling issues. Therefore we are as strict as possible here
+    # (even technically going beyond the requirements of the RFCs: a value of
+    # ",chunked" is legal but doesn't appear in practice for legitimate traffic)
+    if "Transfer-Encoding" not in headers:
+        return False
+    if "Content-Length" in headers:
+        # Message cannot contain both Content-Length and
+        # Transfer-Encoding headers.
+        # http://tools.ietf.org/html/rfc7230#section-3.3.3
+        raise httputil.HTTPInputError(
+            "Message with both Transfer-Encoding and Content-Length"
+        )
+    if headers["Transfer-Encoding"].lower() == "chunked":
+        return True
+    # We do not support any transfer-encodings other than chunked, and we do not
+    # expect to add any support because the concept of transfer-encoding has
+    # been removed in HTTP/2.
+    raise httputil.HTTPInputError(
+        "Unsupported Transfer-Encoding %s" % headers["Transfer-Encoding"]
+    )
diff --git a/tornado/httputil.py b/tornado/httputil.py
@@ -62,6 +62,9 @@
     from asyncio import Future  # noqa: F401
     import unittest  # noqa: F401
 
+# To be used with str.strip() and related methods.
+HTTP_WHITESPACE = " \t"
+
 
 @lru_cache(1000)
 def _normalize_header(name: str) -> str:
@@ -171,15 +174,15 @@ def parse_line(self, line: str) -> None:
             # continuation of a multi-line header
             if self._last_key is None:
                 raise HTTPInputError("first header line cannot start with whitespace")
-            new_part = " " + line.lstrip()
+            new_part = " " + line.lstrip(HTTP_WHITESPACE)
             self._as_list[self._last_key][-1] += new_part
             self._dict[self._last_key] += new_part
         else:
             try:
                 name, value = line.split(":", 1)
             except ValueError:
                 raise HTTPInputError("no colon in header line")
-            self.add(name, value.strip())
+            self.add(name, value.strip(HTTP_WHITESPACE))
 
     @classmethod
     def parse(cls, headers: str) -> "HTTPHeaders":
diff --git a/tornado/test/httpserver_test.py b/tornado/test/httpserver_test.py
@@ -581,6 +581,76 @@ def test_chunked_request_body_invalid_size(self):
             )
         self.assertEqual(400, start_line.code)
 
+    def test_chunked_request_body_duplicate_header(self):
+        # Repeated Transfer-Encoding headers should be an error (and not confuse
+        # the chunked-encoding detection to mess up framing).
+        self.stream.write(
+            b"""\
+POST /echo HTTP/1.1
+Transfer-Encoding: chunked
+Transfer-encoding: chunked
+
+2
+ok
+0
+
+"""
+        )
+        with ExpectLog(
+            gen_log,
+            ".*Unsupported Transfer-Encoding chunked,chunked",
+            level=logging.INFO,
+        ):
+            start_line, headers, response = self.io_loop.run_sync(
+                lambda: read_stream_body(self.stream)
+            )
+        self.assertEqual(400, start_line.code)
+
+    def test_chunked_request_body_unsupported_transfer_encoding(self):
+        # We don't support transfer-encodings other than chunked.
+        self.stream.write(
+            b"""\
+POST /echo HTTP/1.1
+Transfer-Encoding: gzip, chunked
+
+2
+ok
+0
+
+"""
+        )
+        with ExpectLog(
+            gen_log, ".*Unsupported Transfer-Encoding gzip, chunked", level=logging.INFO
+        ):
+            start_line, headers, response = self.io_loop.run_sync(
+                lambda: read_stream_body(self.stream)
+            )
+        self.assertEqual(400, start_line.code)
+
+    def test_chunked_request_body_transfer_encoding_and_content_length(self):
+        # Transfer-encoding and content-length are mutually exclusive
+        self.stream.write(
+            b"""\
+POST /echo HTTP/1.1
+Transfer-Encoding: chunked
+Content-Length: 2
+
+2
+ok
+0
+
+"""
+        )
+        with ExpectLog(
+            gen_log,
+            ".*Message with both Transfer-Encoding and Content-Length",
+            level=logging.INFO,
+        ):
+            start_line, headers, response = self.io_loop.run_sync(
+                lambda: read_stream_body(self.stream)
+            )
+        self.assertEqual(400, start_line.code)
+
     @gen_test
     def test_invalid_content_length(self):
         # HTTP only allows decimal digits in content-length. Make sure we don't
diff --git a/tornado/test/httputil_test.py b/tornado/test/httputil_test.py
@@ -334,6 +334,25 @@ def test_unicode_newlines(self):
                     gen_log.warning("failed while trying %r in %s", newline, encoding)
                     raise
 
+    def test_unicode_whitespace(self):
+        # Only tabs and spaces are to be stripped according to the HTTP standard.
+        # Other unicode whitespace is to be left as-is. In the context of headers,
+        # this specifically means the whitespace characters falling within the
+        # latin1 charset.
+        whitespace = [
+            (" ", True),  # SPACE
+            ("\t", True),  # TAB
+            ("\u00a0", False),  # NON-BREAKING SPACE
+            ("\u0085", False),  # NEXT LINE
+        ]
+        for c, stripped in whitespace:
+            headers = HTTPHeaders.parse("Transfer-Encoding: %schunked" % c)
+            if stripped:
+                expected = [("Transfer-Encoding", "chunked")]
+            else:
+                expected = [("Transfer-Encoding", "%schunked" % c)]
+            self.assertEqual(expected, list(headers.get_all()))
+
     def test_optional_cr(self):
         # Both CRLF and LF should be accepted as separators. CR should not be
         # part of the data when followed by LF, but it is a normal char
diff --git a/tornado/test/simple_httpclient_test.py b/tornado/test/simple_httpclient_test.py
@@ -828,7 +828,7 @@ def test_chunked_with_content_length(self):
         with ExpectLog(
             gen_log,
             (
-                "Malformed HTTP message from None: Response "
+                "Malformed HTTP message from None: Message "
                 "with both Transfer-Encoding and Content-Length"
             ),
             level=logging.INFO,

Original file line number	Diff line number	Diff line change
`@@ -828,7 +828,7 @@ def test_chunked_with_content_length(self):`
`828`	`828`	`with ExpectLog(`
`829`	`829`	`gen_log,`
`830`	`830`	`(`
`831`		`- "Malformed HTTP message from None: Response "`
	`831`	`+ "Malformed HTTP message from None: Message "`
`832`	`832`	`"with both Transfer-Encoding and Content-Length"`
`833`	`833`	`),`
`834`	`834`	`level=logging.INFO,`