Merge pull request 1175 from iex-rs/faster-backslash-u

2025-10-02 15:26:00 +00:00 · 2024-08-14 22:23:15 -07:00 · 2024-08-14 22:23:15 -07:00 · 0f942e5b52
commit 0f942e5b52
parent d8921cd29b 96ae60445d f50e29656a
3 changed files with 175 additions and 86 deletions
--- a/src/de.rs
+++ b/src/de.rs
@ -1575,7 +1575,10 @@ impl<'de, 'a, R: Read<'de>> de::Deserializer<'de> for &'a mut Deserializer<R> {
    ///
    /// The behavior of serde_json is specified to fail on non-UTF-8 strings
    /// when deserializing into Rust UTF-8 string types such as String, and
-    /// succeed with non-UTF-8 bytes when deserializing using this method.
+    /// succeed with the bytes representing the [WTF-8] encoding of code points
+    /// when deserializing using this method.
+    ///
+    /// [WTF-8]: https://simonsapin.github.io/wtf-8
    ///
    /// Escape sequences are processed as usual, and for `\uXXXX` escapes it is
    /// still checked if the hex number represents a valid Unicode code point.
--- a/src/read.rs
+++ b/src/read.rs
@ -1,6 +1,5 @@
 use crate::error::{Error, ErrorCode, Result};
 use alloc::vec::Vec;
-use core::char;
 use core::cmp;
 use core::mem;
 use core::ops::Deref;
@ -877,82 +876,7 @@ fn parse_escape<'de, R: Read<'de>>(
        b'n' => scratch.push(b'\n'),
        b'r' => scratch.push(b'\r'),
        b't' => scratch.push(b'\t'),
-        b'u' => {
-            fn encode_surrogate(scratch: &mut Vec<u8>, n: u16) {
-                scratch.extend_from_slice(&[
-                    (n >> 12 & 0b0000_1111) as u8 | 0b1110_0000,
-                    (n >> 6 & 0b0011_1111) as u8 | 0b1000_0000,
-                    (n & 0b0011_1111) as u8 | 0b1000_0000,
-                ]);
-            }
-
-            let c = match tri!(read.decode_hex_escape()) {
-                n @ 0xDC00..=0xDFFF => {
-                    return if validate {
-                        error(read, ErrorCode::LoneLeadingSurrogateInHexEscape)
-                    } else {
-                        encode_surrogate(scratch, n);
-                        Ok(())
-                    };
-                }
-
-                // Non-BMP characters are encoded as a sequence of two hex
-                // escapes, representing UTF-16 surrogates. If deserializing a
-                // utf-8 string the surrogates are required to be paired,
-                // whereas deserializing a byte string accepts lone surrogates.
-                n1 @ 0xD800..=0xDBFF => {
-                    if tri!(peek_or_eof(read)) == b'\\' {
-                        read.discard();
-                    } else {
-                        return if validate {
-                            read.discard();
-                            error(read, ErrorCode::UnexpectedEndOfHexEscape)
-                        } else {
-                            encode_surrogate(scratch, n1);
-                            Ok(())
-                        };
-                    }
-
-                    if tri!(peek_or_eof(read)) == b'u' {
-                        read.discard();
-                    } else {
-                        return if validate {
-                            read.discard();
-                            error(read, ErrorCode::UnexpectedEndOfHexEscape)
-                        } else {
-                            encode_surrogate(scratch, n1);
-                            // The \ prior to this byte started an escape sequence,
-                            // so we need to parse that now. This recursive call
-                            // does not blow the stack on malicious input because
-                            // the escape is not \u, so it will be handled by one
-                            // of the easy nonrecursive cases.
-                            parse_escape(read, validate, scratch)
-                        };
-                    }
-
-                    let n2 = tri!(read.decode_hex_escape());
-
-                    if n2 < 0xDC00 || n2 > 0xDFFF {
-                        return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape);
-                    }
-
-                    let n = (((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000;
-
-                    match char::from_u32(n) {
-                        Some(c) => c,
-                        None => {
-                            return error(read, ErrorCode::InvalidUnicodeCodePoint);
-                        }
-                    }
-                }
-
-                // Every u16 outside of the surrogate ranges above is guaranteed
-                // to be a legal char.
-                n => char::from_u32(n as u32).unwrap(),
-            };
-
-            scratch.extend_from_slice(c.encode_utf8(&mut [0_u8; 4]).as_bytes());
-        }
+        b'u' => return parse_unicode_escape(read, validate, scratch),
        _ => {
            return error(read, ErrorCode::InvalidEscape);
        }
@ -961,6 +885,126 @@ fn parse_escape<'de, R: Read<'de>>(
    Ok(())
 }

+/// Parses a JSON \u escape and appends it into the scratch space. Assumes \u
+/// has just been read.
+#[cold]
+fn parse_unicode_escape<'de, R: Read<'de>>(
+    read: &mut R,
+    validate: bool,
+    scratch: &mut Vec<u8>,
+) -> Result<()> {
+    let mut n = tri!(read.decode_hex_escape());
+
+    // Non-BMP characters are encoded as a sequence of two hex
+    // escapes, representing UTF-16 surrogates. If deserializing a
+    // utf-8 string the surrogates are required to be paired,
+    // whereas deserializing a byte string accepts lone surrogates.
+    if validate && n >= 0xDC00 && n <= 0xDFFF {
+        // XXX: This is actually a trailing surrogate.
+        return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape);
+    }
+
+    loop {
+        if n < 0xD800 || n > 0xDBFF {
+            // Every u16 outside of the surrogate ranges is guaranteed to be a
+            // legal char.
+            push_wtf8_codepoint(n as u32, scratch);
+            return Ok(());
+        }
+
+        // n is a leading surrogate, we now expect a trailing surrogate.
+        let n1 = n;
+
+        if tri!(peek_or_eof(read)) == b'\\' {
+            read.discard();
+        } else {
+            return if validate {
+                read.discard();
+                error(read, ErrorCode::UnexpectedEndOfHexEscape)
+            } else {
+                push_wtf8_codepoint(n1 as u32, scratch);
+                Ok(())
+            };
+        }
+
+        if tri!(peek_or_eof(read)) == b'u' {
+            read.discard();
+        } else {
+            return if validate {
+                read.discard();
+                error(read, ErrorCode::UnexpectedEndOfHexEscape)
+            } else {
+                push_wtf8_codepoint(n1 as u32, scratch);
+                // The \ prior to this byte started an escape sequence,
+                // so we need to parse that now. This recursive call
+                // does not blow the stack on malicious input because
+                // the escape is not \u, so it will be handled by one
+                // of the easy nonrecursive cases.
+                parse_escape(read, validate, scratch)
+            };
+        }
+
+        let n2 = tri!(read.decode_hex_escape());
+
+        if n2 < 0xDC00 || n2 > 0xDFFF {
+            if validate {
+                return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape);
+            }
+            push_wtf8_codepoint(n1 as u32, scratch);
+            // If n2 is a leading surrogate, we need to restart.
+            n = n2;
+            continue;
+        }
+
+        // This value is in range U+10000..=U+10FFFF, which is always a
+        // valid codepoint.
+        let n = (((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000;
+        push_wtf8_codepoint(n, scratch);
+        return Ok(());
+    }
+}
+
+/// Adds a WTF-8 codepoint to the end of the buffer. This is a more efficient
+/// implementation of String::push. The codepoint may be a surrogate.
+#[inline]
+fn push_wtf8_codepoint(n: u32, scratch: &mut Vec<u8>) {
+    if n < 0x80 {
+        scratch.push(n as u8);
+        return;
+    }
+
+    scratch.reserve(4);
+
+    unsafe {
+        let ptr = scratch.as_mut_ptr().add(scratch.len());
+
+        let encoded_len = match n {
+            0..=0x7F => unreachable!(),
+            0x80..=0x7FF => {
+                ptr.write((n >> 6 & 0b0001_1111) as u8 | 0b1100_0000);
+                2
+            }
+            0x800..=0xFFFF => {
+                ptr.write((n >> 12 & 0b0000_1111) as u8 | 0b1110_0000);
+                ptr.add(1).write((n >> 6 & 0b0011_1111) as u8 | 0b1000_0000);
+                3
+            }
+            0x1_0000..=0x10_FFFF => {
+                ptr.write((n >> 18 & 0b0000_0111) as u8 | 0b1111_0000);
+                ptr.add(1)
+                    .write((n >> 12 & 0b0011_1111) as u8 | 0b1000_0000);
+                ptr.add(2).write((n >> 6 & 0b0011_1111) as u8 | 0b1000_0000);
+                4
+            }
+            0x11_0000.. => unreachable!(),
+        };
+        ptr.add(encoded_len - 1)
+            .write((n & 0b0011_1111) as u8 | 0b1000_0000);
+
+        scratch.set_len(scratch.len() + encoded_len);
+    }
+}
+
 /// Parses a JSON escape sequence and discards the value. Assumes the previous
 /// byte read was a backslash.
 fn ignore_escape<'de, R>(read: &mut R) -> Result<()>
--- a/tests/test.rs
+++ b/tests/test.rs
@ -1707,7 +1707,7 @@ fn test_byte_buf_de() {
 }

 #[test]
-fn test_byte_buf_de_lone_surrogate() {
+fn test_byte_buf_de_invalid_surrogates() {
    let bytes = ByteBuf::from(vec![237, 160, 188]);
    let v: ByteBuf = from_str(r#""\ud83c""#).unwrap();
    assert_eq!(v, bytes);
@ -1720,23 +1720,54 @@ fn test_byte_buf_de_lone_surrogate() {
    let v: ByteBuf = from_str(r#""\ud83c ""#).unwrap();
    assert_eq!(v, bytes);

-    let bytes = ByteBuf::from(vec![237, 176, 129]);
-    let v: ByteBuf = from_str(r#""\udc01""#).unwrap();
-    assert_eq!(v, bytes);
-
    let res = from_str::<ByteBuf>(r#""\ud83c\!""#);
    assert!(res.is_err());

    let res = from_str::<ByteBuf>(r#""\ud83c\u""#);
    assert!(res.is_err());

-    let res = from_str::<ByteBuf>(r#""\ud83c\ud83c""#);
-    assert!(res.is_err());
+    // lone trailing surrogate
+    let bytes = ByteBuf::from(vec![237, 176, 129]);
+    let v: ByteBuf = from_str(r#""\udc01""#).unwrap();
+    assert_eq!(v, bytes);
+
+    // leading surrogate followed by other leading surrogate
+    let bytes = ByteBuf::from(vec![237, 160, 188, 237, 160, 188]);
+    let v: ByteBuf = from_str(r#""\ud83c\ud83c""#).unwrap();
+    assert_eq!(v, bytes);
+
+    // leading surrogate followed by "a" (U+0061) in \u encoding
+    let bytes = ByteBuf::from(vec![237, 160, 188, 97]);
+    let v: ByteBuf = from_str(r#""\ud83c\u0061""#).unwrap();
+    assert_eq!(v, bytes);
+
+    // leading surrogate followed by U+0080
+    let bytes = ByteBuf::from(vec![237, 160, 188, 194, 128]);
+    let v: ByteBuf = from_str(r#""\ud83c\u0080""#).unwrap();
+    assert_eq!(v, bytes);
+
+    // leading surrogate followed by U+FFFF
+    let bytes = ByteBuf::from(vec![237, 160, 188, 239, 191, 191]);
+    let v: ByteBuf = from_str(r#""\ud83c\uffff""#).unwrap();
+    assert_eq!(v, bytes);
+}
+
+#[test]
+fn test_byte_buf_de_surrogate_pair() {
+    // leading surrogate followed by trailing surrogate
+    let bytes = ByteBuf::from(vec![240, 159, 128, 128]);
+    let v: ByteBuf = from_str(r#""\ud83c\udc00""#).unwrap();
+    assert_eq!(v, bytes);
+
+    // leading surrogate followed by a surrogate pair
+    let bytes = ByteBuf::from(vec![237, 160, 188, 240, 159, 128, 128]);
+    let v: ByteBuf = from_str(r#""\ud83c\ud83c\udc00""#).unwrap();
+    assert_eq!(v, bytes);
 }

 #[cfg(feature = "raw_value")]
 #[test]
-fn test_raw_de_lone_surrogate() {
+fn test_raw_de_invalid_surrogates() {
    use serde_json::value::RawValue;

    assert!(from_str::<Box<RawValue>>(r#""\ud83c""#).is_ok());
@ -1746,6 +1777,17 @@ fn test_raw_de_lone_surrogate() {
    assert!(from_str::<Box<RawValue>>(r#""\udc01\!""#).is_err());
    assert!(from_str::<Box<RawValue>>(r#""\udc01\u""#).is_err());
    assert!(from_str::<Box<RawValue>>(r#""\ud83c\ud83c""#).is_ok());
+    assert!(from_str::<Box<RawValue>>(r#""\ud83c\u0061""#).is_ok());
+    assert!(from_str::<Box<RawValue>>(r#""\ud83c\u0080""#).is_ok());
+    assert!(from_str::<Box<RawValue>>(r#""\ud83c\uffff""#).is_ok());
+}
+
+#[cfg(feature = "raw_value")]
+#[test]
+fn test_raw_de_surrogate_pair() {
+    use serde_json::value::RawValue;
+
+    assert!(from_str::<Box<RawValue>>(r#""\ud83c\udc00""#).is_ok());
 }

 #[test]