mirror of
https://github.com/serde-rs/json.git
synced 2025-10-02 15:26:00 +00:00
Merge pull request 1175 from iex-rs/faster-backslash-u
This commit is contained in:
commit
0f942e5b52
@ -1575,7 +1575,10 @@ impl<'de, 'a, R: Read<'de>> de::Deserializer<'de> for &'a mut Deserializer<R> {
|
||||
///
|
||||
/// The behavior of serde_json is specified to fail on non-UTF-8 strings
|
||||
/// when deserializing into Rust UTF-8 string types such as String, and
|
||||
/// succeed with non-UTF-8 bytes when deserializing using this method.
|
||||
/// succeed with the bytes representing the [WTF-8] encoding of code points
|
||||
/// when deserializing using this method.
|
||||
///
|
||||
/// [WTF-8]: https://simonsapin.github.io/wtf-8
|
||||
///
|
||||
/// Escape sequences are processed as usual, and for `\uXXXX` escapes it is
|
||||
/// still checked if the hex number represents a valid Unicode code point.
|
||||
|
114
src/read.rs
114
src/read.rs
@ -1,6 +1,5 @@
|
||||
use crate::error::{Error, ErrorCode, Result};
|
||||
use alloc::vec::Vec;
|
||||
use core::char;
|
||||
use core::cmp;
|
||||
use core::mem;
|
||||
use core::ops::Deref;
|
||||
@ -877,30 +876,45 @@ fn parse_escape<'de, R: Read<'de>>(
|
||||
b'n' => scratch.push(b'\n'),
|
||||
b'r' => scratch.push(b'\r'),
|
||||
b't' => scratch.push(b'\t'),
|
||||
b'u' => {
|
||||
fn encode_surrogate(scratch: &mut Vec<u8>, n: u16) {
|
||||
scratch.extend_from_slice(&[
|
||||
(n >> 12 & 0b0000_1111) as u8 | 0b1110_0000,
|
||||
(n >> 6 & 0b0011_1111) as u8 | 0b1000_0000,
|
||||
(n & 0b0011_1111) as u8 | 0b1000_0000,
|
||||
]);
|
||||
b'u' => return parse_unicode_escape(read, validate, scratch),
|
||||
_ => {
|
||||
return error(read, ErrorCode::InvalidEscape);
|
||||
}
|
||||
}
|
||||
|
||||
let c = match tri!(read.decode_hex_escape()) {
|
||||
n @ 0xDC00..=0xDFFF => {
|
||||
return if validate {
|
||||
error(read, ErrorCode::LoneLeadingSurrogateInHexEscape)
|
||||
} else {
|
||||
encode_surrogate(scratch, n);
|
||||
Ok(())
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/// Parses a JSON \u escape and appends it into the scratch space. Assumes \u
|
||||
/// has just been read.
|
||||
#[cold]
|
||||
fn parse_unicode_escape<'de, R: Read<'de>>(
|
||||
read: &mut R,
|
||||
validate: bool,
|
||||
scratch: &mut Vec<u8>,
|
||||
) -> Result<()> {
|
||||
let mut n = tri!(read.decode_hex_escape());
|
||||
|
||||
// Non-BMP characters are encoded as a sequence of two hex
|
||||
// escapes, representing UTF-16 surrogates. If deserializing a
|
||||
// utf-8 string the surrogates are required to be paired,
|
||||
// whereas deserializing a byte string accepts lone surrogates.
|
||||
n1 @ 0xD800..=0xDBFF => {
|
||||
if validate && n >= 0xDC00 && n <= 0xDFFF {
|
||||
// XXX: This is actually a trailing surrogate.
|
||||
return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape);
|
||||
}
|
||||
|
||||
loop {
|
||||
if n < 0xD800 || n > 0xDBFF {
|
||||
// Every u16 outside of the surrogate ranges is guaranteed to be a
|
||||
// legal char.
|
||||
push_wtf8_codepoint(n as u32, scratch);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// n is a leading surrogate, we now expect a trailing surrogate.
|
||||
let n1 = n;
|
||||
|
||||
if tri!(peek_or_eof(read)) == b'\\' {
|
||||
read.discard();
|
||||
} else {
|
||||
@ -908,7 +922,7 @@ fn parse_escape<'de, R: Read<'de>>(
|
||||
read.discard();
|
||||
error(read, ErrorCode::UnexpectedEndOfHexEscape)
|
||||
} else {
|
||||
encode_surrogate(scratch, n1);
|
||||
push_wtf8_codepoint(n1 as u32, scratch);
|
||||
Ok(())
|
||||
};
|
||||
}
|
||||
@ -920,7 +934,7 @@ fn parse_escape<'de, R: Read<'de>>(
|
||||
read.discard();
|
||||
error(read, ErrorCode::UnexpectedEndOfHexEscape)
|
||||
} else {
|
||||
encode_surrogate(scratch, n1);
|
||||
push_wtf8_codepoint(n1 as u32, scratch);
|
||||
// The \ prior to this byte started an escape sequence,
|
||||
// so we need to parse that now. This recursive call
|
||||
// does not blow the stack on malicious input because
|
||||
@ -933,32 +947,62 @@ fn parse_escape<'de, R: Read<'de>>(
|
||||
let n2 = tri!(read.decode_hex_escape());
|
||||
|
||||
if n2 < 0xDC00 || n2 > 0xDFFF {
|
||||
if validate {
|
||||
return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape);
|
||||
}
|
||||
push_wtf8_codepoint(n1 as u32, scratch);
|
||||
// If n2 is a leading surrogate, we need to restart.
|
||||
n = n2;
|
||||
continue;
|
||||
}
|
||||
|
||||
// This value is in range U+10000..=U+10FFFF, which is always a
|
||||
// valid codepoint.
|
||||
let n = (((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000;
|
||||
push_wtf8_codepoint(n, scratch);
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
match char::from_u32(n) {
|
||||
Some(c) => c,
|
||||
None => {
|
||||
return error(read, ErrorCode::InvalidUnicodeCodePoint);
|
||||
}
|
||||
}
|
||||
/// Adds a WTF-8 codepoint to the end of the buffer. This is a more efficient
|
||||
/// implementation of String::push. The codepoint may be a surrogate.
|
||||
#[inline]
|
||||
fn push_wtf8_codepoint(n: u32, scratch: &mut Vec<u8>) {
|
||||
if n < 0x80 {
|
||||
scratch.push(n as u8);
|
||||
return;
|
||||
}
|
||||
|
||||
// Every u16 outside of the surrogate ranges above is guaranteed
|
||||
// to be a legal char.
|
||||
n => char::from_u32(n as u32).unwrap(),
|
||||
scratch.reserve(4);
|
||||
|
||||
unsafe {
|
||||
let ptr = scratch.as_mut_ptr().add(scratch.len());
|
||||
|
||||
let encoded_len = match n {
|
||||
0..=0x7F => unreachable!(),
|
||||
0x80..=0x7FF => {
|
||||
ptr.write((n >> 6 & 0b0001_1111) as u8 | 0b1100_0000);
|
||||
2
|
||||
}
|
||||
0x800..=0xFFFF => {
|
||||
ptr.write((n >> 12 & 0b0000_1111) as u8 | 0b1110_0000);
|
||||
ptr.add(1).write((n >> 6 & 0b0011_1111) as u8 | 0b1000_0000);
|
||||
3
|
||||
}
|
||||
0x1_0000..=0x10_FFFF => {
|
||||
ptr.write((n >> 18 & 0b0000_0111) as u8 | 0b1111_0000);
|
||||
ptr.add(1)
|
||||
.write((n >> 12 & 0b0011_1111) as u8 | 0b1000_0000);
|
||||
ptr.add(2).write((n >> 6 & 0b0011_1111) as u8 | 0b1000_0000);
|
||||
4
|
||||
}
|
||||
0x11_0000.. => unreachable!(),
|
||||
};
|
||||
ptr.add(encoded_len - 1)
|
||||
.write((n & 0b0011_1111) as u8 | 0b1000_0000);
|
||||
|
||||
scratch.extend_from_slice(c.encode_utf8(&mut [0_u8; 4]).as_bytes());
|
||||
scratch.set_len(scratch.len() + encoded_len);
|
||||
}
|
||||
_ => {
|
||||
return error(read, ErrorCode::InvalidEscape);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Parses a JSON escape sequence and discards the value. Assumes the previous
|
||||
|
@ -1707,7 +1707,7 @@ fn test_byte_buf_de() {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_byte_buf_de_lone_surrogate() {
|
||||
fn test_byte_buf_de_invalid_surrogates() {
|
||||
let bytes = ByteBuf::from(vec![237, 160, 188]);
|
||||
let v: ByteBuf = from_str(r#""\ud83c""#).unwrap();
|
||||
assert_eq!(v, bytes);
|
||||
@ -1720,23 +1720,54 @@ fn test_byte_buf_de_lone_surrogate() {
|
||||
let v: ByteBuf = from_str(r#""\ud83c ""#).unwrap();
|
||||
assert_eq!(v, bytes);
|
||||
|
||||
let bytes = ByteBuf::from(vec![237, 176, 129]);
|
||||
let v: ByteBuf = from_str(r#""\udc01""#).unwrap();
|
||||
assert_eq!(v, bytes);
|
||||
|
||||
let res = from_str::<ByteBuf>(r#""\ud83c\!""#);
|
||||
assert!(res.is_err());
|
||||
|
||||
let res = from_str::<ByteBuf>(r#""\ud83c\u""#);
|
||||
assert!(res.is_err());
|
||||
|
||||
let res = from_str::<ByteBuf>(r#""\ud83c\ud83c""#);
|
||||
assert!(res.is_err());
|
||||
// lone trailing surrogate
|
||||
let bytes = ByteBuf::from(vec![237, 176, 129]);
|
||||
let v: ByteBuf = from_str(r#""\udc01""#).unwrap();
|
||||
assert_eq!(v, bytes);
|
||||
|
||||
// leading surrogate followed by other leading surrogate
|
||||
let bytes = ByteBuf::from(vec![237, 160, 188, 237, 160, 188]);
|
||||
let v: ByteBuf = from_str(r#""\ud83c\ud83c""#).unwrap();
|
||||
assert_eq!(v, bytes);
|
||||
|
||||
// leading surrogate followed by "a" (U+0061) in \u encoding
|
||||
let bytes = ByteBuf::from(vec![237, 160, 188, 97]);
|
||||
let v: ByteBuf = from_str(r#""\ud83c\u0061""#).unwrap();
|
||||
assert_eq!(v, bytes);
|
||||
|
||||
// leading surrogate followed by U+0080
|
||||
let bytes = ByteBuf::from(vec![237, 160, 188, 194, 128]);
|
||||
let v: ByteBuf = from_str(r#""\ud83c\u0080""#).unwrap();
|
||||
assert_eq!(v, bytes);
|
||||
|
||||
// leading surrogate followed by U+FFFF
|
||||
let bytes = ByteBuf::from(vec![237, 160, 188, 239, 191, 191]);
|
||||
let v: ByteBuf = from_str(r#""\ud83c\uffff""#).unwrap();
|
||||
assert_eq!(v, bytes);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_byte_buf_de_surrogate_pair() {
|
||||
// leading surrogate followed by trailing surrogate
|
||||
let bytes = ByteBuf::from(vec![240, 159, 128, 128]);
|
||||
let v: ByteBuf = from_str(r#""\ud83c\udc00""#).unwrap();
|
||||
assert_eq!(v, bytes);
|
||||
|
||||
// leading surrogate followed by a surrogate pair
|
||||
let bytes = ByteBuf::from(vec![237, 160, 188, 240, 159, 128, 128]);
|
||||
let v: ByteBuf = from_str(r#""\ud83c\ud83c\udc00""#).unwrap();
|
||||
assert_eq!(v, bytes);
|
||||
}
|
||||
|
||||
#[cfg(feature = "raw_value")]
|
||||
#[test]
|
||||
fn test_raw_de_lone_surrogate() {
|
||||
fn test_raw_de_invalid_surrogates() {
|
||||
use serde_json::value::RawValue;
|
||||
|
||||
assert!(from_str::<Box<RawValue>>(r#""\ud83c""#).is_ok());
|
||||
@ -1746,6 +1777,17 @@ fn test_raw_de_lone_surrogate() {
|
||||
assert!(from_str::<Box<RawValue>>(r#""\udc01\!""#).is_err());
|
||||
assert!(from_str::<Box<RawValue>>(r#""\udc01\u""#).is_err());
|
||||
assert!(from_str::<Box<RawValue>>(r#""\ud83c\ud83c""#).is_ok());
|
||||
assert!(from_str::<Box<RawValue>>(r#""\ud83c\u0061""#).is_ok());
|
||||
assert!(from_str::<Box<RawValue>>(r#""\ud83c\u0080""#).is_ok());
|
||||
assert!(from_str::<Box<RawValue>>(r#""\ud83c\uffff""#).is_ok());
|
||||
}
|
||||
|
||||
#[cfg(feature = "raw_value")]
|
||||
#[test]
|
||||
fn test_raw_de_surrogate_pair() {
|
||||
use serde_json::value::RawValue;
|
||||
|
||||
assert!(from_str::<Box<RawValue>>(r#""\ud83c\udc00""#).is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
Loading…
x
Reference in New Issue
Block a user