mirror of
https://github.com/serde-rs/json.git
synced 2025-10-02 07:21:29 +00:00
Correct WTF-8 parsing
Closes #877. This is a good time to make ByteBuf parsing more consistent as I'm rewriting it anyway. This commit integrates the changes from #877 and also handles a leading surrogate followed by a surrogate pair correctly. This does not affect performance significantly. Co-authored-by: Luca Casonato <hello@lcas.dev>
This commit is contained in:
parent
236cc8247d
commit
96ae60445d
@ -1575,7 +1575,10 @@ impl<'de, 'a, R: Read<'de>> de::Deserializer<'de> for &'a mut Deserializer<R> {
|
|||||||
///
|
///
|
||||||
/// The behavior of serde_json is specified to fail on non-UTF-8 strings
|
/// The behavior of serde_json is specified to fail on non-UTF-8 strings
|
||||||
/// when deserializing into Rust UTF-8 string types such as String, and
|
/// when deserializing into Rust UTF-8 string types such as String, and
|
||||||
/// succeed with non-UTF-8 bytes when deserializing using this method.
|
/// succeed with the bytes representing the [WTF-8] encoding of code points
|
||||||
|
/// when deserializing using this method.
|
||||||
|
///
|
||||||
|
/// [WTF-8]: https://simonsapin.github.io/wtf-8
|
||||||
///
|
///
|
||||||
/// Escape sequences are processed as usual, and for `\uXXXX` escapes it is
|
/// Escape sequences are processed as usual, and for `\uXXXX` escapes it is
|
||||||
/// still checked if the hex number represents a valid Unicode code point.
|
/// still checked if the hex number represents a valid Unicode code point.
|
||||||
|
106
src/read.rs
106
src/read.rs
@ -898,7 +898,7 @@ fn parse_unicode_escape<'de, R: Read<'de>>(
|
|||||||
validate: bool,
|
validate: bool,
|
||||||
scratch: &mut Vec<u8>,
|
scratch: &mut Vec<u8>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let n = tri!(read.decode_hex_escape());
|
let mut n = tri!(read.decode_hex_escape());
|
||||||
|
|
||||||
// Non-BMP characters are encoded as a sequence of two hex
|
// Non-BMP characters are encoded as a sequence of two hex
|
||||||
// escapes, representing UTF-16 surrogates. If deserializing a
|
// escapes, representing UTF-16 surrogates. If deserializing a
|
||||||
@ -909,56 +909,64 @@ fn parse_unicode_escape<'de, R: Read<'de>>(
|
|||||||
return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape);
|
return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape);
|
||||||
}
|
}
|
||||||
|
|
||||||
if n < 0xD800 || n > 0xDBFF {
|
loop {
|
||||||
// Every u16 outside of the surrogate ranges is guaranteed to be a
|
if n < 0xD800 || n > 0xDBFF {
|
||||||
// legal char.
|
// Every u16 outside of the surrogate ranges is guaranteed to be a
|
||||||
push_wtf8_codepoint(n as u32, scratch);
|
// legal char.
|
||||||
|
push_wtf8_codepoint(n as u32, scratch);
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
// n is a leading surrogate, we now expect a trailing surrogate.
|
||||||
|
let n1 = n;
|
||||||
|
|
||||||
|
if tri!(peek_or_eof(read)) == b'\\' {
|
||||||
|
read.discard();
|
||||||
|
} else {
|
||||||
|
return if validate {
|
||||||
|
read.discard();
|
||||||
|
error(read, ErrorCode::UnexpectedEndOfHexEscape)
|
||||||
|
} else {
|
||||||
|
push_wtf8_codepoint(n1 as u32, scratch);
|
||||||
|
Ok(())
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
if tri!(peek_or_eof(read)) == b'u' {
|
||||||
|
read.discard();
|
||||||
|
} else {
|
||||||
|
return if validate {
|
||||||
|
read.discard();
|
||||||
|
error(read, ErrorCode::UnexpectedEndOfHexEscape)
|
||||||
|
} else {
|
||||||
|
push_wtf8_codepoint(n1 as u32, scratch);
|
||||||
|
// The \ prior to this byte started an escape sequence,
|
||||||
|
// so we need to parse that now. This recursive call
|
||||||
|
// does not blow the stack on malicious input because
|
||||||
|
// the escape is not \u, so it will be handled by one
|
||||||
|
// of the easy nonrecursive cases.
|
||||||
|
parse_escape(read, validate, scratch)
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
let n2 = tri!(read.decode_hex_escape());
|
||||||
|
|
||||||
|
if n2 < 0xDC00 || n2 > 0xDFFF {
|
||||||
|
if validate {
|
||||||
|
return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape);
|
||||||
|
}
|
||||||
|
push_wtf8_codepoint(n1 as u32, scratch);
|
||||||
|
// If n2 is a leading surrogate, we need to restart.
|
||||||
|
n = n2;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// This value is in range U+10000..=U+10FFFF, which is always a
|
||||||
|
// valid codepoint.
|
||||||
|
let n = (((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000;
|
||||||
|
push_wtf8_codepoint(n, scratch);
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
|
||||||
// n is a leading surrogate, we now expect a trailing surrogate.
|
|
||||||
let n1 = n;
|
|
||||||
|
|
||||||
if tri!(peek_or_eof(read)) == b'\\' {
|
|
||||||
read.discard();
|
|
||||||
} else {
|
|
||||||
return if validate {
|
|
||||||
read.discard();
|
|
||||||
error(read, ErrorCode::UnexpectedEndOfHexEscape)
|
|
||||||
} else {
|
|
||||||
push_wtf8_codepoint(n1 as u32, scratch);
|
|
||||||
Ok(())
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
if tri!(peek_or_eof(read)) == b'u' {
|
|
||||||
read.discard();
|
|
||||||
} else {
|
|
||||||
return if validate {
|
|
||||||
read.discard();
|
|
||||||
error(read, ErrorCode::UnexpectedEndOfHexEscape)
|
|
||||||
} else {
|
|
||||||
push_wtf8_codepoint(n1 as u32, scratch);
|
|
||||||
// The \ prior to this byte started an escape sequence,
|
|
||||||
// so we need to parse that now. This recursive call
|
|
||||||
// does not blow the stack on malicious input because
|
|
||||||
// the escape is not \u, so it will be handled by one
|
|
||||||
// of the easy nonrecursive cases.
|
|
||||||
parse_escape(read, validate, scratch)
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
let n2 = tri!(read.decode_hex_escape());
|
|
||||||
|
|
||||||
if n2 < 0xDC00 || n2 > 0xDFFF {
|
|
||||||
return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape);
|
|
||||||
}
|
|
||||||
|
|
||||||
// This value is in range U+10000..=U+10FFFF, which is always a
|
|
||||||
// valid codepoint.
|
|
||||||
let n = (((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000;
|
|
||||||
push_wtf8_codepoint(n, scratch);
|
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Adds a WTF-8 codepoint to the end of the buffer. This is a more efficient
|
/// Adds a WTF-8 codepoint to the end of the buffer. This is a more efficient
|
||||||
|
@ -1707,7 +1707,7 @@ fn test_byte_buf_de() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_byte_buf_de_lone_surrogate() {
|
fn test_byte_buf_de_invalid_surrogates() {
|
||||||
let bytes = ByteBuf::from(vec![237, 160, 188]);
|
let bytes = ByteBuf::from(vec![237, 160, 188]);
|
||||||
let v: ByteBuf = from_str(r#""\ud83c""#).unwrap();
|
let v: ByteBuf = from_str(r#""\ud83c""#).unwrap();
|
||||||
assert_eq!(v, bytes);
|
assert_eq!(v, bytes);
|
||||||
@ -1720,23 +1720,54 @@ fn test_byte_buf_de_lone_surrogate() {
|
|||||||
let v: ByteBuf = from_str(r#""\ud83c ""#).unwrap();
|
let v: ByteBuf = from_str(r#""\ud83c ""#).unwrap();
|
||||||
assert_eq!(v, bytes);
|
assert_eq!(v, bytes);
|
||||||
|
|
||||||
let bytes = ByteBuf::from(vec![237, 176, 129]);
|
|
||||||
let v: ByteBuf = from_str(r#""\udc01""#).unwrap();
|
|
||||||
assert_eq!(v, bytes);
|
|
||||||
|
|
||||||
let res = from_str::<ByteBuf>(r#""\ud83c\!""#);
|
let res = from_str::<ByteBuf>(r#""\ud83c\!""#);
|
||||||
assert!(res.is_err());
|
assert!(res.is_err());
|
||||||
|
|
||||||
let res = from_str::<ByteBuf>(r#""\ud83c\u""#);
|
let res = from_str::<ByteBuf>(r#""\ud83c\u""#);
|
||||||
assert!(res.is_err());
|
assert!(res.is_err());
|
||||||
|
|
||||||
let res = from_str::<ByteBuf>(r#""\ud83c\ud83c""#);
|
// lone trailing surrogate
|
||||||
assert!(res.is_err());
|
let bytes = ByteBuf::from(vec![237, 176, 129]);
|
||||||
|
let v: ByteBuf = from_str(r#""\udc01""#).unwrap();
|
||||||
|
assert_eq!(v, bytes);
|
||||||
|
|
||||||
|
// leading surrogate followed by other leading surrogate
|
||||||
|
let bytes = ByteBuf::from(vec![237, 160, 188, 237, 160, 188]);
|
||||||
|
let v: ByteBuf = from_str(r#""\ud83c\ud83c""#).unwrap();
|
||||||
|
assert_eq!(v, bytes);
|
||||||
|
|
||||||
|
// leading surrogate followed by "a" (U+0061) in \u encoding
|
||||||
|
let bytes = ByteBuf::from(vec![237, 160, 188, 97]);
|
||||||
|
let v: ByteBuf = from_str(r#""\ud83c\u0061""#).unwrap();
|
||||||
|
assert_eq!(v, bytes);
|
||||||
|
|
||||||
|
// leading surrogate followed by U+0080
|
||||||
|
let bytes = ByteBuf::from(vec![237, 160, 188, 194, 128]);
|
||||||
|
let v: ByteBuf = from_str(r#""\ud83c\u0080""#).unwrap();
|
||||||
|
assert_eq!(v, bytes);
|
||||||
|
|
||||||
|
// leading surrogate followed by U+FFFF
|
||||||
|
let bytes = ByteBuf::from(vec![237, 160, 188, 239, 191, 191]);
|
||||||
|
let v: ByteBuf = from_str(r#""\ud83c\uffff""#).unwrap();
|
||||||
|
assert_eq!(v, bytes);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_byte_buf_de_surrogate_pair() {
|
||||||
|
// leading surrogate followed by trailing surrogate
|
||||||
|
let bytes = ByteBuf::from(vec![240, 159, 128, 128]);
|
||||||
|
let v: ByteBuf = from_str(r#""\ud83c\udc00""#).unwrap();
|
||||||
|
assert_eq!(v, bytes);
|
||||||
|
|
||||||
|
// leading surrogate followed by a surrogate pair
|
||||||
|
let bytes = ByteBuf::from(vec![237, 160, 188, 240, 159, 128, 128]);
|
||||||
|
let v: ByteBuf = from_str(r#""\ud83c\ud83c\udc00""#).unwrap();
|
||||||
|
assert_eq!(v, bytes);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(feature = "raw_value")]
|
#[cfg(feature = "raw_value")]
|
||||||
#[test]
|
#[test]
|
||||||
fn test_raw_de_lone_surrogate() {
|
fn test_raw_de_invalid_surrogates() {
|
||||||
use serde_json::value::RawValue;
|
use serde_json::value::RawValue;
|
||||||
|
|
||||||
assert!(from_str::<Box<RawValue>>(r#""\ud83c""#).is_ok());
|
assert!(from_str::<Box<RawValue>>(r#""\ud83c""#).is_ok());
|
||||||
@ -1746,6 +1777,17 @@ fn test_raw_de_lone_surrogate() {
|
|||||||
assert!(from_str::<Box<RawValue>>(r#""\udc01\!""#).is_err());
|
assert!(from_str::<Box<RawValue>>(r#""\udc01\!""#).is_err());
|
||||||
assert!(from_str::<Box<RawValue>>(r#""\udc01\u""#).is_err());
|
assert!(from_str::<Box<RawValue>>(r#""\udc01\u""#).is_err());
|
||||||
assert!(from_str::<Box<RawValue>>(r#""\ud83c\ud83c""#).is_ok());
|
assert!(from_str::<Box<RawValue>>(r#""\ud83c\ud83c""#).is_ok());
|
||||||
|
assert!(from_str::<Box<RawValue>>(r#""\ud83c\u0061""#).is_ok());
|
||||||
|
assert!(from_str::<Box<RawValue>>(r#""\ud83c\u0080""#).is_ok());
|
||||||
|
assert!(from_str::<Box<RawValue>>(r#""\ud83c\uffff""#).is_ok());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(feature = "raw_value")]
|
||||||
|
#[test]
|
||||||
|
fn test_raw_de_surrogate_pair() {
|
||||||
|
use serde_json::value::RawValue;
|
||||||
|
|
||||||
|
assert!(from_str::<Box<RawValue>>(r#""\ud83c\udc00""#).is_ok());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
Loading…
x
Reference in New Issue
Block a user