diff --git a/src/read.rs b/src/read.rs index c6186c2..1319d89 100644 --- a/src/read.rs +++ b/src/read.rs @@ -954,34 +954,15 @@ where match ch { b'"' | b'\\' | b'/' | b'b' | b'f' | b'n' | b'r' | b't' => {} - b'u' => match tri!(read.decode_hex_escape()) { - 0xDC00..=0xDFFF => { - return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape); - } + b'u' => { + // At this point we don't care if the codepoint is valid. We just + // want to consume it. We don't actually know what is valid or not + // at this point, because that depends on if this string will + // ultimately be parsed into a string or a byte buffer in the "real" + // parse. - // Non-BMP characters are encoded as a sequence of - // two hex escapes, representing UTF-16 surrogates. - n1 @ 0xD800..=0xDBFF => { - if tri!(next_or_eof(read)) != b'\\' { - return error(read, ErrorCode::UnexpectedEndOfHexEscape); - } - if tri!(next_or_eof(read)) != b'u' { - return error(read, ErrorCode::UnexpectedEndOfHexEscape); - } - - let n2 = tri!(read.decode_hex_escape()); - if n2 < 0xDC00 || n2 > 0xDFFF { - return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape); - } - - let n = (((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000; - if char::from_u32(n).is_none() { - return error(read, ErrorCode::InvalidUnicodeCodePoint); - } - } - - _ => {} - }, + tri!(read.decode_hex_escape()); + } _ => { return error(read, ErrorCode::InvalidEscape); } diff --git a/tests/test.rs b/tests/test.rs index 304007a..b11635e 100644 --- a/tests/test.rs +++ b/tests/test.rs @@ -1740,6 +1740,20 @@ fn test_byte_buf_de_lone_surrogate() { assert!(res.is_err()); } +#[cfg(feature = "raw_value")] +#[test] +fn test_raw_de_lone_surrogate() { + use serde_json::value::RawValue; + + assert!(from_str::>(r#""\ud83c""#).is_ok()); + assert!(from_str::>(r#""\ud83c\n""#).is_ok()); + assert!(from_str::>(r#""\ud83c ""#).is_ok()); + assert!(from_str::>(r#""\udc01 ""#).is_ok()); + assert!(from_str::>(r#""\udc01\!""#).is_err()); + assert!(from_str::>(r#""\udc01\u""#).is_err()); + assert!(from_str::>(r#""\ud83c\ud83c""#).is_ok()); +} + #[test] fn test_byte_buf_de_multiple() { let s: Vec = from_str(r#"["ab\nc", "cd\ne"]"#).unwrap();