mirror of
https://github.com/serde-rs/json.git
synced 2025-10-04 00:06:04 +00:00
Simplify unicode escape handling
This does not affect performance.
This commit is contained in:
parent
2f28d106e6
commit
236cc8247d
107
src/read.rs
107
src/read.rs
@ -898,67 +898,66 @@ fn parse_unicode_escape<'de, R: Read<'de>>(
|
|||||||
validate: bool,
|
validate: bool,
|
||||||
scratch: &mut Vec<u8>,
|
scratch: &mut Vec<u8>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let c = match tri!(read.decode_hex_escape()) {
|
let n = tri!(read.decode_hex_escape());
|
||||||
n @ 0xDC00..=0xDFFF => {
|
|
||||||
return if validate {
|
|
||||||
error(read, ErrorCode::LoneLeadingSurrogateInHexEscape)
|
|
||||||
} else {
|
|
||||||
push_wtf8_codepoint(n as u32, scratch);
|
|
||||||
Ok(())
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
// Non-BMP characters are encoded as a sequence of two hex
|
// Non-BMP characters are encoded as a sequence of two hex
|
||||||
// escapes, representing UTF-16 surrogates. If deserializing a
|
// escapes, representing UTF-16 surrogates. If deserializing a
|
||||||
// utf-8 string the surrogates are required to be paired,
|
// utf-8 string the surrogates are required to be paired,
|
||||||
// whereas deserializing a byte string accepts lone surrogates.
|
// whereas deserializing a byte string accepts lone surrogates.
|
||||||
n1 @ 0xD800..=0xDBFF => {
|
if validate && n >= 0xDC00 && n <= 0xDFFF {
|
||||||
if tri!(peek_or_eof(read)) == b'\\' {
|
// XXX: This is actually a trailing surrogate.
|
||||||
read.discard();
|
return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape);
|
||||||
} else {
|
}
|
||||||
return if validate {
|
|
||||||
read.discard();
|
|
||||||
error(read, ErrorCode::UnexpectedEndOfHexEscape)
|
|
||||||
} else {
|
|
||||||
push_wtf8_codepoint(n1 as u32, scratch);
|
|
||||||
Ok(())
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
if tri!(peek_or_eof(read)) == b'u' {
|
if n < 0xD800 || n > 0xDBFF {
|
||||||
read.discard();
|
// Every u16 outside of the surrogate ranges is guaranteed to be a
|
||||||
} else {
|
// legal char.
|
||||||
return if validate {
|
push_wtf8_codepoint(n as u32, scratch);
|
||||||
read.discard();
|
return Ok(());
|
||||||
error(read, ErrorCode::UnexpectedEndOfHexEscape)
|
}
|
||||||
} else {
|
|
||||||
push_wtf8_codepoint(n1 as u32, scratch);
|
|
||||||
// The \ prior to this byte started an escape sequence,
|
|
||||||
// so we need to parse that now. This recursive call
|
|
||||||
// does not blow the stack on malicious input because
|
|
||||||
// the escape is not \u, so it will be handled by one
|
|
||||||
// of the easy nonrecursive cases.
|
|
||||||
parse_escape(read, validate, scratch)
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
let n2 = tri!(read.decode_hex_escape());
|
// n is a leading surrogate, we now expect a trailing surrogate.
|
||||||
|
let n1 = n;
|
||||||
|
|
||||||
if n2 < 0xDC00 || n2 > 0xDFFF {
|
if tri!(peek_or_eof(read)) == b'\\' {
|
||||||
return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape);
|
read.discard();
|
||||||
}
|
} else {
|
||||||
|
return if validate {
|
||||||
|
read.discard();
|
||||||
|
error(read, ErrorCode::UnexpectedEndOfHexEscape)
|
||||||
|
} else {
|
||||||
|
push_wtf8_codepoint(n1 as u32, scratch);
|
||||||
|
Ok(())
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
// This value is in range U+10000..=U+10FFFF, which is always a
|
if tri!(peek_or_eof(read)) == b'u' {
|
||||||
// valid codepoint.
|
read.discard();
|
||||||
(((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000
|
} else {
|
||||||
}
|
return if validate {
|
||||||
|
read.discard();
|
||||||
|
error(read, ErrorCode::UnexpectedEndOfHexEscape)
|
||||||
|
} else {
|
||||||
|
push_wtf8_codepoint(n1 as u32, scratch);
|
||||||
|
// The \ prior to this byte started an escape sequence,
|
||||||
|
// so we need to parse that now. This recursive call
|
||||||
|
// does not blow the stack on malicious input because
|
||||||
|
// the escape is not \u, so it will be handled by one
|
||||||
|
// of the easy nonrecursive cases.
|
||||||
|
parse_escape(read, validate, scratch)
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
// Every u16 outside of the surrogate ranges above is guaranteed
|
let n2 = tri!(read.decode_hex_escape());
|
||||||
// to be a legal char.
|
|
||||||
n => n as u32,
|
|
||||||
};
|
|
||||||
|
|
||||||
push_wtf8_codepoint(c, scratch);
|
if n2 < 0xDC00 || n2 > 0xDFFF {
|
||||||
|
return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape);
|
||||||
|
}
|
||||||
|
|
||||||
|
// This value is in range U+10000..=U+10FFFF, which is always a
|
||||||
|
// valid codepoint.
|
||||||
|
let n = (((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000;
|
||||||
|
push_wtf8_codepoint(n, scratch);
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user