Merge pull request #301 from Kijewski/pr-digits

Speed-up HTML escaping a bit
This commit is contained in:
René Kijewski 2025-01-01 18:09:37 +01:00 committed by GitHub
commit 0373645eb9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -41,13 +41,10 @@ pub(crate) fn write_escaped_char(mut dest: impl fmt::Write, c: char) -> fmt::Res
}
/// Returns the decimal representation of the codepoint if the character needs HTML escaping.
#[inline(always)]
#[inline]
fn get_escaped(byte: u8) -> Option<[u8; 2]> {
match byte {
MIN_CHAR..=MAX_CHAR => match TABLE.lookup[(byte - MIN_CHAR) as usize] {
0 => None,
escaped => Some(escaped.to_ne_bytes()),
},
MIN_CHAR..=MAX_CHAR => Some(TABLE.0[(byte - MIN_CHAR) as usize]?.to_bytes()),
_ => None,
}
}
@ -93,24 +90,17 @@ const MAX_CHAR: u8 = {
/// Number of codepoints between the lowest and highest character that needs escaping, incl.
const CHAR_RANGE: usize = (MAX_CHAR - MIN_CHAR + 1) as usize;
struct Table {
_align: [usize; 0],
lookup: [u16; CHAR_RANGE],
}
#[repr(align(64))]
struct Table([Option<Digits>; CHAR_RANGE]);
/// For characters that need HTML escaping, the codepoint is formatted as decimal digits,
/// otherwise `b"\0\0"`. Starting at [`MIN_CHAR`].
const TABLE: Table = {
let mut table = Table {
_align: [],
lookup: [0; CHAR_RANGE],
};
const TABLE: &Table = &{
let mut table = Table([None; CHAR_RANGE]);
let mut i = 0;
while i < CHARS.len() {
let c = CHARS[i];
let h = c / 10 + b'0';
let l = c % 10 + b'0';
table.lookup[(c - MIN_CHAR) as usize] = u16::from_ne_bytes([h, l]);
table.0[(c - MIN_CHAR) as usize] = Some(Digits::new(c));
i += 1;
}
table
@ -120,6 +110,45 @@ const TABLE: Table = {
const ESCAPED_BUF_INIT: [u8; 8] = *b"&#__;\0\0\0";
const ESCAPED_BUF_LEN: usize = b"&#__;".len();
/// All possible decimal representations of codepoints that need escaping in HTML / XML.
///
/// Using this type instead of e.g. `Option<NonZeroU16>` allows rustc to select any and all of the
/// 65,531 unused representations of this type as niche, which can can help speeding up the
/// generated byte code. If not all needed representations were present, then the `TABLE` definition
/// would fail to compile.
#[derive(Debug, Clone, Copy)]
#[repr(u16)]
enum Digits {
/// `'"'`
V34 = u16::from_ne_bytes(*b"34"),
/// `'&'`
V38 = u16::from_ne_bytes(*b"38"),
/// `'\''`
V39 = u16::from_ne_bytes(*b"39"),
/// `'<'`
V60 = u16::from_ne_bytes(*b"60"),
/// `'>'`
V62 = u16::from_ne_bytes(*b"62"),
}
impl Digits {
#[inline]
const fn to_bytes(self) -> [u8; 2] {
(self as u16).to_ne_bytes()
}
const fn new(v: u8) -> Self {
match v {
34 => Self::V34,
38 => Self::V38,
39 => Self::V39,
60 => Self::V60,
62 => Self::V62,
_ => panic!(),
}
}
}
#[test]
#[cfg(feature = "alloc")]
fn test_simple_html_string_escaping() {