From 39647e57a645ab5efc4fed9f0627e77c163fd322 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Kijewski?= Date: Wed, 1 Jan 2025 10:15:01 +0100 Subject: [PATCH] Speed-up HTML escaping a bit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR adds an `enum Digits` that contains all possible values for two-digit decimal numbers: `b"00" ..= b"99"`. This seems to enable rust and/or llvm to make better use of niches. Previously there were no niches, and we compared against `0` manually. Now most of the u16-space are niches. ```text Escaping time: [3.3437 µs 3.3497 µs 3.3578 µs] change: [-18.807% -18.620% -18.452%] (p = 0.00 < 0.05) Performance has improved. ``` --- rinja/src/html.rs | 63 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 46 insertions(+), 17 deletions(-) diff --git a/rinja/src/html.rs b/rinja/src/html.rs index f8281e9b..fe410adb 100644 --- a/rinja/src/html.rs +++ b/rinja/src/html.rs @@ -41,13 +41,10 @@ pub(crate) fn write_escaped_char(mut dest: impl fmt::Write, c: char) -> fmt::Res } /// Returns the decimal representation of the codepoint if the character needs HTML escaping. -#[inline(always)] +#[inline] fn get_escaped(byte: u8) -> Option<[u8; 2]> { match byte { - MIN_CHAR..=MAX_CHAR => match TABLE.lookup[(byte - MIN_CHAR) as usize] { - 0 => None, - escaped => Some(escaped.to_ne_bytes()), - }, + MIN_CHAR..=MAX_CHAR => Some(TABLE.0[(byte - MIN_CHAR) as usize]?.to_bytes()), _ => None, } } @@ -93,24 +90,17 @@ const MAX_CHAR: u8 = { /// Number of codepoints between the lowest and highest character that needs escaping, incl. const CHAR_RANGE: usize = (MAX_CHAR - MIN_CHAR + 1) as usize; -struct Table { - _align: [usize; 0], - lookup: [u16; CHAR_RANGE], -} +#[repr(align(64))] +struct Table([Option; CHAR_RANGE]); /// For characters that need HTML escaping, the codepoint is formatted as decimal digits, /// otherwise `b"\0\0"`. Starting at [`MIN_CHAR`]. -const TABLE: Table = { - let mut table = Table { - _align: [], - lookup: [0; CHAR_RANGE], - }; +const TABLE: &Table = &{ + let mut table = Table([None; CHAR_RANGE]); let mut i = 0; while i < CHARS.len() { let c = CHARS[i]; - let h = c / 10 + b'0'; - let l = c % 10 + b'0'; - table.lookup[(c - MIN_CHAR) as usize] = u16::from_ne_bytes([h, l]); + table.0[(c - MIN_CHAR) as usize] = Some(Digits::new(c)); i += 1; } table @@ -120,6 +110,45 @@ const TABLE: Table = { const ESCAPED_BUF_INIT: [u8; 8] = *b"&#__;\0\0\0"; const ESCAPED_BUF_LEN: usize = b"&#__;".len(); +/// All possible decimal representations of codepoints that need escaping in HTML / XML. +/// +/// Using this type instead of e.g. `Option` allows rustc to select any and all of the +/// 65,531 unused representations of this type as niche, which can can help speeding up the +/// generated byte code. If not all needed representations were present, then the `TABLE` definition +/// would fail to compile. +#[derive(Debug, Clone, Copy)] +#[repr(u16)] +enum Digits { + /// `'"'` + V34 = u16::from_ne_bytes(*b"34"), + /// `'&'` + V38 = u16::from_ne_bytes(*b"38"), + /// `'\''` + V39 = u16::from_ne_bytes(*b"39"), + /// `'<'` + V60 = u16::from_ne_bytes(*b"60"), + /// `'>'` + V62 = u16::from_ne_bytes(*b"62"), +} + +impl Digits { + #[inline] + const fn to_bytes(self) -> [u8; 2] { + (self as u16).to_ne_bytes() + } + + const fn new(v: u8) -> Self { + match v { + 34 => Self::V34, + 38 => Self::V38, + 39 => Self::V39, + 60 => Self::V60, + 62 => Self::V62, + _ => panic!(), + } + } +} + #[test] #[cfg(feature = "alloc")] fn test_simple_html_string_escaping() {