Use unicode-ident to determine what is an identifier

Resolves <https://github.com/askama-rs/askama/issues/442>.

The performance is actually slightly better than before. `unicode-ident`
is highly optimized and jump free.

One test from a fuzzer outcome had to be deleted, because it contained
identifiers that weren't actually identifiers. There is still a test
that tests the same problem, but every identifier is simply `x`. In
another fuzzed test a character [`U+E0049`] was removed.

[`U+E0049`]: https://en.wikipedia.org/w/index.php?oldid=1278382889

<details>
<summary>Benchmark results</summary>

```text
$ cd askama_parser && cargo bench

librustdoc/all          time:   [184.47 µs 185.30 µs 186.08 µs]
                        thrpt:  [75.887 MiB/s 76.207 MiB/s 76.548 MiB/s]
                 change:
                        time:   [−1.4364% −0.9672% −0.4738%] (p = 0.00 < 0.05)
                        thrpt:  [+0.4761% +0.9767% +1.4573%]
                        Change within noise threshold.

librustdoc/item_info    time:   [3.3880 µs 3.3892 µs 3.3906 µs]
                        thrpt:  [46.409 MiB/s 46.428 MiB/s 46.445 MiB/s]
                 change:
                        time:   [−3.8735% −3.5927% −3.2949%] (p = 0.00 < 0.05)
                        thrpt:  [+3.4071% +3.7266% +4.0296%]
                        Performance has improved.

librustdoc/item_union   time:   [20.052 µs 20.087 µs 20.126 µs]
                        thrpt:  [49.044 MiB/s 49.140 MiB/s 49.224 MiB/s]
                 change:
                        time:   [−2.2419% −1.8647% −1.5113%] (p = 0.00 < 0.05)
                        thrpt:  [+1.5345% +1.9002% +2.2933%]
                        Performance has improved.

librustdoc/page         time:   [85.828 µs 86.157 µs 86.518 µs]
                        thrpt:  [71.571 MiB/s 71.871 MiB/s 72.147 MiB/s]
                 change:
                        time:   [−1.2728% −0.7668% −0.2512%] (p = 0.00 < 0.05)
                        thrpt:  [+0.2518% +0.7727% +1.2892%]
                        Change within noise threshold.

librustdoc/print_item   time:   [10.065 µs 10.101 µs 10.138 µs]
                        thrpt:  [93.132 MiB/s 93.469 MiB/s 93.806 MiB/s]
                 change:
                        time:   [−3.3793% −2.8352% −2.3267%] (p = 0.00 < 0.05)
                        thrpt:  [+2.3821% +2.9180% +3.4975%]
                        Performance has improved.

librustdoc/short_item_info
                        time:   [9.0741 µs 9.1018 µs 9.1377 µs]
                        thrpt:  [99.148 MiB/s 99.540 MiB/s 99.843 MiB/s]
                 change:
                        time:   [−4.7480% −4.2335% −3.7763%] (p = 0.00 < 0.05)
                        thrpt:  [+3.9245% +4.4207% +4.9847%]
                        Performance has improved.

librustdoc/sidebar      time:   [21.468 µs 21.555 µs 21.648 µs]
                        thrpt:  [57.004 MiB/s 57.252 MiB/s 57.482 MiB/s]
                 change:
                        time:   [−3.7641% −3.0465% −2.4191%] (p = 0.00 < 0.05)
                        thrpt:  [+2.4791% +3.1423% +3.9114%]
                        Performance has improved.

librustdoc/source       time:   [7.9602 µs 7.9780 µs 7.9929 µs]
                        thrpt:  [92.230 MiB/s 92.403 MiB/s 92.609 MiB/s]
                 change:
                        time:   [−1.6386% −1.0684% −0.5875%] (p = 0.00 < 0.05)
                        thrpt:  [+0.5910% +1.0799% +1.6659%]
                        Change within noise threshold.

librustdoc/type_layout_size
                        time:   [4.7821 µs 4.7915 µs 4.8017 µs]
                        thrpt:  [56.406 MiB/s 56.526 MiB/s 56.637 MiB/s]
                 change:
                        time:   [−1.9743% −1.4867% −1.0153%] (p = 0.00 < 0.05)
                        thrpt:  [+1.0257% +1.5091% +2.0141%]
                        Performance has improved.

librustdoc/type_layout  time:   [15.022 µs 15.051 µs 15.076 µs]
                        thrpt:  [178.57 MiB/s 178.88 MiB/s 179.22 MiB/s]
                 change:
                        time:   [−1.5028% −1.0358% −0.5705%] (p = 0.00 < 0.05)
                        thrpt:  [+0.5738% +1.0466% +1.5257%]
                        Change within noise threshold.
```
</details>
This commit is contained in:
René Kijewski 2025-05-18 04:42:56 +02:00 committed by René Kijewski
parent 37101cb95d
commit 3775f4e3a3
5 changed files with 16 additions and 17 deletions

View File

@ -24,6 +24,7 @@ harness = false
memchr = "2"
serde = { version = "1.0", optional = true }
serde_derive = { version = "1.0", optional = true }
unicode-ident = "1.0.12"
winnow = "0.7.0"
[dev-dependencies]

View File

@ -21,7 +21,7 @@ use std::{fmt, str};
use winnow::ascii::take_escaped;
use winnow::combinator::{alt, cut_err, delimited, fail, not, opt, peek, preceded, repeat};
use winnow::error::FromExternalError;
use winnow::stream::{AsChar, Stream as _};
use winnow::stream::Stream as _;
use winnow::token::{any, one_of, take_till, take_while};
use winnow::{ModalParser, Parser};
@ -399,13 +399,9 @@ fn keyword(k: &str) -> impl ModalParser<&str, &str, ErrorContext<'_>> {
}
fn identifier<'i>(input: &mut &'i str) -> ParseResult<'i> {
let start = take_while(1.., |c: char| c.is_alpha() || c == '_' || c >= '\u{0080}');
let tail = take_while(1.., |c: char| {
c.is_alphanum() || c == '_' || c >= '\u{0080}'
});
(start, opt(tail)).take().parse_next(input)
let head = any.verify(|&c| c == '_' || unicode_ident::is_xid_start(c));
let tail = take_while(.., unicode_ident::is_xid_continue);
(head, tail).take().parse_next(input)
}
fn bool_lit<'i>(i: &mut &'i str) -> ParseResult<'i> {

View File

@ -1,3 +1,5 @@
use winnow::Parser;
use crate::node::{Lit, Whitespace, Ws};
use crate::{
Ast, Expr, Filter, InnerSyntax, Node, Num, PathOrIdentifier, Span, StrLit, Syntax,
@ -1216,13 +1218,6 @@ fn fuzzed_excessive_filter_block() {
err.to_string().lines().next(),
Some("your template code is too deeply nested, or the last expression is too complex"),
);
let src = include!("../tests/fuzzed_excessive_filter_block.inc");
let err = Ast::from_str(src, None, &Syntax::default()).unwrap_err();
assert_eq!(
err.to_string().lines().next(),
Some("your template code is too deeply nested, or the last expression is too complex"),
);
}
#[test]
@ -1412,3 +1407,11 @@ fn test_filter_with_path() {
"failed to parse template source near offset 16",
);
}
#[test]
fn underscore_is_an_identifier() {
let mut input = "_";
let result = crate::identifier.parse_next(&mut input);
assert_eq!(result.unwrap(), "_");
assert_eq!(input, "");
}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long