From b2268e7885fbddeb17a9eae5d9f4b9aba099ff7d Mon Sep 17 00:00:00 2001 From: Jorge Aparicio Date: Tue, 10 May 2022 12:47:14 +0200 Subject: [PATCH] optimize the codegen of Vec::clone these changes optimize `Vec::clone` down to these operations 1. reserve the stack space (1028 bytes on 32-bit ARM) and leave it uninitialized 2. zero the `len` field 3. memcpy `len` bytes of data from the parent analyzed source code ``` rust use heapless::Vec; fn clone(vec: &Vec) { let mut vec = vec.clone(); black_box(&mut vec); } fn black_box(val: &mut T) { unsafe { asm!("// {0}", in(reg) val) } } ``` machine code with `lto = fat`, `codegen-units = 1` and `opt-level = 'z'` ('z' instead of 3 to avoid loop unrolling and keep the machine code readable) ``` armasm 00020100 : 20100: b5d0 push {r4, r6, r7, lr} 20102: af02 add r7, sp, #8 20104: f5ad 6d81 sub.w sp, sp, #1032 ; 0x408 20108: 2300 movs r3, #0 2010a: c802 ldmia r0!, {r1} 2010c: 9301 str r3, [sp, #4] 2010e: aa01 add r2, sp, #4 20110: /--/-X b141 cbz r1, 20124 20112: | | 4413 add r3, r2 20114: | | f810 4b01 ldrb.w r4, [r0], #1 20118: | | 3901 subs r1, #1 2011a: | | 711c strb r4, [r3, #4] 2011c: | | 9b01 ldr r3, [sp, #4] 2011e: | | 3301 adds r3, #1 20120: | | 9301 str r3, [sp, #4] 20122: | \-- e7f5 b.n 20110 20124: \----> a801 add r0, sp, #4 20126: f50d 6d81 add.w sp, sp, #1032 ; 0x408 2012a: bdd0 pop {r4, r6, r7, pc} ``` note that it's not optimizing step (3) to an actual `memcpy` because we lack the 'trait specialization' code that libstd uses --- before `clone` was optimized to 1. reserve and zero (`memclr`) 1028 (!?) bytes of stack space 2. (unnecessarily) runtime check if `len` is equal or less than 1024 (capacity) -- this included a panicking branch 3. memcpy `len` bytes of data from the parent --- CHANGELOG.md | 2 ++ src/vec.rs | 19 +++++++++++++++---- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7808dfac..7499d5ef 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/). ### Changed +* Optimize the codegen of `Vec::clone` + ### Fixed * Inserting an item that replaces an already present item will no longer diff --git a/src/vec.rs b/src/vec.rs index 31d98ca7..bef2edd0 100644 --- a/src/vec.rs +++ b/src/vec.rs @@ -34,12 +34,18 @@ use hash32; /// assert_eq!(*vec, [7, 1, 2, 3]); /// ``` pub struct Vec { - buffer: [MaybeUninit; N], + // NOTE order is important for optimizations. the `len` first layout lets the compiler optimize + // `new` to: reserve stack space and zero the first word. With the fields in the reverse order + // the compiler optimizes `new` to `memclr`-ing the *entire* stack space, including the `buffer` + // field which should be left uninitialized. Optimizations were last checked with Rust 1.60 len: usize, + + buffer: [MaybeUninit; N], } impl Vec { - const INIT: MaybeUninit = MaybeUninit::uninit(); + const ELEM: MaybeUninit = MaybeUninit::uninit(); + const INIT: [MaybeUninit; N] = [Self::ELEM; N]; // important for optimization of `new` /// Constructs a new, empty vector with a fixed capacity of `N` /// @@ -60,8 +66,8 @@ impl Vec { crate::sealed::greater_than_eq_0::(); Self { - buffer: [Self::INIT; N], len: 0, + buffer: Self::INIT, } } @@ -92,7 +98,12 @@ impl Vec { T: Clone, { let mut new = Self::new(); - new.extend_from_slice(self.as_slice()).unwrap(); + // avoid `extend_from_slice` as that introduces a runtime check / panicking branch + for elem in self { + unsafe { + new.push_unchecked(elem.clone()); + } + } new }