optimize the codegen of Vec::clone

these changes optimize `Vec<u8, 1024>::clone` down to these operations

1. reserve the stack space (1028 bytes on 32-bit ARM) and leave it uninitialized
2. zero the `len` field
3. memcpy `len` bytes of data from the parent

analyzed source code

``` rust
use heapless::Vec;

fn clone(vec: &Vec<u8, 1024>) {
    let mut vec = vec.clone();
    black_box(&mut vec);
}

fn black_box<T>(val: &mut T) {
    unsafe { asm!("// {0}", in(reg) val) }
}
```

machine code with `lto = fat`, `codegen-units = 1` and `opt-level = 'z'` ('z' instead of 3 to avoid loop unrolling and keep the machine code readable)

``` armasm
00020100 <clone>:
   20100:              b5d0             push    {r4, r6, r7, lr}
   20102:              af02             add     r7, sp, #8
   20104:              f5ad 6d81        sub.w   sp, sp, #1032   ; 0x408
   20108:              2300             movs    r3, #0
   2010a:              c802             ldmia   r0!, {r1}
   2010c:              9301             str     r3, [sp, #4]
   2010e:              aa01             add     r2, sp, #4
   20110:       /--/-X b141             cbz     r1, 20124 <clone+0x24>
   20112:       |  |   4413             add     r3, r2
   20114:       |  |   f810 4b01        ldrb.w  r4, [r0], #1
   20118:       |  |   3901             subs    r1, #1
   2011a:       |  |   711c             strb    r4, [r3, #4]
   2011c:       |  |   9b01             ldr     r3, [sp, #4]
   2011e:       |  |   3301             adds    r3, #1
   20120:       |  |   9301             str     r3, [sp, #4]
   20122:       |  \-- e7f5             b.n     20110 <clone+0x10>
   20124:       \----> a801             add     r0, sp, #4
   20126:              f50d 6d81        add.w   sp, sp, #1032   ; 0x408
   2012a:              bdd0             pop     {r4, r6, r7, pc}
```

note that it's not optimizing step (3) to an actual `memcpy` because we lack the 'trait specialization' code that libstd uses

---

before `clone` was optimized to

1. reserve and zero (`memclr`) 1028 (!?) bytes of stack space
2. (unnecessarily) runtime check if `len` is equal or less than 1024 (capacity) -- this included a panicking branch
3. memcpy `len` bytes of data from the parent
This commit is contained in:
Jorge Aparicio 2022-05-10 12:47:14 +02:00
parent aacc35997b
commit b2268e7885
2 changed files with 17 additions and 4 deletions

View File

@ -16,6 +16,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
### Changed
* Optimize the codegen of `Vec::clone`
### Fixed
* Inserting an item that replaces an already present item will no longer

View File

@ -34,12 +34,18 @@ use hash32;
/// assert_eq!(*vec, [7, 1, 2, 3]);
/// ```
pub struct Vec<T, const N: usize> {
buffer: [MaybeUninit<T>; N],
// NOTE order is important for optimizations. the `len` first layout lets the compiler optimize
// `new` to: reserve stack space and zero the first word. With the fields in the reverse order
// the compiler optimizes `new` to `memclr`-ing the *entire* stack space, including the `buffer`
// field which should be left uninitialized. Optimizations were last checked with Rust 1.60
len: usize,
buffer: [MaybeUninit<T>; N],
}
impl<T, const N: usize> Vec<T, N> {
const INIT: MaybeUninit<T> = MaybeUninit::uninit();
const ELEM: MaybeUninit<T> = MaybeUninit::uninit();
const INIT: [MaybeUninit<T>; N] = [Self::ELEM; N]; // important for optimization of `new`
/// Constructs a new, empty vector with a fixed capacity of `N`
///
@ -60,8 +66,8 @@ impl<T, const N: usize> Vec<T, N> {
crate::sealed::greater_than_eq_0::<N>();
Self {
buffer: [Self::INIT; N],
len: 0,
buffer: Self::INIT,
}
}
@ -92,7 +98,12 @@ impl<T, const N: usize> Vec<T, N> {
T: Clone,
{
let mut new = Self::new();
new.extend_from_slice(self.as_slice()).unwrap();
// avoid `extend_from_slice` as that introduces a runtime check / panicking branch
for elem in self {
unsafe {
new.push_unchecked(elem.clone());
}
}
new
}