mirror of
https://github.com/embassy-rs/embassy.git
synced 2025-09-30 22:01:07 +00:00
otg: Improve IN write performance
chunks_exact() can be handled by the compiler more efficiently. Previous code was making a memcpy call for each 4 byte chunk slice. Hoisting the fifo out of the loop avoids recalculating the pointer each time. In my benchmark I see a jump from ~13 megabyte/sec to ~25MB/sec after this change (opt-level=3). opt-level = "z" goes 9MB/s to 18MB/s. The benchmark was on a stm32h7s3l8, 600mhz clock, 512 byte bulk writes, data in DTCM. The benchmark isn't just USB writes, also has some unrelated memcpys for packet construction.
This commit is contained in:
parent
f53b6649dd
commit
e2ceb2b1f7
@ -1210,10 +1210,23 @@ impl<'d> embassy_usb_driver::EndpointIn for Endpoint<'d, In> {
|
||||
});
|
||||
|
||||
// Write data to FIFO
|
||||
for chunk in buf.chunks(4) {
|
||||
let chunks = buf.chunks_exact(4);
|
||||
// Stash the last partial chunk
|
||||
let rem = chunks.remainder();
|
||||
let last_chunk = (!rem.is_empty()).then(|| {
|
||||
let mut tmp = [0u8; 4];
|
||||
tmp[0..chunk.len()].copy_from_slice(chunk);
|
||||
self.regs.fifo(index).write_value(regs::Fifo(u32::from_ne_bytes(tmp)));
|
||||
tmp[0..rem.len()].copy_from_slice(rem);
|
||||
u32::from_ne_bytes(tmp)
|
||||
});
|
||||
|
||||
let fifo = self.regs.fifo(index);
|
||||
for chunk in chunks {
|
||||
let val = u32::from_ne_bytes(chunk.try_into().unwrap());
|
||||
fifo.write_value(regs::Fifo(val));
|
||||
}
|
||||
// Write any last chunk
|
||||
if let Some(val) = last_chunk {
|
||||
fifo.write_value(regs::Fifo(val));
|
||||
}
|
||||
});
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user