revamp pool API

2025-09-26 20:10:24 +00:00 · 2022-08-12 16:42:08 +02:00 · 2022-08-12 16:42:08 +02:00 · fb3f34c04a
commit fb3f34c04a
parent 5229dea4e7
16 changed files with 2003 additions and 1858 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -10,11 +10,18 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
 ### Added

 - Add `Clone` and `PartialEq` implementations to `HistoryBuffer`.
+- Added an object pool API. see the `pool::object` module level doc for details

 ### Changed

 - [breaking-change] `IndexMap` and `IndexSet` now require that keys implement the `core::hash::Hash`
  trait instead of the `hash32::Hash` (v0.2.0) trait
+- move `pool::singleton::Box` to the `pool::box` module
+- renamed `pool::singleton::Pool` to `BoxPool` and moved it into the `pool::box` module
+- move `pool::singleton::arc::Arc` to the `pool::arc` module
+- renamed `pool::singleton::arc::Pool` to `ArcPool` and moved it into the `pool::arc` module
+- [breaking-change] changed the target support of memory pool API to only support 32-bit x86 and a
+  subset of ARM targets. See the module level documentation of the `pool` module for details

 - [breaking-change] this crate now depends on `atomic-polyfill` v1.0.1, meaning that targets that
  require a polyfill need a `critical-section` **v1.x.x** implementation.
@ -26,6 +33,11 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
 - [breaking-change] this crate no longer has a Minimum Supported Rust Version (MSRV) guarantee and
  should be used with the latest stable version of the Rust toolchain.

+- [breaking-change] removed the `Init` and `Uninint` type states from `pool::singleton::Box`
+- [breaking-change] removed the following `pool::singleton::Box` methods: `freeze`, `forget` and `init`
+- [breaking-change] removed the `pool::singleton::arc::ArcInner` type
+- [breaking-change] removed support for attributes from `pool!` and `arc_pool!`
+
 ## [v0.7.16] - 2022-08-09

 ### Added
--- a/Cargo.toml
+++ b/Cargo.toml
@ -18,8 +18,6 @@ version = "0.8.0"
 default = ["cas"]
 cas = ["atomic-polyfill"]
 ufmt-impl = ["ufmt-write"]
-# read the docs before enabling: makes `Pool` Sync on x86_64
-x86-sync-pool = []
 # only for tests
 __trybuild = []
 # Enable larger MPMC sizes.
--- a/build.rs
+++ b/build.rs
@ -1,6 +1,12 @@
 #![deny(warnings)]

-use std::{env, error::Error};
+use std::{
+    env,
+    error::Error,
+    fs,
+    path::Path,
+    process::{Command, ExitStatus, Stdio},
+};

 use rustc_version::Channel;

@ -89,5 +95,63 @@ fn main() -> Result<(), Box<dyn Error>> {
        println!("cargo:rustc-cfg=unstable_channel");
    }

+    match compile_probe(ARM_LLSC_PROBE) {
+        Some(status) if status.success() => println!("cargo:rustc-cfg=arm_llsc"),
+        _ => {}
+    }
+
    Ok(())
 }
+
+const ARM_LLSC_PROBE: &str = r#"
+#![no_std]
+
+// `no_mangle` forces codegen, which makes llvm check the contents of the `asm!` macro
+#[no_mangle]
+unsafe fn asm() {
+    core::arch::asm!("clrex");
+}
+"#;
+
+// this function was taken from anyhow v1.0.63 build script
+// https://crates.io/crates/anyhow/1.0.63 (last visited 2022-09-02)
+// the code is licensed under 'MIT or APACHE-2.0'
+fn compile_probe(source: &str) -> Option<ExitStatus> {
+    let rustc = env::var_os("RUSTC")?;
+    let out_dir = env::var_os("OUT_DIR")?;
+    let probefile = Path::new(&out_dir).join("probe.rs");
+    fs::write(&probefile, source).ok()?;
+
+    // Make sure to pick up Cargo rustc configuration.
+    let mut cmd = if let Some(wrapper) = env::var_os("RUSTC_WRAPPER") {
+        let mut cmd = Command::new(wrapper);
+        // The wrapper's first argument is supposed to be the path to rustc.
+        cmd.arg(rustc);
+        cmd
+    } else {
+        Command::new(rustc)
+    };
+
+    cmd.stderr(Stdio::null())
+        .arg("--edition=2018")
+        .arg("--crate-name=probe")
+        .arg("--crate-type=lib")
+        .arg("--out-dir")
+        .arg(out_dir)
+        .arg(probefile);
+
+    if let Some(target) = env::var_os("TARGET") {
+        cmd.arg("--target").arg(target);
+    }
+
+    // If Cargo wants to set RUSTFLAGS, use that.
+    if let Ok(rustflags) = env::var("CARGO_ENCODED_RUSTFLAGS") {
+        if !rustflags.is_empty() {
+            for arg in rustflags.split('\x1f') {
+                cmd.arg(arg);
+            }
+        }
+    }
+
+    cmd.status().ok()
+}
--- a/src/lib.rs
+++ b/src/lib.rs
@ -43,12 +43,15 @@
 //!
 //! List of currently implemented data structures:
 //!
-//! - [`Arc`](pool/singleton/arc/struct.Arc.html) -- Thread-safe reference-counting pointer backed by a memory pool
+//! - [`Arc`](pool/arc/index.html) -- like `std::sync::Arc` but backed by a lock-free memory pool
+//! rather than `#[global_allocator]`
+//! - [`Box`](pool/boxed/index.html) -- like `std::boxed::Box` but backed by a lock-free memory pool
+//! rather than `#[global_allocator]`
 //! - [`BinaryHeap`](binary_heap/struct.BinaryHeap.html) -- priority queue
 //! - [`IndexMap`](struct.IndexMap.html) -- hash table
 //! - [`IndexSet`](struct.IndexSet.html) -- hash set
 //! - [`LinearMap`](struct.LinearMap.html)
-//! - [`Pool`](pool/struct.Pool.html) -- lock-free memory pool
+//! - [`Object`](pool/object/index.html) -- objects managed by an object pool
 //! - [`String`](struct.String.html)
 //! - [`Vec`](struct.Vec.html)
 //! - [`mpmc::Q*`](mpmc/index.html) -- multiple producer multiple consumer lock-free queue
@ -83,8 +86,6 @@ pub use histbuf::{HistoryBuffer, OldestOrdered};
 pub use indexmap::{Bucket, Entry, FnvIndexMap, IndexMap, OccupiedEntry, Pos, VacantEntry};
 pub use indexset::{FnvIndexSet, IndexSet};
 pub use linear_map::LinearMap;
-#[cfg(all(has_cas, feature = "cas"))]
-pub use pool::singleton::arc::Arc;
 pub use string::String;
 pub use vec::Vec;

@ -110,7 +111,7 @@ pub mod binary_heap;
 mod defmt;
 #[cfg(all(has_cas, feature = "cas"))]
 pub mod mpmc;
-#[cfg(all(has_cas, feature = "cas"))]
+#[cfg(any(arm_llsc, target_arch = "x86"))]
 pub mod pool;
 pub mod sorted_linked_list;
 #[cfg(has_atomics)]
--- a/src/pool.rs
+++ b/src/pool.rs
@ -0,0 +1,59 @@
+//! Memory and object pools
+//!
+//! # Target support
+//!
+//! This module / API is only available on these compilation targets:
+//!
+//! - ARM architectures which instruction set include the LDREX, CLREX and STREX instructions, e.g.
+//! `thumbv7m-none-eabi` but not `thumbv6m-none-eabi`
+//! - 32-bit x86, e.g. `i686-unknown-linux-gnu`
+//!
+//! # Benchmarks
+//!
+//! - compilation settings
+//!   - `codegen-units = 1`
+//!   - `lto = 'fat'`
+//!   - `opt-level = 'z'`
+//! - compilation target: `thumbv7em-none-eabihf`
+//! - CPU: ARM Cortex-M4F
+//!
+//! - test program:
+//!
+//! ``` no_run
+//! use heapless::box_pool;
+//!
+//! box_pool!(P: ()); // or `arc_pool!` or `object_pool!`
+//!
+//! bkpt();
+//! let res = P.alloc(());
+//! bkpt();
+//!
+//! if let Ok(boxed) = res {
+//!     bkpt();
+//!     drop(boxed);
+//!     bkpt();
+//! }
+//! # fn bkpt() {}
+//! ```
+//!
+//! - measurement method: the cycle counter (CYCCNT) register was sampled each time a breakpoint
+//! (`bkpt`) was hit. the difference between the "after" and the "before" value of CYCCNT yields the
+//! execution time in clock cycles.
+//!
+//! | API                          | clock cycles |
+//! |------------------------------|--------------|
+//! | `BoxPool::alloc`             | 23           |
+//! | `pool::boxed::Box::drop`     | 23           |
+//! | `ArcPool::alloc`             | 28           |
+//! | `pool::arc::Arc::drop`       | 59           |
+//! | `ObjectPool::request`        | 23           |
+//! | `pool::object::Object::drop` | 23           |
+//!
+//! Note that the execution time won't include `T`'s initialization nor `T`'s destructor which will
+//! be present in the general case for `Box` and `Arc`.
+
+mod treiber;
+
+pub mod arc;
+pub mod boxed;
+pub mod object;
--- a/src/pool/arc.rs
+++ b/src/pool/arc.rs
@ -0,0 +1,503 @@
+//! `std::sync::Arc`-like API on top of a lock-free memory pool
+//!
+//! # Example usage
+//!
+//! ```
+//! use heapless::{arc_pool, pool::arc::{Arc, ArcBlock}};
+//!
+//! arc_pool!(P: u128);
+//!
+//! // cannot allocate without first giving memory blocks to the pool
+//! assert!(P.alloc(42).is_err());
+//!
+//! // (some `no_std` runtimes have safe APIs to create `&'static mut` references)
+//! let block: &'static mut ArcBlock<u128> = unsafe {
+//!     static mut B: ArcBlock<u128> = ArcBlock::new();
+//!     &mut B
+//! };
+//!
+//! P.manage(block);
+//!
+//! let arc = P.alloc(1).unwrap();
+//!
+//! // number of smart pointers is limited to the number of blocks managed by the pool
+//! let res = P.alloc(2);
+//! assert!(res.is_err());
+//!
+//! // but cloning does not consume an `ArcBlock`
+//! let arc2 = arc.clone();
+//!
+//! assert_eq!(1, *arc2);
+//!
+//! // `arc`'s destructor returns the memory block to the pool
+//! drop(arc2); // decrease reference counter
+//! drop(arc); // release memory
+//!
+//! // it's now possible to allocate a new `Arc` smart pointer
+//! let res = P.alloc(3);
+//!
+//! assert!(res.is_ok());
+//! ```
+
+// reference counting logic is based on version 1.63.0 of the Rust standard library (`alloc`  crate)
+// which is licensed under 'MIT or APACHE-2.0'
+// https://github.com/rust-lang/rust/blob/1.63.0/library/alloc/src/sync.rs#L235 (last visited
+// 2022-09-05)
+
+use core::{
+    fmt,
+    hash::{Hash, Hasher},
+    mem::{ManuallyDrop, MaybeUninit},
+    ops, ptr,
+    sync::atomic::{self, AtomicUsize, Ordering},
+};
+
+use super::treiber::{NonNullPtr, Stack, UnionNode};
+
+/// Creates a new `ArcPool` singleton with the given `$name` that manages the specified `$data_type`
+///
+/// For more extensive documentation see the [module level documentation](pool/arc/index.html)
+#[macro_export]
+macro_rules! arc_pool {
+    ($name:ident: $data_type:ty) => {
+        pub struct $name;
+
+        impl $crate::pool::arc::ArcPool for $name {
+            type Data = $data_type;
+
+            fn singleton() -> &'static $crate::pool::arc::ArcPoolImpl<$data_type> {
+                static $name: $crate::pool::arc::ArcPoolImpl<$data_type> =
+                    $crate::pool::arc::ArcPoolImpl::new();
+
+                &$name
+            }
+        }
+
+        impl $name {
+            /// Inherent method version of `ArcPool::alloc`
+            #[allow(dead_code)]
+            pub fn alloc(
+                &self,
+                value: $data_type,
+            ) -> Result<$crate::pool::arc::Arc<$name>, $data_type> {
+                <$name as $crate::pool::arc::ArcPool>::alloc(value)
+            }
+
+            /// Inherent method version of `ArcPool::manage`
+            #[allow(dead_code)]
+            pub fn manage(&self, block: &'static mut $crate::pool::arc::ArcBlock<$data_type>) {
+                <$name as $crate::pool::arc::ArcPool>::manage(block)
+            }
+        }
+    };
+}
+
+/// A singleton that manages `pool::arc::Arc` smart pointers
+pub trait ArcPool: Sized {
+    /// The data type managed by the memory pool
+    type Data: 'static;
+
+    /// `arc_pool!` implementation detail
+    #[doc(hidden)]
+    fn singleton() -> &'static ArcPoolImpl<Self::Data>;
+
+    /// Allocate a new `Arc` smart pointer initialized to the given `value`
+    ///
+    /// `manage` should be called at least once before calling `alloc`
+    ///
+    /// # Errors
+    ///
+    /// The `Err`or variant is returned when the memory pool has run out of memory blocks
+    fn alloc(value: Self::Data) -> Result<Arc<Self>, Self::Data> {
+        Ok(Arc {
+            node_ptr: Self::singleton().alloc(value)?,
+        })
+    }
+
+    /// Add a statically allocated memory block to the memory pool
+    fn manage(block: &'static mut ArcBlock<Self::Data>) {
+        Self::singleton().manage(block)
+    }
+}
+
+/// `arc_pool!` implementation detail
+// newtype to avoid having to make field types public
+#[doc(hidden)]
+pub struct ArcPoolImpl<T> {
+    stack: Stack<UnionNode<MaybeUninit<ArcInner<T>>>>,
+}
+
+impl<T> ArcPoolImpl<T> {
+    /// `arc_pool!` implementation detail
+    #[doc(hidden)]
+    pub const fn new() -> Self {
+        Self {
+            stack: Stack::new(),
+        }
+    }
+
+    fn alloc(&self, value: T) -> Result<NonNullPtr<UnionNode<MaybeUninit<ArcInner<T>>>>, T> {
+        if let Some(node_ptr) = self.stack.try_pop() {
+            let inner = ArcInner {
+                data: value,
+                strong: AtomicUsize::new(1),
+            };
+            unsafe { node_ptr.as_ptr().cast::<ArcInner<T>>().write(inner) }
+
+            Ok(node_ptr)
+        } else {
+            Err(value)
+        }
+    }
+
+    fn manage(&self, block: &'static mut ArcBlock<T>) {
+        let node: &'static mut _ = &mut block.node;
+
+        unsafe { self.stack.push(NonNullPtr::from_static_mut_ref(node)) }
+    }
+}
+
+unsafe impl<T> Sync for ArcPoolImpl<T> {}
+
+/// Like `std::sync::Arc` but managed by memory pool `P`
+pub struct Arc<P>
+where
+    P: ArcPool,
+{
+    node_ptr: NonNullPtr<UnionNode<MaybeUninit<ArcInner<P::Data>>>>,
+}
+
+impl<P> Arc<P>
+where
+    P: ArcPool,
+{
+    fn inner(&self) -> &ArcInner<P::Data> {
+        unsafe { &*self.node_ptr.as_ptr().cast::<ArcInner<P::Data>>() }
+    }
+
+    fn from_inner(node_ptr: NonNullPtr<UnionNode<MaybeUninit<ArcInner<P::Data>>>>) -> Self {
+        Self { node_ptr }
+    }
+
+    unsafe fn get_mut_unchecked(this: &mut Self) -> &mut P::Data {
+        &mut *ptr::addr_of_mut!((*this.node_ptr.as_ptr().cast::<ArcInner<P::Data>>()).data)
+    }
+
+    #[inline(never)]
+    unsafe fn drop_slow(&mut self) {
+        // run `P::Data`'s destructor
+        ptr::drop_in_place(Self::get_mut_unchecked(self));
+
+        // return memory to pool
+        P::singleton().stack.push(self.node_ptr);
+    }
+}
+
+impl<P> AsRef<P::Data> for Arc<P>
+where
+    P: ArcPool,
+{
+    fn as_ref(&self) -> &P::Data {
+        &**self
+    }
+}
+
+const MAX_REFCOUNT: usize = (isize::MAX) as usize;
+
+impl<P> Clone for Arc<P>
+where
+    P: ArcPool,
+{
+    fn clone(&self) -> Self {
+        let old_size = self.inner().strong.fetch_add(1, Ordering::Relaxed);
+
+        if old_size > MAX_REFCOUNT {
+            // XXX original code calls `intrinsics::abort` which is unstable API
+            panic!();
+        }
+
+        Self::from_inner(self.node_ptr)
+    }
+}
+
+impl<A> fmt::Debug for Arc<A>
+where
+    A: ArcPool,
+    A::Data: fmt::Debug,
+{
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        A::Data::fmt(self, f)
+    }
+}
+
+impl<P> ops::Deref for Arc<P>
+where
+    P: ArcPool,
+{
+    type Target = P::Data;
+
+    fn deref(&self) -> &Self::Target {
+        unsafe { &*ptr::addr_of!((*self.node_ptr.as_ptr().cast::<ArcInner<P::Data>>()).data) }
+    }
+}
+
+impl<A> fmt::Display for Arc<A>
+where
+    A: ArcPool,
+    A::Data: fmt::Display,
+{
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        A::Data::fmt(self, f)
+    }
+}
+
+impl<A> Drop for Arc<A>
+where
+    A: ArcPool,
+{
+    fn drop(&mut self) {
+        if self.inner().strong.fetch_sub(1, Ordering::Release) != 1 {
+            return;
+        }
+
+        atomic::fence(Ordering::Acquire);
+
+        unsafe { self.drop_slow() }
+    }
+}
+
+impl<A> Eq for Arc<A>
+where
+    A: ArcPool,
+    A::Data: Eq,
+{
+}
+
+impl<A> Hash for Arc<A>
+where
+    A: ArcPool,
+    A::Data: Hash,
+{
+    fn hash<H>(&self, state: &mut H)
+    where
+        H: Hasher,
+    {
+        (**self).hash(state)
+    }
+}
+
+impl<A> Ord for Arc<A>
+where
+    A: ArcPool,
+    A::Data: Ord,
+{
+    fn cmp(&self, other: &Self) -> core::cmp::Ordering {
+        A::Data::cmp(self, other)
+    }
+}
+
+impl<A, B> PartialEq<Arc<B>> for Arc<A>
+where
+    A: ArcPool,
+    B: ArcPool,
+    A::Data: PartialEq<B::Data>,
+{
+    fn eq(&self, other: &Arc<B>) -> bool {
+        A::Data::eq(self, &**other)
+    }
+}
+
+impl<A, B> PartialOrd<Arc<B>> for Arc<A>
+where
+    A: ArcPool,
+    B: ArcPool,
+    A::Data: PartialOrd<B::Data>,
+{
+    fn partial_cmp(&self, other: &Arc<B>) -> Option<core::cmp::Ordering> {
+        A::Data::partial_cmp(self, &**other)
+    }
+}
+
+unsafe impl<A> Send for Arc<A>
+where
+    A: ArcPool,
+    A::Data: Sync + Send,
+{
+}
+
+unsafe impl<A> Sync for Arc<A>
+where
+    A: ArcPool,
+    A::Data: Sync + Send,
+{
+}
+
+impl<A> Unpin for Arc<A> where A: ArcPool {}
+
+struct ArcInner<T> {
+    data: T,
+    strong: AtomicUsize,
+}
+
+/// A chunk of memory that an `ArcPool` can manage
+pub struct ArcBlock<T> {
+    node: UnionNode<MaybeUninit<ArcInner<T>>>,
+}
+
+impl<T> ArcBlock<T> {
+    /// Creates a new memory block
+    pub const fn new() -> Self {
+        Self {
+            node: UnionNode {
+                data: ManuallyDrop::new(MaybeUninit::uninit()),
+            },
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn cannot_alloc_if_empty() {
+        arc_pool!(P: i32);
+
+        assert_eq!(Err(42), P.alloc(42),);
+    }
+
+    #[test]
+    fn can_alloc_if_manages_one_block() {
+        arc_pool!(P: i32);
+
+        let block = unsafe {
+            static mut B: ArcBlock<i32> = ArcBlock::new();
+            &mut B
+        };
+        P.manage(block);
+
+        assert_eq!(42, *P.alloc(42).unwrap());
+    }
+
+    #[test]
+    fn alloc_drop_alloc() {
+        arc_pool!(P: i32);
+
+        let block = unsafe {
+            static mut B: ArcBlock<i32> = ArcBlock::new();
+            &mut B
+        };
+        P.manage(block);
+
+        let arc = P.alloc(1).unwrap();
+
+        drop(arc);
+
+        assert_eq!(2, *P.alloc(2).unwrap());
+    }
+
+    #[test]
+    fn strong_count_starts_at_one() {
+        arc_pool!(P: i32);
+
+        let block = unsafe {
+            static mut B: ArcBlock<i32> = ArcBlock::new();
+            &mut B
+        };
+        P.manage(block);
+
+        let arc = P.alloc(1).ok().unwrap();
+
+        assert_eq!(1, arc.inner().strong.load(Ordering::Relaxed));
+    }
+
+    #[test]
+    fn clone_increases_strong_count() {
+        arc_pool!(P: i32);
+
+        let block = unsafe {
+            static mut B: ArcBlock<i32> = ArcBlock::new();
+            &mut B
+        };
+        P.manage(block);
+
+        let arc = P.alloc(1).ok().unwrap();
+
+        let before = arc.inner().strong.load(Ordering::Relaxed);
+
+        let arc2 = arc.clone();
+
+        let expected = before + 1;
+        assert_eq!(expected, arc.inner().strong.load(Ordering::Relaxed));
+        assert_eq!(expected, arc2.inner().strong.load(Ordering::Relaxed));
+    }
+
+    #[test]
+    fn drop_decreases_strong_count() {
+        arc_pool!(P: i32);
+
+        let block = unsafe {
+            static mut B: ArcBlock<i32> = ArcBlock::new();
+            &mut B
+        };
+        P.manage(block);
+
+        let arc = P.alloc(1).ok().unwrap();
+        let arc2 = arc.clone();
+
+        let before = arc.inner().strong.load(Ordering::Relaxed);
+
+        drop(arc);
+
+        let expected = before - 1;
+        assert_eq!(expected, arc2.inner().strong.load(Ordering::Relaxed));
+    }
+
+    #[test]
+    fn runs_destructor_exactly_once_when_strong_count_reaches_zero() {
+        static COUNT: AtomicUsize = AtomicUsize::new(0);
+
+        pub struct S;
+
+        impl Drop for S {
+            fn drop(&mut self) {
+                COUNT.fetch_add(1, Ordering::Relaxed);
+            }
+        }
+
+        arc_pool!(P: S);
+
+        let block = unsafe {
+            static mut B: ArcBlock<S> = ArcBlock::new();
+            &mut B
+        };
+        P.manage(block);
+
+        let arc = P.alloc(S).ok().unwrap();
+
+        assert_eq!(0, COUNT.load(Ordering::Relaxed));
+
+        drop(arc);
+
+        assert_eq!(1, COUNT.load(Ordering::Relaxed));
+    }
+
+    #[test]
+    fn zst_is_well_aligned() {
+        #[repr(align(4096))]
+        pub struct Zst4096;
+
+        arc_pool!(P: Zst4096);
+
+        let block = unsafe {
+            static mut B: ArcBlock<Zst4096> = ArcBlock::new();
+            &mut B
+        };
+        P.manage(block);
+
+        let arc = P.alloc(Zst4096).ok().unwrap();
+
+        let raw = &*arc as *const Zst4096;
+        assert_eq!(0, raw as usize % 4096);
+    }
+}
--- a/src/pool/boxed.rs
+++ b/src/pool/boxed.rs
@ -0,0 +1,533 @@
+//! `std::boxed::Box`-like API on top of a lock-free memory pool
+//!
+//! # Example usage
+//!
+//! ```
+//! use heapless::{box_pool, pool::boxed::{Box, BoxBlock}};
+//!
+//! box_pool!(P: u128);
+//!
+//! // cannot allocate without first giving memory blocks to the pool
+//! assert!(P.alloc(42).is_err());
+//!
+//! // (some `no_std` runtimes have safe APIs to create `&'static mut` references)
+//! let block: &'static mut BoxBlock<u128> = unsafe {
+//!     static mut B: BoxBlock <u128>= BoxBlock::new();
+//!     &mut B
+//! };
+//!
+//! // give block of memory to the pool
+//! P.manage(block);
+//!
+//! // it's now possible to allocate
+//! let mut boxed = P.alloc(1).unwrap();
+//!
+//! // mutation is possible
+//! *boxed += 1;
+//! assert_eq!(2, *boxed);
+//!
+//! // number of boxes is limited to the number of blocks managed by the pool
+//! let res = P.alloc(3);
+//! assert!(res.is_err());
+//!
+//! // give another memory block to the pool
+//! P.manage(unsafe {
+//!     static mut B: BoxBlock<u128> = BoxBlock::new();
+//!     &mut B
+//! });
+//!
+//! // cloning also consumes a memory block from the pool
+//! let mut separate_box = boxed.clone();
+//! *separate_box += 1;
+//! assert_eq!(3, *separate_box);
+//!
+//! // after the clone it's not possible to allocate again
+//! let res = P.alloc(4);
+//! assert!(res.is_err());
+//!
+//! // `boxed`'s destructor returns the memory block to the pool
+//! drop(boxed);
+//!
+//! // it's possible to allocate again
+//! let res = P.alloc(5);
+//!
+//! assert!(res.is_ok());
+//! ```
+
+use core::{
+    fmt,
+    hash::{Hash, Hasher},
+    mem::{ManuallyDrop, MaybeUninit},
+    ops, ptr,
+};
+
+use super::treiber::{NonNullPtr, Stack, UnionNode};
+
+/// Creates a new `BoxPool` singleton with the given `$name` that manages the specified `$data_type`
+///
+/// For more extensive documentation see the [module level documentation](pool/boxed/index.html)
+#[macro_export]
+macro_rules! box_pool {
+    ($name:ident: $data_type:ty) => {
+        pub struct $name;
+
+        impl $crate::pool::boxed::BoxPool for $name {
+            type Data = $data_type;
+
+            fn singleton() -> &'static $crate::pool::boxed::BoxPoolImpl<$data_type> {
+                static $name: $crate::pool::boxed::BoxPoolImpl<$data_type> =
+                    $crate::pool::boxed::BoxPoolImpl::new();
+
+                &$name
+            }
+        }
+
+        impl $name {
+            /// Inherent method version of `BoxPool::alloc`
+            #[allow(dead_code)]
+            pub fn alloc(
+                &self,
+                value: $data_type,
+            ) -> Result<$crate::pool::boxed::Box<$name>, $data_type> {
+                <$name as $crate::pool::boxed::BoxPool>::alloc(value)
+            }
+
+            /// Inherent method version of `BoxPool::manage`
+            #[allow(dead_code)]
+            pub fn manage(&self, block: &'static mut $crate::pool::boxed::BoxBlock<$data_type>) {
+                <$name as $crate::pool::boxed::BoxPool>::manage(block)
+            }
+        }
+    };
+}
+
+/// A singleton that manages `pool::boxed::Box`-es
+///
+/// # Usage
+///
+/// Do not implement this trait yourself; instead use the `box_pool!` macro to create a type that
+/// implements this trait.
+///
+/// # Semver guarantees
+///
+/// *Implementing* this trait is exempt from semver guarantees.
+/// i.e. a new patch release is allowed to break downstream `BoxPool` implementations.
+///
+/// *Using* the trait, e.g. in generic code, does fall under semver guarantees.
+pub trait BoxPool: Sized {
+    /// The data type managed by the memory pool
+    type Data: 'static;
+
+    /// `box_pool!` implementation detail
+    #[doc(hidden)]
+    fn singleton() -> &'static BoxPoolImpl<Self::Data>;
+
+    /// Allocate a new `Box` initialized to the given `value`
+    ///
+    /// `manage` should be called at least once before calling `alloc`
+    ///
+    /// # Errors
+    ///
+    /// The `Err`or variant is returned when the memory pool has run out of memory blocks
+    fn alloc(value: Self::Data) -> Result<Box<Self>, Self::Data> {
+        Ok(Box {
+            node_ptr: Self::singleton().alloc(value)?,
+        })
+    }
+
+    /// Add a statically allocated memory block to the memory pool
+    fn manage(block: &'static mut BoxBlock<Self::Data>) {
+        Self::singleton().manage(block)
+    }
+}
+
+/// Like `std::boxed::Box` but managed by memory pool `P` rather than `#[global_allocator]`
+pub struct Box<P>
+where
+    P: BoxPool,
+{
+    node_ptr: NonNullPtr<UnionNode<MaybeUninit<P::Data>>>,
+}
+
+impl<A> Clone for Box<A>
+where
+    A: BoxPool,
+    A::Data: Clone,
+{
+    fn clone(&self) -> Self {
+        A::alloc((**self).clone()).ok().expect("OOM")
+    }
+}
+
+impl<A> fmt::Debug for Box<A>
+where
+    A: BoxPool,
+    A::Data: fmt::Debug,
+{
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        A::Data::fmt(self, f)
+    }
+}
+
+impl<P> ops::Deref for Box<P>
+where
+    P: BoxPool,
+{
+    type Target = P::Data;
+
+    fn deref(&self) -> &Self::Target {
+        unsafe { &*self.node_ptr.as_ptr().cast::<P::Data>() }
+    }
+}
+
+impl<P> ops::DerefMut for Box<P>
+where
+    P: BoxPool,
+{
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        unsafe { &mut *self.node_ptr.as_ptr().cast::<P::Data>() }
+    }
+}
+
+impl<A> fmt::Display for Box<A>
+where
+    A: BoxPool,
+    A::Data: fmt::Display,
+{
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        A::Data::fmt(self, f)
+    }
+}
+
+impl<P> Drop for Box<P>
+where
+    P: BoxPool,
+{
+    fn drop(&mut self) {
+        let node = self.node_ptr;
+
+        unsafe { ptr::drop_in_place(node.as_ptr().cast::<P::Data>()) }
+
+        unsafe { P::singleton().stack.push(node) }
+    }
+}
+
+impl<A> Eq for Box<A>
+where
+    A: BoxPool,
+    A::Data: Eq,
+{
+}
+
+impl<A> Hash for Box<A>
+where
+    A: BoxPool,
+    A::Data: Hash,
+{
+    fn hash<H>(&self, state: &mut H)
+    where
+        H: Hasher,
+    {
+        (**self).hash(state)
+    }
+}
+
+impl<A> Ord for Box<A>
+where
+    A: BoxPool,
+    A::Data: Ord,
+{
+    fn cmp(&self, other: &Self) -> core::cmp::Ordering {
+        A::Data::cmp(self, other)
+    }
+}
+
+impl<A, B> PartialEq<Box<B>> for Box<A>
+where
+    A: BoxPool,
+    B: BoxPool,
+    A::Data: PartialEq<B::Data>,
+{
+    fn eq(&self, other: &Box<B>) -> bool {
+        A::Data::eq(self, other)
+    }
+}
+
+impl<A, B> PartialOrd<Box<B>> for Box<A>
+where
+    A: BoxPool,
+    B: BoxPool,
+    A::Data: PartialOrd<B::Data>,
+{
+    fn partial_cmp(&self, other: &Box<B>) -> Option<core::cmp::Ordering> {
+        A::Data::partial_cmp(self, other)
+    }
+}
+
+unsafe impl<P> Send for Box<P>
+where
+    P: BoxPool,
+    P::Data: Send,
+{
+}
+
+unsafe impl<P> Sync for Box<P>
+where
+    P: BoxPool,
+    P::Data: Sync,
+{
+}
+
+/// `box_pool!` implementation detail
+// newtype to avoid having to make field types public
+#[doc(hidden)]
+pub struct BoxPoolImpl<T> {
+    stack: Stack<UnionNode<MaybeUninit<T>>>,
+}
+
+impl<T> BoxPoolImpl<T> {
+    pub const fn new() -> Self {
+        Self {
+            stack: Stack::new(),
+        }
+    }
+
+    fn alloc(&self, value: T) -> Result<NonNullPtr<UnionNode<MaybeUninit<T>>>, T> {
+        if let Some(node_ptr) = self.stack.try_pop() {
+            unsafe { node_ptr.as_ptr().cast::<T>().write(value) }
+
+            Ok(node_ptr)
+        } else {
+            Err(value)
+        }
+    }
+
+    fn manage(&self, block: &'static mut BoxBlock<T>) {
+        let node: &'static mut _ = &mut block.node;
+
+        unsafe { self.stack.push(NonNullPtr::from_static_mut_ref(node)) }
+    }
+}
+
+unsafe impl<T> Sync for BoxPoolImpl<T> {}
+
+/// A chunk of memory that a `BoxPool` singleton can manage
+pub struct BoxBlock<T> {
+    node: UnionNode<MaybeUninit<T>>,
+}
+
+impl<T> BoxBlock<T> {
+    /// Creates a new memory block
+    pub const fn new() -> Self {
+        Self {
+            node: UnionNode {
+                data: ManuallyDrop::new(MaybeUninit::uninit()),
+            },
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use core::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
+    use std::thread;
+
+    use super::*;
+
+    #[test]
+    fn cannot_alloc_if_empty() {
+        box_pool!(P: i32);
+
+        assert_eq!(Err(42), P.alloc(42));
+    }
+
+    #[test]
+    fn can_alloc_if_pool_manages_one_block() {
+        box_pool!(P: i32);
+
+        let block = unsafe {
+            static mut B: BoxBlock<i32> = BoxBlock::new();
+            &mut B
+        };
+        P.manage(block);
+
+        assert_eq!(42, *P.alloc(42).unwrap());
+    }
+
+    #[test]
+    fn alloc_drop_alloc() {
+        box_pool!(P: i32);
+
+        let block = unsafe {
+            static mut B: BoxBlock<i32> = BoxBlock::new();
+            &mut B
+        };
+        P.manage(block);
+
+        let boxed = P.alloc(1).unwrap();
+
+        drop(boxed);
+
+        assert_eq!(2, *P.alloc(2).unwrap());
+    }
+
+    #[test]
+    fn runs_destructor_exactly_once_on_drop() {
+        static COUNT: AtomicUsize = AtomicUsize::new(0);
+
+        pub struct S;
+
+        impl Drop for S {
+            fn drop(&mut self) {
+                COUNT.fetch_add(1, Ordering::Relaxed);
+            }
+        }
+
+        box_pool!(P: S);
+
+        let block = unsafe {
+            static mut B: BoxBlock<S> = BoxBlock::new();
+            &mut B
+        };
+        P.manage(block);
+
+        let boxed = P.alloc(S).ok().unwrap();
+
+        assert_eq!(0, COUNT.load(Ordering::Relaxed));
+
+        drop(boxed);
+
+        assert_eq!(1, COUNT.load(Ordering::Relaxed));
+    }
+
+    #[test]
+    fn zst_is_well_aligned() {
+        #[repr(align(4096))]
+        pub struct Zst4096;
+
+        box_pool!(P: Zst4096);
+
+        let block = unsafe {
+            static mut B: BoxBlock<Zst4096> = BoxBlock::new();
+            &mut B
+        };
+        P.manage(block);
+
+        let boxed = P.alloc(Zst4096).ok().unwrap();
+
+        let raw = &*boxed as *const Zst4096;
+        assert_eq!(0, raw as usize % 4096);
+    }
+
+    #[allow(clippy::redundant_clone)]
+    #[test]
+    fn can_clone_if_pool_is_not_exhausted() {
+        static STRUCT_CLONE_WAS_CALLED: AtomicBool = AtomicBool::new(false);
+
+        pub struct S;
+
+        impl Clone for S {
+            fn clone(&self) -> Self {
+                STRUCT_CLONE_WAS_CALLED.store(true, Ordering::Relaxed);
+                Self
+            }
+        }
+
+        box_pool!(P: S);
+
+        P.manage(unsafe {
+            static mut B: BoxBlock<S> = BoxBlock::new();
+            &mut B
+        });
+        P.manage(unsafe {
+            static mut B: BoxBlock<S> = BoxBlock::new();
+            &mut B
+        });
+
+        let first = P.alloc(S).ok().unwrap();
+        let _second = first.clone();
+
+        assert!(STRUCT_CLONE_WAS_CALLED.load(Ordering::Relaxed));
+
+        let is_oom = P.alloc(S).is_err();
+        assert!(is_oom);
+    }
+
+    #[allow(clippy::redundant_clone)]
+    #[test]
+    fn clone_panics_if_pool_exhausted() {
+        static STRUCT_CLONE_WAS_CALLED: AtomicBool = AtomicBool::new(false);
+
+        pub struct S;
+
+        impl Clone for S {
+            fn clone(&self) -> Self {
+                STRUCT_CLONE_WAS_CALLED.store(true, Ordering::Relaxed);
+                Self
+            }
+        }
+
+        box_pool!(P: S);
+
+        P.manage(unsafe {
+            static mut B: BoxBlock<S> = BoxBlock::new();
+            &mut B
+        });
+
+        let first = P.alloc(S).ok().unwrap();
+
+        let thread = thread::spawn(move || {
+            let _second = first.clone();
+        });
+
+        let thread_panicked = thread.join().is_err();
+        assert!(thread_panicked);
+
+        // we diverge from `alloc::Box<T>` in that we call `T::clone` first and then request
+        // memory from the allocator whereas `alloc::Box<T>` does it the other way around
+        // assert!(!STRUCT_CLONE_WAS_CALLED.load(Ordering::Relaxed));
+    }
+
+    #[allow(clippy::redundant_clone)]
+    #[test]
+    fn panicking_clone_does_not_leak_memory() {
+        static STRUCT_CLONE_WAS_CALLED: AtomicBool = AtomicBool::new(false);
+
+        pub struct S;
+
+        impl Clone for S {
+            fn clone(&self) -> Self {
+                STRUCT_CLONE_WAS_CALLED.store(true, Ordering::Relaxed);
+                panic!()
+            }
+        }
+
+        box_pool!(P: S);
+
+        P.manage(unsafe {
+            static mut B: BoxBlock<S> = BoxBlock::new();
+            &mut B
+        });
+        P.manage(unsafe {
+            static mut B: BoxBlock<S> = BoxBlock::new();
+            &mut B
+        });
+
+        let boxed = P.alloc(S).ok().unwrap();
+
+        let thread = thread::spawn(move || {
+            let _boxed = boxed.clone();
+        });
+
+        let thread_panicked = thread.join().is_err();
+        assert!(thread_panicked);
+
+        assert!(STRUCT_CLONE_WAS_CALLED.load(Ordering::Relaxed));
+
+        let once = P.alloc(S);
+        let twice = P.alloc(S);
+
+        assert!(once.is_ok());
+        assert!(twice.is_ok());
+    }
+}
--- a/src/pool/cas.rs
+++ b/src/pool/cas.rs
@ -1,248 +0,0 @@
-//! Stack based on CAS atomics
-//!
-//! To reduce the chance of hitting the ABA problem we use a 32-bit offset + a 32-bit version tag
-//! instead of a 64-bit pointer. The version tag will be bumped on each successful `pop` operation.
-
-use core::{
-    cell::UnsafeCell,
-    marker::PhantomData,
-    num::{NonZeroU32, NonZeroU64},
-    ptr::NonNull,
-    sync::atomic::{AtomicU64, Ordering},
-};
-
-/// Unfortunate implementation detail required to use the
-/// [`Pool.grow_exact`](struct.Pool.html#method.grow_exact) method
-pub struct Node<T> {
-    next: Atomic<Node<T>>,
-    pub(crate) data: UnsafeCell<T>,
-}
-
-impl<T> Node<T> {
-    fn next(&self) -> &Atomic<Node<T>> {
-        &self.next
-    }
-}
-
-pub struct Stack<T> {
-    head: Atomic<Node<T>>,
-}
-
-impl<T> Stack<T> {
-    pub const fn new() -> Self {
-        Self {
-            head: Atomic::null(),
-        }
-    }
-
-    pub fn push(&self, new_head: Ptr<Node<T>>) {
-        let mut head = self.head.load(Ordering::Relaxed);
-
-        loop {
-            unsafe {
-                new_head
-                    .as_raw()
-                    .as_ref()
-                    .next()
-                    .store(head, Ordering::Relaxed);
-            }
-
-            if let Err(p) = self.head.compare_and_exchange_weak(
-                head,
-                Some(new_head),
-                Ordering::Release,
-                Ordering::Relaxed,
-            ) {
-                head = p;
-            } else {
-                return;
-            }
-        }
-    }
-
-    pub fn try_pop(&self) -> Option<Ptr<Node<T>>> {
-        loop {
-            if let Some(mut head) = self.head.load(Ordering::Acquire) {
-                let next = unsafe { head.as_raw().as_ref().next().load(Ordering::Relaxed) };
-
-                if self
-                    .head
-                    .compare_and_exchange_weak(
-                        Some(head),
-                        next,
-                        Ordering::Release,
-                        Ordering::Relaxed,
-                    )
-                    .is_ok()
-                {
-                    head.incr_tag();
-                    return Some(head);
-                }
-            } else {
-                // stack observed empty
-                return None;
-            }
-        }
-    }
-}
-
-#[cfg(target_arch = "x86_64")]
-fn anchor<T>(init: Option<*mut T>) -> *mut T {
-    use core::sync::atomic::AtomicU8;
-
-    use spin::Once;
-
-    static LAZY_ANCHOR: Once<usize> = Once::new();
-
-    let likely_unaligned_address = if let Some(init) = init {
-        *LAZY_ANCHOR.call_once(|| init as usize)
-    } else {
-        LAZY_ANCHOR.get().copied().unwrap_or_else(|| {
-            // we may hit this branch with Pool of ZSTs where `grow` does not need to be called
-            static BSS_ANCHOR: AtomicU8 = AtomicU8::new(0);
-            &BSS_ANCHOR as *const _ as usize
-        })
-    };
-
-    let alignment_mask = !(core::mem::align_of::<T>() - 1);
-    let well_aligned_address = likely_unaligned_address & alignment_mask;
-    well_aligned_address as *mut T
-}
-
-/// On x86_64, anchored pointer. This is a (signed) 32-bit offset from `anchor` plus a 32-bit tag
-/// On x86, this is a pointer plus a 32-bit tag
-pub struct Ptr<T> {
-    inner: NonZeroU64,
-    _marker: PhantomData<*mut T>,
-}
-
-impl<T> Clone for Ptr<T> {
-    fn clone(&self) -> Self {
-        *self
-    }
-}
-
-impl<T> Copy for Ptr<T> {}
-
-fn initial_tag_value() -> NonZeroU32 {
-    NonZeroU32::new(1).unwrap()
-}
-
-impl<T> Ptr<T> {
-    #[cfg(target_arch = "x86_64")]
-    pub fn new(p: *mut T) -> Option<Self> {
-        use core::convert::TryFrom;
-
-        i32::try_from((p as isize).wrapping_sub(anchor::<T>(Some(p)) as isize))
-            .ok()
-            .map(|offset| unsafe { Ptr::from_parts(initial_tag_value(), offset) })
-    }
-
-    #[cfg(target_arch = "x86")]
-    pub fn new(p: *mut T) -> Option<Self> {
-        Some(unsafe { Ptr::from_parts(initial_tag_value(), p as i32) })
-    }
-
-    unsafe fn from_parts(tag: NonZeroU32, offset: i32) -> Self {
-        Self {
-            inner: NonZeroU64::new_unchecked((tag.get() as u64) << 32 | (offset as u32 as u64)),
-            _marker: PhantomData,
-        }
-    }
-
-    fn from_u64(p: u64) -> Option<Self> {
-        NonZeroU64::new(p).map(|inner| Self {
-            inner,
-            _marker: PhantomData,
-        })
-    }
-
-    fn into_u64(&self) -> u64 {
-        self.inner.get()
-    }
-
-    fn tag(&self) -> NonZeroU32 {
-        let tag = (self.inner.get() >> 32) as u32;
-        debug_assert_ne!(0, tag, "broken non-zero invariant");
-        unsafe { NonZeroU32::new_unchecked(tag) }
-    }
-
-    fn incr_tag(&mut self) {
-        let maybe_zero_tag = self.tag().get().wrapping_add(1);
-        let tag = NonZeroU32::new(maybe_zero_tag).unwrap_or(initial_tag_value());
-        let offset = self.offset();
-
-        *self = unsafe { Ptr::from_parts(tag, offset) };
-    }
-
-    fn offset(&self) -> i32 {
-        self.inner.get() as i32
-    }
-
-    #[cfg(target_arch = "x86_64")]
-    fn as_raw(&self) -> NonNull<T> {
-        unsafe {
-            NonNull::new_unchecked(
-                (anchor::<T>(None) as isize).wrapping_add(self.offset() as isize) as *mut T,
-            )
-        }
-    }
-
-    #[cfg(target_arch = "x86")]
-    fn as_raw(&self) -> NonNull<T> {
-        unsafe { NonNull::new_unchecked(self.offset() as *mut T) }
-    }
-
-    pub fn dangling() -> Self {
-        // `anchor()` returns a well-aligned pointer so an offset of 0 will also produce a well-aligned pointer
-        unsafe { Self::from_parts(initial_tag_value(), 0) }
-    }
-
-    pub unsafe fn as_ref(&self) -> &T {
-        &*self.as_raw().as_ptr()
-    }
-}
-
-struct Atomic<T> {
-    inner: AtomicU64,
-    _marker: PhantomData<*mut T>,
-}
-
-impl<T> Atomic<T> {
-    const fn null() -> Self {
-        Self {
-            inner: AtomicU64::new(0),
-            _marker: PhantomData,
-        }
-    }
-
-    fn compare_and_exchange_weak(
-        &self,
-        current: Option<Ptr<T>>,
-        new: Option<Ptr<T>>,
-        succ: Ordering,
-        fail: Ordering,
-    ) -> Result<(), Option<Ptr<T>>> {
-        self.inner
-            .compare_exchange_weak(
-                current.map(|p| p.into_u64()).unwrap_or(0),
-                new.map(|p| p.into_u64()).unwrap_or(0),
-                succ,
-                fail,
-            )
-            .map(drop)
-            .map_err(Ptr::from_u64)
-    }
-
-    fn load(&self, ord: Ordering) -> Option<Ptr<T>> {
-        NonZeroU64::new(self.inner.load(ord)).map(|inner| Ptr {
-            inner,
-            _marker: PhantomData,
-        })
-    }
-
-    fn store(&self, val: Option<Ptr<T>>, ord: Ordering) {
-        self.inner
-            .store(val.map(|p| p.into_u64()).unwrap_or(0), ord)
-    }
-}
--- a/src/pool/llsc.rs
+++ b/src/pool/llsc.rs
@ -1,80 +0,0 @@
-//! Stack based on LL/SC atomics
-
-pub use core::ptr::NonNull as Ptr;
-use core::{cell::UnsafeCell, ptr};
-
-#[cfg(cas_atomic_polyfill)]
-use atomic_polyfill::{AtomicPtr, Ordering};
-
-#[cfg(not(cas_atomic_polyfill))]
-use core::sync::atomic::{AtomicPtr, Ordering};
-
-/// Unfortunate implementation detail required to use the
-/// [`Pool.grow_exact`](struct.Pool.html#method.grow_exact) method
-pub struct Node<T> {
-    next: AtomicPtr<Node<T>>,
-    pub(crate) data: UnsafeCell<T>,
-}
-
-impl<T> Node<T> {
-    fn next(&self) -> &AtomicPtr<Node<T>> {
-        &self.next
-    }
-}
-
-pub struct Stack<T> {
-    head: AtomicPtr<Node<T>>,
-}
-
-impl<T> Stack<T> {
-    pub const fn new() -> Self {
-        Self {
-            head: AtomicPtr::new(ptr::null_mut()),
-        }
-    }
-
-    pub fn push(&self, new_head: Ptr<Node<T>>) {
-        // NOTE `Ordering`s come from crossbeam's (v0.6.0) `TreiberStack`
-
-        let mut head = self.head.load(Ordering::Relaxed);
-        loop {
-            unsafe { new_head.as_ref().next().store(head, Ordering::Relaxed) }
-
-            match self.head.compare_exchange_weak(
-                head,
-                new_head.as_ptr(),
-                Ordering::Release, // success
-                Ordering::Relaxed, // failure
-            ) {
-                Ok(_) => return,
-                // interrupt occurred or other core made a successful STREX op on the head
-                Err(p) => head = p,
-            }
-        }
-    }
-
-    pub fn try_pop(&self) -> Option<Ptr<Node<T>>> {
-        // NOTE `Ordering`s come from crossbeam's (v0.6.0) `TreiberStack`
-
-        loop {
-            let head = self.head.load(Ordering::Acquire);
-            if let Some(nn_head) = Ptr::new(head) {
-                let next = unsafe { nn_head.as_ref().next().load(Ordering::Relaxed) };
-
-                match self.head.compare_exchange_weak(
-                    head,
-                    next,
-                    Ordering::Release, // success
-                    Ordering::Relaxed, // failure
-                ) {
-                    Ok(_) => break Some(nn_head),
-                    // interrupt occurred or other core made a successful STREX op on the head
-                    Err(_) => continue,
-                }
-            } else {
-                // stack is observed as empty
-                break None;
-            }
-        }
-    }
-}
--- a/src/pool/mod.rs
+++ b/src/pool/mod.rs
@ -1,693 +0,0 @@
-//! A heap-less, interrupt-safe, lock-free memory pool (\*)
-//!
-//! NOTE: This module is not available on targets that do *not* support CAS operations and are not
-//! emulated by the [`atomic_polyfill`](https://crates.io/crates/atomic-polyfill) crate (e.g.,
-//! MSP430).
-//!
-//! (\*) Currently, the implementation is only lock-free *and* `Sync` on ARMv6, ARMv7-{A,R,M} & ARMv8-M
-//! devices
-//!
-//! # Examples
-//!
-//! The most common way of using this pool is as a global singleton; the singleton mode gives you
-//! automatic deallocation of memory blocks on `drop`.
-//!
-//! ``` ignore
-//! #![no_main]
-//! #![no_std]
-//!
-//! use cortex_m_rt::{entry, exception};
-//! use heapless::{
-//!     pool,
-//!     pool::singleton::{Box, Pool},
-//! };
-//!
-//! // instantiate a memory pool of `[u8; 128]` blocks as a global singleton
-//! pool!(
-//!     // attributes can be used here
-//!     // #[link_section = ".ccram.A"]
-//!     A: [u8; 128]
-//! );
-//!
-//! #[entry]
-//! fn main() -> ! {
-//!     static mut MEMORY: [u8; 1024] = [0; 1024];
-//!
-//!     // increase the capacity of the pool by ~8 blocks
-//!     A::grow(MEMORY);
-//!
-//!     // claim a block of memory
-//!     // note that the type is `Box<A>`, and not `Box<[u8; 128]>`
-//!     // `A` is the "name" of the pool
-//!     let x: Box<A, _> = A::alloc().unwrap();
-//!     loop {
-//!         // .. do stuff with `x` ..
-//!     }
-//! }
-//!
-//! #[exception]
-//! fn SysTick() {
-//!     // claim a block of memory
-//!     let y = A::alloc().unwrap();
-//!
-//!     // .. do stuff with `y` ..
-//!
-//!     // return the memory block to the pool
-//!     drop(y);
-//! }
-//! ```
-//!
-//! # Portability
-//!
-//! This pool internally uses a Treiber stack which is known to be susceptible to the ABA problem.
-//! The only counter measure against the ABA problem that this implementation currently takes is
-//! relying on LL/SC (Link-local / Store-conditional) instructions being used to implement CAS loops
-//! on the target architecture (see section on ['Soundness'](#soundness) for more information). For
-//! this reason, `Pool` only implements `Sync` when compiling for some ARM cores.
-//!
-//! This module requires CAS atomic instructions which are not available on all architectures (e.g.
-//! ARMv6-M (`thumbv6m-none-eabi`) and MSP430 (`msp430-none-elf`)). These atomics can be emulated
-//! however with [`atomic_polyfill`](https://crates.io/crates/atomic-polyfill), which is enabled
-//! with the `cas` feature and is enabled by default for `thumbv6m-none-eabi` and `riscv32` targets.
-//! MSP430 is currently not supported by
-//! [`atomic_polyfill`](https://crates.io/crates/atomic-polyfill).
-//!
-//! # Soundness
-//!
-//! This pool uses a Treiber stack to keep a list of free memory blocks (nodes). Each of these
-//! nodes has a pointer to the next node. To claim a memory block we simply pop a node from the
-//! top of the stack and use it as a memory block. The pop operation consists of swapping the
-//! current head (top) node with the node below it. The Rust code for the `pop` operation is shown
-//! below:
-//!
-//! ``` ignore
-//! fn pop(&self) -> Option<NonNull<Node<T>>> {
-//!     let fetch_order = ..;
-//!     let set_order = ..;
-//!
-//!     // `self.head` has type `AtomicPtr<Node<T>>`
-//!     // where `struct Node<T> { next: AtomicPtr<Node<T>>, data: UnsafeCell<T> }`
-//!     let mut head = self.head.load(fetch_order);
-//!     loop {
-//!         if let Some(nn_head) = NonNull::new(head) {
-//!             let next = unsafe { (*head).next.load(Ordering::Relaxed) };
-//!
-//!             // <~ preempted
-//!
-//!             match self
-//!                 .head
-//!                 .compare_exchange_weak(head, next, set_order, fetch_order)
-//!             {
-//!                 Ok(_) => break Some(nn_head),
-//!                 // head was changed by some interrupt handler / thread
-//!                 Err(new_head) => head = new_head,
-//!             }
-//!         } else {
-//!             // stack is observed as empty
-//!             break None;
-//!         }
-//!     }
-//! }
-//! ```
-//!
-//! In general, the `pop` operation is susceptible to the ABA problem. If this operation gets
-//! preempted by some interrupt handler somewhere between the `head.load` and the
-//! `compare_and_exchange_weak`, and that handler modifies the stack in such a way that the head
-//! (top) of the stack remains unchanged then resuming the `pop` operation will corrupt the stack.
-//!
-//! An example: imagine we are doing on `pop` on stack that contains these nodes: `A -> B -> C`,
-//! `A` is the head (top), `B` is next to `A` and `C` is next to `B`. The `pop` operation will do a
-//! `CAS(&self.head, A, B)` operation to atomically change the head to `B` iff it currently is `A`.
-//! Now, let's say a handler preempts the `pop` operation before the `CAS` operation starts and it
-//! `pop`s the stack twice and then `push`es back the `A` node; now the state of the stack is `A ->
-//! C`. When the original `pop` operation is resumed it will succeed in doing the `CAS` operation
-//! setting `B` as the head of the stack. However, `B` was used by the handler as a memory block and
-//! no longer is a valid free node. As a result the stack, and thus the allocator, is in a invalid
-//! state.
-//!
-//! However, not all is lost because ARM devices use LL/SC (Link-local / Store-conditional)
-//! operations to implement CAS loops. Let's look at the actual disassembly of `pop` for the ARM
-//! Cortex-M.
-//!
-//! ``` text
-//! 08000130 <<heapless::pool::Pool<T>>::pop>:
-//!  8000130:       6802            ldr     r2, [r0, #0]
-//!  8000132:       e00c            b.n     800014e <<heapless::pool::Pool<T>>::pop+0x1e>
-//!  8000134:       4611            mov     r1, r2
-//!  8000136:       f8d2 c000       ldr.w   ip, [r2]
-//!  800013a:       e850 2f00       ldrex   r2, [r0]
-//!  800013e:       428a            cmp     r2, r1
-//!  8000140:       d103            bne.n   800014a <<heapless::pool::Pool<T>>::pop+0x1a>
-//!  8000142:       e840 c300       strex   r3, ip, [r0]
-//!  8000146:       b913            cbnz    r3, 800014e <<heapless::pool::Pool<T>>::pop+0x1e>
-//!  8000148:       e004            b.n     8000154 <<heapless::pool::Pool<T>>::pop+0x24>
-//!  800014a:       f3bf 8f2f       clrex
-//!  800014e:       2a00            cmp     r2, #0
-//!  8000150:       d1f0            bne.n   8000134 <<heapless::pool::Pool<T>>::pop+0x4>
-//!  8000152:       2100            movs    r1, #0
-//!  8000154:       4608            mov     r0, r1
-//!  8000156:       4770            bx      lr
-//! ```
-//!
-//! LDREX ("load exclusive") is the LL instruction, and STREX ("store exclusive") is the SC
-//! instruction (see [1](#references)). On the Cortex-M, STREX will always fail if the processor
-//! takes an exception between it and its corresponding LDREX operation (see [2](#references)). If
-//! STREX fails then the CAS loop is retried (see instruction @ `0x8000146`). On single core
-//! systems, preemption is required to run into the ABA problem and on Cortex-M devices preemption
-//! always involves taking an exception. Thus the underlying LL/SC operations prevent the ABA
-//! problem on Cortex-M.
-//!
-//! In the case of multi-core systems if any other core successfully does a STREX op on the head
-//! while the current core is somewhere between LDREX and STREX then the current core will fail its
-//! STREX operation.
-//!
-//! # x86_64 support / limitations
-//!
-//! *NOTE* `Pool` is only `Sync` on `x86_64` and `x86` (`i686`) if the Cargo feature "x86-sync-pool"
-//! is enabled
-//!
-//! x86_64 support is a gamble. Yes, a gamble. Do you feel lucky enough to use `Pool` on x86_64?
-//!
-//! As it's not possible to implement *ideal* LL/SC semantics (\*) on x86_64 the architecture is
-//! susceptible to the ABA problem described above. To *reduce the chances* of ABA occurring in
-//! practice we use version tags (keyword: IBM ABA-prevention tags). Again, this approach does
-//! *not* fix / prevent / avoid the ABA problem; it only reduces the chance of it occurring in
-//! practice but the chances of it occurring are not reduced to zero.
-//!
-//! How we have implemented version tags: instead of using an `AtomicPtr` to link the stack `Node`s
-//! we use an `AtomicUsize` where the 64-bit `usize` is always comprised of a monotonically
-//! increasing 32-bit tag (higher bits) and a 32-bit signed address offset. The address of a node is
-//! computed by adding the 32-bit offset to an "anchor" address (the address of a static variable
-//! that lives somewhere in the `.bss` linker section). The tag is increased every time a node is
-//! popped (removed) from the stack.
-//!
-//! To see how version tags can prevent ABA consider the example from the previous section. Let's
-//! start with a stack in this state: `(~A, 0) -> (~B, 1) -> (~C, 2)`, where `~A` represents the
-//! address of node A as a 32-bit offset from the "anchor" and the second tuple element (e.g. `0`)
-//! indicates the version of the node. For simplicity, assume a single core system: thread T1 is
-//! performing `pop` and before `CAS(&self.head, (~A, 0), (~B, 1))` is executed a context switch
-//! occurs and the core resumes T2. T2 pops the stack twice and pushes A back into the stack;
-//! because the `pop` operation increases the version the stack ends in the following state: `(~A,
-//! 1) -> (~C, 2)`. Now if T1 is resumed the CAS operation will fail because `self.head` is `(~A,
-//! 1)` and not `(~A, 0)`.
-//!
-//! When can version tags fail to prevent ABA? Using the previous example: if T2 performs a `push`
-//! followed by a `pop` `(1 << 32) - 1` times before doing its original `pop` - `pop` - `push`
-//! operation then ABA will occur because the version tag of node `A` will wraparound to its
-//! original value of `0` and the CAS operation in T1 will succeed and corrupt the stack.
-//!
-//! It does seem unlikely that (1) a thread will perform the above operation and (2) that the above
-//! operation will complete within one time slice, assuming time sliced threads. If you have thread
-//! priorities then the above operation could occur during the lifetime of many high priorities
-//! threads if T1 is running at low priority.
-//!
-//! Other implementations of version tags use more than 32 bits in their tags (e.g. "Scalable
-//! Lock-Free Dynamic Memory Allocation" uses 42-bit tags in its super blocks). In theory, one could
-//! use double-word CAS on x86_64 to pack a 64-bit tag and a 64-bit pointer in a double-word but
-//! this CAS operation is not exposed in the standard library (and I think it's not available on
-//! older x86_64 processors?)
-//!
-//! (\*) Apparently one can emulate proper LL/SC semantics on x86_64 using hazard pointers (?) --
-//! the technique appears to be documented in "ABA Prevention Using Single-Word Instructions", which
-//! is not public AFAICT -- but hazard pointers require Thread Local Storage (TLS), which is a
-//! non-starter for a `no_std` library like `heapless`.
-//!
-//! ## x86_64 Limitations
-//!
-//! *NOTE* this limitation does not apply to `x86` (32-bit address space). If you run into this
-//! issue, on an x86_64 processor try running your code compiled for `x86`, e.g. `cargo run --target
-//! i686-unknown-linux-musl`
-//!
-//! Because stack nodes must be located within +- 2 GB of the hidden `ANCHOR` variable, which
-//! lives in the `.bss` section, `Pool` may not be able to manage static references created using
-//! `Box::leak` -- these heap allocated chunks of memory may live in a very different address space.
-//! When the `Pool` is unable to manage a node because of its address it will simply discard it:
-//! `Pool::grow*` methods return the number of new memory blocks added to the pool; if these methods
-//! return `0` it means the `Pool` is unable to manage the memory given to them.
-//!
-//! # References
-//!
-//! 1. [Cortex-M3 Devices Generic User Guide (DUI 0552A)][0], Section 2.2.7 "Synchronization
-//! primitives"
-//!
-//! [0]: http://infocenter.arm.com/help/topic/com.arm.doc.dui0552a/DUI0552A_cortex_m3_dgug.pdf
-//!
-//! 2. [ARMv7-M Architecture Reference Manual (DDI 0403E.b)][1], Section A3.4 "Synchronization and
-//! semaphores"
-//!
-//! [1]: https://static.docs.arm.com/ddi0403/eb/DDI0403E_B_armv7m_arm.pdf
-//!
-//! 3. "Scalable Lock-Free Dynamic Memory Allocation" Michael, Maged M.
-//!
-//! 4. "Hazard pointers: Safe memory reclamation for lock-free objects." Michael, Maged M.
-
-use core::{any::TypeId, mem};
-use core::{
-    cmp, fmt,
-    hash::{Hash, Hasher},
-    marker::PhantomData,
-    mem::MaybeUninit,
-    ops::{Deref, DerefMut},
-    ptr::{self, NonNull},
-};
-
-pub use stack::Node;
-use stack::{Ptr, Stack};
-
-pub mod singleton;
-#[cfg_attr(any(target_arch = "x86_64", target_arch = "x86"), path = "cas.rs")]
-#[cfg_attr(
-    not(any(target_arch = "x86_64", target_arch = "x86")),
-    path = "llsc.rs"
-)]
-mod stack;
-
-/// A lock-free memory pool
-pub struct Pool<T> {
-    stack: Stack<T>,
-
-    // Current implementation is unsound on architectures that don't have LL/SC semantics so this
-    // struct is not `Sync` on those platforms
-    _not_send_or_sync: PhantomData<*const ()>,
-}
-
-// NOTE(any(test)) makes testing easier (no need to enable Cargo features for testing)
-#[cfg(any(
-    armv6m,
-    armv7a,
-    armv7r,
-    armv7m,
-    armv8m_main,
-    all(
-        any(target_arch = "x86_64", target_arch = "x86"),
-        feature = "x86-sync-pool"
-    ),
-    test
-))]
-unsafe impl<T> Sync for Pool<T> {}
-
-unsafe impl<T> Send for Pool<T> {}
-
-impl<T> Pool<T> {
-    /// Creates a new empty pool
-    pub const fn new() -> Self {
-        Pool {
-            stack: Stack::new(),
-
-            _not_send_or_sync: PhantomData,
-        }
-    }
-
-    /// Claims a memory block from the pool
-    ///
-    /// Returns `None` when the pool is observed as exhausted
-    ///
-    /// *NOTE:* This method does *not* have bounded execution time because it contains a CAS loop
-    pub fn alloc(&self) -> Option<Box<T, Uninit>> {
-        if mem::size_of::<T>() == 0 {
-            // NOTE because we return a dangling pointer to a NODE, which has non-zero size
-            // even when T is a ZST, in this case we need to make sure we
-            // - don't do pointer arithmetic on this pointer
-            // - dereference that offset-ed pointer as a ZST
-            // because miri doesn't like that
-            return Some(Box {
-                node: Ptr::dangling(),
-                _state: PhantomData,
-            });
-        }
-
-        if let Some(node) = self.stack.try_pop() {
-            Some(Box {
-                node,
-                _state: PhantomData,
-            })
-        } else {
-            None
-        }
-    }
-
-    /// Returns a memory block to the pool
-    ///
-    /// *NOTE*: `T`'s destructor (if any) will run on `value` iff `S = Init`
-    ///
-    /// *NOTE:* This method does *not* have bounded execution time because it contains a CAS loop
-    pub fn free<S>(&self, value: Box<T, S>)
-    where
-        S: 'static,
-    {
-        if TypeId::of::<S>() == TypeId::of::<Init>() {
-            let p = if mem::size_of::<T>() == 0 {
-                // any pointer will do to invoke the destructor of a ZST
-                NonNull::dangling().as_ptr()
-            } else {
-                unsafe { value.node.as_ref().data.get() }
-            };
-            unsafe {
-                ptr::drop_in_place(p);
-            }
-        }
-
-        // no operation
-        if mem::size_of::<T>() == 0 {
-            return;
-        }
-
-        self.stack.push(value.node)
-    }
-
-    /// Increases the capacity of the pool
-    ///
-    /// This method might *not* fully utilize the given memory block due to alignment requirements.
-    ///
-    /// This method returns the number of *new* blocks that can be allocated.
-    pub fn grow(&self, memory: &'static mut [u8]) -> usize {
-        if mem::size_of::<T>() == 0 {
-            // ZST use no memory so a pool of ZST always has maximum capacity
-            return usize::max_value();
-        }
-
-        let sz = mem::size_of::<Node<T>>();
-        let mut p = memory.as_mut_ptr();
-        let mut len = memory.len();
-
-        let align = mem::align_of::<Node<T>>();
-        let rem = (p as usize) % align;
-        if rem != 0 {
-            let offset = align - rem;
-
-            if offset >= len {
-                // slice is too small
-                return 0;
-            }
-
-            p = unsafe { p.add(offset) };
-            len -= offset;
-        }
-
-        let mut n = 0;
-        while len >= sz {
-            match () {
-                #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
-                () => {
-                    if let Some(p) = Ptr::new(p as *mut _) {
-                        self.stack.push(p);
-                        n += 1;
-                    }
-                }
-
-                #[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))]
-                () => {
-                    self.stack.push(unsafe { Ptr::new_unchecked(p as *mut _) });
-                    n += 1;
-                }
-            }
-
-            p = unsafe { p.add(sz) };
-            len -= sz;
-        }
-
-        n
-    }
-
-    /// Increases the capacity of the pool
-    ///
-    /// Unlike [`Pool.grow`](struct.Pool.html#method.grow) this method fully utilizes the given
-    /// memory block
-    pub fn grow_exact<A>(&self, memory: &'static mut MaybeUninit<A>) -> usize
-    where
-        A: AsMut<[Node<T>]>,
-    {
-        if mem::size_of::<T>() == 0 {
-            return usize::max_value();
-        }
-
-        let nodes = unsafe { (*memory.as_mut_ptr()).as_mut() };
-        let cap = nodes.len();
-        for p in nodes {
-            match () {
-                #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
-                () => {
-                    if let Some(p) = Ptr::new(p) {
-                        self.stack.push(p);
-                    }
-                }
-
-                #[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))]
-                () => self.stack.push(core::ptr::NonNull::from(p)),
-            }
-        }
-        cap
-    }
-}
-
-/// A memory block
-pub struct Box<T, STATE = Init> {
-    _state: PhantomData<STATE>,
-    node: Ptr<Node<T>>,
-}
-
-impl<T> Box<T, Uninit> {
-    /// Initializes this memory block
-    pub fn init(self, val: T) -> Box<T, Init> {
-        if mem::size_of::<T>() == 0 {
-            // no memory operation needed for ZST
-            // BUT we want to avoid calling `val`s destructor
-            mem::forget(val)
-        } else {
-            unsafe {
-                ptr::write(self.node.as_ref().data.get(), val);
-            }
-        }
-
-        Box {
-            node: self.node,
-            _state: PhantomData,
-        }
-    }
-}
-
-/// Uninitialized type state
-pub enum Uninit {}
-
-/// Initialized type state
-pub enum Init {}
-
-unsafe impl<T, S> Send for Box<T, S> where T: Send {}
-
-unsafe impl<T, S> Sync for Box<T, S> where T: Sync {}
-
-unsafe impl<T> stable_deref_trait::StableDeref for Box<T> {}
-
-impl<A, T> AsRef<[T]> for Box<A>
-where
-    A: AsRef<[T]>,
-{
-    fn as_ref(&self) -> &[T] {
-        self.deref().as_ref()
-    }
-}
-
-impl<A, T> AsMut<[T]> for Box<A>
-where
-    A: AsMut<[T]>,
-{
-    fn as_mut(&mut self) -> &mut [T] {
-        self.deref_mut().as_mut()
-    }
-}
-
-impl<T> Deref for Box<T> {
-    type Target = T;
-
-    fn deref(&self) -> &T {
-        if mem::size_of::<T>() == 0 {
-            // any pointer will do for ZST
-            unsafe { &*NonNull::dangling().as_ptr() }
-        } else {
-            unsafe { &*self.node.as_ref().data.get() }
-        }
-    }
-}
-
-impl<T> DerefMut for Box<T> {
-    fn deref_mut(&mut self) -> &mut T {
-        if mem::size_of::<T>() == 0 {
-            // any pointer will do for ZST
-            unsafe { &mut *NonNull::dangling().as_ptr() }
-        } else {
-            unsafe { &mut *self.node.as_ref().data.get() }
-        }
-    }
-}
-
-impl<T> fmt::Debug for Box<T>
-where
-    T: fmt::Debug,
-{
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        <T as fmt::Debug>::fmt(self, f)
-    }
-}
-
-impl<T> fmt::Display for Box<T>
-where
-    T: fmt::Display,
-{
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        <T as fmt::Display>::fmt(self, f)
-    }
-}
-
-impl<T> PartialEq for Box<T>
-where
-    T: PartialEq,
-{
-    fn eq(&self, rhs: &Box<T>) -> bool {
-        <T as PartialEq>::eq(self, rhs)
-    }
-}
-
-impl<T> Eq for Box<T> where T: Eq {}
-
-impl<T> PartialOrd for Box<T>
-where
-    T: PartialOrd,
-{
-    fn partial_cmp(&self, rhs: &Box<T>) -> Option<cmp::Ordering> {
-        <T as PartialOrd>::partial_cmp(self, rhs)
-    }
-}
-
-impl<T> Ord for Box<T>
-where
-    T: Ord,
-{
-    fn cmp(&self, rhs: &Box<T>) -> cmp::Ordering {
-        <T as Ord>::cmp(self, rhs)
-    }
-}
-
-impl<T> Hash for Box<T>
-where
-    T: Hash,
-{
-    fn hash<H>(&self, state: &mut H)
-    where
-        H: Hasher,
-    {
-        <T as Hash>::hash(self, state)
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use core::{
-        mem::{self, MaybeUninit},
-        sync::atomic::{AtomicUsize, Ordering},
-    };
-
-    use super::{Node, Pool};
-
-    #[test]
-    fn grow() {
-        static mut MEMORY: [u8; 1024] = [0; 1024];
-
-        static POOL: Pool<[u8; 128]> = Pool::new();
-
-        unsafe {
-            POOL.grow(&mut MEMORY);
-        }
-
-        for _ in 0..7 {
-            assert!(POOL.alloc().is_some());
-        }
-    }
-
-    #[test]
-    fn grow_exact() {
-        const SZ: usize = 8;
-        static mut MEMORY: MaybeUninit<[Node<[u8; 128]>; SZ]> = MaybeUninit::uninit();
-
-        static POOL: Pool<[u8; 128]> = Pool::new();
-
-        unsafe {
-            POOL.grow_exact(&mut MEMORY);
-        }
-
-        for _ in 0..SZ {
-            assert!(POOL.alloc().is_some());
-        }
-        assert!(POOL.alloc().is_none());
-    }
-
-    #[test]
-    fn sanity() {
-        const SZ: usize = 2 * mem::size_of::<Node<u8>>() - 1;
-        static mut MEMORY: [u8; SZ] = [0; SZ];
-
-        static POOL: Pool<u8> = Pool::new();
-
-        // empty pool
-        assert!(POOL.alloc().is_none());
-
-        POOL.grow(unsafe { &mut MEMORY });
-
-        let x = POOL.alloc().unwrap().init(0);
-        assert_eq!(*x, 0);
-
-        // pool exhausted
-        assert!(POOL.alloc().is_none());
-
-        POOL.free(x);
-
-        // should be possible to allocate again
-        assert_eq!(*POOL.alloc().unwrap().init(1), 1);
-    }
-
-    #[test]
-    fn destructors() {
-        static COUNT: AtomicUsize = AtomicUsize::new(0);
-
-        struct X;
-
-        impl X {
-            fn new() -> X {
-                COUNT.fetch_add(1, Ordering::Relaxed);
-                X
-            }
-        }
-
-        impl Drop for X {
-            fn drop(&mut self) {
-                COUNT.fetch_sub(1, Ordering::Relaxed);
-            }
-        }
-
-        static mut MEMORY: [u8; 31] = [0; 31];
-
-        static POOL: Pool<X> = Pool::new();
-
-        POOL.grow(unsafe { &mut MEMORY });
-
-        let x = POOL.alloc().unwrap().init(X::new());
-        let y = POOL.alloc().unwrap().init(X::new());
-        let z = POOL.alloc().unwrap().init(X::new());
-
-        assert_eq!(COUNT.load(Ordering::Relaxed), 3);
-
-        // this leaks memory
-        drop(x);
-
-        assert_eq!(COUNT.load(Ordering::Relaxed), 3);
-
-        // this leaks memory
-        mem::forget(y);
-
-        assert_eq!(COUNT.load(Ordering::Relaxed), 3);
-
-        // this runs `X` destructor
-        POOL.free(z);
-
-        assert_eq!(COUNT.load(Ordering::Relaxed), 2);
-    }
-}
--- a/src/pool/object.rs
+++ b/src/pool/object.rs
@ -0,0 +1,393 @@
+//! Object pool API
+//!
+//! # Example usage
+//!
+//! ```
+//! use heapless::{object_pool, pool::object::{Object, ObjectBlock}};
+//!
+//! object_pool!(P: [u8; 128]);
+//!
+//! // cannot request objects without first giving object blocks to the pool
+//! assert!(P.request().is_none());
+//!
+//! // (some `no_std` runtimes have safe APIs to create `&'static mut` references)
+//! let block: &'static mut ObjectBlock<[u8; 128]> = unsafe {
+//!     // unlike the memory pool APIs, an initial value must be specified here
+//!     static mut B: ObjectBlock<[u8; 128]>= ObjectBlock::new([0; 128]);
+//!     &mut B
+//! };
+//!
+//! // give object block to the pool
+//! P.manage(block);
+//!
+//! // it's now possible to request objects
+//! // unlike the memory pool APIs, no initial value is required here
+//! let mut object = P.request().unwrap();
+//!
+//! // mutation is possible
+//! object.iter_mut().for_each(|byte| *byte = byte.wrapping_add(1));
+//!
+//! // the number of live objects is limited to the number of blocks managed by the pool
+//! let res = P.request();
+//! assert!(res.is_none());
+//!
+//! // `object`'s destructor returns the object to the pool
+//! drop(object);
+//!
+//! // it's possible to request an `Object` again
+//! let res = P.request();
+//!
+//! assert!(res.is_some());
+//! ```
+
+use core::{
+    cmp::Ordering,
+    fmt,
+    hash::{Hash, Hasher},
+    mem::ManuallyDrop,
+    ops, ptr,
+};
+
+use super::treiber::{AtomicPtr, NonNullPtr, Stack, StructNode};
+
+/// Creates a new `ObjectPool` singleton with the given `$name` that manages the specified
+/// `$data_type`
+///
+/// For more extensive documentation see the [module level documentation](pool/object/index.html)
+#[macro_export]
+macro_rules! object_pool {
+    ($name:ident: $data_type:ty) => {
+        pub struct $name;
+
+        impl $crate::pool::object::ObjectPool for $name {
+            type Data = $data_type;
+
+            fn singleton() -> &'static $crate::pool::object::ObjectPoolImpl<$data_type> {
+                static $name: $crate::pool::object::ObjectPoolImpl<$data_type> =
+                    $crate::pool::object::ObjectPoolImpl::new();
+
+                &$name
+            }
+        }
+
+        impl $name {
+            /// Inherent method version of `ObjectPool::request`
+            #[allow(dead_code)]
+            pub fn request(&self) -> Option<$crate::pool::object::Object<$name>> {
+                <$name as $crate::pool::object::ObjectPool>::request()
+            }
+
+            /// Inherent method version of `ObjectPool::manage`
+            #[allow(dead_code)]
+            pub fn manage(
+                &self,
+                block: &'static mut $crate::pool::object::ObjectBlock<$data_type>,
+            ) {
+                <$name as $crate::pool::object::ObjectPool>::manage(block)
+            }
+        }
+    };
+}
+
+/// A singleton that manages `pool::object::Object`s
+pub trait ObjectPool: Sized {
+    /// The data type of the objects managed by the object pool
+    type Data: 'static;
+
+    /// `object_pool!` implementation detail
+    #[doc(hidden)]
+    fn singleton() -> &'static ObjectPoolImpl<Self::Data>;
+
+    /// Request a new object from the pool
+    fn request() -> Option<Object<Self>> {
+        Self::singleton()
+            .request()
+            .map(|node_ptr| Object { node_ptr })
+    }
+
+    /// Adds a statically allocate object to the pool
+    fn manage(block: &'static mut ObjectBlock<Self::Data>) {
+        Self::singleton().manage(block)
+    }
+}
+
+/// `object_pool!` implementation detail
+#[doc(hidden)]
+pub struct ObjectPoolImpl<T> {
+    stack: Stack<StructNode<T>>,
+}
+
+impl<T> ObjectPoolImpl<T> {
+    /// `object_pool!` implementation detail
+    #[doc(hidden)]
+    pub const fn new() -> Self {
+        Self {
+            stack: Stack::new(),
+        }
+    }
+
+    fn request(&self) -> Option<NonNullPtr<StructNode<T>>> {
+        self.stack.try_pop()
+    }
+
+    fn manage(&self, block: &'static mut ObjectBlock<T>) {
+        let node: &'static mut _ = &mut block.node;
+
+        unsafe { self.stack.push(NonNullPtr::from_static_mut_ref(node)) }
+    }
+}
+
+// `T needs` to be Send because returning an object from a thread and then
+// requesting it from another is effectively a cross-thread 'send' operation
+unsafe impl<T> Sync for ObjectPoolImpl<T> where T: Send {}
+
+/// An object managed by object pool `P`
+pub struct Object<P>
+where
+    P: ObjectPool,
+{
+    node_ptr: NonNullPtr<StructNode<P::Data>>,
+}
+
+impl<A, T, const N: usize> AsMut<[T]> for Object<A>
+where
+    A: ObjectPool<Data = [T; N]>,
+{
+    fn as_mut(&mut self) -> &mut [T] {
+        &mut **self
+    }
+}
+
+impl<A, T, const N: usize> AsRef<[T]> for Object<A>
+where
+    A: ObjectPool<Data = [T; N]>,
+{
+    fn as_ref(&self) -> &[T] {
+        &**self
+    }
+}
+
+impl<A> fmt::Debug for Object<A>
+where
+    A: ObjectPool,
+    A::Data: fmt::Debug,
+{
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        A::Data::fmt(self, f)
+    }
+}
+
+impl<A> ops::Deref for Object<A>
+where
+    A: ObjectPool,
+{
+    type Target = A::Data;
+
+    fn deref(&self) -> &Self::Target {
+        unsafe { &*ptr::addr_of!((*self.node_ptr.as_ptr()).data) }
+    }
+}
+
+impl<A> ops::DerefMut for Object<A>
+where
+    A: ObjectPool,
+{
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        unsafe { &mut *ptr::addr_of_mut!((*self.node_ptr.as_ptr()).data) }
+    }
+}
+
+impl<A> fmt::Display for Object<A>
+where
+    A: ObjectPool,
+    A::Data: fmt::Display,
+{
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        A::Data::fmt(self, f)
+    }
+}
+
+impl<P> Drop for Object<P>
+where
+    P: ObjectPool,
+{
+    fn drop(&mut self) {
+        unsafe { P::singleton().stack.push(self.node_ptr) }
+    }
+}
+
+impl<A> Eq for Object<A>
+where
+    A: ObjectPool,
+    A::Data: Eq,
+{
+}
+
+impl<A> Hash for Object<A>
+where
+    A: ObjectPool,
+    A::Data: Hash,
+{
+    fn hash<H>(&self, state: &mut H)
+    where
+        H: Hasher,
+    {
+        (**self).hash(state)
+    }
+}
+
+impl<A> Ord for Object<A>
+where
+    A: ObjectPool,
+    A::Data: Ord,
+{
+    fn cmp(&self, other: &Self) -> Ordering {
+        A::Data::cmp(self, other)
+    }
+}
+
+impl<A, B> PartialEq<Object<B>> for Object<A>
+where
+    A: ObjectPool,
+    B: ObjectPool,
+    A::Data: PartialEq<B::Data>,
+{
+    fn eq(&self, other: &Object<B>) -> bool {
+        A::Data::eq(self, other)
+    }
+}
+
+impl<A, B> PartialOrd<Object<B>> for Object<A>
+where
+    A: ObjectPool,
+    B: ObjectPool,
+    A::Data: PartialOrd<B::Data>,
+{
+    fn partial_cmp(&self, other: &Object<B>) -> Option<Ordering> {
+        A::Data::partial_cmp(self, other)
+    }
+}
+
+unsafe impl<P> Send for Object<P>
+where
+    P: ObjectPool,
+    P::Data: Send,
+{
+}
+
+unsafe impl<P> Sync for Object<P>
+where
+    P: ObjectPool,
+    P::Data: Sync,
+{
+}
+
+/// An object "block" of data type `T` that has not yet been associated to an `ObjectPool`
+pub struct ObjectBlock<T> {
+    node: StructNode<T>,
+}
+
+impl<T> ObjectBlock<T> {
+    /// Creates a new object block with the given `initial_value`
+    pub const fn new(initial_value: T) -> Self {
+        Self {
+            node: StructNode {
+                next: ManuallyDrop::new(AtomicPtr::null()),
+                data: ManuallyDrop::new(initial_value),
+            },
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use core::sync::atomic::{self, AtomicUsize};
+
+    use super::*;
+
+    #[test]
+    fn cannot_request_if_empty() {
+        object_pool!(P: i32);
+
+        assert_eq!(None, P.request());
+    }
+
+    #[test]
+    fn can_request_if_manages_one_block() {
+        object_pool!(P: i32);
+
+        let block = unsafe {
+            static mut B: ObjectBlock<i32> = ObjectBlock::new(1);
+            &mut B
+        };
+        P.manage(block);
+
+        assert_eq!(1, *P.request().unwrap());
+    }
+
+    #[test]
+    fn request_drop_request() {
+        object_pool!(P: i32);
+
+        let block = unsafe {
+            static mut B: ObjectBlock<i32> = ObjectBlock::new(1);
+            &mut B
+        };
+        P.manage(block);
+
+        let mut object = P.request().unwrap();
+
+        *object = 2;
+        drop(object);
+
+        assert_eq!(2, *P.request().unwrap());
+    }
+
+    #[test]
+    fn destructor_does_not_run_on_drop() {
+        static COUNT: AtomicUsize = AtomicUsize::new(0);
+
+        pub struct S;
+
+        impl Drop for S {
+            fn drop(&mut self) {
+                COUNT.fetch_add(1, atomic::Ordering::Relaxed);
+            }
+        }
+
+        object_pool!(P: S);
+
+        let block = unsafe {
+            static mut B: ObjectBlock<S> = ObjectBlock::new(S);
+            &mut B
+        };
+        P.manage(block);
+
+        let object = P.request().unwrap();
+
+        assert_eq!(0, COUNT.load(atomic::Ordering::Relaxed));
+
+        drop(object);
+
+        assert_eq!(0, COUNT.load(atomic::Ordering::Relaxed));
+    }
+
+    #[test]
+    fn zst_is_well_aligned() {
+        #[repr(align(4096))]
+        pub struct Zst4096;
+
+        object_pool!(P: Zst4096);
+
+        let block = unsafe {
+            static mut B: ObjectBlock<Zst4096> = ObjectBlock::new(Zst4096);
+            &mut B
+        };
+        P.manage(block);
+
+        let object = P.request().unwrap();
+
+        let raw = &*object as *const Zst4096;
+        assert_eq!(0, raw as usize % 4096);
+    }
+}
--- a/src/pool/singleton.rs
+++ b/src/pool/singleton.rs
@ -1,437 +0,0 @@
-//! `Pool` as a global singleton
-
-use core::{
-    any::TypeId,
-    cmp, fmt,
-    hash::{Hash, Hasher},
-    marker::PhantomData,
-    mem::{self, MaybeUninit},
-    ops::{Deref, DerefMut},
-    ptr::{self, NonNull},
-};
-
-use super::{Init, Node, Uninit};
-
-pub mod arc;
-
-/// Instantiates a pool as a global singleton
-// NOTE(any(test)) makes testing easier (no need to enable Cargo features for testing)
-#[cfg(any(
-    armv6m,
-    armv7a,
-    armv7r,
-    armv7m,
-    armv8m_main,
-    all(
-        any(target_arch = "x86_64", target_arch = "x86"),
-        feature = "x86-sync-pool"
-    ),
-    test
-))]
-#[macro_export]
-macro_rules! pool {
-    ($(#[$($attr:tt)*])* $ident:ident: $ty:ty) => {
-        pub struct $ident;
-
-        impl $crate::pool::singleton::Pool for $ident {
-            type Data = $ty;
-
-            fn ptr() -> &'static $crate::pool::Pool<$ty> {
-                $(#[$($attr)*])*
-                static $ident: $crate::pool::Pool<$ty> = $crate::pool::Pool::new();
-
-                &$ident
-            }
-        }
-    };
-}
-
-/// A global singleton memory pool
-pub trait Pool {
-    /// The type of data that can be allocated on this pool
-    type Data: 'static;
-
-    #[doc(hidden)]
-    fn ptr() -> &'static super::Pool<Self::Data>;
-
-    /// Claims a memory block from the pool
-    ///
-    /// Returns `None` when the pool is observed as exhausted
-    ///
-    /// *NOTE:* This method does *not* have bounded execution time; i.e. it contains a CAS loop
-    fn alloc() -> Option<Box<Self, Uninit>>
-    where
-        Self: Sized,
-    {
-        Self::ptr().alloc().map(|inner| Box {
-            _pool: PhantomData,
-            inner,
-        })
-    }
-
-    /// Increases the capacity of the pool
-    ///
-    /// This method might *not* fully utilize the given memory block due to alignment requirements
-    ///
-    /// This method returns the number of *new* blocks that can be allocated.
-    fn grow(memory: &'static mut [u8]) -> usize {
-        Self::ptr().grow(memory)
-    }
-
-    /// Increases the capacity of the pool
-    ///
-    /// Unlike [`Pool.grow`](trait.Pool.html#method.grow_exact) this method fully utilizes the given
-    /// memory block
-    fn grow_exact<A>(memory: &'static mut MaybeUninit<A>) -> usize
-    where
-        A: AsMut<[Node<Self::Data>]>,
-    {
-        Self::ptr().grow_exact(memory)
-    }
-}
-
-/// A memory block that belongs to the global memory pool, `POOL`
-pub struct Box<POOL, STATE = Init>
-where
-    POOL: Pool,
-    STATE: 'static,
-{
-    _pool: PhantomData<POOL>,
-    inner: super::Box<POOL::Data, STATE>,
-}
-
-impl<P> Box<P, Uninit>
-where
-    P: Pool,
-{
-    /// Initializes this memory block
-    pub fn init(self, val: P::Data) -> Box<P, Init> {
-        let node = self.inner.node;
-
-        mem::forget(self);
-
-        if mem::size_of::<P::Data>() == 0 {
-            // no memory operation needed for ZST
-            // BUT we want to avoid calling `val`s destructor
-            mem::forget(val)
-        } else {
-            unsafe {
-                ptr::write(node.as_ref().data.get(), val);
-            }
-        }
-
-        Box {
-            inner: super::Box {
-                node,
-                _state: PhantomData,
-            },
-            _pool: PhantomData,
-        }
-    }
-}
-
-impl<P> Box<P, Uninit>
-where
-    P: Pool,
-    P::Data: AsRef<[u8]>,
-{
-    #[deprecated(
-        since = "0.7.3",
-        note = "This can access uninitialized memory, use `init(..)` instead (https://github.com/japaric/heapless/issues/212)"
-    )]
-    /// (DO NOT USE, SEE DEPRECATION) Freezes the contents of this memory block
-    ///
-    /// See [rust-lang/rust#58363](https://github.com/rust-lang/rust/pull/58363) for details.
-    pub fn freeze(self) -> Box<P, Init> {
-        let node = self.inner.node;
-
-        mem::forget(self);
-
-        // it seems we can get away with not calling `ptr::freeze` here and not run into UB
-        // because we are dealing with static memory and using fences
-        // let p: *const u8 = (*node.as_ref().data.get()).as_slice().as_ptr();
-        // ptr::freeze(p as *mut u8);
-
-        Box {
-            inner: super::Box {
-                node,
-                _state: PhantomData,
-            },
-            _pool: PhantomData,
-        }
-    }
-}
-
-impl<P> Box<P, Init>
-where
-    P: Pool,
-{
-    /// Forgets the contents of this memory block without running its destructor.
-    ///
-    /// Note that this this does not return the memory block to the pool. The
-    /// block can be reused, or returned to the pool by dropping it.
-    pub fn forget(self) -> Box<P, Uninit> {
-        let node = self.inner.node;
-
-        mem::forget(self);
-        if mem::size_of::<P::Data>() == 0 {
-            // no need to do a pointer dereference in this case
-        } else {
-            mem::forget(unsafe { ptr::read(node.as_ref().data.get()) });
-        }
-
-        Box {
-            inner: super::Box {
-                node,
-                _state: PhantomData,
-            },
-            _pool: PhantomData,
-        }
-    }
-}
-
-impl<P> Deref for Box<P>
-where
-    P: Pool,
-{
-    type Target = P::Data;
-
-    fn deref(&self) -> &P::Data {
-        self.inner.deref()
-    }
-}
-
-impl<P> DerefMut for Box<P>
-where
-    P: Pool,
-{
-    fn deref_mut(&mut self) -> &mut P::Data {
-        self.inner.deref_mut()
-    }
-}
-
-unsafe impl<P: Pool> stable_deref_trait::StableDeref for Box<P> {}
-
-impl<P> fmt::Debug for Box<P>
-where
-    P: Pool,
-    P::Data: fmt::Debug,
-{
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        <P::Data as fmt::Debug>::fmt(self, f)
-    }
-}
-
-impl<P> fmt::Display for Box<P>
-where
-    P: Pool,
-    P::Data: fmt::Display,
-{
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        <P::Data as fmt::Display>::fmt(self, f)
-    }
-}
-
-impl<P, S> Drop for Box<P, S>
-where
-    P: Pool,
-    S: 'static,
-{
-    fn drop(&mut self) {
-        if TypeId::of::<S>() == TypeId::of::<Init>() {
-            let p = if mem::size_of::<P::Data>() == 0 {
-                // any pointer will do to invoke the destructor of a ZST
-                NonNull::dangling().as_ptr()
-            } else {
-                unsafe { self.inner.node.as_ref().data.get() }
-            };
-            unsafe {
-                ptr::drop_in_place(p);
-            }
-        }
-
-        if mem::size_of::<P::Data>() != 0 {
-            P::ptr().stack.push(self.inner.node)
-        }
-    }
-}
-
-unsafe impl<P, S> Send for Box<P, S>
-where
-    P: Pool,
-    P::Data: Send,
-{
-}
-
-unsafe impl<P, S> Sync for Box<P, S>
-where
-    P: Pool,
-    P::Data: Sync,
-{
-}
-
-impl<P, T> AsRef<[T]> for Box<P>
-where
-    P: Pool,
-    P::Data: AsRef<[T]>,
-{
-    fn as_ref(&self) -> &[T] {
-        self.deref().as_ref()
-    }
-}
-
-impl<P, T> AsMut<[T]> for Box<P>
-where
-    P: Pool,
-    P::Data: AsMut<[T]>,
-{
-    fn as_mut(&mut self) -> &mut [T] {
-        self.deref_mut().as_mut()
-    }
-}
-
-impl<P> PartialEq for Box<P>
-where
-    P: Pool,
-    P::Data: PartialEq,
-{
-    fn eq(&self, rhs: &Box<P>) -> bool {
-        <P::Data as PartialEq>::eq(self, rhs)
-    }
-}
-
-impl<P> Eq for Box<P>
-where
-    P: Pool,
-    P::Data: Eq,
-{
-}
-
-impl<P> PartialOrd for Box<P>
-where
-    P: Pool,
-    P::Data: PartialOrd,
-{
-    fn partial_cmp(&self, rhs: &Box<P>) -> Option<cmp::Ordering> {
-        <P::Data as PartialOrd>::partial_cmp(self, rhs)
-    }
-}
-
-impl<P> Ord for Box<P>
-where
-    P: Pool,
-    P::Data: Ord,
-{
-    fn cmp(&self, rhs: &Box<P>) -> cmp::Ordering {
-        <P::Data as Ord>::cmp(self, rhs)
-    }
-}
-
-impl<P> Hash for Box<P>
-where
-    P: Pool,
-    P::Data: Hash,
-{
-    fn hash<H>(&self, state: &mut H)
-    where
-        H: Hasher,
-    {
-        <P::Data as Hash>::hash(self, state)
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use core::{
-        mem,
-        sync::atomic::{AtomicUsize, Ordering},
-    };
-
-    use super::{super::Node, Pool};
-
-    #[test]
-    fn sanity() {
-        const SZ: usize = 2 * mem::size_of::<Node<u8>>() - 1;
-        static mut MEMORY: [u8; SZ] = [0; SZ];
-
-        pool!(A: u8);
-
-        // empty pool
-        assert!(A::alloc().is_none());
-
-        A::grow(unsafe { &mut MEMORY });
-
-        let x = A::alloc().unwrap().init(0);
-        assert_eq!(*x, 0);
-
-        // pool exhausted
-        assert!(A::alloc().is_none());
-
-        drop(x);
-
-        // should be possible to allocate again
-        assert_eq!(*A::alloc().unwrap().init(1), 1);
-    }
-
-    #[test]
-    fn boxed_zst_is_well_aligned() {
-        #[repr(align(2))]
-        pub struct Zst2;
-
-        pool!(A: Zst2);
-
-        let x = A::alloc().unwrap().init(Zst2);
-        assert_eq!(0, &*x as *const Zst2 as usize % 2);
-
-        #[repr(align(4096))]
-        pub struct Zst4096;
-
-        pool!(B: Zst4096);
-
-        let x = B::alloc().unwrap().init(Zst4096);
-        assert_eq!(0, &*x as *const Zst4096 as usize % 4096);
-    }
-
-    #[test]
-    fn destructors() {
-        static COUNT: AtomicUsize = AtomicUsize::new(0);
-
-        pub struct X;
-
-        impl X {
-            fn new() -> X {
-                COUNT.fetch_add(1, Ordering::Relaxed);
-                X
-            }
-        }
-
-        impl Drop for X {
-            fn drop(&mut self) {
-                COUNT.fetch_sub(1, Ordering::Relaxed);
-            }
-        }
-
-        pool!(A: X);
-
-        let x = A::alloc().unwrap().init(X::new());
-        let y = A::alloc().unwrap().init(X::new());
-        let z = A::alloc().unwrap().init(X::new());
-
-        assert_eq!(COUNT.load(Ordering::Relaxed), 3);
-
-        // this runs `X`'s destructor
-        drop(x);
-
-        assert_eq!(COUNT.load(Ordering::Relaxed), 2);
-
-        // this leaks memory
-        mem::forget(y);
-
-        assert_eq!(COUNT.load(Ordering::Relaxed), 2);
-
-        // this forgets `X` without leaking memory
-        z.forget();
-
-        assert_eq!(COUNT.load(Ordering::Relaxed), 2);
-    }
-}
--- a/src/pool/singleton/arc.rs
+++ b/src/pool/singleton/arc.rs
@ -1,392 +0,0 @@
-//! Like [`std::sync::Arc`](https://doc.rust-lang.org/std/sync/struct.Arc.html) but backed by a
-//! memory [`Pool`](trait.Pool.html) rather than `#[global_allocator]`
-//!
-//! Note that the same limitations that apply to ["Box" pool] also apply to the "Arc" pool.
-//!
-//! ["Box" pool]: ../../index.html
-//!
-//! # Examples
-//!
-//! ``` ignore
-//! use heapless::{arc_pool, Arc};
-//!
-//! pub struct BigStruct { // <- does NOT implement Clone
-//!     data: [u8; 128],
-//!     // ..
-//! }
-//!
-//! // declare a memory pool
-//! arc_pool!(P: BigStruct);
-//!
-//!
-//! #[cortex_m_rt::entry]
-//! fn main() -> ! {
-//!     static mut MEMORY: [u8; 1024] = [0; 1024];
-//!
-//!     // give some static memory to the pool
-//!     P::grow(MEMORY);
-//!
-//!     let x: Arc<P> = P::alloc(BigStruct::new()).ok().expect("OOM");
-//!     //         ^ NOTE: this is the Pool type, not the data type
-//!
-//!     // cloning is cheap; it increases the refcount
-//!     let y = x.clone();
-//!
-//!     // same data address
-//!     assert_eq!(&*x as *const _, &*y as *const _);
-//!
-//!     // auto-deref
-//!     let data: &[u8] = &x.data;
-//!
-//!     // decrease refcount
-//!     drop(x);
-//!
-//!     // refcount decreased to 0; memory is returned to the pool
-//!     drop(y);
-//!
-//!     // ..
-//! }
-//! ```
-//!
-//! The `grow_exact` API is also available on the "Arc pool". It requires using
-//! `Node<ArcInner<Type>>` as the array element type. Example below:
-//!
-//! ``` ignore
-//! use heapless::pool::{singleton::arc::ArcInner, Node};
-//!
-//! pub struct BigStruct { /* .. */ }
-//!
-//! arc_pool!(P: BigStruct);
-//!
-//! #[cortex_m_rt::entry]
-//! fn main() -> ! {
-//!     static mut MEMORY: MaybeUninit<[Node<ArcInner<BigStruct>>; 2]> = MaybeUninit::uninit();
-//!
-//!     P::grow_exact(MEMORY);
-//!
-//!     // 2 allocations are guaranteed to work
-//!     let x = P::alloc(BigStruct::new()).ok().expect("OOM");
-//!     let y = P::alloc(BigStruct::new()).ok().expect("OOM");
-//!
-//!     // ..
-//! }
-//! ```
-
-use core::{
-    cmp, fmt,
-    hash::{Hash, Hasher},
-    marker::PhantomData,
-    ops::Deref,
-    ptr,
-    sync::atomic,
-};
-
-#[cfg(cas_atomic_polyfill)]
-use atomic_polyfill::{AtomicUsize, Ordering};
-
-#[cfg(not(cas_atomic_polyfill))]
-use core::sync::atomic::{AtomicUsize, Ordering};
-
-use crate::pool::{self, stack::Ptr, Node};
-
-/// Instantiates a pool of Arc pointers as a global singleton
-// NOTE(any(test)) makes testing easier (no need to enable Cargo features for testing)
-#[cfg(any(
-    armv6m,
-    armv7a,
-    armv7r,
-    armv7m,
-    armv8m_main,
-    all(
-        any(target_arch = "x86_64", target_arch = "x86"),
-        feature = "x86-sync-pool"
-    ),
-    test
-))]
-#[macro_export]
-macro_rules! arc_pool {
-    ($(#[$($attr:tt)*])* $ident:ident: $ty:ty) => {
-        pub struct $ident;
-
-        impl $crate::pool::singleton::arc::Pool for $ident {
-            type Data = $ty;
-
-            fn ptr() -> &'static $crate::pool::Pool<$crate::pool::singleton::arc::ArcInner<$ty>> {
-                $(#[$($attr)*])*
-                static POOL: $crate::pool::Pool<$crate::pool::singleton::arc::ArcInner<$ty>> =
-                    $crate::pool::Pool::new();
-
-                &POOL
-            }
-        }
-
-        impl $ident {
-            /// Allocates a new `Arc` and writes `data` to it
-            ///
-            /// Returns an `Err`or if the backing memory pool is empty
-            pub fn alloc(data: $ty) -> Result<$crate::Arc<Self>, $ty>
-            where
-                Self: Sized,
-            {
-                $crate::Arc::new(data)
-            }
-
-            /// Increases the capacity of the pool
-            ///
-            /// This method might *not* fully utilize the given memory block due to alignment requirements
-            ///
-            /// This method returns the number of *new* blocks that can be allocated.
-            pub fn grow(memory: &'static mut [u8]) -> usize {
-                <Self as $crate::pool::singleton::arc::Pool>::ptr().grow(memory)
-            }
-
-            /// Increases the capacity of the pool
-            ///
-            /// Unlike `grow`, this method fully utilizes the given memory block
-            pub fn grow_exact<A>(memory: &'static mut MaybeUninit<A>) -> usize
-            where
-                A: AsMut<[$crate::pool::Node<$crate::pool::singleton::arc::ArcInner<$ty>>]>,
-            {
-                <Self as $crate::pool::singleton::arc::Pool>::ptr().grow_exact(memory)
-            }
-        }
-    };
-}
-
-/// Pool of Arc pointers
-pub trait Pool {
-    /// The data behind the Arc pointer
-    type Data: 'static;
-
-    #[doc(hidden)]
-    fn ptr() -> &'static pool::Pool<ArcInner<Self::Data>>;
-}
-
-// mostly a verbatim copy of liballoc(/src/sync.rs) as of v1.54.0 minus the `Weak` API
-// anything that diverges has been marked with `XXX`
-
-/// `std::sync::Arc` but backed by a memory [`Pool`] rather than `#[global_allocator]`
-///
-/// [`Pool`]: trait.Pool.html
-///
-/// An example and more details can be found in the [module level documentation](index.html).
-// XXX `Pool::Data` is not `?Sized` -- `Unsize` coercions cannot be implemented on stable
-pub struct Arc<P>
-where
-    P: Pool,
-{
-    phantom: PhantomData<ArcInner<P::Data>>,
-    ptr: Ptr<Node<ArcInner<P::Data>>>,
-    pool: PhantomData<P>,
-}
-
-impl<P> Arc<P>
-where
-    P: Pool,
-{
-    /// Constructs a new `Arc`
-    ///
-    /// Returns an `Err`or if the backing memory pool is empty
-    // XXX original API is "infallible"
-    pub fn new(data: P::Data) -> Result<Self, P::Data> {
-        if let Some(node) = P::ptr().stack.try_pop() {
-            unsafe {
-                ptr::write(
-                    node.as_ref().data.get(),
-                    ArcInner {
-                        strong: AtomicUsize::new(1),
-                        data,
-                    },
-                )
-            }
-
-            Ok(Self {
-                phantom: PhantomData,
-                pool: PhantomData,
-                ptr: node,
-            })
-        } else {
-            Err(data)
-        }
-    }
-
-    fn inner(&self) -> &ArcInner<P::Data> {
-        unsafe { &*self.ptr.as_ref().data.get() }
-    }
-
-    fn from_inner(ptr: Ptr<Node<ArcInner<P::Data>>>) -> Self {
-        Self {
-            phantom: PhantomData,
-            pool: PhantomData,
-            ptr,
-        }
-    }
-
-    unsafe fn get_mut_unchecked(this: &mut Self) -> &mut P::Data {
-        &mut (*this.ptr.as_ref().data.get()).data
-        // &mut (*this.ptr.as_ptr()).data
-    }
-
-    #[inline(never)]
-    unsafe fn drop_slow(&mut self) {
-        // run `P::Data`'s destructor
-        ptr::drop_in_place(Self::get_mut_unchecked(self));
-
-        // XXX memory pool instead of `#[global_allocator]`
-        // return memory to pool
-        P::ptr().stack.push(self.ptr);
-    }
-}
-
-const MAX_REFCOUNT: usize = (isize::MAX) as usize;
-
-impl<P> AsRef<P::Data> for Arc<P>
-where
-    P: Pool,
-{
-    fn as_ref(&self) -> &P::Data {
-        &**self
-    }
-}
-
-// XXX no `Borrow` implementation due to 'conflicting implementations of trait' error
-
-impl<P> Clone for Arc<P>
-where
-    P: Pool,
-{
-    fn clone(&self) -> Self {
-        let old_size = self.inner().strong.fetch_add(1, Ordering::Relaxed);
-
-        if old_size > MAX_REFCOUNT {
-            // XXX original code calls `intrinsics::abort` which is unstable API
-            panic!();
-        }
-
-        Self::from_inner(self.ptr)
-    }
-}
-
-impl<P> fmt::Debug for Arc<P>
-where
-    P: Pool,
-    P::Data: fmt::Debug,
-{
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        fmt::Debug::fmt(&**self, f)
-    }
-}
-
-impl<P> Deref for Arc<P>
-where
-    P: Pool,
-{
-    type Target = P::Data;
-
-    fn deref(&self) -> &P::Data {
-        &self.inner().data
-    }
-}
-
-impl<P> fmt::Display for Arc<P>
-where
-    P: Pool,
-    P::Data: fmt::Display,
-{
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        fmt::Display::fmt(&**self, f)
-    }
-}
-
-// XXX original uses `#[may_dangle]` which is an unstable language feature
-impl<P> Drop for Arc<P>
-where
-    P: Pool,
-{
-    fn drop(&mut self) {
-        if self.inner().strong.fetch_sub(1, Ordering::Release) != 1 {
-            return;
-        }
-
-        atomic::fence(Ordering::Acquire);
-
-        unsafe {
-            self.drop_slow();
-        }
-    }
-}
-
-impl<P> Eq for Arc<P>
-where
-    P: Pool,
-    P::Data: Eq,
-{
-}
-
-impl<P> Hash for Arc<P>
-where
-    P: Pool,
-    P::Data: Hash,
-{
-    fn hash<H>(&self, state: &mut H)
-    where
-        H: Hasher,
-    {
-        (**self).hash(state)
-    }
-}
-
-impl<P> Ord for Arc<P>
-where
-    P: Pool,
-    P::Data: Ord,
-{
-    fn cmp(&self, other: &Self) -> cmp::Ordering {
-        (**self).cmp(&**other)
-    }
-}
-
-impl<P> PartialEq for Arc<P>
-where
-    P: Pool,
-    P::Data: PartialEq,
-{
-    fn eq(&self, other: &Self) -> bool {
-        // XXX missing pointer equality specialization, which uses an unstable language feature
-        (**self).eq(&**other)
-    }
-}
-
-impl<P> PartialOrd for Arc<P>
-where
-    P: Pool,
-    P::Data: PartialOrd,
-{
-    fn partial_cmp(&self, other: &Self) -> Option<cmp::Ordering> {
-        (**self).partial_cmp(&**other)
-    }
-}
-
-unsafe impl<P> Send for Arc<P>
-where
-    P: Pool,
-    P::Data: Sync + Send,
-{
-}
-
-unsafe impl<P> Sync for Arc<P>
-where
-    P: Pool,
-    P::Data: Sync + Send,
-{
-}
-
-impl<P> Unpin for Arc<P> where P: Pool {}
-
-/// Unfortunate implementation detail required to use the `grow_exact` API
-pub struct ArcInner<T> {
-    data: T,
-    strong: AtomicUsize,
-    // XXX `Weak` API not implemented
-    // weak: AtomicUsize,
-}
--- a/src/pool/treiber.rs
+++ b/src/pool/treiber.rs
@ -0,0 +1,91 @@
+use core::mem::ManuallyDrop;
+
+#[cfg_attr(target_arch = "x86", path = "treiber/cas.rs")]
+#[cfg_attr(arm_llsc, path = "treiber/llsc.rs")]
+mod impl_;
+
+pub use impl_::{AtomicPtr, NonNullPtr};
+
+pub struct Stack<N>
+where
+    N: Node,
+{
+    top: AtomicPtr<N>,
+}
+
+impl<N> Stack<N>
+where
+    N: Node,
+{
+    pub const fn new() -> Self {
+        Self {
+            top: AtomicPtr::null(),
+        }
+    }
+
+    /// # Safety
+    /// - `node` must be a valid pointer
+    /// - aliasing rules must be enforced by the caller. e.g, the same `node` may not be pushed more than once
+    pub unsafe fn push(&self, node: NonNullPtr<N>) {
+        impl_::push(self, node)
+    }
+
+    pub fn try_pop(&self) -> Option<NonNullPtr<N>> {
+        impl_::try_pop(self)
+    }
+}
+
+pub trait Node: Sized {
+    type Data;
+
+    fn next(&self) -> &AtomicPtr<Self>;
+    fn next_mut(&mut self) -> &mut AtomicPtr<Self>;
+}
+
+pub union UnionNode<T> {
+    next: ManuallyDrop<AtomicPtr<UnionNode<T>>>,
+    pub data: ManuallyDrop<T>,
+}
+
+impl<T> Node for UnionNode<T> {
+    type Data = T;
+
+    fn next(&self) -> &AtomicPtr<Self> {
+        unsafe { &self.next }
+    }
+
+    fn next_mut(&mut self) -> &mut AtomicPtr<Self> {
+        unsafe { &mut self.next }
+    }
+}
+
+pub struct StructNode<T> {
+    pub next: ManuallyDrop<AtomicPtr<StructNode<T>>>,
+    pub data: ManuallyDrop<T>,
+}
+
+impl<T> Node for StructNode<T> {
+    type Data = T;
+
+    fn next(&self) -> &AtomicPtr<Self> {
+        &self.next
+    }
+
+    fn next_mut(&mut self) -> &mut AtomicPtr<Self> {
+        &mut self.next
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use core::mem;
+
+    use super::*;
+
+    #[test]
+    fn node_is_never_zero_sized() {
+        struct Zst;
+
+        assert_ne!(mem::size_of::<UnionNode<Zst>>(), 0);
+    }
+}
--- a/src/pool/treiber/cas.rs
+++ b/src/pool/treiber/cas.rs
@ -0,0 +1,196 @@
+use core::{
+    marker::PhantomData,
+    num::{NonZeroU32, NonZeroU64},
+    ptr::NonNull,
+    sync::atomic::{AtomicU64, Ordering},
+};
+
+use super::{Node, Stack};
+
+pub struct AtomicPtr<N>
+where
+    N: Node,
+{
+    inner: AtomicU64,
+    _marker: PhantomData<*mut N>,
+}
+
+impl<N> AtomicPtr<N>
+where
+    N: Node,
+{
+    pub const fn null() -> Self {
+        Self {
+            inner: AtomicU64::new(0),
+            _marker: PhantomData,
+        }
+    }
+
+    fn compare_and_exchange_weak(
+        &self,
+        current: Option<NonNullPtr<N>>,
+        new: Option<NonNullPtr<N>>,
+        success: Ordering,
+        failure: Ordering,
+    ) -> Result<(), Option<NonNullPtr<N>>> {
+        self.inner
+            .compare_exchange_weak(
+                current
+                    .map(|pointer| pointer.into_u64())
+                    .unwrap_or_default(),
+                new.map(|pointer| pointer.into_u64()).unwrap_or_default(),
+                success,
+                failure,
+            )
+            .map(drop)
+            .map_err(NonNullPtr::from_u64)
+    }
+
+    fn load(&self, order: Ordering) -> Option<NonNullPtr<N>> {
+        NonZeroU64::new(self.inner.load(order)).map(|inner| NonNullPtr {
+            inner,
+            _marker: PhantomData,
+        })
+    }
+
+    fn store(&self, value: Option<NonNullPtr<N>>, order: Ordering) {
+        self.inner.store(
+            value.map(|pointer| pointer.into_u64()).unwrap_or_default(),
+            order,
+        )
+    }
+}
+
+pub struct NonNullPtr<N>
+where
+    N: Node,
+{
+    inner: NonZeroU64,
+    _marker: PhantomData<*mut N>,
+}
+
+impl<N> Clone for NonNullPtr<N>
+where
+    N: Node,
+{
+    fn clone(&self) -> Self {
+        *self
+    }
+}
+
+impl<N> Copy for NonNullPtr<N> where N: Node {}
+
+impl<N> NonNullPtr<N>
+where
+    N: Node,
+{
+    pub fn as_ptr(&self) -> *mut N {
+        self.inner.get() as *mut N
+    }
+
+    pub fn from_static_mut_ref(ref_: &'static mut N) -> NonNullPtr<N> {
+        let non_null = NonNull::from(ref_);
+        Self::from_non_null(non_null)
+    }
+
+    fn from_non_null(ptr: NonNull<N>) -> Self {
+        let address = ptr.as_ptr() as u32;
+        let tag = initial_tag().get();
+
+        let value = (u64::from(tag) << 32) | u64::from(address);
+
+        Self {
+            inner: unsafe { NonZeroU64::new_unchecked(value) },
+            _marker: PhantomData,
+        }
+    }
+
+    fn from_u64(value: u64) -> Option<Self> {
+        NonZeroU64::new(value).map(|inner| Self {
+            inner,
+            _marker: PhantomData,
+        })
+    }
+
+    fn non_null(&self) -> NonNull<N> {
+        unsafe { NonNull::new_unchecked(self.inner.get() as *mut N) }
+    }
+
+    fn tag(&self) -> NonZeroU32 {
+        unsafe { NonZeroU32::new_unchecked((self.inner.get() >> 32) as u32) }
+    }
+
+    fn into_u64(self) -> u64 {
+        self.inner.get()
+    }
+
+    fn increase_tag(&mut self) {
+        let address = self.as_ptr() as u32;
+
+        let new_tag = self
+            .tag()
+            .get()
+            .checked_add(1)
+            .map(|val| unsafe { NonZeroU32::new_unchecked(val) })
+            .unwrap_or_else(initial_tag)
+            .get();
+
+        let value = (u64::from(new_tag) << 32) | u64::from(address);
+
+        self.inner = unsafe { NonZeroU64::new_unchecked(value) };
+    }
+}
+
+fn initial_tag() -> NonZeroU32 {
+    unsafe { NonZeroU32::new_unchecked(1) }
+}
+
+pub unsafe fn push<N>(stack: &Stack<N>, new_top: NonNullPtr<N>)
+where
+    N: Node,
+{
+    let mut top = stack.top.load(Ordering::Relaxed);
+
+    loop {
+        new_top
+            .non_null()
+            .as_ref()
+            .next()
+            .store(top, Ordering::Relaxed);
+
+        if let Err(p) = stack.top.compare_and_exchange_weak(
+            top,
+            Some(new_top),
+            Ordering::Release,
+            Ordering::Relaxed,
+        ) {
+            top = p;
+        } else {
+            return;
+        }
+    }
+}
+
+pub fn try_pop<N>(stack: &Stack<N>) -> Option<NonNullPtr<N>>
+where
+    N: Node,
+{
+    loop {
+        if let Some(mut top) = stack.top.load(Ordering::Acquire) {
+            let next = unsafe { top.non_null().as_ref().next().load(Ordering::Relaxed) };
+
+            if stack
+                .top
+                .compare_and_exchange_weak(Some(top), next, Ordering::Release, Ordering::Relaxed)
+                .is_ok()
+            {
+                top.increase_tag();
+
+                return Some(top);
+            }
+        } else {
+            // stack observed as empty
+            return None;
+        }
+    }
+}
--- a/src/pool/treiber/llsc.rs
+++ b/src/pool/treiber/llsc.rs
@ -0,0 +1,145 @@
+use core::{
+    cell::UnsafeCell,
+    ptr::{self, NonNull},
+};
+
+use super::{Node, Stack};
+
+pub struct AtomicPtr<N>
+where
+    N: Node,
+{
+    inner: UnsafeCell<Option<NonNull<N>>>,
+}
+
+impl<N> AtomicPtr<N>
+where
+    N: Node,
+{
+    pub const fn null() -> Self {
+        Self {
+            inner: UnsafeCell::new(None),
+        }
+    }
+}
+
+pub struct NonNullPtr<N>
+where
+    N: Node,
+{
+    inner: NonNull<N>,
+}
+
+impl<N> NonNullPtr<N>
+where
+    N: Node,
+{
+    pub fn as_ptr(&self) -> *mut N {
+        self.inner.as_ptr().cast()
+    }
+
+    pub fn from_static_mut_ref(ref_: &'static mut N) -> Self {
+        Self {
+            inner: NonNull::from(ref_),
+        }
+    }
+}
+
+impl<N> Clone for NonNullPtr<N>
+where
+    N: Node,
+{
+    fn clone(&self) -> Self {
+        Self { inner: self.inner }
+    }
+}
+
+impl<N> Copy for NonNullPtr<N> where N: Node {}
+
+pub unsafe fn push<N>(stack: &Stack<N>, mut node: NonNullPtr<N>)
+where
+    N: Node,
+{
+    let top_addr = ptr::addr_of!(stack.top) as *mut usize;
+
+    loop {
+        let top = arch::load_link(top_addr);
+
+        node.inner
+            .as_mut()
+            .next_mut()
+            .inner
+            .get()
+            .write(NonNull::new(top as *mut _));
+
+        if arch::store_conditional(node.inner.as_ptr() as usize, top_addr).is_ok() {
+            break;
+        }
+    }
+}
+
+pub fn try_pop<N>(stack: &Stack<N>) -> Option<NonNullPtr<N>>
+where
+    N: Node,
+{
+    unsafe {
+        let top_addr = ptr::addr_of!(stack.top) as *mut usize;
+
+        loop {
+            let top = arch::load_link(top_addr);
+
+            if let Some(top) = NonNull::new(top as *mut N) {
+                let next = &top.as_ref().next();
+
+                if arch::store_conditional(
+                    next.inner
+                        .get()
+                        .read()
+                        .map(|non_null| non_null.as_ptr() as usize)
+                        .unwrap_or_default(),
+                    top_addr,
+                )
+                .is_ok()
+                {
+                    break Some(NonNullPtr { inner: top });
+                }
+            } else {
+                arch::clear_load_link();
+
+                break None;
+            }
+        }
+    }
+}
+
+#[cfg(arm_llsc)]
+mod arch {
+    use core::arch::asm;
+
+    #[inline(always)]
+    pub fn clear_load_link() {
+        unsafe { asm!("clrex", options(nomem, nostack)) }
+    }
+
+    /// # Safety
+    /// - `addr` must be a valid pointer
+    #[inline(always)]
+    pub unsafe fn load_link(addr: *const usize) -> usize {
+        let value;
+        asm!("ldrex {}, [{}]", out(reg) value, in(reg) addr, options(nostack));
+        value
+    }
+
+    /// # Safety
+    /// - `addr` must be a valid pointer
+    #[inline(always)]
+    pub unsafe fn store_conditional(value: usize, addr: *mut usize) -> Result<(), ()> {
+        let outcome: usize;
+        asm!("strex {}, {}, [{}]", out(reg) outcome, in(reg) value, in(reg) addr, options(nostack));
+        if outcome == 0 {
+            Ok(())
+        } else {
+            Err(())
+        }
+    }
+}