mirror of
https://github.com/rust-lang/rust.git
synced 2025-12-15 18:12:00 +00:00
* fix build after stabilization of cfg_target_feature and target_feature * fix doc tests * fix spurious unused_attributes warning * fix more unused attribute warnings * More unnecessary target features * Remove no longer needed trait imports * Remove fixed upstream workarounds * Fix parsing the #[assert_instr] macro Following upstream proc_macro changes * Fix form and parsing of #[simd_test] * Don't use Cargo features for testing modes Instead use RUSTFLAGS with `--cfg`. This'll help us be compatible with the latest Cargo where a tweak to workspaces and features made the previous invocations we had invalid. * Don't thread RUSTFLAGS through docker * Re-gate on x86 verification Closes #411
256 lines
8.6 KiB
Rust
256 lines
8.6 KiB
Rust
//! Streaming SIMD Extensions 3 (SSE3)
|
|
|
|
use coresimd::simd::*;
|
|
use coresimd::simd_llvm::{simd_shuffle2, simd_shuffle4};
|
|
use coresimd::x86::*;
|
|
|
|
#[cfg(test)]
|
|
use stdsimd_test::assert_instr;
|
|
|
|
/// Alternatively add and subtract packed single-precision (32-bit)
|
|
/// floating-point elements in `a` to/from packed elements in `b`.
|
|
///
|
|
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_addsub_ps)
|
|
#[inline]
|
|
#[target_feature(enable = "sse3")]
|
|
#[cfg_attr(test, assert_instr(addsubps))]
|
|
#[stable(feature = "simd_x86", since = "1.27.0")]
|
|
pub unsafe fn _mm_addsub_ps(a: __m128, b: __m128) -> __m128 {
|
|
addsubps(a, b)
|
|
}
|
|
|
|
/// Alternatively add and subtract packed double-precision (64-bit)
|
|
/// floating-point elements in `a` to/from packed elements in `b`.
|
|
///
|
|
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_addsub_pd)
|
|
#[inline]
|
|
#[target_feature(enable = "sse3")]
|
|
#[cfg_attr(test, assert_instr(addsubpd))]
|
|
#[stable(feature = "simd_x86", since = "1.27.0")]
|
|
pub unsafe fn _mm_addsub_pd(a: __m128d, b: __m128d) -> __m128d {
|
|
addsubpd(a, b)
|
|
}
|
|
|
|
/// Horizontally add adjacent pairs of double-precision (64-bit)
|
|
/// floating-point elements in `a` and `b`, and pack the results.
|
|
///
|
|
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pd)
|
|
#[inline]
|
|
#[target_feature(enable = "sse3")]
|
|
#[cfg_attr(test, assert_instr(haddpd))]
|
|
#[stable(feature = "simd_x86", since = "1.27.0")]
|
|
pub unsafe fn _mm_hadd_pd(a: __m128d, b: __m128d) -> __m128d {
|
|
haddpd(a, b)
|
|
}
|
|
|
|
/// Horizontally add adjacent pairs of single-precision (32-bit)
|
|
/// floating-point elements in `a` and `b`, and pack the results.
|
|
///
|
|
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_ps)
|
|
#[inline]
|
|
#[target_feature(enable = "sse3")]
|
|
#[cfg_attr(test, assert_instr(haddps))]
|
|
#[stable(feature = "simd_x86", since = "1.27.0")]
|
|
pub unsafe fn _mm_hadd_ps(a: __m128, b: __m128) -> __m128 {
|
|
haddps(a, b)
|
|
}
|
|
|
|
/// Horizontally subtract adjacent pairs of double-precision (64-bit)
|
|
/// floating-point elements in `a` and `b`, and pack the results.
|
|
///
|
|
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_pd)
|
|
#[inline]
|
|
#[target_feature(enable = "sse3")]
|
|
#[cfg_attr(test, assert_instr(hsubpd))]
|
|
#[stable(feature = "simd_x86", since = "1.27.0")]
|
|
pub unsafe fn _mm_hsub_pd(a: __m128d, b: __m128d) -> __m128d {
|
|
hsubpd(a, b)
|
|
}
|
|
|
|
/// Horizontally add adjacent pairs of single-precision (32-bit)
|
|
/// floating-point elements in `a` and `b`, and pack the results.
|
|
///
|
|
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_ps)
|
|
#[inline]
|
|
#[target_feature(enable = "sse3")]
|
|
#[cfg_attr(test, assert_instr(hsubps))]
|
|
#[stable(feature = "simd_x86", since = "1.27.0")]
|
|
pub unsafe fn _mm_hsub_ps(a: __m128, b: __m128) -> __m128 {
|
|
hsubps(a, b)
|
|
}
|
|
|
|
/// Load 128-bits of integer data from unaligned memory.
|
|
/// This intrinsic may perform better than `_mm_loadu_si128`
|
|
/// when the data crosses a cache line boundary.
|
|
///
|
|
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lddqu_si128)
|
|
#[inline]
|
|
#[target_feature(enable = "sse3")]
|
|
#[cfg_attr(test, assert_instr(lddqu))]
|
|
#[stable(feature = "simd_x86", since = "1.27.0")]
|
|
pub unsafe fn _mm_lddqu_si128(mem_addr: *const __m128i) -> __m128i {
|
|
mem::transmute(lddqu(mem_addr as *const _))
|
|
}
|
|
|
|
/// Duplicate the low double-precision (64-bit) floating-point element
|
|
/// from `a`.
|
|
///
|
|
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movedup_pd)
|
|
#[inline]
|
|
#[target_feature(enable = "sse3")]
|
|
#[cfg_attr(test, assert_instr(movddup))]
|
|
#[stable(feature = "simd_x86", since = "1.27.0")]
|
|
pub unsafe fn _mm_movedup_pd(a: __m128d) -> __m128d {
|
|
simd_shuffle2(a, a, [0, 0])
|
|
}
|
|
|
|
/// Load a double-precision (64-bit) floating-point element from memory
|
|
/// into both elements of return vector.
|
|
///
|
|
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loaddup_pd)
|
|
#[inline]
|
|
#[target_feature(enable = "sse3")]
|
|
#[cfg_attr(test, assert_instr(movddup))]
|
|
#[stable(feature = "simd_x86", since = "1.27.0")]
|
|
pub unsafe fn _mm_loaddup_pd(mem_addr: *const f64) -> __m128d {
|
|
_mm_load1_pd(mem_addr)
|
|
}
|
|
|
|
/// Duplicate odd-indexed single-precision (32-bit) floating-point elements
|
|
/// from `a`.
|
|
///
|
|
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehdup_ps)
|
|
#[inline]
|
|
#[target_feature(enable = "sse3")]
|
|
#[cfg_attr(test, assert_instr(movshdup))]
|
|
#[stable(feature = "simd_x86", since = "1.27.0")]
|
|
pub unsafe fn _mm_movehdup_ps(a: __m128) -> __m128 {
|
|
simd_shuffle4(a, a, [1, 1, 3, 3])
|
|
}
|
|
|
|
/// Duplicate even-indexed single-precision (32-bit) floating-point elements
|
|
/// from `a`.
|
|
///
|
|
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_moveldup_ps)
|
|
#[inline]
|
|
#[target_feature(enable = "sse3")]
|
|
#[cfg_attr(test, assert_instr(movsldup))]
|
|
#[stable(feature = "simd_x86", since = "1.27.0")]
|
|
pub unsafe fn _mm_moveldup_ps(a: __m128) -> __m128 {
|
|
simd_shuffle4(a, a, [0, 0, 2, 2])
|
|
}
|
|
|
|
#[allow(improper_ctypes)]
|
|
extern "C" {
|
|
#[link_name = "llvm.x86.sse3.addsub.ps"]
|
|
fn addsubps(a: __m128, b: __m128) -> __m128;
|
|
#[link_name = "llvm.x86.sse3.addsub.pd"]
|
|
fn addsubpd(a: __m128d, b: __m128d) -> __m128d;
|
|
#[link_name = "llvm.x86.sse3.hadd.pd"]
|
|
fn haddpd(a: __m128d, b: __m128d) -> __m128d;
|
|
#[link_name = "llvm.x86.sse3.hadd.ps"]
|
|
fn haddps(a: __m128, b: __m128) -> __m128;
|
|
#[link_name = "llvm.x86.sse3.hsub.pd"]
|
|
fn hsubpd(a: __m128d, b: __m128d) -> __m128d;
|
|
#[link_name = "llvm.x86.sse3.hsub.ps"]
|
|
fn hsubps(a: __m128, b: __m128) -> __m128;
|
|
#[link_name = "llvm.x86.sse3.ldu.dq"]
|
|
fn lddqu(mem_addr: *const i8) -> i8x16;
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use stdsimd_test::simd_test;
|
|
|
|
use coresimd::x86::*;
|
|
|
|
#[simd_test(enable = "sse3")]
|
|
unsafe fn test_mm_addsub_ps() {
|
|
let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
|
|
let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
|
|
let r = _mm_addsub_ps(a, b);
|
|
assert_eq_m128(r, _mm_setr_ps(99.0, 25.0, 0.0, -15.0));
|
|
}
|
|
|
|
#[simd_test(enable = "sse3")]
|
|
unsafe fn test_mm_addsub_pd() {
|
|
let a = _mm_setr_pd(-1.0, 5.0);
|
|
let b = _mm_setr_pd(-100.0, 20.0);
|
|
let r = _mm_addsub_pd(a, b);
|
|
assert_eq_m128d(r, _mm_setr_pd(99.0, 25.0));
|
|
}
|
|
|
|
#[simd_test(enable = "sse3")]
|
|
unsafe fn test_mm_hadd_pd() {
|
|
let a = _mm_setr_pd(-1.0, 5.0);
|
|
let b = _mm_setr_pd(-100.0, 20.0);
|
|
let r = _mm_hadd_pd(a, b);
|
|
assert_eq_m128d(r, _mm_setr_pd(4.0, -80.0));
|
|
}
|
|
|
|
#[simd_test(enable = "sse3")]
|
|
unsafe fn test_mm_hadd_ps() {
|
|
let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
|
|
let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
|
|
let r = _mm_hadd_ps(a, b);
|
|
assert_eq_m128(r, _mm_setr_ps(4.0, -10.0, -80.0, -5.0));
|
|
}
|
|
|
|
#[simd_test(enable = "sse3")]
|
|
unsafe fn test_mm_hsub_pd() {
|
|
let a = _mm_setr_pd(-1.0, 5.0);
|
|
let b = _mm_setr_pd(-100.0, 20.0);
|
|
let r = _mm_hsub_pd(a, b);
|
|
assert_eq_m128d(r, _mm_setr_pd(-6.0, -120.0));
|
|
}
|
|
|
|
#[simd_test(enable = "sse3")]
|
|
unsafe fn test_mm_hsub_ps() {
|
|
let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
|
|
let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
|
|
let r = _mm_hsub_ps(a, b);
|
|
assert_eq_m128(r, _mm_setr_ps(-6.0, 10.0, -120.0, 5.0));
|
|
}
|
|
|
|
#[simd_test(enable = "sse3")]
|
|
unsafe fn test_mm_lddqu_si128() {
|
|
#[cfg_attr(rustfmt, rustfmt_skip)]
|
|
let a = _mm_setr_epi8(
|
|
1, 2, 3, 4,
|
|
5, 6, 7, 8,
|
|
9, 10, 11, 12,
|
|
13, 14, 15, 16,
|
|
);
|
|
let r = _mm_lddqu_si128(&a);
|
|
assert_eq_m128i(a, r);
|
|
}
|
|
|
|
#[simd_test(enable = "sse3")]
|
|
unsafe fn test_mm_movedup_pd() {
|
|
let a = _mm_setr_pd(-1.0, 5.0);
|
|
let r = _mm_movedup_pd(a);
|
|
assert_eq_m128d(r, _mm_setr_pd(-1.0, -1.0));
|
|
}
|
|
|
|
#[simd_test(enable = "sse3")]
|
|
unsafe fn test_mm_movehdup_ps() {
|
|
let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
|
|
let r = _mm_movehdup_ps(a);
|
|
assert_eq_m128(r, _mm_setr_ps(5.0, 5.0, -10.0, -10.0));
|
|
}
|
|
|
|
#[simd_test(enable = "sse3")]
|
|
unsafe fn test_mm_moveldup_ps() {
|
|
let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
|
|
let r = _mm_moveldup_ps(a);
|
|
assert_eq_m128(r, _mm_setr_ps(-1.0, -1.0, 0.0, 0.0));
|
|
}
|
|
|
|
#[simd_test(enable = "sse3")]
|
|
unsafe fn test_mm_loaddup_pd() {
|
|
let d = -5.0;
|
|
let r = _mm_loaddup_pd(&d);
|
|
assert_eq_m128d(r, _mm_setr_pd(d, d));
|
|
}
|
|
}
|