mirror of
https://github.com/rust-lang/rust.git
synced 2025-11-14 17:22:58 +00:00
900 lines
30 KiB
Rust
900 lines
30 KiB
Rust
//! Supplemental Streaming SIMD Extensions 3 (SSSE3)
|
|
|
|
use coresimd::simd::*;
|
|
use coresimd::simd_llvm::simd_shuffle16;
|
|
use coresimd::x86::*;
|
|
use mem;
|
|
|
|
#[cfg(test)]
|
|
use stdsimd_test::assert_instr;
|
|
|
|
/// Compute the absolute value of packed 8-bit signed integers in `a` and
|
|
/// return the unsigned results.
|
|
///
|
|
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi8)
|
|
#[inline]
|
|
#[target_feature(enable = "ssse3")]
|
|
#[cfg_attr(test, assert_instr(pabsb))]
|
|
#[stable(feature = "simd_x86", since = "1.27.0")]
|
|
pub unsafe fn _mm_abs_epi8(a: __m128i) -> __m128i {
|
|
mem::transmute(pabsb128(a.as_i8x16()))
|
|
}
|
|
|
|
/// Compute the absolute value of each of the packed 16-bit signed integers in
|
|
/// `a` and
|
|
/// return the 16-bit unsigned integer
|
|
///
|
|
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi16)
|
|
#[inline]
|
|
#[target_feature(enable = "ssse3")]
|
|
#[cfg_attr(test, assert_instr(pabsw))]
|
|
#[stable(feature = "simd_x86", since = "1.27.0")]
|
|
pub unsafe fn _mm_abs_epi16(a: __m128i) -> __m128i {
|
|
mem::transmute(pabsw128(a.as_i16x8()))
|
|
}
|
|
|
|
/// Compute the absolute value of each of the packed 32-bit signed integers in
|
|
/// `a` and
|
|
/// return the 32-bit unsigned integer
|
|
///
|
|
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi32)
|
|
#[inline]
|
|
#[target_feature(enable = "ssse3")]
|
|
#[cfg_attr(test, assert_instr(pabsd))]
|
|
#[stable(feature = "simd_x86", since = "1.27.0")]
|
|
pub unsafe fn _mm_abs_epi32(a: __m128i) -> __m128i {
|
|
mem::transmute(pabsd128(a.as_i32x4()))
|
|
}
|
|
|
|
/// Shuffle bytes from `a` according to the content of `b`.
|
|
///
|
|
/// The last 4 bits of each byte of `b` are used as addresses
|
|
/// into the 16 bytes of `a`.
|
|
///
|
|
/// In addition, if the highest significant bit of a byte of `b`
|
|
/// is set, the respective destination byte is set to 0.
|
|
///
|
|
/// Picturing `a` and `b` as `[u8; 16]`, `_mm_shuffle_epi8` is
|
|
/// logically equivalent to:
|
|
///
|
|
/// ```
|
|
/// fn mm_shuffle_epi8(a: [u8; 16], b: [u8; 16]) -> [u8; 16] {
|
|
/// let mut r = [0u8; 16];
|
|
/// for i in 0..16 {
|
|
/// // if the most significant bit of b is set,
|
|
/// // then the destination byte is set to 0.
|
|
/// if b[i] & 0x80 == 0u8 {
|
|
/// r[i] = a[(b[i] % 16) as usize];
|
|
/// }
|
|
/// }
|
|
/// r
|
|
/// }
|
|
/// ```
|
|
///
|
|
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi8)
|
|
#[inline]
|
|
#[target_feature(enable = "ssse3")]
|
|
#[cfg_attr(test, assert_instr(pshufb))]
|
|
#[stable(feature = "simd_x86", since = "1.27.0")]
|
|
pub unsafe fn _mm_shuffle_epi8(a: __m128i, b: __m128i) -> __m128i {
|
|
mem::transmute(pshufb128(a.as_u8x16(), b.as_u8x16()))
|
|
}
|
|
|
|
/// Concatenate 16-byte blocks in `a` and `b` into a 32-byte temporary result,
|
|
/// shift the result right by `n` bytes, and return the low 16 bytes.
|
|
///
|
|
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_epi8)
|
|
#[inline]
|
|
#[target_feature(enable = "ssse3")]
|
|
#[cfg_attr(test, assert_instr(palignr, n = 15))]
|
|
#[rustc_args_required_const(2)]
|
|
#[stable(feature = "simd_x86", since = "1.27.0")]
|
|
pub unsafe fn _mm_alignr_epi8(a: __m128i, b: __m128i, n: i32) -> __m128i {
|
|
let n = n as u32;
|
|
// If palignr is shifting the pair of vectors more than the size of two
|
|
// lanes, emit zero.
|
|
if n > 32 {
|
|
return _mm_set1_epi8(0);
|
|
}
|
|
// If palignr is shifting the pair of input vectors more than one lane,
|
|
// but less than two lanes, convert to shifting in zeroes.
|
|
let (a, b, n) = if n > 16 {
|
|
(_mm_set1_epi8(0), a, n - 16)
|
|
} else {
|
|
(a, b, n)
|
|
};
|
|
let a = a.as_i8x16();
|
|
let b = b.as_i8x16();
|
|
|
|
macro_rules! shuffle {
|
|
($shift:expr) => {
|
|
simd_shuffle16(
|
|
b,
|
|
a,
|
|
[
|
|
0 + $shift,
|
|
1 + $shift,
|
|
2 + $shift,
|
|
3 + $shift,
|
|
4 + $shift,
|
|
5 + $shift,
|
|
6 + $shift,
|
|
7 + $shift,
|
|
8 + $shift,
|
|
9 + $shift,
|
|
10 + $shift,
|
|
11 + $shift,
|
|
12 + $shift,
|
|
13 + $shift,
|
|
14 + $shift,
|
|
15 + $shift,
|
|
],
|
|
)
|
|
};
|
|
}
|
|
let r: i8x16 = match n {
|
|
0 => shuffle!(0),
|
|
1 => shuffle!(1),
|
|
2 => shuffle!(2),
|
|
3 => shuffle!(3),
|
|
4 => shuffle!(4),
|
|
5 => shuffle!(5),
|
|
6 => shuffle!(6),
|
|
7 => shuffle!(7),
|
|
8 => shuffle!(8),
|
|
9 => shuffle!(9),
|
|
10 => shuffle!(10),
|
|
11 => shuffle!(11),
|
|
12 => shuffle!(12),
|
|
13 => shuffle!(13),
|
|
14 => shuffle!(14),
|
|
15 => shuffle!(15),
|
|
_ => shuffle!(16),
|
|
};
|
|
mem::transmute(r)
|
|
}
|
|
|
|
/// Horizontally add the adjacent pairs of values contained in 2 packed
|
|
/// 128-bit vectors of `[8 x i16]`.
|
|
///
|
|
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_epi16)
|
|
#[inline]
|
|
#[target_feature(enable = "ssse3")]
|
|
#[cfg_attr(test, assert_instr(phaddw))]
|
|
#[stable(feature = "simd_x86", since = "1.27.0")]
|
|
pub unsafe fn _mm_hadd_epi16(a: __m128i, b: __m128i) -> __m128i {
|
|
mem::transmute(phaddw128(a.as_i16x8(), b.as_i16x8()))
|
|
}
|
|
|
|
/// Horizontally add the adjacent pairs of values contained in 2 packed
|
|
/// 128-bit vectors of `[8 x i16]`. Positive sums greater than 7FFFh are
|
|
/// saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
|
|
///
|
|
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadds_epi16)
|
|
#[inline]
|
|
#[target_feature(enable = "ssse3")]
|
|
#[cfg_attr(test, assert_instr(phaddsw))]
|
|
#[stable(feature = "simd_x86", since = "1.27.0")]
|
|
pub unsafe fn _mm_hadds_epi16(a: __m128i, b: __m128i) -> __m128i {
|
|
mem::transmute(phaddsw128(a.as_i16x8(), b.as_i16x8()))
|
|
}
|
|
|
|
/// Horizontally add the adjacent pairs of values contained in 2 packed
|
|
/// 128-bit vectors of `[4 x i32]`.
|
|
///
|
|
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_epi32)
|
|
#[inline]
|
|
#[target_feature(enable = "ssse3")]
|
|
#[cfg_attr(test, assert_instr(phaddd))]
|
|
#[stable(feature = "simd_x86", since = "1.27.0")]
|
|
pub unsafe fn _mm_hadd_epi32(a: __m128i, b: __m128i) -> __m128i {
|
|
mem::transmute(phaddd128(a.as_i32x4(), b.as_i32x4()))
|
|
}
|
|
|
|
/// Horizontally subtract the adjacent pairs of values contained in 2
|
|
/// packed 128-bit vectors of `[8 x i16]`.
|
|
///
|
|
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_epi16)
|
|
#[inline]
|
|
#[target_feature(enable = "ssse3")]
|
|
#[cfg_attr(test, assert_instr(phsubw))]
|
|
#[stable(feature = "simd_x86", since = "1.27.0")]
|
|
pub unsafe fn _mm_hsub_epi16(a: __m128i, b: __m128i) -> __m128i {
|
|
mem::transmute(phsubw128(a.as_i16x8(), b.as_i16x8()))
|
|
}
|
|
|
|
/// Horizontally subtract the adjacent pairs of values contained in 2
|
|
/// packed 128-bit vectors of `[8 x i16]`. Positive differences greater than
|
|
/// 7FFFh are saturated to 7FFFh. Negative differences less than 8000h are
|
|
/// saturated to 8000h.
|
|
///
|
|
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_epi16)
|
|
#[inline]
|
|
#[target_feature(enable = "ssse3")]
|
|
#[cfg_attr(test, assert_instr(phsubsw))]
|
|
#[stable(feature = "simd_x86", since = "1.27.0")]
|
|
pub unsafe fn _mm_hsubs_epi16(a: __m128i, b: __m128i) -> __m128i {
|
|
mem::transmute(phsubsw128(a.as_i16x8(), b.as_i16x8()))
|
|
}
|
|
|
|
/// Horizontally subtract the adjacent pairs of values contained in 2
|
|
/// packed 128-bit vectors of `[4 x i32]`.
|
|
///
|
|
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_epi32)
|
|
#[inline]
|
|
#[target_feature(enable = "ssse3")]
|
|
#[cfg_attr(test, assert_instr(phsubd))]
|
|
#[stable(feature = "simd_x86", since = "1.27.0")]
|
|
pub unsafe fn _mm_hsub_epi32(a: __m128i, b: __m128i) -> __m128i {
|
|
mem::transmute(phsubd128(a.as_i32x4(), b.as_i32x4()))
|
|
}
|
|
|
|
/// Multiply corresponding pairs of packed 8-bit unsigned integer
|
|
/// values contained in the first source operand and packed 8-bit signed
|
|
/// integer values contained in the second source operand, add pairs of
|
|
/// contiguous products with signed saturation, and writes the 16-bit sums to
|
|
/// the corresponding bits in the destination.
|
|
///
|
|
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maddubs_epi16)
|
|
#[inline]
|
|
#[target_feature(enable = "ssse3")]
|
|
#[cfg_attr(test, assert_instr(pmaddubsw))]
|
|
#[stable(feature = "simd_x86", since = "1.27.0")]
|
|
pub unsafe fn _mm_maddubs_epi16(a: __m128i, b: __m128i) -> __m128i {
|
|
mem::transmute(pmaddubsw128(a.as_u8x16(), b.as_i8x16()))
|
|
}
|
|
|
|
/// Multiply packed 16-bit signed integer values, truncate the 32-bit
|
|
/// product to the 18 most significant bits by right-shifting, round the
|
|
/// truncated value by adding 1, and write bits `[16:1]` to the destination.
|
|
///
|
|
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhrs_epi16)
|
|
#[inline]
|
|
#[target_feature(enable = "ssse3")]
|
|
#[cfg_attr(test, assert_instr(pmulhrsw))]
|
|
#[stable(feature = "simd_x86", since = "1.27.0")]
|
|
pub unsafe fn _mm_mulhrs_epi16(a: __m128i, b: __m128i) -> __m128i {
|
|
mem::transmute(pmulhrsw128(a.as_i16x8(), b.as_i16x8()))
|
|
}
|
|
|
|
/// Negate packed 8-bit integers in `a` when the corresponding signed 8-bit
|
|
/// integer in `b` is negative, and return the result.
|
|
/// Elements in result are zeroed out when the corresponding element in `b`
|
|
/// is zero.
|
|
///
|
|
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_epi8)
|
|
#[inline]
|
|
#[target_feature(enable = "ssse3")]
|
|
#[cfg_attr(test, assert_instr(psignb))]
|
|
#[stable(feature = "simd_x86", since = "1.27.0")]
|
|
pub unsafe fn _mm_sign_epi8(a: __m128i, b: __m128i) -> __m128i {
|
|
mem::transmute(psignb128(a.as_i8x16(), b.as_i8x16()))
|
|
}
|
|
|
|
/// Negate packed 16-bit integers in `a` when the corresponding signed 16-bit
|
|
/// integer in `b` is negative, and return the results.
|
|
/// Elements in result are zeroed out when the corresponding element in `b`
|
|
/// is zero.
|
|
///
|
|
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_epi16)
|
|
#[inline]
|
|
#[target_feature(enable = "ssse3")]
|
|
#[cfg_attr(test, assert_instr(psignw))]
|
|
#[stable(feature = "simd_x86", since = "1.27.0")]
|
|
pub unsafe fn _mm_sign_epi16(a: __m128i, b: __m128i) -> __m128i {
|
|
mem::transmute(psignw128(a.as_i16x8(), b.as_i16x8()))
|
|
}
|
|
|
|
/// Negate packed 32-bit integers in `a` when the corresponding signed 32-bit
|
|
/// integer in `b` is negative, and return the results.
|
|
/// Element in result are zeroed out when the corresponding element in `b`
|
|
/// is zero.
|
|
///
|
|
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_epi32)
|
|
#[inline]
|
|
#[target_feature(enable = "ssse3")]
|
|
#[cfg_attr(test, assert_instr(psignd))]
|
|
#[stable(feature = "simd_x86", since = "1.27.0")]
|
|
pub unsafe fn _mm_sign_epi32(a: __m128i, b: __m128i) -> __m128i {
|
|
mem::transmute(psignd128(a.as_i32x4(), b.as_i32x4()))
|
|
}
|
|
|
|
/// Compute the absolute value of packed 8-bit integers in `a` and
|
|
/// return the unsigned results.
|
|
#[inline]
|
|
#[target_feature(enable = "ssse3,mmx")]
|
|
#[cfg_attr(test, assert_instr(pabsb))]
|
|
pub unsafe fn _mm_abs_pi8(a: __m64) -> __m64 {
|
|
pabsb(a)
|
|
}
|
|
|
|
/// Compute the absolute value of packed 8-bit integers in `a`, and return the
|
|
/// unsigned results.
|
|
#[inline]
|
|
#[target_feature(enable = "ssse3,mmx")]
|
|
#[cfg_attr(test, assert_instr(pabsw))]
|
|
pub unsafe fn _mm_abs_pi16(a: __m64) -> __m64 {
|
|
pabsw(a)
|
|
}
|
|
|
|
/// Compute the absolute value of packed 32-bit integers in `a`, and return the
|
|
/// unsigned results.
|
|
#[inline]
|
|
#[target_feature(enable = "ssse3,mmx")]
|
|
#[cfg_attr(test, assert_instr(pabsd))]
|
|
pub unsafe fn _mm_abs_pi32(a: __m64) -> __m64 {
|
|
pabsd(a)
|
|
}
|
|
|
|
/// Shuffle packed 8-bit integers in `a` according to shuffle control mask in
|
|
/// the corresponding 8-bit element of `b`, and return the results
|
|
#[inline]
|
|
#[target_feature(enable = "ssse3,mmx")]
|
|
#[cfg_attr(test, assert_instr(pshufb))]
|
|
pub unsafe fn _mm_shuffle_pi8(a: __m64, b: __m64) -> __m64 {
|
|
pshufb(a, b)
|
|
}
|
|
|
|
/// Concatenates the two 64-bit integer vector operands, and right-shifts
|
|
/// the result by the number of bytes specified in the immediate operand.
|
|
#[inline]
|
|
#[target_feature(enable = "ssse3,mmx")]
|
|
#[cfg_attr(test, assert_instr(palignr, n = 15))]
|
|
#[rustc_args_required_const(2)]
|
|
pub unsafe fn _mm_alignr_pi8(a: __m64, b: __m64, n: i32) -> __m64 {
|
|
macro_rules! call {
|
|
($imm8:expr) => {
|
|
palignrb(a, b, $imm8)
|
|
};
|
|
}
|
|
constify_imm8!(n, call)
|
|
}
|
|
|
|
/// Horizontally add the adjacent pairs of values contained in 2 packed
|
|
/// 64-bit vectors of `[4 x i16]`.
|
|
#[inline]
|
|
#[target_feature(enable = "ssse3,mmx")]
|
|
#[cfg_attr(test, assert_instr(phaddw))]
|
|
pub unsafe fn _mm_hadd_pi16(a: __m64, b: __m64) -> __m64 {
|
|
phaddw(a, b)
|
|
}
|
|
|
|
/// Horizontally add the adjacent pairs of values contained in 2 packed
|
|
/// 64-bit vectors of `[2 x i32]`.
|
|
#[inline]
|
|
#[target_feature(enable = "ssse3,mmx")]
|
|
#[cfg_attr(test, assert_instr(phaddd))]
|
|
pub unsafe fn _mm_hadd_pi32(a: __m64, b: __m64) -> __m64 {
|
|
phaddd(a, b)
|
|
}
|
|
|
|
/// Horizontally add the adjacent pairs of values contained in 2 packed
|
|
/// 64-bit vectors of `[4 x i16]`. Positive sums greater than 7FFFh are
|
|
/// saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
|
|
#[inline]
|
|
#[target_feature(enable = "ssse3,mmx")]
|
|
#[cfg_attr(test, assert_instr(phaddsw))]
|
|
pub unsafe fn _mm_hadds_pi16(a: __m64, b: __m64) -> __m64 {
|
|
phaddsw(a, b)
|
|
}
|
|
|
|
/// Horizontally subtracts the adjacent pairs of values contained in 2
|
|
/// packed 64-bit vectors of `[4 x i16]`.
|
|
#[inline]
|
|
#[target_feature(enable = "ssse3,mmx")]
|
|
#[cfg_attr(test, assert_instr(phsubw))]
|
|
pub unsafe fn _mm_hsub_pi16(a: __m64, b: __m64) -> __m64 {
|
|
phsubw(a, b)
|
|
}
|
|
|
|
/// Horizontally subtracts the adjacent pairs of values contained in 2
|
|
/// packed 64-bit vectors of `[2 x i32]`.
|
|
#[inline]
|
|
#[target_feature(enable = "ssse3,mmx")]
|
|
#[cfg_attr(test, assert_instr(phsubd))]
|
|
pub unsafe fn _mm_hsub_pi32(a: __m64, b: __m64) -> __m64 {
|
|
phsubd(a, b)
|
|
}
|
|
|
|
/// Horizontally subtracts the adjacent pairs of values contained in 2
|
|
/// packed 64-bit vectors of `[4 x i16]`. Positive differences greater than
|
|
/// 7FFFh are saturated to 7FFFh. Negative differences less than 8000h are
|
|
/// saturated to 8000h.
|
|
#[inline]
|
|
#[target_feature(enable = "ssse3,mmx")]
|
|
#[cfg_attr(test, assert_instr(phsubsw))]
|
|
pub unsafe fn _mm_hsubs_pi16(a: __m64, b: __m64) -> __m64 {
|
|
phsubsw(a, b)
|
|
}
|
|
|
|
/// Multiplies corresponding pairs of packed 8-bit unsigned integer
|
|
/// values contained in the first source operand and packed 8-bit signed
|
|
/// integer values contained in the second source operand, adds pairs of
|
|
/// contiguous products with signed saturation, and writes the 16-bit sums to
|
|
/// the corresponding bits in the destination.
|
|
#[inline]
|
|
#[target_feature(enable = "ssse3,mmx")]
|
|
#[cfg_attr(test, assert_instr(pmaddubsw))]
|
|
pub unsafe fn _mm_maddubs_pi16(a: __m64, b: __m64) -> __m64 {
|
|
pmaddubsw(a, b)
|
|
}
|
|
|
|
/// Multiplies packed 16-bit signed integer values, truncates the 32-bit
|
|
/// products to the 18 most significant bits by right-shifting, rounds the
|
|
/// truncated value by adding 1, and writes bits `[16:1]` to the destination.
|
|
#[inline]
|
|
#[target_feature(enable = "ssse3,mmx")]
|
|
#[cfg_attr(test, assert_instr(pmulhrsw))]
|
|
pub unsafe fn _mm_mulhrs_pi16(a: __m64, b: __m64) -> __m64 {
|
|
pmulhrsw(a, b)
|
|
}
|
|
|
|
/// Negate packed 8-bit integers in `a` when the corresponding signed 8-bit
|
|
/// integer in `b` is negative, and return the results.
|
|
/// Element in result are zeroed out when the corresponding element in `b` is
|
|
/// zero.
|
|
#[inline]
|
|
#[target_feature(enable = "ssse3,mmx")]
|
|
#[cfg_attr(test, assert_instr(psignb))]
|
|
pub unsafe fn _mm_sign_pi8(a: __m64, b: __m64) -> __m64 {
|
|
psignb(a, b)
|
|
}
|
|
|
|
/// Negate packed 16-bit integers in `a` when the corresponding signed 16-bit
|
|
/// integer in `b` is negative, and return the results.
|
|
/// Element in result are zeroed out when the corresponding element in `b` is
|
|
/// zero.
|
|
#[inline]
|
|
#[target_feature(enable = "ssse3,mmx")]
|
|
#[cfg_attr(test, assert_instr(psignw))]
|
|
pub unsafe fn _mm_sign_pi16(a: __m64, b: __m64) -> __m64 {
|
|
psignw(a, b)
|
|
}
|
|
|
|
/// Negate packed 32-bit integers in `a` when the corresponding signed 32-bit
|
|
/// integer in `b` is negative, and return the results.
|
|
/// Element in result are zeroed out when the corresponding element in `b` is
|
|
/// zero.
|
|
#[inline]
|
|
#[target_feature(enable = "ssse3,mmx")]
|
|
#[cfg_attr(test, assert_instr(psignd))]
|
|
pub unsafe fn _mm_sign_pi32(a: __m64, b: __m64) -> __m64 {
|
|
psignd(a, b)
|
|
}
|
|
|
|
#[allow(improper_ctypes)]
|
|
extern "C" {
|
|
#[link_name = "llvm.x86.ssse3.pabs.b.128"]
|
|
fn pabsb128(a: i8x16) -> u8x16;
|
|
|
|
#[link_name = "llvm.x86.ssse3.pabs.w.128"]
|
|
fn pabsw128(a: i16x8) -> u16x8;
|
|
|
|
#[link_name = "llvm.x86.ssse3.pabs.d.128"]
|
|
fn pabsd128(a: i32x4) -> u32x4;
|
|
|
|
#[link_name = "llvm.x86.ssse3.pshuf.b.128"]
|
|
fn pshufb128(a: u8x16, b: u8x16) -> u8x16;
|
|
|
|
#[link_name = "llvm.x86.ssse3.phadd.w.128"]
|
|
fn phaddw128(a: i16x8, b: i16x8) -> i16x8;
|
|
|
|
#[link_name = "llvm.x86.ssse3.phadd.sw.128"]
|
|
fn phaddsw128(a: i16x8, b: i16x8) -> i16x8;
|
|
|
|
#[link_name = "llvm.x86.ssse3.phadd.d.128"]
|
|
fn phaddd128(a: i32x4, b: i32x4) -> i32x4;
|
|
|
|
#[link_name = "llvm.x86.ssse3.phsub.w.128"]
|
|
fn phsubw128(a: i16x8, b: i16x8) -> i16x8;
|
|
|
|
#[link_name = "llvm.x86.ssse3.phsub.sw.128"]
|
|
fn phsubsw128(a: i16x8, b: i16x8) -> i16x8;
|
|
|
|
#[link_name = "llvm.x86.ssse3.phsub.d.128"]
|
|
fn phsubd128(a: i32x4, b: i32x4) -> i32x4;
|
|
|
|
#[link_name = "llvm.x86.ssse3.pmadd.ub.sw.128"]
|
|
fn pmaddubsw128(a: u8x16, b: i8x16) -> i16x8;
|
|
|
|
#[link_name = "llvm.x86.ssse3.pmul.hr.sw.128"]
|
|
fn pmulhrsw128(a: i16x8, b: i16x8) -> i16x8;
|
|
|
|
#[link_name = "llvm.x86.ssse3.psign.b.128"]
|
|
fn psignb128(a: i8x16, b: i8x16) -> i8x16;
|
|
|
|
#[link_name = "llvm.x86.ssse3.psign.w.128"]
|
|
fn psignw128(a: i16x8, b: i16x8) -> i16x8;
|
|
|
|
#[link_name = "llvm.x86.ssse3.psign.d.128"]
|
|
fn psignd128(a: i32x4, b: i32x4) -> i32x4;
|
|
|
|
#[link_name = "llvm.x86.ssse3.pabs.b"]
|
|
fn pabsb(a: __m64) -> __m64;
|
|
|
|
#[link_name = "llvm.x86.ssse3.pabs.w"]
|
|
fn pabsw(a: __m64) -> __m64;
|
|
|
|
#[link_name = "llvm.x86.ssse3.pabs.d"]
|
|
fn pabsd(a: __m64) -> __m64;
|
|
|
|
#[link_name = "llvm.x86.ssse3.pshuf.b"]
|
|
fn pshufb(a: __m64, b: __m64) -> __m64;
|
|
|
|
#[link_name = "llvm.x86.mmx.palignr.b"]
|
|
fn palignrb(a: __m64, b: __m64, n: u8) -> __m64;
|
|
|
|
#[link_name = "llvm.x86.ssse3.phadd.w"]
|
|
fn phaddw(a: __m64, b: __m64) -> __m64;
|
|
|
|
#[link_name = "llvm.x86.ssse3.phadd.d"]
|
|
fn phaddd(a: __m64, b: __m64) -> __m64;
|
|
|
|
#[link_name = "llvm.x86.ssse3.phadd.sw"]
|
|
fn phaddsw(a: __m64, b: __m64) -> __m64;
|
|
|
|
#[link_name = "llvm.x86.ssse3.phsub.w"]
|
|
fn phsubw(a: __m64, b: __m64) -> __m64;
|
|
|
|
#[link_name = "llvm.x86.ssse3.phsub.d"]
|
|
fn phsubd(a: __m64, b: __m64) -> __m64;
|
|
|
|
#[link_name = "llvm.x86.ssse3.phsub.sw"]
|
|
fn phsubsw(a: __m64, b: __m64) -> __m64;
|
|
|
|
#[link_name = "llvm.x86.ssse3.pmadd.ub.sw"]
|
|
fn pmaddubsw(a: __m64, b: __m64) -> __m64;
|
|
|
|
#[link_name = "llvm.x86.ssse3.pmul.hr.sw"]
|
|
fn pmulhrsw(a: __m64, b: __m64) -> __m64;
|
|
|
|
#[link_name = "llvm.x86.ssse3.psign.b"]
|
|
fn psignb(a: __m64, b: __m64) -> __m64;
|
|
|
|
#[link_name = "llvm.x86.ssse3.psign.w"]
|
|
fn psignw(a: __m64, b: __m64) -> __m64;
|
|
|
|
#[link_name = "llvm.x86.ssse3.psign.d"]
|
|
fn psignd(a: __m64, b: __m64) -> __m64;
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use stdsimd_test::simd_test;
|
|
|
|
use coresimd::x86::*;
|
|
|
|
#[simd_test(enable = "ssse3")]
|
|
unsafe fn test_mm_abs_epi8() {
|
|
let r = _mm_abs_epi8(_mm_set1_epi8(-5));
|
|
assert_eq_m128i(r, _mm_set1_epi8(5));
|
|
}
|
|
|
|
#[simd_test(enable = "ssse3")]
|
|
unsafe fn test_mm_abs_epi16() {
|
|
let r = _mm_abs_epi16(_mm_set1_epi16(-5));
|
|
assert_eq_m128i(r, _mm_set1_epi16(5));
|
|
}
|
|
|
|
#[simd_test(enable = "ssse3")]
|
|
unsafe fn test_mm_abs_epi32() {
|
|
let r = _mm_abs_epi32(_mm_set1_epi32(-5));
|
|
assert_eq_m128i(r, _mm_set1_epi32(5));
|
|
}
|
|
|
|
#[simd_test(enable = "ssse3")]
|
|
unsafe fn test_mm_shuffle_epi8() {
|
|
#[cfg_attr(rustfmt, rustfmt_skip)]
|
|
let a = _mm_setr_epi8(
|
|
1, 2, 3, 4, 5, 6, 7, 8,
|
|
9, 10, 11, 12, 13, 14, 15, 16,
|
|
);
|
|
#[cfg_attr(rustfmt, rustfmt_skip)]
|
|
let b = _mm_setr_epi8(
|
|
4, 128_u8 as i8, 4, 3,
|
|
24, 12, 6, 19,
|
|
12, 5, 5, 10,
|
|
4, 1, 8, 0,
|
|
);
|
|
let expected =
|
|
_mm_setr_epi8(5, 0, 5, 4, 9, 13, 7, 4, 13, 6, 6, 11, 5, 2, 9, 1);
|
|
let r = _mm_shuffle_epi8(a, b);
|
|
assert_eq_m128i(r, expected);
|
|
}
|
|
|
|
#[simd_test(enable = "ssse3")]
|
|
unsafe fn test_mm_alignr_epi8() {
|
|
#[cfg_attr(rustfmt, rustfmt_skip)]
|
|
let a = _mm_setr_epi8(
|
|
1, 2, 3, 4, 5, 6, 7, 8,
|
|
9, 10, 11, 12, 13, 14, 15, 16,
|
|
);
|
|
#[cfg_attr(rustfmt, rustfmt_skip)]
|
|
let b = _mm_setr_epi8(
|
|
4, 63, 4, 3,
|
|
24, 12, 6, 19,
|
|
12, 5, 5, 10,
|
|
4, 1, 8, 0,
|
|
);
|
|
let r = _mm_alignr_epi8(a, b, 33);
|
|
assert_eq_m128i(r, _mm_set1_epi8(0));
|
|
|
|
let r = _mm_alignr_epi8(a, b, 17);
|
|
#[cfg_attr(rustfmt, rustfmt_skip)]
|
|
let expected = _mm_setr_epi8(
|
|
2, 3, 4, 5, 6, 7, 8, 9,
|
|
10, 11, 12, 13, 14, 15, 16, 0,
|
|
);
|
|
assert_eq_m128i(r, expected);
|
|
|
|
let r = _mm_alignr_epi8(a, b, 16);
|
|
assert_eq_m128i(r, a);
|
|
|
|
let r = _mm_alignr_epi8(a, b, 15);
|
|
#[cfg_attr(rustfmt, rustfmt_skip)]
|
|
let expected = _mm_setr_epi8(
|
|
0, 1, 2, 3, 4, 5, 6, 7,
|
|
8, 9, 10, 11, 12, 13, 14, 15,
|
|
);
|
|
assert_eq_m128i(r, expected);
|
|
|
|
let r = _mm_alignr_epi8(a, b, 0);
|
|
assert_eq_m128i(r, b);
|
|
}
|
|
|
|
#[simd_test(enable = "ssse3")]
|
|
unsafe fn test_mm_hadd_epi16() {
|
|
let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
|
|
let b = _mm_setr_epi16(4, 128, 4, 3, 24, 12, 6, 19);
|
|
let expected = _mm_setr_epi16(3, 7, 11, 15, 132, 7, 36, 25);
|
|
let r = _mm_hadd_epi16(a, b);
|
|
assert_eq_m128i(r, expected);
|
|
}
|
|
|
|
#[simd_test(enable = "ssse3")]
|
|
unsafe fn test_mm_hadds_epi16() {
|
|
let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
|
|
let b = _mm_setr_epi16(4, 128, 4, 3, 32767, 1, -32768, -1);
|
|
let expected = _mm_setr_epi16(3, 7, 11, 15, 132, 7, 32767, -32768);
|
|
let r = _mm_hadds_epi16(a, b);
|
|
assert_eq_m128i(r, expected);
|
|
}
|
|
|
|
#[simd_test(enable = "ssse3")]
|
|
unsafe fn test_mm_hadd_epi32() {
|
|
let a = _mm_setr_epi32(1, 2, 3, 4);
|
|
let b = _mm_setr_epi32(4, 128, 4, 3);
|
|
let expected = _mm_setr_epi32(3, 7, 132, 7);
|
|
let r = _mm_hadd_epi32(a, b);
|
|
assert_eq_m128i(r, expected);
|
|
}
|
|
|
|
#[simd_test(enable = "ssse3")]
|
|
unsafe fn test_mm_hsub_epi16() {
|
|
let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
|
|
let b = _mm_setr_epi16(4, 128, 4, 3, 24, 12, 6, 19);
|
|
let expected = _mm_setr_epi16(-1, -1, -1, -1, -124, 1, 12, -13);
|
|
let r = _mm_hsub_epi16(a, b);
|
|
assert_eq_m128i(r, expected);
|
|
}
|
|
|
|
#[simd_test(enable = "ssse3")]
|
|
unsafe fn test_mm_hsubs_epi16() {
|
|
let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
|
|
let b = _mm_setr_epi16(4, 128, 4, 3, 32767, -1, -32768, 1);
|
|
let expected = _mm_setr_epi16(-1, -1, -1, -1, -124, 1, 32767, -32768);
|
|
let r = _mm_hsubs_epi16(a, b);
|
|
assert_eq_m128i(r, expected);
|
|
}
|
|
|
|
#[simd_test(enable = "ssse3")]
|
|
unsafe fn test_mm_hsub_epi32() {
|
|
let a = _mm_setr_epi32(1, 2, 3, 4);
|
|
let b = _mm_setr_epi32(4, 128, 4, 3);
|
|
let expected = _mm_setr_epi32(-1, -1, -124, 1);
|
|
let r = _mm_hsub_epi32(a, b);
|
|
assert_eq_m128i(r, expected);
|
|
}
|
|
|
|
#[simd_test(enable = "ssse3")]
|
|
unsafe fn test_mm_maddubs_epi16() {
|
|
#[cfg_attr(rustfmt, rustfmt_skip)]
|
|
let a = _mm_setr_epi8(
|
|
1, 2, 3, 4, 5, 6, 7, 8,
|
|
9, 10, 11, 12, 13, 14, 15, 16,
|
|
);
|
|
#[cfg_attr(rustfmt, rustfmt_skip)]
|
|
let b = _mm_setr_epi8(
|
|
4, 63, 4, 3,
|
|
24, 12, 6, 19,
|
|
12, 5, 5, 10,
|
|
4, 1, 8, 0,
|
|
);
|
|
let expected = _mm_setr_epi16(130, 24, 192, 194, 158, 175, 66, 120);
|
|
let r = _mm_maddubs_epi16(a, b);
|
|
assert_eq_m128i(r, expected);
|
|
}
|
|
|
|
#[simd_test(enable = "ssse3")]
|
|
unsafe fn test_mm_mulhrs_epi16() {
|
|
let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
|
|
let b = _mm_setr_epi16(4, 128, 4, 3, 32767, -1, -32768, 1);
|
|
let expected = _mm_setr_epi16(0, 0, 0, 0, 5, 0, -7, 0);
|
|
let r = _mm_mulhrs_epi16(a, b);
|
|
assert_eq_m128i(r, expected);
|
|
}
|
|
|
|
#[simd_test(enable = "ssse3")]
|
|
unsafe fn test_mm_sign_epi8() {
|
|
#[cfg_attr(rustfmt, rustfmt_skip)]
|
|
let a = _mm_setr_epi8(
|
|
1, 2, 3, 4, 5, 6, 7, 8,
|
|
9, 10, 11, 12, 13, -14, -15, 16,
|
|
);
|
|
#[cfg_attr(rustfmt, rustfmt_skip)]
|
|
let b = _mm_setr_epi8(
|
|
4, 63, -4, 3, 24, 12, -6, -19,
|
|
12, 5, -5, 10, 4, 1, -8, 0,
|
|
);
|
|
#[cfg_attr(rustfmt, rustfmt_skip)]
|
|
let expected = _mm_setr_epi8(
|
|
1, 2, -3, 4, 5, 6, -7, -8,
|
|
9, 10, -11, 12, 13, -14, 15, 0,
|
|
);
|
|
let r = _mm_sign_epi8(a, b);
|
|
assert_eq_m128i(r, expected);
|
|
}
|
|
|
|
#[simd_test(enable = "ssse3")]
|
|
unsafe fn test_mm_sign_epi16() {
|
|
let a = _mm_setr_epi16(1, 2, 3, 4, -5, -6, 7, 8);
|
|
let b = _mm_setr_epi16(4, 128, 0, 3, 1, -1, -2, 1);
|
|
let expected = _mm_setr_epi16(1, 2, 0, 4, -5, 6, -7, 8);
|
|
let r = _mm_sign_epi16(a, b);
|
|
assert_eq_m128i(r, expected);
|
|
}
|
|
|
|
#[simd_test(enable = "ssse3")]
|
|
unsafe fn test_mm_sign_epi32() {
|
|
let a = _mm_setr_epi32(-1, 2, 3, 4);
|
|
let b = _mm_setr_epi32(1, -1, 1, 0);
|
|
let expected = _mm_setr_epi32(-1, -2, 3, 0);
|
|
let r = _mm_sign_epi32(a, b);
|
|
assert_eq_m128i(r, expected);
|
|
}
|
|
|
|
#[simd_test(enable = "ssse3,mmx")]
|
|
unsafe fn test_mm_abs_pi8() {
|
|
let r = _mm_abs_pi8(_mm_set1_pi8(-5));
|
|
assert_eq_m64(r, _mm_set1_pi8(5));
|
|
}
|
|
|
|
#[simd_test(enable = "ssse3,mmx")]
|
|
unsafe fn test_mm_abs_pi16() {
|
|
let r = _mm_abs_pi16(_mm_set1_pi16(-5));
|
|
assert_eq_m64(r, _mm_set1_pi16(5));
|
|
}
|
|
|
|
#[simd_test(enable = "ssse3,mmx")]
|
|
unsafe fn test_mm_abs_pi32() {
|
|
let r = _mm_abs_pi32(_mm_set1_pi32(-5));
|
|
assert_eq_m64(r, _mm_set1_pi32(5));
|
|
}
|
|
|
|
#[simd_test(enable = "ssse3,mmx")]
|
|
unsafe fn test_mm_shuffle_pi8() {
|
|
let a = _mm_setr_pi8(1, 2, 3, 4, 5, 6, 7, 8);
|
|
let b = _mm_setr_pi8(4, 128u8 as i8, 4, 3, 24, 12, 6, 19);
|
|
let expected = _mm_setr_pi8(5, 0, 5, 4, 1, 5, 7, 4);
|
|
let r = _mm_shuffle_pi8(a, b);
|
|
assert_eq_m64(r, expected);
|
|
}
|
|
|
|
#[simd_test(enable = "ssse3,mmx")]
|
|
unsafe fn test_mm_alignr_pi8() {
|
|
let a = _mm_setr_pi32(0x89ABCDEF_u32 as i32, 0x01234567_u32 as i32);
|
|
let b = _mm_setr_pi32(0xBBAA9988_u32 as i32, 0xFFDDEECC_u32 as i32);
|
|
let r = _mm_alignr_pi8(a, b, 4);
|
|
assert_eq_m64(r, ::std::mem::transmute(0x89abcdefffddeecc_u64));
|
|
}
|
|
|
|
#[simd_test(enable = "ssse3,mmx")]
|
|
unsafe fn test_mm_hadd_pi16() {
|
|
let a = _mm_setr_pi16(1, 2, 3, 4);
|
|
let b = _mm_setr_pi16(4, 128, 4, 3);
|
|
let expected = _mm_setr_pi16(3, 7, 132, 7);
|
|
let r = _mm_hadd_pi16(a, b);
|
|
assert_eq_m64(r, expected);
|
|
}
|
|
|
|
#[simd_test(enable = "ssse3,mmx")]
|
|
unsafe fn test_mm_hadd_pi32() {
|
|
let a = _mm_setr_pi32(1, 2);
|
|
let b = _mm_setr_pi32(4, 128);
|
|
let expected = _mm_setr_pi32(3, 132);
|
|
let r = _mm_hadd_pi32(a, b);
|
|
assert_eq_m64(r, expected);
|
|
}
|
|
|
|
#[simd_test(enable = "ssse3,mmx")]
|
|
unsafe fn test_mm_hadds_pi16() {
|
|
let a = _mm_setr_pi16(1, 2, 3, 4);
|
|
let b = _mm_setr_pi16(32767, 1, -32768, -1);
|
|
let expected = _mm_setr_pi16(3, 7, 32767, -32768);
|
|
let r = _mm_hadds_pi16(a, b);
|
|
assert_eq_m64(r, expected);
|
|
}
|
|
|
|
#[simd_test(enable = "ssse3,mmx")]
|
|
unsafe fn test_mm_hsub_pi16() {
|
|
let a = _mm_setr_pi16(1, 2, 3, 4);
|
|
let b = _mm_setr_pi16(4, 128, 4, 3);
|
|
let expected = _mm_setr_pi16(-1, -1, -124, 1);
|
|
let r = _mm_hsub_pi16(a, b);
|
|
assert_eq_m64(r, expected);
|
|
}
|
|
|
|
#[simd_test(enable = "ssse3,mmx")]
|
|
unsafe fn test_mm_hsub_pi32() {
|
|
let a = _mm_setr_pi32(1, 2);
|
|
let b = _mm_setr_pi32(4, 128);
|
|
let expected = _mm_setr_pi32(-1, -124);
|
|
let r = _mm_hsub_pi32(a, b);
|
|
assert_eq_m64(r, expected);
|
|
}
|
|
|
|
#[simd_test(enable = "ssse3,mmx")]
|
|
unsafe fn test_mm_hsubs_pi16() {
|
|
let a = _mm_setr_pi16(1, 2, 3, 4);
|
|
let b = _mm_setr_pi16(4, 128, 4, 3);
|
|
let expected = _mm_setr_pi16(-1, -1, -124, 1);
|
|
let r = _mm_hsubs_pi16(a, b);
|
|
assert_eq_m64(r, expected);
|
|
}
|
|
|
|
#[simd_test(enable = "ssse3,mmx")]
|
|
unsafe fn test_mm_maddubs_pi16() {
|
|
let a = _mm_setr_pi8(1, 2, 3, 4, 5, 6, 7, 8);
|
|
let b = _mm_setr_pi8(4, 63, 4, 3, 24, 12, 6, 19);
|
|
let expected = _mm_setr_pi16(130, 24, 192, 194);
|
|
let r = _mm_maddubs_pi16(a, b);
|
|
assert_eq_m64(r, expected);
|
|
}
|
|
|
|
#[simd_test(enable = "ssse3,mmx")]
|
|
unsafe fn test_mm_mulhrs_pi16() {
|
|
let a = _mm_setr_pi16(1, 2, 3, 4);
|
|
let b = _mm_setr_pi16(4, 32767, -1, -32768);
|
|
let expected = _mm_setr_pi16(0, 2, 0, -4);
|
|
let r = _mm_mulhrs_pi16(a, b);
|
|
assert_eq_m64(r, expected);
|
|
}
|
|
|
|
#[simd_test(enable = "ssse3,mmx")]
|
|
unsafe fn test_mm_sign_pi8() {
|
|
let a = _mm_setr_pi8(1, 2, 3, 4, -5, -6, 7, 8);
|
|
let b = _mm_setr_pi8(4, 64, 0, 3, 1, -1, -2, 1);
|
|
let expected = _mm_setr_pi8(1, 2, 0, 4, -5, 6, -7, 8);
|
|
let r = _mm_sign_pi8(a, b);
|
|
assert_eq_m64(r, expected);
|
|
}
|
|
|
|
#[simd_test(enable = "ssse3,mmx")]
|
|
unsafe fn test_mm_sign_pi16() {
|
|
let a = _mm_setr_pi16(-1, 2, 3, 4);
|
|
let b = _mm_setr_pi16(1, -1, 1, 0);
|
|
let expected = _mm_setr_pi16(-1, -2, 3, 0);
|
|
let r = _mm_sign_pi16(a, b);
|
|
assert_eq_m64(r, expected);
|
|
}
|
|
|
|
#[simd_test(enable = "ssse3,mmx")]
|
|
unsafe fn test_mm_sign_pi32() {
|
|
let a = _mm_setr_pi32(-1, 2);
|
|
let b = _mm_setr_pi32(1, 0);
|
|
let expected = _mm_setr_pi32(-1, 0);
|
|
let r = _mm_sign_pi32(a, b);
|
|
assert_eq_m64(r, expected);
|
|
}
|
|
}
|