From e975e22c20e583cddf4d4d042f30bc7c96e15ab1 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 19 Jun 2017 17:03:29 -0400 Subject: [PATCH] refactor --- library/stdarch/QUESTIONS.md | 5 + library/stdarch/TODO.md | 2 +- library/stdarch/examples/play.rs | 11 +- library/stdarch/src/lib.rs | 21 ++- library/stdarch/src/macros.rs | 16 +- library/stdarch/src/{simd.rs => simd_llvm.rs} | 0 library/stdarch/src/v128.rs | 2 +- library/stdarch/src/v256.rs | 2 +- library/stdarch/src/v512.rs | 152 ++++++++++++++++++ library/stdarch/src/v64.rs | 12 +- library/stdarch/src/x86/avx2.rs | 104 ++++++------ library/stdarch/src/x86/sse2.rs | 2 +- library/stdarch/src/x86/sse41.rs | 1 - library/stdarch/src/x86/sse42.rs | 1 - 14 files changed, 256 insertions(+), 75 deletions(-) rename library/stdarch/src/{simd.rs => simd_llvm.rs} (100%) create mode 100644 library/stdarch/src/v512.rs diff --git a/library/stdarch/QUESTIONS.md b/library/stdarch/QUESTIONS.md index 7a76d0d66320..3c6492556ca1 100644 --- a/library/stdarch/QUESTIONS.md +++ b/library/stdarch/QUESTIONS.md @@ -24,3 +24,8 @@ type is probably `i64x2`. What is the deal with the ucomi f64 comparison functions in SSE2? Clang's headers indicate some specific behavior with NAN, but I can't seem to reproduce it. Intel's official docs are very vague. + +--- + +`_mm256_blendv_pd` takes a mask parameter with type `f64x4`, but the +documentation seems to indicate that it is a bit vector. What's going on? diff --git a/library/stdarch/TODO.md b/library/stdarch/TODO.md index 21fec4831060..3c2417277bff 100644 --- a/library/stdarch/TODO.md +++ b/library/stdarch/TODO.md @@ -456,7 +456,7 @@ sse4.1 * [ ] `_mm_blend_ps` * [ ] `_mm_blendv_pd` * [ ] `_mm_blendv_ps` -* [ ] `_mm_blendv_epi8` +* [x] `_mm_blendv_epi8` * [ ] `_mm_blend_epi16` * [ ] `_mm_dp_pd` * [ ] `_mm_dp_ps` diff --git a/library/stdarch/examples/play.rs b/library/stdarch/examples/play.rs index bc4297c2d79f..9bcdde0de9df 100644 --- a/library/stdarch/examples/play.rs +++ b/library/stdarch/examples/play.rs @@ -3,7 +3,8 @@ extern crate stdsimd; use std::env; -use stdsimd as s; +use stdsimd::simd as s; +use stdsimd::vendor; #[inline(never)] #[target_feature = "+sse4.2"] @@ -14,15 +15,15 @@ fn index(needle: &str, haystack: &str) -> usize { let mut needle = needle.to_string().into_bytes(); needle.resize(16, 0); - let vneedle = s::__m128i::from(s::u8x16::load(&needle, 0)); + let vneedle = vendor::__m128i::from(s::u8x16::load(&needle, 0)); let mut haystack = haystack.to_string().into_bytes(); haystack.resize(16, 0); - let vhaystack = s::__m128i::from(s::u8x16::load(&haystack, 0)); + let vhaystack = vendor::__m128i::from(s::u8x16::load(&haystack, 0)); - s::_mm_cmpestri( + vendor::_mm_cmpestri( vneedle, needle_len as i32, vhaystack, hay_len as i32, - s::_SIDD_CMP_EQUAL_ORDERED) as usize + vendor::_SIDD_CMP_EQUAL_ORDERED) as usize } fn main() { diff --git a/library/stdarch/src/lib.rs b/library/stdarch/src/lib.rs index a3c770d5492c..1aa713742844 100644 --- a/library/stdarch/src/lib.rs +++ b/library/stdarch/src/lib.rs @@ -4,17 +4,26 @@ target_feature, )] -pub use v128::*; -pub use v256::*; -pub use v64::*; -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -pub use x86::*; +/// Platform independent SIMD vector types and operations. +pub mod simd { + pub use v128::*; + pub use v256::*; + pub use v512::*; + pub use v64::*; +} + +/// Platform dependent vendor intrinsics. +pub mod vendor { + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + pub use x86::*; +} #[macro_use] mod macros; -mod simd; +mod simd_llvm; mod v128; mod v256; +mod v512; mod v64; #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] mod x86; diff --git a/library/stdarch/src/macros.rs b/library/stdarch/src/macros.rs index aadb986052fd..339736e8694f 100644 --- a/library/stdarch/src/macros.rs +++ b/library/stdarch/src/macros.rs @@ -7,6 +7,16 @@ macro_rules! define_ty { } } +macro_rules! define_ty_doc { + ($name:ident, $($elty:ident),+ | $(#[$doc:meta])*) => { + $(#[$doc])* + #[repr(simd)] + #[derive(Clone, Copy, Debug, PartialEq)] + #[allow(non_camel_case_types)] + pub struct $name($($elty),*); + } +} + macro_rules! define_impl { ( $name:ident, $elemty:ident, $nelems:expr, $boolname:ident, @@ -246,11 +256,11 @@ macro_rules! define_integer_ops { } macro_rules! define_casts { - ($(($ty:ident, $floatty:ident, $floatcast:ident)),+) => { + ($(($fromty:ident, $toty:ident, $cast:ident)),+) => { $( - impl $ty { + impl $fromty { #[inline] - pub fn $floatcast(self) -> ::$floatty { + pub fn $cast(self) -> ::simd::$toty { unsafe { simd_cast(self) } } } diff --git a/library/stdarch/src/simd.rs b/library/stdarch/src/simd_llvm.rs similarity index 100% rename from library/stdarch/src/simd.rs rename to library/stdarch/src/simd_llvm.rs diff --git a/library/stdarch/src/v128.rs b/library/stdarch/src/v128.rs index 4a8bf358da6a..922f983a59ad 100644 --- a/library/stdarch/src/v128.rs +++ b/library/stdarch/src/v128.rs @@ -1,4 +1,4 @@ -use simd::*; +use simd_llvm::*; define_ty! { f64x2, f64, f64 } define_impl! { f64x2, f64, 2, i64x2, x0, x1 } diff --git a/library/stdarch/src/v256.rs b/library/stdarch/src/v256.rs index ba09e70ebdee..a5e163e45319 100644 --- a/library/stdarch/src/v256.rs +++ b/library/stdarch/src/v256.rs @@ -1,4 +1,4 @@ -use simd::*; +use simd_llvm::*; define_ty! { f64x4, f64, f64, f64, f64 } define_impl! { f64x4, f64, 4, i64x4, x0, x1, x2, x3 } diff --git a/library/stdarch/src/v512.rs b/library/stdarch/src/v512.rs new file mode 100644 index 000000000000..5b1afee639b9 --- /dev/null +++ b/library/stdarch/src/v512.rs @@ -0,0 +1,152 @@ +use simd_llvm::*; + +define_ty! { f64x8, f64, f64, f64, f64, f64, f64, f64, f64 } +define_impl! { f64x8, f64, 8, i64x8, x0, x1, x2, x3, x4, x5, x6, x7 } + +define_ty! { + f32x16, + f32, f32, f32, f32, f32, f32, f32, f32, + f32, f32, f32, f32, f32, f32, f32, f32 +} +define_impl! { + f32x16, f32, 16, i32x16, + x0, x1, x2, x3, x4, x5, x6, x7, + x8, x9, x10, x11, x12, x13, x14, x15 +} + +define_ty! { u64x8, u64, u64, u64, u64, u64, u64, u64, u64 } +define_impl! { u64x8, u64, 8, i64x8, x0, x1, x2, x3, x4, x5, x6, x7 } + +define_ty! { i64x8, i64, i64, i64, i64, i64, i64, i64, i64 } +define_impl! { i64x8, i64, 8, i64x8, x0, x1, x2, x3, x4, x5, x6, x7 } + +define_ty! { + u32x16, + u32, u32, u32, u32, u32, u32, u32, u32, + u32, u32, u32, u32, u32, u32, u32, u32 +} +define_impl! { + u32x16, u32, 16, i32x16, + x0, x1, x2, x3, x4, x5, x6, x7, + x8, x9, x10, x11, x12, x13, x14, x15 +} + +define_ty! { + i32x16, + i32, i32, i32, i32, i32, i32, i32, i32, + i32, i32, i32, i32, i32, i32, i32, i32 +} +define_impl! { + i32x16, i32, 16, i32x16, + x0, x1, x2, x3, x4, x5, x6, x7, + x8, x9, x10, x11, x12, x13, x14, x15 +} + +define_ty! { + u16x32, + u16, u16, u16, u16, u16, u16, u16, u16, + u16, u16, u16, u16, u16, u16, u16, u16, + u16, u16, u16, u16, u16, u16, u16, u16, + u16, u16, u16, u16, u16, u16, u16, u16 +} +define_impl! { + u16x32, u16, 32, i16x32, + x0, x1, x2, x3, x4, x5, x6, x7, + x8, x9, x10, x11, x12, x13, x14, x15, + x16, x17, x18, x19, x20, x21, x22, x23, + x24, x25, x26, x27, x28, x29, x30, x31 +} + +define_ty! { + i16x32, + i16, i16, i16, i16, i16, i16, i16, i16, + i16, i16, i16, i16, i16, i16, i16, i16, + i16, i16, i16, i16, i16, i16, i16, i16, + i16, i16, i16, i16, i16, i16, i16, i16 +} +define_impl! { + i16x32, i16, 32, i16x32, + x0, x1, x2, x3, x4, x5, x6, x7, + x8, x9, x10, x11, x12, x13, x14, x15, + x16, x17, x18, x19, x20, x21, x22, x23, + x24, x25, x26, x27, x28, x29, x30, x31 +} + +define_ty! { + u8x64, + u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, + u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, + u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, + u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8 +} +define_impl! { + u8x64, u8, 64, i8x64, + x0, x1, x2, x3, x4, x5, x6, x7, + x8, x9, x10, x11, x12, x13, x14, x15, + x16, x17, x18, x19, x20, x21, x22, x23, + x24, x25, x26, x27, x28, x29, x30, x31, + x32, x33, x34, x35, x36, x37, x38, x39, + x40, x41, x42, x43, x44, x45, x46, x47, + x48, x49, x50, x51, x52, x53, x54, x55, + x56, x57, x58, x59, x60, x61, x62, x63 +} + +define_ty! { + i8x64, + i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, + i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, + i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, + i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 +} +define_impl! { + i8x64, i8, 64, i8x64, + x0, x1, x2, x3, x4, x5, x6, x7, + x8, x9, x10, x11, x12, x13, x14, x15, + x16, x17, x18, x19, x20, x21, x22, x23, + x24, x25, x26, x27, x28, x29, x30, x31, + x32, x33, x34, x35, x36, x37, x38, x39, + x40, x41, x42, x43, x44, x45, x46, x47, + x48, x49, x50, x51, x52, x53, x54, x55, + x56, x57, x58, x59, x60, x61, x62, x63 +} + +define_from!(u64x8, i64x8, u32x16, i32x16, u16x32, i16x32, u8x64, i8x64); +define_from!(i64x8, u64x8, u32x16, i32x16, u16x32, i16x32, u8x64, i8x64); +define_from!(u32x16, u64x8, i64x8, i32x16, u16x32, i16x32, u8x64, i8x64); +define_from!(i32x16, u64x8, i64x8, u32x16, u16x32, i16x32, u8x64, i8x64); +define_from!(u16x32, u64x8, i64x8, u32x16, i32x16, i16x32, u8x64, i8x64); +define_from!(i16x32, u64x8, i64x8, u32x16, i32x16, u16x32, u8x64, i8x64); +define_from!(u8x64, u64x8, i64x8, u32x16, i32x16, u16x32, i16x32, i8x64); +define_from!(i8x64, u64x8, i64x8, u32x16, i32x16, u16x32, i16x32, u8x64); + +define_common_ops!( + f64x8, f32x16, u64x8, i64x8, u32x16, i32x16, u16x32, i16x32, u8x64, i8x64); +define_float_ops!(f64x8, f32x16); +define_integer_ops!( + (u64x8, u64), + (i64x8, i64), + (u32x16, u32), + (i32x16, i32), + (u16x32, u16), + (i16x32, i16), + (u8x64, u8), + (i8x64, i8)); +define_casts!( + (f64x8, f32x8, as_f32x8), + (f64x8, u64x8, as_u64x8), + (f64x8, i64x8, as_i64x8), + (f32x16, u32x16, as_u32x16), + (f32x16, i32x16, as_i32x16), + (u64x8, f64x8, as_f64x8), + (u64x8, i64x8, as_i64x8), + (i64x8, f64x8, as_f64x8), + (i64x8, u64x8, as_u64x8), + (u32x16, f32x16, as_f32x16), + (u32x16, i32x16, as_i32x16), + (i32x16, f32x16, as_f32x16), + (i32x16, u32x16, as_u32x16), + (u16x32, i16x32, as_i16x32), + (i16x32, u16x32, as_u16x32), + (u8x64, i8x64, as_i8x64), + (i8x64, u8x64, as_u8x64)); + diff --git a/library/stdarch/src/v64.rs b/library/stdarch/src/v64.rs index de2c14c356a8..ee856b23b06b 100644 --- a/library/stdarch/src/v64.rs +++ b/library/stdarch/src/v64.rs @@ -1,9 +1,15 @@ -use simd::*; +use simd_llvm::*; -define_ty! { f32x2, f32, f32 } +define_ty_doc! { + f32x2, f32, f32 | + /// A 64-bit vector with 2 `f32` lanes. +} define_impl! { f32x2, f32, 2, i32x2, x0, x1 } -define_ty! { u32x2, u32, u32 } +define_ty_doc! { + u32x2, u32, u32 | + /// A 64-bit vector with 2 `u32` lanes. +} define_impl! { u32x2, u32, 2, i32x2, x0, x1 } define_ty! { i32x2, i32, i32 } diff --git a/library/stdarch/src/x86/avx2.rs b/library/stdarch/src/x86/avx2.rs index 7cf413e97109..e07f26a67dec 100644 --- a/library/stdarch/src/x86/avx2.rs +++ b/library/stdarch/src/x86/avx2.rs @@ -423,12 +423,12 @@ pub fn _mm256_movemask_epi8(a: i8x32) -> i32 { unsafe { pmovmskb(a) } } -/// Compute the sum of absolute differences (SADs) of quadruplets of unsigned -/// 8-bit integers in `a` compared to those in `b`, and store the 16-bit +/// Compute the sum of absolute differences (SADs) of quadruplets of unsigned +/// 8-bit integers in `a` compared to those in `b`, and store the 16-bit /// results in dst. Eight SADs are performed for each 128-bit lane using one -/// quadruplet from `b` and eight quadruplets from `a`. One quadruplet is -/// selected from `b` starting at on the offset specified in `imm8`. Eight -/// quadruplets are formed from sequential 8-bit integers selected from `a` +/// quadruplet from `b` and eight quadruplets from `a`. One quadruplet is +/// selected from `b` starting at on the offset specified in `imm8`. Eight +/// quadruplets are formed from sequential 8-bit integers selected from `a` /// starting at the offset specified in `imm8`. #[inline(always)] #[target_feature = "+avx2"] @@ -438,9 +438,9 @@ pub fn _mm256_mpsadbw_epu8(a: u8x32, b: u8x32, imm8: i32) -> u16x16 { ***/ -/// Multiply the low 32-bit integers from each packed 64-bit element in +/// Multiply the low 32-bit integers from each packed 64-bit element in /// `a` and `b` -/// +/// /// Return the 64-bit results. #[inline(always)] #[target_feature = "+avx2"] @@ -448,7 +448,7 @@ pub fn _mm256_mul_epi32(a: i32x8, b: i32x8) -> i64x4 { unsafe { pmuldq(a, b) } } -/// Multiply the low unsigned 32-bit integers from each packed 64-bit +/// Multiply the low unsigned 32-bit integers from each packed 64-bit /// element in `a` and `b` /// /// Return the unsigned 64-bit results. @@ -458,7 +458,7 @@ pub fn _mm256_mul_epu32(a: u32x8, b: u32x8) -> u64x4 { unsafe { pmuludq(a, b) } } -/// Multiply the packed 16-bit integers in `a` and `b`, producing +/// Multiply the packed 16-bit integers in `a` and `b`, producing /// intermediate 32-bit integers and returning the high 16 bits of the /// intermediate integers. #[inline(always)] @@ -476,7 +476,7 @@ pub fn _mm256_mulhi_epu16(a: u16x16, b: u16x16) -> u16x16 { unsafe { pmulhuw(a, b) } } -/// Multiply the packed 16-bit integers in `a` and `b`, producing +/// Multiply the packed 16-bit integers in `a` and `b`, producing /// intermediate 32-bit integers, and return the low 16 bits of the /// intermediate integers #[inline(always)] @@ -486,7 +486,7 @@ pub fn _mm256_mullo_epi16(a: i16x16, b:i16x16) -> i16x16 { } -/// Multiply the packed 32-bit integers in `a` and `b`, producing +/// Multiply the packed 32-bit integers in `a` and `b`, producing /// intermediate 64-bit integers, and return the low 16 bits of the /// intermediate integers #[inline(always)] @@ -495,7 +495,7 @@ pub fn _mm256_mullo_epi32(a: i32x8, b:i32x8) -> i32x8 { a * b } -/// Multiply packed 16-bit integers in `a` and `b`, producing +/// Multiply packed 16-bit integers in `a` and `b`, producing /// intermediate signed 32-bit integers. Truncate each intermediate /// integer to the 18 most significant bits, round by adding 1, and /// return bits [16:1] @@ -505,7 +505,7 @@ pub fn _mm256_mulhrs_epi16(a: i16x16, b:i16x16) -> i16x16 { unsafe { pmulhrsw(a, b) } } -/// Compute the bitwise OR of 256 bits (representing integer data) in `a` +/// Compute the bitwise OR of 256 bits (representing integer data) in `a` /// and `b` #[inline(always)] #[target_feature = "+avx2"] @@ -513,7 +513,7 @@ pub fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i { a | b } -/// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers +/// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers /// using signed saturation #[inline(always)] #[target_feature = "+avx2"] @@ -521,7 +521,7 @@ pub fn _mm256_packs_epi16(a: i16x16, b: i16x16) -> i8x32 { unsafe { packsswb(a, b) } } -/// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers +/// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers /// using signed saturation #[inline(always)] #[target_feature = "+avx2"] @@ -552,8 +552,8 @@ pub fn _mm256_packus_epi32(a: i32x8, b: i32x8) -> u16x16 { // TODO _mm256_permutevar8x32_ps (__m256 a, __m256i idx) /// Compute the absolute differences of packed unsigned 8-bit integers in `a` -/// and `b`, then horizontally sum each consecutive 8 differences to -/// produce four unsigned 16-bit integers, and pack these unsigned 16-bit +/// and `b`, then horizontally sum each consecutive 8 differences to +/// produce four unsigned 16-bit integers, and pack these unsigned 16-bit /// integers in the low 16 bits of the 64-bit return value #[inline(always)] #[target_feature = "+avx2"] @@ -593,7 +593,7 @@ pub fn _mm256_sign_epi8(a: i8x32, b: i8x32) -> i8x32 { unsafe { psignb(a, b) } } -/// Shift packed 16-bit integers in `a` left by `count` while +/// Shift packed 16-bit integers in `a` left by `count` while /// shifting in zeros, and return the result #[inline(always)] #[target_feature = "+avx2"] @@ -601,7 +601,7 @@ pub fn _mm256_sll_epi16(a: i16x16, count: i16x8) -> i16x16 { unsafe { psllw(a, count) } } -/// Shift packed 32-bit integers in `a` left by `count` while +/// Shift packed 32-bit integers in `a` left by `count` while /// shifting in zeros, and return the result #[inline(always)] #[target_feature = "+avx2"] @@ -609,7 +609,7 @@ pub fn _mm256_sll_epi32(a: i32x8, count: i32x4) -> i32x8 { unsafe { pslld(a, count) } } -/// Shift packed 64-bit integers in `a` left by `count` while +/// Shift packed 64-bit integers in `a` left by `count` while /// shifting in zeros, and return the result #[inline(always)] #[target_feature = "+avx2"] @@ -622,7 +622,7 @@ pub fn _mm256_sll_epi64(a: i64x4, count: i64x2) -> i64x4 { #[inline(always)] #[target_feature = "+avx2"] pub fn _mm256_slli_epi16(a: i16x16, imm8: i32) -> i16x16 { - unsafe { pslliw(a, imm8) } + unsafe { pslliw(a, imm8) } } /// Shift packed 32-bit integers in `a` left by `imm8` while @@ -630,7 +630,7 @@ pub fn _mm256_slli_epi16(a: i16x16, imm8: i32) -> i16x16 { #[inline(always)] #[target_feature = "+avx2"] pub fn _mm256_slli_epi32(a: i32x8, imm8: i32) -> i32x8 { - unsafe { psllid(a, imm8) } + unsafe { psllid(a, imm8) } } /// Shift packed 64-bit integers in `a` left by `imm8` while @@ -638,7 +638,7 @@ pub fn _mm256_slli_epi32(a: i32x8, imm8: i32) -> i32x8 { #[inline(always)] #[target_feature = "+avx2"] pub fn _mm256_slli_epi64(a: i64x4, imm8: i32) -> i64x4 { - unsafe { pslliq(a, imm8) } + unsafe { pslliq(a, imm8) } } // TODO _mm256_slli_si256 (__m256i a, const int imm8) @@ -695,7 +695,7 @@ pub fn _mm256_sra_epi32(a: i32x8, count: i32x4) -> i32x8 { unsafe { psrad(a, count) } } -/// Shift packed 16-bit integers in `a` right by `imm8` while +/// Shift packed 16-bit integers in `a` right by `imm8` while /// shifting in sign bits. #[inline(always)] #[target_feature = "+avx2"] @@ -703,7 +703,7 @@ pub fn _mm256_srai_epi16(a: i16x16, imm8: i32) -> i16x16 { unsafe { psraiw(a, imm8) } } -/// Shift packed 32-bit integers in `a` right by `imm8` while +/// Shift packed 32-bit integers in `a` right by `imm8` while /// shifting in sign bits. #[inline(always)] #[target_feature = "+avx2"] @@ -733,7 +733,7 @@ pub fn _mm256_srav_epi32(a: i32x8, count: i32x8) -> i32x8 { #[inline(always)] #[target_feature = "+avx2"] pub fn _mm256_srl_epi16(a: i16x16, count: i16x8) -> i16x16 { - unsafe { psrlw(a, count) } + unsafe { psrlw(a, count) } } /// Shift packed 32-bit integers in `a` right by `count` while shifting in @@ -741,7 +741,7 @@ pub fn _mm256_srl_epi16(a: i16x16, count: i16x8) -> i16x16 { #[inline(always)] #[target_feature = "+avx2"] pub fn _mm256_srl_epi32(a: i32x8, count: i32x4) -> i32x8 { - unsafe { psrld(a, count) } + unsafe { psrld(a, count) } } /// Shift packed 64-bit integers in `a` right by `count` while shifting in @@ -749,10 +749,10 @@ pub fn _mm256_srl_epi32(a: i32x8, count: i32x4) -> i32x8 { #[inline(always)] #[target_feature = "+avx2"] pub fn _mm256_srl_epi64(a: i64x4, count: i64x2) -> i64x4 { - unsafe { psrlq(a, count) } + unsafe { psrlq(a, count) } } -/// Shift packed 16-bit integers in `a` right by `imm8` while shifting in +/// Shift packed 16-bit integers in `a` right by `imm8` while shifting in /// zeros #[inline(always)] #[target_feature = "+avx2"] @@ -760,7 +760,7 @@ pub fn _mm256_srli_epi16(a: i16x16, imm8: i32) -> i16x16 { unsafe { psrliw(a, imm8) } } -/// Shift packed 32-bit integers in `a` right by `imm8` while shifting in +/// Shift packed 32-bit integers in `a` right by `imm8` while shifting in /// zeros #[inline(always)] #[target_feature = "+avx2"] @@ -768,7 +768,7 @@ pub fn _mm256_srli_epi32(a: i32x8, imm8: i32) -> i32x8 { unsafe { psrlid(a, imm8) } } -/// Shift packed 64-bit integers in `a` right by `imm8` while shifting in +/// Shift packed 64-bit integers in `a` right by `imm8` while shifting in /// zeros #[inline(always)] #[target_feature = "+avx2"] @@ -879,7 +879,7 @@ pub fn _mm256_subs_epu8(a: u8x32, b: u8x32) -> u8x32 { // TODO __m256i _mm256_unpacklo_epi64 (__m256i a, __m256i b) // TODO __m256i _mm256_unpacklo_epi8 (__m256i a, __m256i b) -/// Compute the bitwise XOR of 256 bits (representing integer data) +/// Compute the bitwise XOR of 256 bits (representing integer data) /// in `a` and `b` #[inline(always)] #[target_feature = "+avx2"] @@ -903,7 +903,7 @@ extern "C" { #[link_name = "llvm.x86.avx2.paddus.b"] fn paddusb(a: u8x32, b: u8x32) -> u8x32; #[link_name = "llvm.x86.avx2.paddus.w"] - fn paddusw(a: u16x16, b: u16x16) -> u16x16; + fn paddusw(a: u16x16, b: u16x16) -> u16x16; #[link_name = "llvm.x86.avx2.pavg.b"] fn pavgb(a: u8x32, b: u8x32) -> u8x32; #[link_name = "llvm.x86.avx2.pavg.w"] @@ -949,7 +949,7 @@ extern "C" { #[link_name = "llvm.x86.avx2.pminu.d"] fn pminud(a: u32x8, b: u32x8) -> u32x8; #[link_name = "llvm.x86.avx2.pminu.b"] - fn pminub(a: u8x32, b: u8x32) -> u8x32; + fn pminub(a: u8x32, b: u8x32) -> u8x32; #[link_name = "llvm.x86.avx2.pmovmskb"] //fails in debug fn pmovmskb(a: i8x32) -> i32; #[link_name = "llvm.x86.avx2.mpsadbw"] //fails in debug @@ -1031,7 +1031,7 @@ extern "C" { #[link_name = "llvm.x86.avx2.psrlv.q"] fn psrlvq(a: i64x2, count: i64x2) -> i64x2; #[link_name = "llvm.x86.avx2.psrlv.q.256"] - fn psrlvq256(a: i64x4, count: i64x4) -> i64x4; + fn psrlvq256(a: i64x4, count: i64x4) -> i64x4; #[link_name = "llvm.x86.avx2.psubs.b"] fn psubsb(a: i8x32, b: i8x32) -> i8x32; #[link_name = "llvm.x86.avx2.psubs.w"] @@ -1580,15 +1580,15 @@ mod tests { } -/** +/** // TODO this fails in debug but not release, why? #[test] #[target_feature ="+avx2"] fn _mm256_movemask_epi8() { - let a = i8x32::splat(-1); + let a = i8x32::splat(-1); let r = avx2::_mm256_movemask_epi8(a); let e : i32 = -1; - assert_eq!(r, e); + assert_eq!(r, e); } // TODO This fails in debug but not in release, whhhy? @@ -1604,7 +1604,7 @@ mod tests { **/ #[test] - #[target_feature = "+avx2"] + #[target_feature = "+avx2"] fn _mm256_mul_epi32() { let a = i32x8::new(0, 0, 0, 0, 2, 2, 2, 2); let b = i32x8::new(1, 2, 3, 4, 5, 6, 7, 8); @@ -1693,7 +1693,7 @@ mod tests { 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4); - + assert_eq!(r, e); } @@ -1708,7 +1708,7 @@ mod tests { 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4); - + assert_eq!(r, e); } @@ -1723,7 +1723,7 @@ mod tests { 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4); - + assert_eq!(r, e); } @@ -1738,7 +1738,7 @@ mod tests { 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4); - + assert_eq!(r, e); } @@ -1756,7 +1756,7 @@ mod tests { #[target_feature = "+avx2"] fn _mm256_sign_epi16() { let a = i16x16::splat(2); - let b = i16x16::splat(-1); + let b = i16x16::splat(-1); let r = avx2::_mm256_sign_epi16(a, b); let e = i16x16::splat(-2); assert_eq!(r, e); @@ -1766,7 +1766,7 @@ mod tests { #[target_feature = "+avx2"] fn _mm256_sign_epi32() { let a = i32x8::splat(2); - let b = i32x8::splat(-1); + let b = i32x8::splat(-1); let r = avx2::_mm256_sign_epi32(a, b); let e = i32x8::splat(-2); assert_eq!(r, e); @@ -1776,7 +1776,7 @@ mod tests { #[target_feature = "+avx2"] fn _mm256_sign_epi8() { let a = i8x32::splat(2); - let b = i8x32::splat(-1); + let b = i8x32::splat(-1); let r = avx2::_mm256_sign_epi8(a, b); let e = i8x32::splat(-2); assert_eq!(r, e); @@ -1816,7 +1816,7 @@ mod tests { avx2::_mm256_slli_epi16(i16x16::splat(0xFF), 4), i16x16::splat(0xFF0)); } - + #[test] #[target_feature = "+avx2"] fn _mm256_slli_epi32() { @@ -1840,7 +1840,7 @@ mod tests { let b = i32x4::splat(1); let r = avx2::_mm_sllv_epi32(a, b); let e = i32x4::splat(4); - assert_eq!(r, e); + assert_eq!(r, e); } #[test] @@ -1850,7 +1850,7 @@ mod tests { let b = i32x8::splat(1); let r = avx2::_mm256_sllv_epi32(a, b); let e = i32x8::splat(4); - assert_eq!(r, e); + assert_eq!(r, e); } #[test] #[target_feature = "+avx2"] @@ -1859,7 +1859,7 @@ mod tests { let b = i64x2::splat(1); let r = avx2::_mm_sllv_epi64(a, b); let e = i64x2::splat(4); - assert_eq!(r, e); + assert_eq!(r, e); } #[test] #[target_feature = "+avx2"] @@ -1868,7 +1868,7 @@ mod tests { let b = i64x4::splat(1); let r = avx2::_mm256_sllv_epi64(a, b); let e = i64x4::splat(4); - assert_eq!(r, e); + assert_eq!(r, e); } #[test] @@ -2097,5 +2097,5 @@ mod tests { __m256i::splat(6)); } - + } diff --git a/library/stdarch/src/x86/sse2.rs b/library/stdarch/src/x86/sse2.rs index 1ecaf9ea43aa..b564677ea786 100644 --- a/library/stdarch/src/x86/sse2.rs +++ b/library/stdarch/src/x86/sse2.rs @@ -2,7 +2,7 @@ use std::mem; use std::os::raw::c_void; use std::ptr; -use simd::{ +use simd_llvm::{ simd_cast, simd_shuffle2, simd_shuffle4, simd_shuffle8, simd_shuffle16, }; use x86::__m128i; diff --git a/library/stdarch/src/x86/sse41.rs b/library/stdarch/src/x86/sse41.rs index 2f4a8a307dd2..61217f26f596 100644 --- a/library/stdarch/src/x86/sse41.rs +++ b/library/stdarch/src/x86/sse41.rs @@ -1,4 +1,3 @@ -// use v128::*; use x86::__m128i; #[inline(always)] diff --git a/library/stdarch/src/x86/sse42.rs b/library/stdarch/src/x86/sse42.rs index a40aae722548..4789fd32d537 100644 --- a/library/stdarch/src/x86/sse42.rs +++ b/library/stdarch/src/x86/sse42.rs @@ -1,4 +1,3 @@ -// use v128::*; use x86::__m128i; pub const _SIDD_UBYTE_OPS: i8 = 0b00000000;