refactor

2025-11-18 14:19:41 +00:00 · 2017-06-19 17:03:29 -04:00 · 2017-06-19 17:03:29 -04:00 · e975e22c20
commit e975e22c20
parent 1450e641a5
14 changed files with 256 additions and 75 deletions
--- a/library/stdarch/QUESTIONS.md
+++ b/library/stdarch/QUESTIONS.md
@ -24,3 +24,8 @@ type is probably `i64x2`.
 What is the deal with the ucomi f64 comparison functions in SSE2? Clang's
 headers indicate some specific behavior with NAN, but I can't seem to reproduce
 it. Intel's official docs are very vague.
+
+---
+
+`_mm256_blendv_pd` takes a mask parameter with type `f64x4`, but the
+documentation seems to indicate that it is a bit vector. What's going on?
--- a/library/stdarch/TODO.md
+++ b/library/stdarch/TODO.md
@ -456,7 +456,7 @@ sse4.1
 * [ ] `_mm_blend_ps`
 * [ ] `_mm_blendv_pd`
 * [ ] `_mm_blendv_ps`
-* [ ] `_mm_blendv_epi8`
+* [x] `_mm_blendv_epi8`
 * [ ] `_mm_blend_epi16`
 * [ ] `_mm_dp_pd`
 * [ ] `_mm_dp_ps`
--- a/library/stdarch/examples/play.rs
+++ b/library/stdarch/examples/play.rs
@ -3,7 +3,8 @@
 extern crate stdsimd;

 use std::env;
-use stdsimd as s;
+use stdsimd::simd as s;
+use stdsimd::vendor;

 #[inline(never)]
 #[target_feature = "+sse4.2"]
@ -14,15 +15,15 @@ fn index(needle: &str, haystack: &str) -> usize {

    let mut needle = needle.to_string().into_bytes();
    needle.resize(16, 0);
-    let vneedle = s::__m128i::from(s::u8x16::load(&needle, 0));
+    let vneedle = vendor::__m128i::from(s::u8x16::load(&needle, 0));

    let mut haystack = haystack.to_string().into_bytes();
    haystack.resize(16, 0);
-    let vhaystack = s::__m128i::from(s::u8x16::load(&haystack, 0));
+    let vhaystack = vendor::__m128i::from(s::u8x16::load(&haystack, 0));

-    s::_mm_cmpestri(
+    vendor::_mm_cmpestri(
        vneedle, needle_len as i32, vhaystack, hay_len as i32,
-        s::_SIDD_CMP_EQUAL_ORDERED) as usize
+        vendor::_SIDD_CMP_EQUAL_ORDERED) as usize
 }

 fn main() {
--- a/library/stdarch/src/lib.rs
+++ b/library/stdarch/src/lib.rs
@ -4,17 +4,26 @@
    target_feature,
 )]

-pub use v128::*;
-pub use v256::*;
-pub use v64::*;
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-pub use x86::*;
+/// Platform independent SIMD vector types and operations.
+pub mod simd {
+    pub use v128::*;
+    pub use v256::*;
+    pub use v512::*;
+    pub use v64::*;
+}
+
+/// Platform dependent vendor intrinsics.
+pub mod vendor {
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    pub use x86::*;
+}

 #[macro_use]
 mod macros;
-mod simd;
+mod simd_llvm;
 mod v128;
 mod v256;
+mod v512;
 mod v64;
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 mod x86;
--- a/library/stdarch/src/macros.rs
+++ b/library/stdarch/src/macros.rs
@ -7,6 +7,16 @@ macro_rules! define_ty {
    }
 }

+macro_rules! define_ty_doc {
+    ($name:ident, $($elty:ident),+ | $(#[$doc:meta])*) => {
+        $(#[$doc])*
+        #[repr(simd)]
+        #[derive(Clone, Copy, Debug, PartialEq)]
+        #[allow(non_camel_case_types)]
+        pub struct $name($($elty),*);
+    }
+}
+
 macro_rules! define_impl {
    (
        $name:ident, $elemty:ident, $nelems:expr, $boolname:ident,
@ -246,11 +256,11 @@ macro_rules! define_integer_ops {
 }

 macro_rules! define_casts {
-    ($(($ty:ident, $floatty:ident, $floatcast:ident)),+) => {
+    ($(($fromty:ident, $toty:ident, $cast:ident)),+) => {
        $(
-            impl $ty {
+            impl $fromty {
                #[inline]
-                pub fn $floatcast(self) -> ::$floatty {
+                pub fn $cast(self) -> ::simd::$toty {
                    unsafe { simd_cast(self) }
                }
            }
--- a/library/stdarch/src/simd_llvm.rs
+++ b/library/stdarch/src/simd_llvm.rs
--- a/library/stdarch/src/v128.rs
+++ b/library/stdarch/src/v128.rs
@ -1,4 +1,4 @@
-use simd::*;
+use simd_llvm::*;

 define_ty! { f64x2, f64, f64 }
 define_impl! { f64x2, f64, 2, i64x2, x0, x1 }
--- a/library/stdarch/src/v256.rs
+++ b/library/stdarch/src/v256.rs
@ -1,4 +1,4 @@
-use simd::*;
+use simd_llvm::*;

 define_ty! { f64x4, f64, f64, f64, f64 }
 define_impl! { f64x4, f64, 4, i64x4, x0, x1, x2, x3 }
--- a/library/stdarch/src/v512.rs
+++ b/library/stdarch/src/v512.rs
@ -0,0 +1,152 @@
+use simd_llvm::*;
+
+define_ty! { f64x8, f64, f64, f64, f64, f64, f64, f64, f64 }
+define_impl! { f64x8, f64, 8, i64x8, x0, x1, x2, x3, x4, x5, x6, x7 }
+
+define_ty! {
+    f32x16,
+    f32, f32, f32, f32, f32, f32, f32, f32,
+    f32, f32, f32, f32, f32, f32, f32, f32
+}
+define_impl! {
+    f32x16, f32, 16, i32x16,
+    x0, x1, x2, x3, x4, x5, x6, x7,
+    x8, x9, x10, x11, x12, x13, x14, x15
+}
+
+define_ty! { u64x8, u64, u64, u64, u64, u64, u64, u64, u64 }
+define_impl! { u64x8, u64, 8, i64x8, x0, x1, x2, x3, x4, x5, x6, x7 }
+
+define_ty! { i64x8, i64, i64, i64, i64, i64, i64, i64, i64 }
+define_impl! { i64x8, i64, 8, i64x8, x0, x1, x2, x3, x4, x5, x6, x7 }
+
+define_ty! {
+    u32x16,
+    u32, u32, u32, u32, u32, u32, u32, u32,
+    u32, u32, u32, u32, u32, u32, u32, u32
+}
+define_impl! {
+    u32x16, u32, 16, i32x16,
+    x0, x1, x2, x3, x4, x5, x6, x7,
+    x8, x9, x10, x11, x12, x13, x14, x15
+}
+
+define_ty! {
+    i32x16,
+    i32, i32, i32, i32, i32, i32, i32, i32,
+    i32, i32, i32, i32, i32, i32, i32, i32
+}
+define_impl! {
+    i32x16, i32, 16, i32x16,
+    x0, x1, x2, x3, x4, x5, x6, x7,
+    x8, x9, x10, x11, x12, x13, x14, x15
+}
+
+define_ty! {
+    u16x32,
+    u16, u16, u16, u16, u16, u16, u16, u16,
+    u16, u16, u16, u16, u16, u16, u16, u16,
+    u16, u16, u16, u16, u16, u16, u16, u16,
+    u16, u16, u16, u16, u16, u16, u16, u16
+}
+define_impl! {
+    u16x32, u16, 32, i16x32,
+    x0, x1, x2, x3, x4, x5, x6, x7,
+    x8, x9, x10, x11, x12, x13, x14, x15,
+    x16, x17, x18, x19, x20, x21, x22, x23,
+    x24, x25, x26, x27, x28, x29, x30, x31
+}
+
+define_ty! {
+    i16x32,
+    i16, i16, i16, i16, i16, i16, i16, i16,
+    i16, i16, i16, i16, i16, i16, i16, i16,
+    i16, i16, i16, i16, i16, i16, i16, i16,
+    i16, i16, i16, i16, i16, i16, i16, i16
+}
+define_impl! {
+    i16x32, i16, 32, i16x32,
+    x0, x1, x2, x3, x4, x5, x6, x7,
+    x8, x9, x10, x11, x12, x13, x14, x15,
+    x16, x17, x18, x19, x20, x21, x22, x23,
+    x24, x25, x26, x27, x28, x29, x30, x31
+}
+
+define_ty! {
+    u8x64,
+    u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8,
+    u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8,
+    u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8,
+    u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8
+}
+define_impl! {
+    u8x64, u8, 64, i8x64,
+    x0, x1, x2, x3, x4, x5, x6, x7,
+    x8, x9, x10, x11, x12, x13, x14, x15,
+    x16, x17, x18, x19, x20, x21, x22, x23,
+    x24, x25, x26, x27, x28, x29, x30, x31,
+    x32, x33, x34, x35, x36, x37, x38, x39,
+    x40, x41, x42, x43, x44, x45, x46, x47,
+    x48, x49, x50, x51, x52, x53, x54, x55,
+    x56, x57, x58, x59, x60, x61, x62, x63
+}
+
+define_ty! {
+    i8x64,
+    i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8,
+    i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8,
+    i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8,
+    i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8
+}
+define_impl! {
+    i8x64, i8, 64, i8x64,
+    x0, x1, x2, x3, x4, x5, x6, x7,
+    x8, x9, x10, x11, x12, x13, x14, x15,
+    x16, x17, x18, x19, x20, x21, x22, x23,
+    x24, x25, x26, x27, x28, x29, x30, x31,
+    x32, x33, x34, x35, x36, x37, x38, x39,
+    x40, x41, x42, x43, x44, x45, x46, x47,
+    x48, x49, x50, x51, x52, x53, x54, x55,
+    x56, x57, x58, x59, x60, x61, x62, x63
+}
+
+define_from!(u64x8, i64x8, u32x16, i32x16, u16x32, i16x32, u8x64, i8x64);
+define_from!(i64x8, u64x8, u32x16, i32x16, u16x32, i16x32, u8x64, i8x64);
+define_from!(u32x16, u64x8, i64x8, i32x16, u16x32, i16x32, u8x64, i8x64);
+define_from!(i32x16, u64x8, i64x8, u32x16, u16x32, i16x32, u8x64, i8x64);
+define_from!(u16x32, u64x8, i64x8, u32x16, i32x16, i16x32, u8x64, i8x64);
+define_from!(i16x32, u64x8, i64x8, u32x16, i32x16, u16x32, u8x64, i8x64);
+define_from!(u8x64, u64x8, i64x8, u32x16, i32x16, u16x32, i16x32, i8x64);
+define_from!(i8x64, u64x8, i64x8, u32x16, i32x16, u16x32, i16x32, u8x64);
+
+define_common_ops!(
+    f64x8, f32x16, u64x8, i64x8, u32x16, i32x16, u16x32, i16x32, u8x64, i8x64);
+define_float_ops!(f64x8, f32x16);
+define_integer_ops!(
+    (u64x8, u64),
+    (i64x8, i64),
+    (u32x16, u32),
+    (i32x16, i32),
+    (u16x32, u16),
+    (i16x32, i16),
+    (u8x64, u8),
+    (i8x64, i8));
+define_casts!(
+    (f64x8, f32x8, as_f32x8),
+    (f64x8, u64x8, as_u64x8),
+    (f64x8, i64x8, as_i64x8),
+    (f32x16, u32x16, as_u32x16),
+    (f32x16, i32x16, as_i32x16),
+    (u64x8, f64x8, as_f64x8),
+    (u64x8, i64x8, as_i64x8),
+    (i64x8, f64x8, as_f64x8),
+    (i64x8, u64x8, as_u64x8),
+    (u32x16, f32x16, as_f32x16),
+    (u32x16, i32x16, as_i32x16),
+    (i32x16, f32x16, as_f32x16),
+    (i32x16, u32x16, as_u32x16),
+    (u16x32, i16x32, as_i16x32),
+    (i16x32, u16x32, as_u16x32),
+    (u8x64, i8x64, as_i8x64),
+    (i8x64, u8x64, as_u8x64));
+
--- a/library/stdarch/src/v64.rs
+++ b/library/stdarch/src/v64.rs
@ -1,9 +1,15 @@
-use simd::*;
+use simd_llvm::*;

-define_ty! { f32x2, f32, f32 }
+define_ty_doc! {
+    f32x2, f32, f32 |
+    /// A 64-bit vector with 2 `f32` lanes.
+}
 define_impl! { f32x2, f32, 2, i32x2, x0, x1 }

-define_ty! { u32x2, u32, u32 }
+define_ty_doc! {
+    u32x2, u32, u32 |
+    /// A 64-bit vector with 2 `u32` lanes.
+}
 define_impl! { u32x2, u32, 2, i32x2, x0, x1 }

 define_ty! { i32x2, i32, i32 }
--- a/library/stdarch/src/x86/avx2.rs
+++ b/library/stdarch/src/x86/avx2.rs
@ -423,12 +423,12 @@ pub fn _mm256_movemask_epi8(a: i8x32) -> i32 {
    unsafe { pmovmskb(a) }
 }

-/// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 
-/// 8-bit integers in `a` compared to those in `b`, and store the 16-bit 
+/// Compute the sum of absolute differences (SADs) of quadruplets of unsigned
+/// 8-bit integers in `a` compared to those in `b`, and store the 16-bit
 /// results in dst. Eight SADs are performed for each 128-bit lane using one
-/// quadruplet from `b` and eight quadruplets from `a`. One quadruplet is 
-/// selected from `b` starting at on the offset specified in `imm8`. Eight 
-/// quadruplets are formed from sequential 8-bit integers selected from `a` 
+/// quadruplet from `b` and eight quadruplets from `a`. One quadruplet is
+/// selected from `b` starting at on the offset specified in `imm8`. Eight
+/// quadruplets are formed from sequential 8-bit integers selected from `a`
 /// starting at the offset specified in `imm8`.
 #[inline(always)]
 #[target_feature = "+avx2"]
@ -438,9 +438,9 @@ pub fn _mm256_mpsadbw_epu8(a: u8x32, b: u8x32, imm8: i32) -> u16x16 {

 ***/

-/// Multiply the low 32-bit integers from each packed 64-bit element in 
+/// Multiply the low 32-bit integers from each packed 64-bit element in
 /// `a` and `b`
-/// 
+///
 /// Return the 64-bit results.
 #[inline(always)]
 #[target_feature = "+avx2"]
@ -448,7 +448,7 @@ pub fn _mm256_mul_epi32(a: i32x8, b: i32x8) -> i64x4 {
    unsafe { pmuldq(a, b) }
 }

-/// Multiply the low unsigned 32-bit integers from each packed 64-bit 
+/// Multiply the low unsigned 32-bit integers from each packed 64-bit
 /// element in `a` and `b`
 ///
 /// Return the unsigned 64-bit results.
@ -458,7 +458,7 @@ pub fn _mm256_mul_epu32(a: u32x8, b: u32x8) -> u64x4 {
    unsafe { pmuludq(a, b) }
 }

-/// Multiply the packed 16-bit integers in `a` and `b`, producing 
+/// Multiply the packed 16-bit integers in `a` and `b`, producing
 /// intermediate 32-bit integers and returning the high 16 bits of the
 /// intermediate integers.
 #[inline(always)]
@ -476,7 +476,7 @@ pub fn _mm256_mulhi_epu16(a: u16x16, b: u16x16) -> u16x16 {
    unsafe { pmulhuw(a, b) }
 }

-/// Multiply the packed 16-bit integers in `a` and `b`, producing 
+/// Multiply the packed 16-bit integers in `a` and `b`, producing
 /// intermediate 32-bit integers, and return the low 16 bits of the
 /// intermediate integers
 #[inline(always)]
@ -486,7 +486,7 @@ pub fn _mm256_mullo_epi16(a: i16x16, b:i16x16) -> i16x16 {
 }


-/// Multiply the packed 32-bit integers in `a` and `b`, producing 
+/// Multiply the packed 32-bit integers in `a` and `b`, producing
 /// intermediate 64-bit integers, and return the low 16 bits of the
 /// intermediate integers
 #[inline(always)]
@ -495,7 +495,7 @@ pub fn _mm256_mullo_epi32(a: i32x8, b:i32x8) -> i32x8 {
    a * b
 }

-/// Multiply packed 16-bit integers in `a` and `b`, producing 
+/// Multiply packed 16-bit integers in `a` and `b`, producing
 /// intermediate signed 32-bit integers. Truncate each intermediate
 /// integer to the 18 most significant bits, round by adding 1, and
 /// return bits [16:1]
@ -505,7 +505,7 @@ pub fn _mm256_mulhrs_epi16(a: i16x16, b:i16x16) -> i16x16 {
    unsafe { pmulhrsw(a, b) }
 }

-/// Compute the bitwise OR of 256 bits (representing integer data) in `a` 
+/// Compute the bitwise OR of 256 bits (representing integer data) in `a`
 /// and `b`
 #[inline(always)]
 #[target_feature = "+avx2"]
@ -513,7 +513,7 @@ pub fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
    a | b
 }

-/// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers 
+/// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers
 /// using signed saturation
 #[inline(always)]
 #[target_feature = "+avx2"]
@ -521,7 +521,7 @@ pub fn _mm256_packs_epi16(a: i16x16, b: i16x16) -> i8x32 {
    unsafe { packsswb(a, b) }
 }

-/// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers 
+/// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers
 /// using signed saturation
 #[inline(always)]
 #[target_feature = "+avx2"]
@ -552,8 +552,8 @@ pub fn _mm256_packus_epi32(a: i32x8, b: i32x8) -> u16x16 {
 // TODO _mm256_permutevar8x32_ps (__m256 a, __m256i idx)

 /// Compute the absolute differences of packed unsigned 8-bit integers in `a`
-/// and `b`, then horizontally sum each consecutive 8 differences to 
-/// produce four unsigned 16-bit integers, and pack these unsigned 16-bit 
+/// and `b`, then horizontally sum each consecutive 8 differences to
+/// produce four unsigned 16-bit integers, and pack these unsigned 16-bit
 /// integers in the low 16 bits of the 64-bit return value
 #[inline(always)]
 #[target_feature = "+avx2"]
@ -593,7 +593,7 @@ pub fn _mm256_sign_epi8(a: i8x32, b: i8x32) -> i8x32 {
    unsafe { psignb(a, b) }
 }

-/// Shift packed 16-bit integers in `a` left by `count` while 
+/// Shift packed 16-bit integers in `a` left by `count` while
 /// shifting in zeros, and return the result
 #[inline(always)]
 #[target_feature = "+avx2"]
@ -601,7 +601,7 @@ pub fn _mm256_sll_epi16(a: i16x16, count: i16x8) -> i16x16 {
    unsafe { psllw(a, count) }
 }

-/// Shift packed 32-bit integers in `a` left by `count` while 
+/// Shift packed 32-bit integers in `a` left by `count` while
 /// shifting in zeros, and return the result
 #[inline(always)]
 #[target_feature = "+avx2"]
@ -609,7 +609,7 @@ pub fn _mm256_sll_epi32(a: i32x8, count: i32x4) -> i32x8 {
    unsafe { pslld(a, count) }
 }

-/// Shift packed 64-bit integers in `a` left by `count` while 
+/// Shift packed 64-bit integers in `a` left by `count` while
 /// shifting in zeros, and return the result
 #[inline(always)]
 #[target_feature = "+avx2"]
@ -622,7 +622,7 @@ pub fn _mm256_sll_epi64(a: i64x4, count: i64x2) -> i64x4 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 pub fn _mm256_slli_epi16(a: i16x16, imm8: i32) -> i16x16 {
-    unsafe { pslliw(a, imm8) }    
+    unsafe { pslliw(a, imm8) }
 }

 /// Shift packed 32-bit integers in `a` left by `imm8` while
@ -630,7 +630,7 @@ pub fn _mm256_slli_epi16(a: i16x16, imm8: i32) -> i16x16 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 pub fn _mm256_slli_epi32(a: i32x8, imm8: i32) -> i32x8 {
-    unsafe { psllid(a, imm8) }    
+    unsafe { psllid(a, imm8) }
 }

 /// Shift packed 64-bit integers in `a` left by `imm8` while
@ -638,7 +638,7 @@ pub fn _mm256_slli_epi32(a: i32x8, imm8: i32) -> i32x8 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 pub fn _mm256_slli_epi64(a: i64x4, imm8: i32) -> i64x4 {
-    unsafe { pslliq(a, imm8) }    
+    unsafe { pslliq(a, imm8) }
 }

 // TODO _mm256_slli_si256 (__m256i a, const int imm8)
@ -695,7 +695,7 @@ pub fn _mm256_sra_epi32(a: i32x8, count: i32x4) -> i32x8 {
    unsafe { psrad(a, count) }
 }

-/// Shift packed 16-bit integers in `a` right by `imm8` while 
+/// Shift packed 16-bit integers in `a` right by `imm8` while
 /// shifting in sign bits.
 #[inline(always)]
 #[target_feature = "+avx2"]
@ -703,7 +703,7 @@ pub fn _mm256_srai_epi16(a: i16x16, imm8: i32) -> i16x16 {
    unsafe { psraiw(a, imm8) }
 }

-/// Shift packed 32-bit integers in `a` right by `imm8` while 
+/// Shift packed 32-bit integers in `a` right by `imm8` while
 /// shifting in sign bits.
 #[inline(always)]
 #[target_feature = "+avx2"]
@ -733,7 +733,7 @@ pub fn _mm256_srav_epi32(a: i32x8, count: i32x8) -> i32x8 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 pub fn _mm256_srl_epi16(a: i16x16, count: i16x8) -> i16x16 {
-    unsafe { psrlw(a, count) }    
+    unsafe { psrlw(a, count) }
 }

 /// Shift packed 32-bit integers in `a` right by `count` while shifting in
@ -741,7 +741,7 @@ pub fn _mm256_srl_epi16(a: i16x16, count: i16x8) -> i16x16 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 pub fn _mm256_srl_epi32(a: i32x8, count: i32x4) -> i32x8 {
-    unsafe { psrld(a, count) }    
+    unsafe { psrld(a, count) }
 }

 /// Shift packed 64-bit integers in `a` right by `count` while shifting in
@ -749,10 +749,10 @@ pub fn _mm256_srl_epi32(a: i32x8, count: i32x4) -> i32x8 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 pub fn _mm256_srl_epi64(a: i64x4, count: i64x2) -> i64x4 {
-    unsafe { psrlq(a, count) }    
+    unsafe { psrlq(a, count) }
 }

-/// Shift packed 16-bit integers in `a` right by `imm8` while shifting in 
+/// Shift packed 16-bit integers in `a` right by `imm8` while shifting in
 /// zeros
 #[inline(always)]
 #[target_feature = "+avx2"]
@ -760,7 +760,7 @@ pub fn _mm256_srli_epi16(a: i16x16, imm8: i32) -> i16x16 {
    unsafe { psrliw(a, imm8) }
 }

-/// Shift packed 32-bit integers in `a` right by `imm8` while shifting in 
+/// Shift packed 32-bit integers in `a` right by `imm8` while shifting in
 /// zeros
 #[inline(always)]
 #[target_feature = "+avx2"]
@ -768,7 +768,7 @@ pub fn _mm256_srli_epi32(a: i32x8, imm8: i32) -> i32x8 {
    unsafe { psrlid(a, imm8) }
 }

-/// Shift packed 64-bit integers in `a` right by `imm8` while shifting in 
+/// Shift packed 64-bit integers in `a` right by `imm8` while shifting in
 /// zeros
 #[inline(always)]
 #[target_feature = "+avx2"]
@ -879,7 +879,7 @@ pub fn _mm256_subs_epu8(a: u8x32, b: u8x32) -> u8x32 {
 // TODO __m256i _mm256_unpacklo_epi64 (__m256i a, __m256i b)
 // TODO __m256i _mm256_unpacklo_epi8 (__m256i a, __m256i b)

-/// Compute the bitwise XOR of 256 bits (representing integer data) 
+/// Compute the bitwise XOR of 256 bits (representing integer data)
 /// in `a` and `b`
 #[inline(always)]
 #[target_feature = "+avx2"]
@ -903,7 +903,7 @@ extern "C" {
    #[link_name = "llvm.x86.avx2.paddus.b"]
    fn paddusb(a: u8x32, b: u8x32) -> u8x32;
    #[link_name = "llvm.x86.avx2.paddus.w"]
-    fn paddusw(a: u16x16, b: u16x16) -> u16x16;    
+    fn paddusw(a: u16x16, b: u16x16) -> u16x16;
    #[link_name = "llvm.x86.avx2.pavg.b"]
    fn pavgb(a: u8x32, b: u8x32) -> u8x32;
    #[link_name = "llvm.x86.avx2.pavg.w"]
@ -949,7 +949,7 @@ extern "C" {
    #[link_name = "llvm.x86.avx2.pminu.d"]
    fn pminud(a: u32x8, b: u32x8) -> u32x8;
    #[link_name = "llvm.x86.avx2.pminu.b"]
-    fn pminub(a: u8x32, b: u8x32) -> u8x32;    
+    fn pminub(a: u8x32, b: u8x32) -> u8x32;
    #[link_name = "llvm.x86.avx2.pmovmskb"]  //fails in debug
    fn pmovmskb(a: i8x32) -> i32;
    #[link_name = "llvm.x86.avx2.mpsadbw"] //fails in debug
@ -1031,7 +1031,7 @@ extern "C" {
    #[link_name = "llvm.x86.avx2.psrlv.q"]
    fn psrlvq(a: i64x2, count: i64x2) -> i64x2;
    #[link_name = "llvm.x86.avx2.psrlv.q.256"]
-    fn psrlvq256(a: i64x4, count: i64x4) -> i64x4;    
+    fn psrlvq256(a: i64x4, count: i64x4) -> i64x4;
    #[link_name = "llvm.x86.avx2.psubs.b"]
    fn psubsb(a: i8x32, b: i8x32) -> i8x32;
    #[link_name = "llvm.x86.avx2.psubs.w"]
@ -1580,15 +1580,15 @@ mod tests {
    }


-/** 
+/**
    // TODO this fails in debug but not release, why?
    #[test]
    #[target_feature ="+avx2"]
    fn _mm256_movemask_epi8() {
-        let a = i8x32::splat(-1);        
+        let a = i8x32::splat(-1);
        let r = avx2::_mm256_movemask_epi8(a);
        let e : i32 = -1;
-        assert_eq!(r, e);    
+        assert_eq!(r, e);
    }

    // TODO This fails in debug but not in release, whhhy?
@ -1604,7 +1604,7 @@ mod tests {
 **/

    #[test]
-    #[target_feature = "+avx2"]    
+    #[target_feature = "+avx2"]
    fn _mm256_mul_epi32() {
        let a = i32x8::new(0, 0, 0, 0, 2, 2, 2, 2);
        let b = i32x8::new(1, 2, 3, 4, 5, 6, 7, 8);
@ -1693,7 +1693,7 @@ mod tests {
            4, 4, 4, 4, 4, 4, 4, 4,
            2, 2, 2, 2, 2, 2, 2, 2,
            4, 4, 4, 4, 4, 4, 4, 4);
-        
+
        assert_eq!(r, e);
    }

@ -1708,7 +1708,7 @@ mod tests {
            4, 4, 4, 4,
            2, 2, 2, 2,
            4, 4, 4, 4);
-        
+
        assert_eq!(r, e);
    }

@ -1723,7 +1723,7 @@ mod tests {
            4, 4, 4, 4, 4, 4, 4, 4,
            2, 2, 2, 2, 2, 2, 2, 2,
            4, 4, 4, 4, 4, 4, 4, 4);
-        
+
        assert_eq!(r, e);
    }

@ -1738,7 +1738,7 @@ mod tests {
            4, 4, 4, 4,
            2, 2, 2, 2,
            4, 4, 4, 4);
-        
+
        assert_eq!(r, e);
    }

@ -1756,7 +1756,7 @@ mod tests {
    #[target_feature = "+avx2"]
    fn _mm256_sign_epi16() {
        let a = i16x16::splat(2);
-        let b = i16x16::splat(-1);    
+        let b = i16x16::splat(-1);
        let r = avx2::_mm256_sign_epi16(a, b);
        let e = i16x16::splat(-2);
        assert_eq!(r, e);
@ -1766,7 +1766,7 @@ mod tests {
    #[target_feature = "+avx2"]
    fn _mm256_sign_epi32() {
        let a = i32x8::splat(2);
-        let b = i32x8::splat(-1);    
+        let b = i32x8::splat(-1);
        let r = avx2::_mm256_sign_epi32(a, b);
        let e = i32x8::splat(-2);
        assert_eq!(r, e);
@ -1776,7 +1776,7 @@ mod tests {
    #[target_feature = "+avx2"]
    fn _mm256_sign_epi8() {
        let a = i8x32::splat(2);
-        let b = i8x32::splat(-1);    
+        let b = i8x32::splat(-1);
        let r = avx2::_mm256_sign_epi8(a, b);
        let e = i8x32::splat(-2);
        assert_eq!(r, e);
@ -1816,7 +1816,7 @@ mod tests {
            avx2::_mm256_slli_epi16(i16x16::splat(0xFF), 4),
            i16x16::splat(0xFF0));
    }
-    
+
    #[test]
    #[target_feature = "+avx2"]
    fn _mm256_slli_epi32() {
@ -1840,7 +1840,7 @@ mod tests {
        let b = i32x4::splat(1);
        let r = avx2::_mm_sllv_epi32(a, b);
        let e = i32x4::splat(4);
-        assert_eq!(r, e);   
+        assert_eq!(r, e);
    }

    #[test]
@ -1850,7 +1850,7 @@ mod tests {
        let b = i32x8::splat(1);
        let r = avx2::_mm256_sllv_epi32(a, b);
        let e = i32x8::splat(4);
-        assert_eq!(r, e);   
+        assert_eq!(r, e);
    }
    #[test]
    #[target_feature = "+avx2"]
@ -1859,7 +1859,7 @@ mod tests {
        let b = i64x2::splat(1);
        let r = avx2::_mm_sllv_epi64(a, b);
        let e = i64x2::splat(4);
-        assert_eq!(r, e);   
+        assert_eq!(r, e);
    }
    #[test]
    #[target_feature = "+avx2"]
@ -1868,7 +1868,7 @@ mod tests {
        let b = i64x4::splat(1);
        let r = avx2::_mm256_sllv_epi64(a, b);
        let e = i64x4::splat(4);
-        assert_eq!(r, e);   
+        assert_eq!(r, e);
    }

    #[test]
@ -2097,5 +2097,5 @@ mod tests {
            __m256i::splat(6));
    }

-    
+
 }
--- a/library/stdarch/src/x86/sse2.rs
+++ b/library/stdarch/src/x86/sse2.rs
@ -2,7 +2,7 @@ use std::mem;
 use std::os::raw::c_void;
 use std::ptr;

-use simd::{
+use simd_llvm::{
    simd_cast, simd_shuffle2, simd_shuffle4, simd_shuffle8, simd_shuffle16,
 };
 use x86::__m128i;
--- a/library/stdarch/src/x86/sse41.rs
+++ b/library/stdarch/src/x86/sse41.rs
@ -1,4 +1,3 @@
-// use v128::*;
 use x86::__m128i;

 #[inline(always)]
--- a/library/stdarch/src/x86/sse42.rs
+++ b/library/stdarch/src/x86/sse42.rs
@ -1,4 +1,3 @@
-// use v128::*;
 use x86::__m128i;

 pub const _SIDD_UBYTE_OPS: i8 = 0b00000000;