From e975e22c20e583cddf4d4d042f30bc7c96e15ab1 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Mon, 19 Jun 2017 17:03:29 -0400
Subject: [PATCH] refactor

---
 library/stdarch/QUESTIONS.md                  |   5 +
 library/stdarch/TODO.md                       |   2 +-
 library/stdarch/examples/play.rs              |  11 +-
 library/stdarch/src/lib.rs                    |  21 ++-
 library/stdarch/src/macros.rs                 |  16 +-
 library/stdarch/src/{simd.rs => simd_llvm.rs} |   0
 library/stdarch/src/v128.rs                   |   2 +-
 library/stdarch/src/v256.rs                   |   2 +-
 library/stdarch/src/v512.rs                   | 152 ++++++++++++++++++
 library/stdarch/src/v64.rs                    |  12 +-
 library/stdarch/src/x86/avx2.rs               | 104 ++++++------
 library/stdarch/src/x86/sse2.rs               |   2 +-
 library/stdarch/src/x86/sse41.rs              |   1 -
 library/stdarch/src/x86/sse42.rs              |   1 -
 14 files changed, 256 insertions(+), 75 deletions(-)
 rename library/stdarch/src/{simd.rs => simd_llvm.rs} (100%)
 create mode 100644 library/stdarch/src/v512.rs

diff --git a/library/stdarch/QUESTIONS.md b/library/stdarch/QUESTIONS.md
index 7a76d0d66320..3c6492556ca1 100644
--- a/library/stdarch/QUESTIONS.md
+++ b/library/stdarch/QUESTIONS.md
@@ -24,3 +24,8 @@ type is probably `i64x2`.
 What is the deal with the ucomi f64 comparison functions in SSE2? Clang's
 headers indicate some specific behavior with NAN, but I can't seem to reproduce
 it. Intel's official docs are very vague.
+
+---
+
+`_mm256_blendv_pd` takes a mask parameter with type `f64x4`, but the
+documentation seems to indicate that it is a bit vector. What's going on?
diff --git a/library/stdarch/TODO.md b/library/stdarch/TODO.md
index 21fec4831060..3c2417277bff 100644
--- a/library/stdarch/TODO.md
+++ b/library/stdarch/TODO.md
@@ -456,7 +456,7 @@ sse4.1
 * [ ] `_mm_blend_ps`
 * [ ] `_mm_blendv_pd`
 * [ ] `_mm_blendv_ps`
-* [ ] `_mm_blendv_epi8`
+* [x] `_mm_blendv_epi8`
 * [ ] `_mm_blend_epi16`
 * [ ] `_mm_dp_pd`
 * [ ] `_mm_dp_ps`
diff --git a/library/stdarch/examples/play.rs b/library/stdarch/examples/play.rs
index bc4297c2d79f..9bcdde0de9df 100644
--- a/library/stdarch/examples/play.rs
+++ b/library/stdarch/examples/play.rs
@@ -3,7 +3,8 @@
 extern crate stdsimd;
 
 use std::env;
-use stdsimd as s;
+use stdsimd::simd as s;
+use stdsimd::vendor;
 
 #[inline(never)]
 #[target_feature = "+sse4.2"]
@@ -14,15 +15,15 @@ fn index(needle: &str, haystack: &str) -> usize {
 
     let mut needle = needle.to_string().into_bytes();
     needle.resize(16, 0);
-    let vneedle = s::__m128i::from(s::u8x16::load(&needle, 0));
+    let vneedle = vendor::__m128i::from(s::u8x16::load(&needle, 0));
 
     let mut haystack = haystack.to_string().into_bytes();
     haystack.resize(16, 0);
-    let vhaystack = s::__m128i::from(s::u8x16::load(&haystack, 0));
+    let vhaystack = vendor::__m128i::from(s::u8x16::load(&haystack, 0));
 
-    s::_mm_cmpestri(
+    vendor::_mm_cmpestri(
         vneedle, needle_len as i32, vhaystack, hay_len as i32,
-        s::_SIDD_CMP_EQUAL_ORDERED) as usize
+        vendor::_SIDD_CMP_EQUAL_ORDERED) as usize
 }
 
 fn main() {
diff --git a/library/stdarch/src/lib.rs b/library/stdarch/src/lib.rs
index a3c770d5492c..1aa713742844 100644
--- a/library/stdarch/src/lib.rs
+++ b/library/stdarch/src/lib.rs
@@ -4,17 +4,26 @@
     target_feature,
 )]
 
-pub use v128::*;
-pub use v256::*;
-pub use v64::*;
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-pub use x86::*;
+/// Platform independent SIMD vector types and operations.
+pub mod simd {
+    pub use v128::*;
+    pub use v256::*;
+    pub use v512::*;
+    pub use v64::*;
+}
+
+/// Platform dependent vendor intrinsics.
+pub mod vendor {
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    pub use x86::*;
+}
 
 #[macro_use]
 mod macros;
-mod simd;
+mod simd_llvm;
 mod v128;
 mod v256;
+mod v512;
 mod v64;
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 mod x86;
diff --git a/library/stdarch/src/macros.rs b/library/stdarch/src/macros.rs
index aadb986052fd..339736e8694f 100644
--- a/library/stdarch/src/macros.rs
+++ b/library/stdarch/src/macros.rs
@@ -7,6 +7,16 @@ macro_rules! define_ty {
     }
 }
 
+macro_rules! define_ty_doc {
+    ($name:ident, $($elty:ident),+ | $(#[$doc:meta])*) => {
+        $(#[$doc])*
+        #[repr(simd)]
+        #[derive(Clone, Copy, Debug, PartialEq)]
+        #[allow(non_camel_case_types)]
+        pub struct $name($($elty),*);
+    }
+}
+
 macro_rules! define_impl {
     (
         $name:ident, $elemty:ident, $nelems:expr, $boolname:ident,
@@ -246,11 +256,11 @@ macro_rules! define_integer_ops {
 }
 
 macro_rules! define_casts {
-    ($(($ty:ident, $floatty:ident, $floatcast:ident)),+) => {
+    ($(($fromty:ident, $toty:ident, $cast:ident)),+) => {
         $(
-            impl $ty {
+            impl $fromty {
                 #[inline]
-                pub fn $floatcast(self) -> ::$floatty {
+                pub fn $cast(self) -> ::simd::$toty {
                     unsafe { simd_cast(self) }
                 }
             }
diff --git a/library/stdarch/src/simd.rs b/library/stdarch/src/simd_llvm.rs
similarity index 100%
rename from library/stdarch/src/simd.rs
rename to library/stdarch/src/simd_llvm.rs
diff --git a/library/stdarch/src/v128.rs b/library/stdarch/src/v128.rs
index 4a8bf358da6a..922f983a59ad 100644
--- a/library/stdarch/src/v128.rs
+++ b/library/stdarch/src/v128.rs
@@ -1,4 +1,4 @@
-use simd::*;
+use simd_llvm::*;
 
 define_ty! { f64x2, f64, f64 }
 define_impl! { f64x2, f64, 2, i64x2, x0, x1 }
diff --git a/library/stdarch/src/v256.rs b/library/stdarch/src/v256.rs
index ba09e70ebdee..a5e163e45319 100644
--- a/library/stdarch/src/v256.rs
+++ b/library/stdarch/src/v256.rs
@@ -1,4 +1,4 @@
-use simd::*;
+use simd_llvm::*;
 
 define_ty! { f64x4, f64, f64, f64, f64 }
 define_impl! { f64x4, f64, 4, i64x4, x0, x1, x2, x3 }
diff --git a/library/stdarch/src/v512.rs b/library/stdarch/src/v512.rs
new file mode 100644
index 000000000000..5b1afee639b9
--- /dev/null
+++ b/library/stdarch/src/v512.rs
@@ -0,0 +1,152 @@
+use simd_llvm::*;
+
+define_ty! { f64x8, f64, f64, f64, f64, f64, f64, f64, f64 }
+define_impl! { f64x8, f64, 8, i64x8, x0, x1, x2, x3, x4, x5, x6, x7 }
+
+define_ty! {
+    f32x16,
+    f32, f32, f32, f32, f32, f32, f32, f32,
+    f32, f32, f32, f32, f32, f32, f32, f32
+}
+define_impl! {
+    f32x16, f32, 16, i32x16,
+    x0, x1, x2, x3, x4, x5, x6, x7,
+    x8, x9, x10, x11, x12, x13, x14, x15
+}
+
+define_ty! { u64x8, u64, u64, u64, u64, u64, u64, u64, u64 }
+define_impl! { u64x8, u64, 8, i64x8, x0, x1, x2, x3, x4, x5, x6, x7 }
+
+define_ty! { i64x8, i64, i64, i64, i64, i64, i64, i64, i64 }
+define_impl! { i64x8, i64, 8, i64x8, x0, x1, x2, x3, x4, x5, x6, x7 }
+
+define_ty! {
+    u32x16,
+    u32, u32, u32, u32, u32, u32, u32, u32,
+    u32, u32, u32, u32, u32, u32, u32, u32
+}
+define_impl! {
+    u32x16, u32, 16, i32x16,
+    x0, x1, x2, x3, x4, x5, x6, x7,
+    x8, x9, x10, x11, x12, x13, x14, x15
+}
+
+define_ty! {
+    i32x16,
+    i32, i32, i32, i32, i32, i32, i32, i32,
+    i32, i32, i32, i32, i32, i32, i32, i32
+}
+define_impl! {
+    i32x16, i32, 16, i32x16,
+    x0, x1, x2, x3, x4, x5, x6, x7,
+    x8, x9, x10, x11, x12, x13, x14, x15
+}
+
+define_ty! {
+    u16x32,
+    u16, u16, u16, u16, u16, u16, u16, u16,
+    u16, u16, u16, u16, u16, u16, u16, u16,
+    u16, u16, u16, u16, u16, u16, u16, u16,
+    u16, u16, u16, u16, u16, u16, u16, u16
+}
+define_impl! {
+    u16x32, u16, 32, i16x32,
+    x0, x1, x2, x3, x4, x5, x6, x7,
+    x8, x9, x10, x11, x12, x13, x14, x15,
+    x16, x17, x18, x19, x20, x21, x22, x23,
+    x24, x25, x26, x27, x28, x29, x30, x31
+}
+
+define_ty! {
+    i16x32,
+    i16, i16, i16, i16, i16, i16, i16, i16,
+    i16, i16, i16, i16, i16, i16, i16, i16,
+    i16, i16, i16, i16, i16, i16, i16, i16,
+    i16, i16, i16, i16, i16, i16, i16, i16
+}
+define_impl! {
+    i16x32, i16, 32, i16x32,
+    x0, x1, x2, x3, x4, x5, x6, x7,
+    x8, x9, x10, x11, x12, x13, x14, x15,
+    x16, x17, x18, x19, x20, x21, x22, x23,
+    x24, x25, x26, x27, x28, x29, x30, x31
+}
+
+define_ty! {
+    u8x64,
+    u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8,
+    u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8,
+    u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8,
+    u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8
+}
+define_impl! {
+    u8x64, u8, 64, i8x64,
+    x0, x1, x2, x3, x4, x5, x6, x7,
+    x8, x9, x10, x11, x12, x13, x14, x15,
+    x16, x17, x18, x19, x20, x21, x22, x23,
+    x24, x25, x26, x27, x28, x29, x30, x31,
+    x32, x33, x34, x35, x36, x37, x38, x39,
+    x40, x41, x42, x43, x44, x45, x46, x47,
+    x48, x49, x50, x51, x52, x53, x54, x55,
+    x56, x57, x58, x59, x60, x61, x62, x63
+}
+
+define_ty! {
+    i8x64,
+    i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8,
+    i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8,
+    i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8,
+    i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8
+}
+define_impl! {
+    i8x64, i8, 64, i8x64,
+    x0, x1, x2, x3, x4, x5, x6, x7,
+    x8, x9, x10, x11, x12, x13, x14, x15,
+    x16, x17, x18, x19, x20, x21, x22, x23,
+    x24, x25, x26, x27, x28, x29, x30, x31,
+    x32, x33, x34, x35, x36, x37, x38, x39,
+    x40, x41, x42, x43, x44, x45, x46, x47,
+    x48, x49, x50, x51, x52, x53, x54, x55,
+    x56, x57, x58, x59, x60, x61, x62, x63
+}
+
+define_from!(u64x8, i64x8, u32x16, i32x16, u16x32, i16x32, u8x64, i8x64);
+define_from!(i64x8, u64x8, u32x16, i32x16, u16x32, i16x32, u8x64, i8x64);
+define_from!(u32x16, u64x8, i64x8, i32x16, u16x32, i16x32, u8x64, i8x64);
+define_from!(i32x16, u64x8, i64x8, u32x16, u16x32, i16x32, u8x64, i8x64);
+define_from!(u16x32, u64x8, i64x8, u32x16, i32x16, i16x32, u8x64, i8x64);
+define_from!(i16x32, u64x8, i64x8, u32x16, i32x16, u16x32, u8x64, i8x64);
+define_from!(u8x64, u64x8, i64x8, u32x16, i32x16, u16x32, i16x32, i8x64);
+define_from!(i8x64, u64x8, i64x8, u32x16, i32x16, u16x32, i16x32, u8x64);
+
+define_common_ops!(
+    f64x8, f32x16, u64x8, i64x8, u32x16, i32x16, u16x32, i16x32, u8x64, i8x64);
+define_float_ops!(f64x8, f32x16);
+define_integer_ops!(
+    (u64x8, u64),
+    (i64x8, i64),
+    (u32x16, u32),
+    (i32x16, i32),
+    (u16x32, u16),
+    (i16x32, i16),
+    (u8x64, u8),
+    (i8x64, i8));
+define_casts!(
+    (f64x8, f32x8, as_f32x8),
+    (f64x8, u64x8, as_u64x8),
+    (f64x8, i64x8, as_i64x8),
+    (f32x16, u32x16, as_u32x16),
+    (f32x16, i32x16, as_i32x16),
+    (u64x8, f64x8, as_f64x8),
+    (u64x8, i64x8, as_i64x8),
+    (i64x8, f64x8, as_f64x8),
+    (i64x8, u64x8, as_u64x8),
+    (u32x16, f32x16, as_f32x16),
+    (u32x16, i32x16, as_i32x16),
+    (i32x16, f32x16, as_f32x16),
+    (i32x16, u32x16, as_u32x16),
+    (u16x32, i16x32, as_i16x32),
+    (i16x32, u16x32, as_u16x32),
+    (u8x64, i8x64, as_i8x64),
+    (i8x64, u8x64, as_u8x64));
+
diff --git a/library/stdarch/src/v64.rs b/library/stdarch/src/v64.rs
index de2c14c356a8..ee856b23b06b 100644
--- a/library/stdarch/src/v64.rs
+++ b/library/stdarch/src/v64.rs
@@ -1,9 +1,15 @@
-use simd::*;
+use simd_llvm::*;
 
-define_ty! { f32x2, f32, f32 }
+define_ty_doc! {
+    f32x2, f32, f32 |
+    /// A 64-bit vector with 2 `f32` lanes.
+}
 define_impl! { f32x2, f32, 2, i32x2, x0, x1 }
 
-define_ty! { u32x2, u32, u32 }
+define_ty_doc! {
+    u32x2, u32, u32 |
+    /// A 64-bit vector with 2 `u32` lanes.
+}
 define_impl! { u32x2, u32, 2, i32x2, x0, x1 }
 
 define_ty! { i32x2, i32, i32 }
diff --git a/library/stdarch/src/x86/avx2.rs b/library/stdarch/src/x86/avx2.rs
index 7cf413e97109..e07f26a67dec 100644
--- a/library/stdarch/src/x86/avx2.rs
+++ b/library/stdarch/src/x86/avx2.rs
@@ -423,12 +423,12 @@ pub fn _mm256_movemask_epi8(a: i8x32) -> i32 {
     unsafe { pmovmskb(a) }
 }
 
-/// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 
-/// 8-bit integers in `a` compared to those in `b`, and store the 16-bit 
+/// Compute the sum of absolute differences (SADs) of quadruplets of unsigned
+/// 8-bit integers in `a` compared to those in `b`, and store the 16-bit
 /// results in dst. Eight SADs are performed for each 128-bit lane using one
-/// quadruplet from `b` and eight quadruplets from `a`. One quadruplet is 
-/// selected from `b` starting at on the offset specified in `imm8`. Eight 
-/// quadruplets are formed from sequential 8-bit integers selected from `a` 
+/// quadruplet from `b` and eight quadruplets from `a`. One quadruplet is
+/// selected from `b` starting at on the offset specified in `imm8`. Eight
+/// quadruplets are formed from sequential 8-bit integers selected from `a`
 /// starting at the offset specified in `imm8`.
 #[inline(always)]
 #[target_feature = "+avx2"]
@@ -438,9 +438,9 @@ pub fn _mm256_mpsadbw_epu8(a: u8x32, b: u8x32, imm8: i32) -> u16x16 {
 
 ***/
 
-/// Multiply the low 32-bit integers from each packed 64-bit element in 
+/// Multiply the low 32-bit integers from each packed 64-bit element in
 /// `a` and `b`
-/// 
+///
 /// Return the 64-bit results.
 #[inline(always)]
 #[target_feature = "+avx2"]
@@ -448,7 +448,7 @@ pub fn _mm256_mul_epi32(a: i32x8, b: i32x8) -> i64x4 {
     unsafe { pmuldq(a, b) }
 }
 
-/// Multiply the low unsigned 32-bit integers from each packed 64-bit 
+/// Multiply the low unsigned 32-bit integers from each packed 64-bit
 /// element in `a` and `b`
 ///
 /// Return the unsigned 64-bit results.
@@ -458,7 +458,7 @@ pub fn _mm256_mul_epu32(a: u32x8, b: u32x8) -> u64x4 {
     unsafe { pmuludq(a, b) }
 }
 
-/// Multiply the packed 16-bit integers in `a` and `b`, producing 
+/// Multiply the packed 16-bit integers in `a` and `b`, producing
 /// intermediate 32-bit integers and returning the high 16 bits of the
 /// intermediate integers.
 #[inline(always)]
@@ -476,7 +476,7 @@ pub fn _mm256_mulhi_epu16(a: u16x16, b: u16x16) -> u16x16 {
     unsafe { pmulhuw(a, b) }
 }
 
-/// Multiply the packed 16-bit integers in `a` and `b`, producing 
+/// Multiply the packed 16-bit integers in `a` and `b`, producing
 /// intermediate 32-bit integers, and return the low 16 bits of the
 /// intermediate integers
 #[inline(always)]
@@ -486,7 +486,7 @@ pub fn _mm256_mullo_epi16(a: i16x16, b:i16x16) -> i16x16 {
 }
 
 
-/// Multiply the packed 32-bit integers in `a` and `b`, producing 
+/// Multiply the packed 32-bit integers in `a` and `b`, producing
 /// intermediate 64-bit integers, and return the low 16 bits of the
 /// intermediate integers
 #[inline(always)]
@@ -495,7 +495,7 @@ pub fn _mm256_mullo_epi32(a: i32x8, b:i32x8) -> i32x8 {
     a * b
 }
 
-/// Multiply packed 16-bit integers in `a` and `b`, producing 
+/// Multiply packed 16-bit integers in `a` and `b`, producing
 /// intermediate signed 32-bit integers. Truncate each intermediate
 /// integer to the 18 most significant bits, round by adding 1, and
 /// return bits [16:1]
@@ -505,7 +505,7 @@ pub fn _mm256_mulhrs_epi16(a: i16x16, b:i16x16) -> i16x16 {
     unsafe { pmulhrsw(a, b) }
 }
 
-/// Compute the bitwise OR of 256 bits (representing integer data) in `a` 
+/// Compute the bitwise OR of 256 bits (representing integer data) in `a`
 /// and `b`
 #[inline(always)]
 #[target_feature = "+avx2"]
@@ -513,7 +513,7 @@ pub fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
     a | b
 }
 
-/// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers 
+/// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers
 /// using signed saturation
 #[inline(always)]
 #[target_feature = "+avx2"]
@@ -521,7 +521,7 @@ pub fn _mm256_packs_epi16(a: i16x16, b: i16x16) -> i8x32 {
     unsafe { packsswb(a, b) }
 }
 
-/// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers 
+/// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers
 /// using signed saturation
 #[inline(always)]
 #[target_feature = "+avx2"]
@@ -552,8 +552,8 @@ pub fn _mm256_packus_epi32(a: i32x8, b: i32x8) -> u16x16 {
 // TODO _mm256_permutevar8x32_ps (__m256 a, __m256i idx)
 
 /// Compute the absolute differences of packed unsigned 8-bit integers in `a`
-/// and `b`, then horizontally sum each consecutive 8 differences to 
-/// produce four unsigned 16-bit integers, and pack these unsigned 16-bit 
+/// and `b`, then horizontally sum each consecutive 8 differences to
+/// produce four unsigned 16-bit integers, and pack these unsigned 16-bit
 /// integers in the low 16 bits of the 64-bit return value
 #[inline(always)]
 #[target_feature = "+avx2"]
@@ -593,7 +593,7 @@ pub fn _mm256_sign_epi8(a: i8x32, b: i8x32) -> i8x32 {
     unsafe { psignb(a, b) }
 }
 
-/// Shift packed 16-bit integers in `a` left by `count` while 
+/// Shift packed 16-bit integers in `a` left by `count` while
 /// shifting in zeros, and return the result
 #[inline(always)]
 #[target_feature = "+avx2"]
@@ -601,7 +601,7 @@ pub fn _mm256_sll_epi16(a: i16x16, count: i16x8) -> i16x16 {
     unsafe { psllw(a, count) }
 }
 
-/// Shift packed 32-bit integers in `a` left by `count` while 
+/// Shift packed 32-bit integers in `a` left by `count` while
 /// shifting in zeros, and return the result
 #[inline(always)]
 #[target_feature = "+avx2"]
@@ -609,7 +609,7 @@ pub fn _mm256_sll_epi32(a: i32x8, count: i32x4) -> i32x8 {
     unsafe { pslld(a, count) }
 }
 
-/// Shift packed 64-bit integers in `a` left by `count` while 
+/// Shift packed 64-bit integers in `a` left by `count` while
 /// shifting in zeros, and return the result
 #[inline(always)]
 #[target_feature = "+avx2"]
@@ -622,7 +622,7 @@ pub fn _mm256_sll_epi64(a: i64x4, count: i64x2) -> i64x4 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 pub fn _mm256_slli_epi16(a: i16x16, imm8: i32) -> i16x16 {
-    unsafe { pslliw(a, imm8) }    
+    unsafe { pslliw(a, imm8) }
 }
 
 /// Shift packed 32-bit integers in `a` left by `imm8` while
@@ -630,7 +630,7 @@ pub fn _mm256_slli_epi16(a: i16x16, imm8: i32) -> i16x16 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 pub fn _mm256_slli_epi32(a: i32x8, imm8: i32) -> i32x8 {
-    unsafe { psllid(a, imm8) }    
+    unsafe { psllid(a, imm8) }
 }
 
 /// Shift packed 64-bit integers in `a` left by `imm8` while
@@ -638,7 +638,7 @@ pub fn _mm256_slli_epi32(a: i32x8, imm8: i32) -> i32x8 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 pub fn _mm256_slli_epi64(a: i64x4, imm8: i32) -> i64x4 {
-    unsafe { pslliq(a, imm8) }    
+    unsafe { pslliq(a, imm8) }
 }
 
 // TODO _mm256_slli_si256 (__m256i a, const int imm8)
@@ -695,7 +695,7 @@ pub fn _mm256_sra_epi32(a: i32x8, count: i32x4) -> i32x8 {
     unsafe { psrad(a, count) }
 }
 
-/// Shift packed 16-bit integers in `a` right by `imm8` while 
+/// Shift packed 16-bit integers in `a` right by `imm8` while
 /// shifting in sign bits.
 #[inline(always)]
 #[target_feature = "+avx2"]
@@ -703,7 +703,7 @@ pub fn _mm256_srai_epi16(a: i16x16, imm8: i32) -> i16x16 {
     unsafe { psraiw(a, imm8) }
 }
 
-/// Shift packed 32-bit integers in `a` right by `imm8` while 
+/// Shift packed 32-bit integers in `a` right by `imm8` while
 /// shifting in sign bits.
 #[inline(always)]
 #[target_feature = "+avx2"]
@@ -733,7 +733,7 @@ pub fn _mm256_srav_epi32(a: i32x8, count: i32x8) -> i32x8 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 pub fn _mm256_srl_epi16(a: i16x16, count: i16x8) -> i16x16 {
-    unsafe { psrlw(a, count) }    
+    unsafe { psrlw(a, count) }
 }
 
 /// Shift packed 32-bit integers in `a` right by `count` while shifting in
@@ -741,7 +741,7 @@ pub fn _mm256_srl_epi16(a: i16x16, count: i16x8) -> i16x16 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 pub fn _mm256_srl_epi32(a: i32x8, count: i32x4) -> i32x8 {
-    unsafe { psrld(a, count) }    
+    unsafe { psrld(a, count) }
 }
 
 /// Shift packed 64-bit integers in `a` right by `count` while shifting in
@@ -749,10 +749,10 @@ pub fn _mm256_srl_epi32(a: i32x8, count: i32x4) -> i32x8 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 pub fn _mm256_srl_epi64(a: i64x4, count: i64x2) -> i64x4 {
-    unsafe { psrlq(a, count) }    
+    unsafe { psrlq(a, count) }
 }
 
-/// Shift packed 16-bit integers in `a` right by `imm8` while shifting in 
+/// Shift packed 16-bit integers in `a` right by `imm8` while shifting in
 /// zeros
 #[inline(always)]
 #[target_feature = "+avx2"]
@@ -760,7 +760,7 @@ pub fn _mm256_srli_epi16(a: i16x16, imm8: i32) -> i16x16 {
     unsafe { psrliw(a, imm8) }
 }
 
-/// Shift packed 32-bit integers in `a` right by `imm8` while shifting in 
+/// Shift packed 32-bit integers in `a` right by `imm8` while shifting in
 /// zeros
 #[inline(always)]
 #[target_feature = "+avx2"]
@@ -768,7 +768,7 @@ pub fn _mm256_srli_epi32(a: i32x8, imm8: i32) -> i32x8 {
     unsafe { psrlid(a, imm8) }
 }
 
-/// Shift packed 64-bit integers in `a` right by `imm8` while shifting in 
+/// Shift packed 64-bit integers in `a` right by `imm8` while shifting in
 /// zeros
 #[inline(always)]
 #[target_feature = "+avx2"]
@@ -879,7 +879,7 @@ pub fn _mm256_subs_epu8(a: u8x32, b: u8x32) -> u8x32 {
 // TODO __m256i _mm256_unpacklo_epi64 (__m256i a, __m256i b)
 // TODO __m256i _mm256_unpacklo_epi8 (__m256i a, __m256i b)
 
-/// Compute the bitwise XOR of 256 bits (representing integer data) 
+/// Compute the bitwise XOR of 256 bits (representing integer data)
 /// in `a` and `b`
 #[inline(always)]
 #[target_feature = "+avx2"]
@@ -903,7 +903,7 @@ extern "C" {
     #[link_name = "llvm.x86.avx2.paddus.b"]
     fn paddusb(a: u8x32, b: u8x32) -> u8x32;
     #[link_name = "llvm.x86.avx2.paddus.w"]
-    fn paddusw(a: u16x16, b: u16x16) -> u16x16;    
+    fn paddusw(a: u16x16, b: u16x16) -> u16x16;
     #[link_name = "llvm.x86.avx2.pavg.b"]
     fn pavgb(a: u8x32, b: u8x32) -> u8x32;
     #[link_name = "llvm.x86.avx2.pavg.w"]
@@ -949,7 +949,7 @@ extern "C" {
     #[link_name = "llvm.x86.avx2.pminu.d"]
     fn pminud(a: u32x8, b: u32x8) -> u32x8;
     #[link_name = "llvm.x86.avx2.pminu.b"]
-    fn pminub(a: u8x32, b: u8x32) -> u8x32;    
+    fn pminub(a: u8x32, b: u8x32) -> u8x32;
     #[link_name = "llvm.x86.avx2.pmovmskb"]  //fails in debug
     fn pmovmskb(a: i8x32) -> i32;
     #[link_name = "llvm.x86.avx2.mpsadbw"] //fails in debug
@@ -1031,7 +1031,7 @@ extern "C" {
     #[link_name = "llvm.x86.avx2.psrlv.q"]
     fn psrlvq(a: i64x2, count: i64x2) -> i64x2;
     #[link_name = "llvm.x86.avx2.psrlv.q.256"]
-    fn psrlvq256(a: i64x4, count: i64x4) -> i64x4;    
+    fn psrlvq256(a: i64x4, count: i64x4) -> i64x4;
     #[link_name = "llvm.x86.avx2.psubs.b"]
     fn psubsb(a: i8x32, b: i8x32) -> i8x32;
     #[link_name = "llvm.x86.avx2.psubs.w"]
@@ -1580,15 +1580,15 @@ mod tests {
     }
 
 
-/** 
+/**
     // TODO this fails in debug but not release, why?
     #[test]
     #[target_feature ="+avx2"]
     fn _mm256_movemask_epi8() {
-        let a = i8x32::splat(-1);        
+        let a = i8x32::splat(-1);
         let r = avx2::_mm256_movemask_epi8(a);
         let e : i32 = -1;
-        assert_eq!(r, e);    
+        assert_eq!(r, e);
     }
 
     // TODO This fails in debug but not in release, whhhy?
@@ -1604,7 +1604,7 @@ mod tests {
 **/
 
     #[test]
-    #[target_feature = "+avx2"]    
+    #[target_feature = "+avx2"]
     fn _mm256_mul_epi32() {
         let a = i32x8::new(0, 0, 0, 0, 2, 2, 2, 2);
         let b = i32x8::new(1, 2, 3, 4, 5, 6, 7, 8);
@@ -1693,7 +1693,7 @@ mod tests {
             4, 4, 4, 4, 4, 4, 4, 4,
             2, 2, 2, 2, 2, 2, 2, 2,
             4, 4, 4, 4, 4, 4, 4, 4);
-        
+
         assert_eq!(r, e);
     }
 
@@ -1708,7 +1708,7 @@ mod tests {
             4, 4, 4, 4,
             2, 2, 2, 2,
             4, 4, 4, 4);
-        
+
         assert_eq!(r, e);
     }
 
@@ -1723,7 +1723,7 @@ mod tests {
             4, 4, 4, 4, 4, 4, 4, 4,
             2, 2, 2, 2, 2, 2, 2, 2,
             4, 4, 4, 4, 4, 4, 4, 4);
-        
+
         assert_eq!(r, e);
     }
 
@@ -1738,7 +1738,7 @@ mod tests {
             4, 4, 4, 4,
             2, 2, 2, 2,
             4, 4, 4, 4);
-        
+
         assert_eq!(r, e);
     }
 
@@ -1756,7 +1756,7 @@ mod tests {
     #[target_feature = "+avx2"]
     fn _mm256_sign_epi16() {
         let a = i16x16::splat(2);
-        let b = i16x16::splat(-1);    
+        let b = i16x16::splat(-1);
         let r = avx2::_mm256_sign_epi16(a, b);
         let e = i16x16::splat(-2);
         assert_eq!(r, e);
@@ -1766,7 +1766,7 @@ mod tests {
     #[target_feature = "+avx2"]
     fn _mm256_sign_epi32() {
         let a = i32x8::splat(2);
-        let b = i32x8::splat(-1);    
+        let b = i32x8::splat(-1);
         let r = avx2::_mm256_sign_epi32(a, b);
         let e = i32x8::splat(-2);
         assert_eq!(r, e);
@@ -1776,7 +1776,7 @@ mod tests {
     #[target_feature = "+avx2"]
     fn _mm256_sign_epi8() {
         let a = i8x32::splat(2);
-        let b = i8x32::splat(-1);    
+        let b = i8x32::splat(-1);
         let r = avx2::_mm256_sign_epi8(a, b);
         let e = i8x32::splat(-2);
         assert_eq!(r, e);
@@ -1816,7 +1816,7 @@ mod tests {
             avx2::_mm256_slli_epi16(i16x16::splat(0xFF), 4),
             i16x16::splat(0xFF0));
     }
-    
+
     #[test]
     #[target_feature = "+avx2"]
     fn _mm256_slli_epi32() {
@@ -1840,7 +1840,7 @@ mod tests {
         let b = i32x4::splat(1);
         let r = avx2::_mm_sllv_epi32(a, b);
         let e = i32x4::splat(4);
-        assert_eq!(r, e);   
+        assert_eq!(r, e);
     }
 
     #[test]
@@ -1850,7 +1850,7 @@ mod tests {
         let b = i32x8::splat(1);
         let r = avx2::_mm256_sllv_epi32(a, b);
         let e = i32x8::splat(4);
-        assert_eq!(r, e);   
+        assert_eq!(r, e);
     }
     #[test]
     #[target_feature = "+avx2"]
@@ -1859,7 +1859,7 @@ mod tests {
         let b = i64x2::splat(1);
         let r = avx2::_mm_sllv_epi64(a, b);
         let e = i64x2::splat(4);
-        assert_eq!(r, e);   
+        assert_eq!(r, e);
     }
     #[test]
     #[target_feature = "+avx2"]
@@ -1868,7 +1868,7 @@ mod tests {
         let b = i64x4::splat(1);
         let r = avx2::_mm256_sllv_epi64(a, b);
         let e = i64x4::splat(4);
-        assert_eq!(r, e);   
+        assert_eq!(r, e);
     }
 
     #[test]
@@ -2097,5 +2097,5 @@ mod tests {
             __m256i::splat(6));
     }
 
-    
+
 }
diff --git a/library/stdarch/src/x86/sse2.rs b/library/stdarch/src/x86/sse2.rs
index 1ecaf9ea43aa..b564677ea786 100644
--- a/library/stdarch/src/x86/sse2.rs
+++ b/library/stdarch/src/x86/sse2.rs
@@ -2,7 +2,7 @@ use std::mem;
 use std::os::raw::c_void;
 use std::ptr;
 
-use simd::{
+use simd_llvm::{
     simd_cast, simd_shuffle2, simd_shuffle4, simd_shuffle8, simd_shuffle16,
 };
 use x86::__m128i;
diff --git a/library/stdarch/src/x86/sse41.rs b/library/stdarch/src/x86/sse41.rs
index 2f4a8a307dd2..61217f26f596 100644
--- a/library/stdarch/src/x86/sse41.rs
+++ b/library/stdarch/src/x86/sse41.rs
@@ -1,4 +1,3 @@
-// use v128::*;
 use x86::__m128i;
 
 #[inline(always)]
diff --git a/library/stdarch/src/x86/sse42.rs b/library/stdarch/src/x86/sse42.rs
index a40aae722548..4789fd32d537 100644
--- a/library/stdarch/src/x86/sse42.rs
+++ b/library/stdarch/src/x86/sse42.rs
@@ -1,4 +1,3 @@
-// use v128::*;
 use x86::__m128i;
 
 pub const _SIDD_UBYTE_OPS: i8 = 0b00000000;