mirror of
https://github.com/rust-lang/rust.git
synced 2025-11-17 16:26:46 +00:00
807 lines
32 KiB
Rust
807 lines
32 KiB
Rust
//! Fused Multiply-Add instruction set (FMA)
|
||
//!
|
||
//! The FMA instruction set is an extension to the 128 and 256-bit SSE
|
||
//! instructions in the x86 microprocessor instruction set to perform fused
|
||
//! multiply–add (FMA) operations.
|
||
//!
|
||
//! The references are:
|
||
//!
|
||
//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
|
||
//! Instruction Set Reference, A-Z][intel64_ref].
|
||
//! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
|
||
//! System Instructions][amd64_ref].
|
||
//!
|
||
//! Wikipedia's [FMA][wiki_fma] page provides a quick overview of the
|
||
//! instructions available.
|
||
//!
|
||
//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
|
||
//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
|
||
//! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate
|
||
|
||
use coresimd::x86::*;
|
||
|
||
#[cfg(test)]
|
||
use stdsimd_test::assert_instr;
|
||
|
||
/// Multiply packed double-precision (64-bit) floating-point elements in `a`
|
||
/// and `b`, and add the intermediate result to packed elements in `c`.
|
||
///
|
||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_pd)
|
||
#[inline]
|
||
#[target_feature(enable = "fma")]
|
||
#[cfg_attr(test, assert_instr(vfmadd213pd))]
|
||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||
pub unsafe fn _mm_fmadd_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
|
||
vfmaddpd(a, b, c)
|
||
}
|
||
|
||
/// Multiply packed double-precision (64-bit) floating-point elements in `a`
|
||
/// and `b`, and add the intermediate result to packed elements in `c`.
|
||
///
|
||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmadd_pd)
|
||
#[inline]
|
||
#[target_feature(enable = "fma")]
|
||
#[cfg_attr(test, assert_instr(vfmadd213pd))]
|
||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||
pub unsafe fn _mm256_fmadd_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
|
||
vfmaddpd256(a, b, c)
|
||
}
|
||
|
||
/// Multiply packed single-precision (32-bit) floating-point elements in `a`
|
||
/// and `b`, and add the intermediate result to packed elements in `c`.
|
||
///
|
||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_ps)
|
||
#[inline]
|
||
#[target_feature(enable = "fma")]
|
||
#[cfg_attr(test, assert_instr(vfmadd213ps))]
|
||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||
pub unsafe fn _mm_fmadd_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
|
||
vfmaddps(a, b, c)
|
||
}
|
||
|
||
/// Multiply packed single-precision (32-bit) floating-point elements in `a`
|
||
/// and `b`, and add the intermediate result to packed elements in `c`.
|
||
///
|
||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmadd_ps)
|
||
#[inline]
|
||
#[target_feature(enable = "fma")]
|
||
#[cfg_attr(test, assert_instr(vfmadd213ps))]
|
||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||
pub unsafe fn _mm256_fmadd_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
|
||
vfmaddps256(a, b, c)
|
||
}
|
||
|
||
/// Multiply the lower double-precision (64-bit) floating-point elements in
|
||
/// `a` and `b`, and add the intermediate result to the lower element in `c`.
|
||
/// Store the result in the lower element of the returned value, and copy the
|
||
/// upper element from `a` to the upper elements of the result.
|
||
///
|
||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_sd)
|
||
#[inline]
|
||
#[target_feature(enable = "fma")]
|
||
#[cfg_attr(test, assert_instr(vfmadd213sd))]
|
||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||
pub unsafe fn _mm_fmadd_sd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
|
||
vfmaddsd(a, b, c)
|
||
}
|
||
|
||
/// Multiply the lower single-precision (32-bit) floating-point elements in
|
||
/// `a` and `b`, and add the intermediate result to the lower element in `c`.
|
||
/// Store the result in the lower element of the returned value, and copy the
|
||
/// 3 upper elements from `a` to the upper elements of the result.
|
||
///
|
||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_ss)
|
||
#[inline]
|
||
#[target_feature(enable = "fma")]
|
||
#[cfg_attr(test, assert_instr(vfmadd213ss))]
|
||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||
pub unsafe fn _mm_fmadd_ss(a: __m128, b: __m128, c: __m128) -> __m128 {
|
||
vfmaddss(a, b, c)
|
||
}
|
||
|
||
/// Multiply packed double-precision (64-bit) floating-point elements in `a`
|
||
/// and `b`, and alternatively add and subtract packed elements in `c` to/from
|
||
/// the intermediate result.
|
||
///
|
||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmaddsub_pd)
|
||
#[inline]
|
||
#[target_feature(enable = "fma")]
|
||
#[cfg_attr(test, assert_instr(vfmaddsub213pd))]
|
||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||
pub unsafe fn _mm_fmaddsub_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
|
||
vfmaddsubpd(a, b, c)
|
||
}
|
||
|
||
/// Multiply packed double-precision (64-bit) floating-point elements in `a`
|
||
/// and `b`, and alternatively add and subtract packed elements in `c` to/from
|
||
/// the intermediate result.
|
||
///
|
||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmaddsub_pd)
|
||
#[inline]
|
||
#[target_feature(enable = "fma")]
|
||
#[cfg_attr(test, assert_instr(vfmaddsub213pd))]
|
||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||
pub unsafe fn _mm256_fmaddsub_pd(
|
||
a: __m256d, b: __m256d, c: __m256d,
|
||
) -> __m256d {
|
||
vfmaddsubpd256(a, b, c)
|
||
}
|
||
|
||
/// Multiply packed single-precision (32-bit) floating-point elements in `a`
|
||
/// and `b`, and alternatively add and subtract packed elements in `c` to/from
|
||
/// the intermediate result.
|
||
///
|
||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmaddsub_ps)
|
||
#[inline]
|
||
#[target_feature(enable = "fma")]
|
||
#[cfg_attr(test, assert_instr(vfmaddsub213ps))]
|
||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||
pub unsafe fn _mm_fmaddsub_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
|
||
vfmaddsubps(a, b, c)
|
||
}
|
||
|
||
/// Multiply packed single-precision (32-bit) floating-point elements in `a`
|
||
/// and `b`, and alternatively add and subtract packed elements in `c` to/from
|
||
/// the intermediate result.
|
||
///
|
||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmaddsub_ps)
|
||
#[inline]
|
||
#[target_feature(enable = "fma")]
|
||
#[cfg_attr(test, assert_instr(vfmaddsub213ps))]
|
||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||
pub unsafe fn _mm256_fmaddsub_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
|
||
vfmaddsubps256(a, b, c)
|
||
}
|
||
|
||
/// Multiply packed double-precision (64-bit) floating-point elements in `a`
|
||
/// and `b`, and subtract packed elements in `c` from the intermediate result.
|
||
///
|
||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_pd)
|
||
#[inline]
|
||
#[target_feature(enable = "fma")]
|
||
#[cfg_attr(test, assert_instr(vfmsub213pd))]
|
||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||
pub unsafe fn _mm_fmsub_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
|
||
vfmsubpd(a, b, c)
|
||
}
|
||
|
||
/// Multiply packed double-precision (64-bit) floating-point elements in `a`
|
||
/// and `b`, and subtract packed elements in `c` from the intermediate result.
|
||
///
|
||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmsub_pd)
|
||
#[inline]
|
||
#[target_feature(enable = "fma")]
|
||
#[cfg_attr(test, assert_instr(vfmsub213pd))]
|
||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||
pub unsafe fn _mm256_fmsub_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
|
||
vfmsubpd256(a, b, c)
|
||
}
|
||
|
||
/// Multiply packed single-precision (32-bit) floating-point elements in `a`
|
||
/// and `b`, and subtract packed elements in `c` from the intermediate result.
|
||
///
|
||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_ps)
|
||
#[inline]
|
||
#[target_feature(enable = "fma")]
|
||
#[cfg_attr(test, assert_instr(vfmsub213ps))]
|
||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||
pub unsafe fn _mm_fmsub_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
|
||
vfmsubps(a, b, c)
|
||
}
|
||
|
||
/// Multiply packed single-precision (32-bit) floating-point elements in `a`
|
||
/// and `b`, and subtract packed elements in `c` from the intermediate result.
|
||
///
|
||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmsub_ps)
|
||
#[inline]
|
||
#[target_feature(enable = "fma")]
|
||
#[cfg_attr(test, assert_instr(vfmsub213ps))]
|
||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||
pub unsafe fn _mm256_fmsub_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
|
||
vfmsubps256(a, b, c)
|
||
}
|
||
|
||
/// Multiply the lower double-precision (64-bit) floating-point elements in
|
||
/// `a` and `b`, and subtract the lower element in `c` from the intermediate
|
||
/// result. Store the result in the lower element of the returned value, and
|
||
/// copy the upper element from `a` to the upper elements of the result.
|
||
///
|
||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_sd)
|
||
#[inline]
|
||
#[target_feature(enable = "fma")]
|
||
#[cfg_attr(test, assert_instr(vfmsub213sd))]
|
||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||
pub unsafe fn _mm_fmsub_sd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
|
||
vfmsubsd(a, b, c)
|
||
}
|
||
|
||
/// Multiply the lower single-precision (32-bit) floating-point elements in
|
||
/// `a` and `b`, and subtract the lower element in `c` from the intermediate
|
||
/// result. Store the result in the lower element of the returned value, and
|
||
/// copy the 3 upper elements from `a` to the upper elements of the result.
|
||
///
|
||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_ss)
|
||
#[inline]
|
||
#[target_feature(enable = "fma")]
|
||
#[cfg_attr(test, assert_instr(vfmsub213ss))]
|
||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||
pub unsafe fn _mm_fmsub_ss(a: __m128, b: __m128, c: __m128) -> __m128 {
|
||
vfmsubss(a, b, c)
|
||
}
|
||
|
||
/// Multiply packed double-precision (64-bit) floating-point elements in `a`
|
||
/// and `b`, and alternatively subtract and add packed elements in `c` from/to
|
||
/// the intermediate result.
|
||
///
|
||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsubadd_pd)
|
||
#[inline]
|
||
#[target_feature(enable = "fma")]
|
||
#[cfg_attr(test, assert_instr(vfmsubadd213pd))]
|
||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||
pub unsafe fn _mm_fmsubadd_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
|
||
vfmsubaddpd(a, b, c)
|
||
}
|
||
|
||
/// Multiply packed double-precision (64-bit) floating-point elements in `a`
|
||
/// and `b`, and alternatively subtract and add packed elements in `c` from/to
|
||
/// the intermediate result.
|
||
///
|
||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmsubadd_pd)
|
||
#[inline]
|
||
#[target_feature(enable = "fma")]
|
||
#[cfg_attr(test, assert_instr(vfmsubadd213pd))]
|
||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||
pub unsafe fn _mm256_fmsubadd_pd(
|
||
a: __m256d, b: __m256d, c: __m256d,
|
||
) -> __m256d {
|
||
vfmsubaddpd256(a, b, c)
|
||
}
|
||
|
||
/// Multiply packed single-precision (32-bit) floating-point elements in `a`
|
||
/// and `b`, and alternatively subtract and add packed elements in `c` from/to
|
||
/// the intermediate result.
|
||
///
|
||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsubadd_ps)
|
||
#[inline]
|
||
#[target_feature(enable = "fma")]
|
||
#[cfg_attr(test, assert_instr(vfmsubadd213ps))]
|
||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||
pub unsafe fn _mm_fmsubadd_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
|
||
vfmsubaddps(a, b, c)
|
||
}
|
||
|
||
/// Multiply packed single-precision (32-bit) floating-point elements in `a`
|
||
/// and `b`, and alternatively subtract and add packed elements in `c` from/to
|
||
/// the intermediate result.
|
||
///
|
||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmsubadd_ps)
|
||
#[inline]
|
||
#[target_feature(enable = "fma")]
|
||
#[cfg_attr(test, assert_instr(vfmsubadd213ps))]
|
||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||
pub unsafe fn _mm256_fmsubadd_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
|
||
vfmsubaddps256(a, b, c)
|
||
}
|
||
|
||
/// Multiply packed double-precision (64-bit) floating-point elements in `a`
|
||
/// and `b`, and add the negated intermediate result to packed elements in `c`.
|
||
///
|
||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_pd)
|
||
#[inline]
|
||
#[target_feature(enable = "fma")]
|
||
#[cfg_attr(test, assert_instr(vfnmadd213pd))]
|
||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||
pub unsafe fn _mm_fnmadd_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
|
||
vfnmaddpd(a, b, c)
|
||
}
|
||
|
||
/// Multiply packed double-precision (64-bit) floating-point elements in `a`
|
||
/// and `b`, and add the negated intermediate result to packed elements in `c`.
|
||
///
|
||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fnmadd_pd)
|
||
#[inline]
|
||
#[target_feature(enable = "fma")]
|
||
#[cfg_attr(test, assert_instr(vfnmadd213pd))]
|
||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||
pub unsafe fn _mm256_fnmadd_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
|
||
vfnmaddpd256(a, b, c)
|
||
}
|
||
|
||
/// Multiply packed single-precision (32-bit) floating-point elements in `a`
|
||
/// and `b`, and add the negated intermediate result to packed elements in `c`.
|
||
///
|
||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_ps)
|
||
#[inline]
|
||
#[target_feature(enable = "fma")]
|
||
#[cfg_attr(test, assert_instr(vfnmadd213ps))]
|
||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||
pub unsafe fn _mm_fnmadd_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
|
||
vfnmaddps(a, b, c)
|
||
}
|
||
|
||
/// Multiply packed single-precision (32-bit) floating-point elements in `a`
|
||
/// and `b`, and add the negated intermediate result to packed elements in `c`.
|
||
///
|
||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fnmadd_ps)
|
||
#[inline]
|
||
#[target_feature(enable = "fma")]
|
||
#[cfg_attr(test, assert_instr(vfnmadd213ps))]
|
||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||
pub unsafe fn _mm256_fnmadd_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
|
||
vfnmaddps256(a, b, c)
|
||
}
|
||
|
||
/// Multiply the lower double-precision (64-bit) floating-point elements in
|
||
/// `a` and `b`, and add the negated intermediate result to the lower element
|
||
/// in `c`. Store the result in the lower element of the returned value, and
|
||
/// copy the upper element from `a` to the upper elements of the result.
|
||
///
|
||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_sd)
|
||
#[inline]
|
||
#[target_feature(enable = "fma")]
|
||
#[cfg_attr(test, assert_instr(vfnmadd213sd))]
|
||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||
pub unsafe fn _mm_fnmadd_sd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
|
||
vfnmaddsd(a, b, c)
|
||
}
|
||
|
||
/// Multiply the lower single-precision (32-bit) floating-point elements in
|
||
/// `a` and `b`, and add the negated intermediate result to the lower element
|
||
/// in `c`. Store the result in the lower element of the returned value, and
|
||
/// copy the 3 upper elements from `a` to the upper elements of the result.
|
||
///
|
||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_ss)
|
||
#[inline]
|
||
#[target_feature(enable = "fma")]
|
||
#[cfg_attr(test, assert_instr(vfnmadd213ss))]
|
||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||
pub unsafe fn _mm_fnmadd_ss(a: __m128, b: __m128, c: __m128) -> __m128 {
|
||
vfnmaddss(a, b, c)
|
||
}
|
||
|
||
/// Multiply packed double-precision (64-bit) floating-point elements in `a`
|
||
/// and `b`, and subtract packed elements in `c` from the negated intermediate
|
||
/// result.
|
||
///
|
||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_pd)
|
||
#[inline]
|
||
#[target_feature(enable = "fma")]
|
||
#[cfg_attr(test, assert_instr(vfnmsub213pd))]
|
||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||
pub unsafe fn _mm_fnmsub_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
|
||
vfnmsubpd(a, b, c)
|
||
}
|
||
|
||
/// Multiply packed double-precision (64-bit) floating-point elements in `a`
|
||
/// and `b`, and subtract packed elements in `c` from the negated intermediate
|
||
/// result.
|
||
///
|
||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fnmsub_pd)
|
||
#[inline]
|
||
#[target_feature(enable = "fma")]
|
||
#[cfg_attr(test, assert_instr(vfnmsub213pd))]
|
||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||
pub unsafe fn _mm256_fnmsub_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
|
||
vfnmsubpd256(a, b, c)
|
||
}
|
||
|
||
/// Multiply packed single-precision (32-bit) floating-point elements in `a`
|
||
/// and `b`, and subtract packed elements in `c` from the negated intermediate
|
||
/// result.
|
||
///
|
||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_ps)
|
||
#[inline]
|
||
#[target_feature(enable = "fma")]
|
||
#[cfg_attr(test, assert_instr(vfnmsub213ps))]
|
||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||
pub unsafe fn _mm_fnmsub_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
|
||
vfnmsubps(a, b, c)
|
||
}
|
||
|
||
/// Multiply packed single-precision (32-bit) floating-point elements in `a`
|
||
/// and `b`, and subtract packed elements in `c` from the negated intermediate
|
||
/// result.
|
||
///
|
||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fnmsub_ps)
|
||
#[inline]
|
||
#[target_feature(enable = "fma")]
|
||
#[cfg_attr(test, assert_instr(vfnmsub213ps))]
|
||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||
pub unsafe fn _mm256_fnmsub_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
|
||
vfnmsubps256(a, b, c)
|
||
}
|
||
|
||
/// Multiply the lower double-precision (64-bit) floating-point elements in
|
||
/// `a` and `b`, and subtract packed elements in `c` from the negated
|
||
/// intermediate result. Store the result in the lower element of the returned
|
||
/// value, and copy the upper element from `a` to the upper elements of the
|
||
/// result.
|
||
///
|
||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_sd)
|
||
#[inline]
|
||
#[target_feature(enable = "fma")]
|
||
#[cfg_attr(test, assert_instr(vfnmsub213sd))]
|
||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||
pub unsafe fn _mm_fnmsub_sd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
|
||
vfnmsubsd(a, b, c)
|
||
}
|
||
|
||
/// Multiply the lower single-precision (32-bit) floating-point elements in
|
||
/// `a` and `b`, and subtract packed elements in `c` from the negated
|
||
/// intermediate result. Store the result in the lower element of the
|
||
/// returned value, and copy the 3 upper elements from `a` to the upper
|
||
/// elements of the result.
|
||
///
|
||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_ss)
|
||
#[inline]
|
||
#[target_feature(enable = "fma")]
|
||
#[cfg_attr(test, assert_instr(vfnmsub213ss))]
|
||
#[stable(feature = "simd_x86", since = "1.27.0")]
|
||
pub unsafe fn _mm_fnmsub_ss(a: __m128, b: __m128, c: __m128) -> __m128 {
|
||
vfnmsubss(a, b, c)
|
||
}
|
||
|
||
#[allow(improper_ctypes)]
|
||
extern "C" {
|
||
#[link_name = "llvm.x86.fma.vfmadd.pd"]
|
||
fn vfmaddpd(a: __m128d, b: __m128d, c: __m128d) -> __m128d;
|
||
#[link_name = "llvm.x86.fma.vfmadd.pd.256"]
|
||
fn vfmaddpd256(a: __m256d, b: __m256d, c: __m256d) -> __m256d;
|
||
#[link_name = "llvm.x86.fma.vfmadd.ps"]
|
||
fn vfmaddps(a: __m128, b: __m128, c: __m128) -> __m128;
|
||
#[link_name = "llvm.x86.fma.vfmadd.ps.256"]
|
||
fn vfmaddps256(a: __m256, b: __m256, c: __m256) -> __m256;
|
||
#[link_name = "llvm.x86.fma.vfmadd.sd"]
|
||
fn vfmaddsd(a: __m128d, b: __m128d, c: __m128d) -> __m128d;
|
||
#[link_name = "llvm.x86.fma.vfmadd.ss"]
|
||
fn vfmaddss(a: __m128, b: __m128, c: __m128) -> __m128;
|
||
#[link_name = "llvm.x86.fma.vfmaddsub.pd"]
|
||
fn vfmaddsubpd(a: __m128d, b: __m128d, c: __m128d) -> __m128d;
|
||
#[link_name = "llvm.x86.fma.vfmaddsub.pd.256"]
|
||
fn vfmaddsubpd256(a: __m256d, b: __m256d, c: __m256d) -> __m256d;
|
||
#[link_name = "llvm.x86.fma.vfmaddsub.ps"]
|
||
fn vfmaddsubps(a: __m128, b: __m128, c: __m128) -> __m128;
|
||
#[link_name = "llvm.x86.fma.vfmaddsub.ps.256"]
|
||
fn vfmaddsubps256(a: __m256, b: __m256, c: __m256) -> __m256;
|
||
#[link_name = "llvm.x86.fma.vfmsub.pd"]
|
||
fn vfmsubpd(a: __m128d, b: __m128d, c: __m128d) -> __m128d;
|
||
#[link_name = "llvm.x86.fma.vfmsub.pd.256"]
|
||
fn vfmsubpd256(a: __m256d, b: __m256d, c: __m256d) -> __m256d;
|
||
#[link_name = "llvm.x86.fma.vfmsub.ps"]
|
||
fn vfmsubps(a: __m128, b: __m128, c: __m128) -> __m128;
|
||
#[link_name = "llvm.x86.fma.vfmsub.ps.256"]
|
||
fn vfmsubps256(a: __m256, b: __m256, c: __m256) -> __m256;
|
||
#[link_name = "llvm.x86.fma.vfmsub.sd"]
|
||
fn vfmsubsd(a: __m128d, b: __m128d, c: __m128d) -> __m128d;
|
||
#[link_name = "llvm.x86.fma.vfmsub.ss"]
|
||
fn vfmsubss(a: __m128, b: __m128, c: __m128) -> __m128;
|
||
#[link_name = "llvm.x86.fma.vfmsubadd.pd"]
|
||
fn vfmsubaddpd(a: __m128d, b: __m128d, c: __m128d) -> __m128d;
|
||
#[link_name = "llvm.x86.fma.vfmsubadd.pd.256"]
|
||
fn vfmsubaddpd256(a: __m256d, b: __m256d, c: __m256d) -> __m256d;
|
||
#[link_name = "llvm.x86.fma.vfmsubadd.ps"]
|
||
fn vfmsubaddps(a: __m128, b: __m128, c: __m128) -> __m128;
|
||
#[link_name = "llvm.x86.fma.vfmsubadd.ps.256"]
|
||
fn vfmsubaddps256(a: __m256, b: __m256, c: __m256) -> __m256;
|
||
#[link_name = "llvm.x86.fma.vfnmadd.pd"]
|
||
fn vfnmaddpd(a: __m128d, b: __m128d, c: __m128d) -> __m128d;
|
||
#[link_name = "llvm.x86.fma.vfnmadd.pd.256"]
|
||
fn vfnmaddpd256(a: __m256d, b: __m256d, c: __m256d) -> __m256d;
|
||
#[link_name = "llvm.x86.fma.vfnmadd.ps"]
|
||
fn vfnmaddps(a: __m128, b: __m128, c: __m128) -> __m128;
|
||
#[link_name = "llvm.x86.fma.vfnmadd.ps.256"]
|
||
fn vfnmaddps256(a: __m256, b: __m256, c: __m256) -> __m256;
|
||
#[link_name = "llvm.x86.fma.vfnmadd.sd"]
|
||
fn vfnmaddsd(a: __m128d, b: __m128d, c: __m128d) -> __m128d;
|
||
#[link_name = "llvm.x86.fma.vfnmadd.ss"]
|
||
fn vfnmaddss(a: __m128, b: __m128, c: __m128) -> __m128;
|
||
#[link_name = "llvm.x86.fma.vfnmsub.pd"]
|
||
fn vfnmsubpd(a: __m128d, b: __m128d, c: __m128d) -> __m128d;
|
||
#[link_name = "llvm.x86.fma.vfnmsub.pd.256"]
|
||
fn vfnmsubpd256(a: __m256d, b: __m256d, c: __m256d) -> __m256d;
|
||
#[link_name = "llvm.x86.fma.vfnmsub.ps"]
|
||
fn vfnmsubps(a: __m128, b: __m128, c: __m128) -> __m128;
|
||
#[link_name = "llvm.x86.fma.vfnmsub.ps.256"]
|
||
fn vfnmsubps256(a: __m256, b: __m256, c: __m256) -> __m256;
|
||
#[link_name = "llvm.x86.fma.vfnmsub.sd"]
|
||
fn vfnmsubsd(a: __m128d, b: __m128d, c: __m128d) -> __m128d;
|
||
#[link_name = "llvm.x86.fma.vfnmsub.ss"]
|
||
fn vfnmsubss(a: __m128, b: __m128, c: __m128) -> __m128;
|
||
}
|
||
|
||
#[cfg(test)]
|
||
mod tests {
|
||
use std;
|
||
use stdsimd_test::simd_test;
|
||
|
||
use coresimd::x86::*;
|
||
|
||
#[simd_test(enable = "fma")]
|
||
unsafe fn test_mm_fmadd_pd() {
|
||
let a = _mm_setr_pd(1., 2.);
|
||
let b = _mm_setr_pd(5., 3.);
|
||
let c = _mm_setr_pd(4., 9.);
|
||
let r = _mm_setr_pd(9., 15.);
|
||
assert_eq_m128d(_mm_fmadd_pd(a, b, c), r);
|
||
}
|
||
|
||
#[simd_test(enable = "fma")]
|
||
unsafe fn test_mm256_fmadd_pd() {
|
||
let a = _mm256_setr_pd(1., 2., 3., 4.);
|
||
let b = _mm256_setr_pd(5., 3., 7., 2.);
|
||
let c = _mm256_setr_pd(4., 9., 1., 7.);
|
||
let r = _mm256_setr_pd(9., 15., 22., 15.);
|
||
assert_eq_m256d(_mm256_fmadd_pd(a, b, c), r);
|
||
}
|
||
|
||
#[simd_test(enable = "fma")]
|
||
unsafe fn test_mm_fmadd_ps() {
|
||
let a = _mm_setr_ps(1., 2., 3., 4.);
|
||
let b = _mm_setr_ps(5., 3., 7., 2.);
|
||
let c = _mm_setr_ps(4., 9., 1., 7.);
|
||
let r = _mm_setr_ps(9., 15., 22., 15.);
|
||
assert_eq_m128(_mm_fmadd_ps(a, b, c), r);
|
||
}
|
||
|
||
#[simd_test(enable = "fma")]
|
||
unsafe fn test_mm256_fmadd_ps() {
|
||
let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
|
||
let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
|
||
let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
|
||
let r = _mm256_setr_ps(9., 15., 22., 15., -5., -49., -2., -31.);
|
||
assert_eq_m256(_mm256_fmadd_ps(a, b, c), r);
|
||
}
|
||
|
||
#[simd_test(enable = "fma")]
|
||
unsafe fn test_mm_fmadd_sd() {
|
||
let a = _mm_setr_pd(1., 2.);
|
||
let b = _mm_setr_pd(5., 3.);
|
||
let c = _mm_setr_pd(4., 9.);
|
||
let r = _mm_setr_pd(9., 2.);
|
||
assert_eq_m128d(_mm_fmadd_sd(a, b, c), r);
|
||
}
|
||
|
||
#[simd_test(enable = "fma")]
|
||
unsafe fn test_mm_fmadd_ss() {
|
||
let a = _mm_setr_ps(1., 2., 3., 4.);
|
||
let b = _mm_setr_ps(5., 3., 7., 2.);
|
||
let c = _mm_setr_ps(4., 9., 1., 7.);
|
||
let r = _mm_setr_ps(9., 2., 3., 4.);
|
||
assert_eq_m128(_mm_fmadd_ss(a, b, c), r);
|
||
}
|
||
|
||
#[simd_test(enable = "fma")]
|
||
unsafe fn test_mm_fmaddsub_pd() {
|
||
let a = _mm_setr_pd(1., 2.);
|
||
let b = _mm_setr_pd(5., 3.);
|
||
let c = _mm_setr_pd(4., 9.);
|
||
let r = _mm_setr_pd(1., 15.);
|
||
assert_eq_m128d(_mm_fmaddsub_pd(a, b, c), r);
|
||
}
|
||
|
||
#[simd_test(enable = "fma")]
|
||
unsafe fn test_mm256_fmaddsub_pd() {
|
||
let a = _mm256_setr_pd(1., 2., 3., 4.);
|
||
let b = _mm256_setr_pd(5., 3., 7., 2.);
|
||
let c = _mm256_setr_pd(4., 9., 1., 7.);
|
||
let r = _mm256_setr_pd(1., 15., 20., 15.);
|
||
assert_eq_m256d(_mm256_fmaddsub_pd(a, b, c), r);
|
||
}
|
||
|
||
#[simd_test(enable = "fma")]
|
||
unsafe fn test_mm_fmaddsub_ps() {
|
||
let a = _mm_setr_ps(1., 2., 3., 4.);
|
||
let b = _mm_setr_ps(5., 3., 7., 2.);
|
||
let c = _mm_setr_ps(4., 9., 1., 7.);
|
||
let r = _mm_setr_ps(1., 15., 20., 15.);
|
||
assert_eq_m128(_mm_fmaddsub_ps(a, b, c), r);
|
||
}
|
||
|
||
#[simd_test(enable = "fma")]
|
||
unsafe fn test_mm256_fmaddsub_ps() {
|
||
let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
|
||
let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
|
||
let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
|
||
let r = _mm256_setr_ps(1., 15., 20., 15., 5., -49., 2., -31.);
|
||
assert_eq_m256(_mm256_fmaddsub_ps(a, b, c), r);
|
||
}
|
||
|
||
#[simd_test(enable = "fma")]
|
||
unsafe fn test_mm_fmsub_pd() {
|
||
let a = _mm_setr_pd(1., 2.);
|
||
let b = _mm_setr_pd(5., 3.);
|
||
let c = _mm_setr_pd(4., 9.);
|
||
let r = _mm_setr_pd(1., -3.);
|
||
assert_eq_m128d(_mm_fmsub_pd(a, b, c), r);
|
||
}
|
||
|
||
#[simd_test(enable = "fma")]
|
||
unsafe fn test_mm256_fmsub_pd() {
|
||
let a = _mm256_setr_pd(1., 2., 3., 4.);
|
||
let b = _mm256_setr_pd(5., 3., 7., 2.);
|
||
let c = _mm256_setr_pd(4., 9., 1., 7.);
|
||
let r = _mm256_setr_pd(1., -3., 20., 1.);
|
||
assert_eq_m256d(_mm256_fmsub_pd(a, b, c), r);
|
||
}
|
||
|
||
#[simd_test(enable = "fma")]
|
||
unsafe fn test_mm_fmsub_ps() {
|
||
let a = _mm_setr_ps(1., 2., 3., 4.);
|
||
let b = _mm_setr_ps(5., 3., 7., 2.);
|
||
let c = _mm_setr_ps(4., 9., 1., 7.);
|
||
let r = _mm_setr_ps(1., -3., 20., 1.);
|
||
assert_eq_m128(_mm_fmsub_ps(a, b, c), r);
|
||
}
|
||
|
||
#[simd_test(enable = "fma")]
|
||
unsafe fn test_mm256_fmsub_ps() {
|
||
let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
|
||
let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
|
||
let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
|
||
let r = _mm256_setr_ps(1., -3., 20., 1., 5., -71., 2., -25.);
|
||
assert_eq_m256(_mm256_fmsub_ps(a, b, c), r);
|
||
}
|
||
|
||
#[simd_test(enable = "fma")]
|
||
unsafe fn test_mm_fmsub_sd() {
|
||
let a = _mm_setr_pd(1., 2.);
|
||
let b = _mm_setr_pd(5., 3.);
|
||
let c = _mm_setr_pd(4., 9.);
|
||
let r = _mm_setr_pd(1., 2.);
|
||
assert_eq_m128d(_mm_fmsub_sd(a, b, c), r);
|
||
}
|
||
|
||
#[simd_test(enable = "fma")]
|
||
unsafe fn test_mm_fmsub_ss() {
|
||
let a = _mm_setr_ps(1., 2., 3., 4.);
|
||
let b = _mm_setr_ps(5., 3., 7., 2.);
|
||
let c = _mm_setr_ps(4., 9., 1., 7.);
|
||
let r = _mm_setr_ps(1., 2., 3., 4.);
|
||
assert_eq_m128(_mm_fmsub_ss(a, b, c), r);
|
||
}
|
||
|
||
#[simd_test(enable = "fma")]
|
||
unsafe fn test_mm_fmsubadd_pd() {
|
||
let a = _mm_setr_pd(1., 2.);
|
||
let b = _mm_setr_pd(5., 3.);
|
||
let c = _mm_setr_pd(4., 9.);
|
||
let r = _mm_setr_pd(9., -3.);
|
||
assert_eq_m128d(_mm_fmsubadd_pd(a, b, c), r);
|
||
}
|
||
|
||
#[simd_test(enable = "fma")]
|
||
unsafe fn test_mm256_fmsubadd_pd() {
|
||
let a = _mm256_setr_pd(1., 2., 3., 4.);
|
||
let b = _mm256_setr_pd(5., 3., 7., 2.);
|
||
let c = _mm256_setr_pd(4., 9., 1., 7.);
|
||
let r = _mm256_setr_pd(9., -3., 22., 1.);
|
||
assert_eq_m256d(_mm256_fmsubadd_pd(a, b, c), r);
|
||
}
|
||
|
||
#[simd_test(enable = "fma")]
|
||
unsafe fn test_mm_fmsubadd_ps() {
|
||
let a = _mm_setr_ps(1., 2., 3., 4.);
|
||
let b = _mm_setr_ps(5., 3., 7., 2.);
|
||
let c = _mm_setr_ps(4., 9., 1., 7.);
|
||
let r = _mm_setr_ps(9., -3., 22., 1.);
|
||
assert_eq_m128(_mm_fmsubadd_ps(a, b, c), r);
|
||
}
|
||
|
||
#[simd_test(enable = "fma")]
|
||
unsafe fn test_mm256_fmsubadd_ps() {
|
||
let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
|
||
let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
|
||
let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
|
||
let r = _mm256_setr_ps(9., -3., 22., 1., -5., -71., -2., -25.);
|
||
assert_eq_m256(_mm256_fmsubadd_ps(a, b, c), r);
|
||
}
|
||
|
||
#[simd_test(enable = "fma")]
|
||
unsafe fn test_mm_fnmadd_pd() {
|
||
let a = _mm_setr_pd(1., 2.);
|
||
let b = _mm_setr_pd(5., 3.);
|
||
let c = _mm_setr_pd(4., 9.);
|
||
let r = _mm_setr_pd(-1., 3.);
|
||
assert_eq_m128d(_mm_fnmadd_pd(a, b, c), r);
|
||
}
|
||
|
||
#[simd_test(enable = "fma")]
|
||
unsafe fn test_mm256_fnmadd_pd() {
|
||
let a = _mm256_setr_pd(1., 2., 3., 4.);
|
||
let b = _mm256_setr_pd(5., 3., 7., 2.);
|
||
let c = _mm256_setr_pd(4., 9., 1., 7.);
|
||
let r = _mm256_setr_pd(-1., 3., -20., -1.);
|
||
assert_eq_m256d(_mm256_fnmadd_pd(a, b, c), r);
|
||
}
|
||
|
||
#[simd_test(enable = "fma")]
|
||
unsafe fn test_mm_fnmadd_ps() {
|
||
let a = _mm_setr_ps(1., 2., 3., 4.);
|
||
let b = _mm_setr_ps(5., 3., 7., 2.);
|
||
let c = _mm_setr_ps(4., 9., 1., 7.);
|
||
let r = _mm_setr_ps(-1., 3., -20., -1.);
|
||
assert_eq_m128(_mm_fnmadd_ps(a, b, c), r);
|
||
}
|
||
|
||
#[simd_test(enable = "fma")]
|
||
unsafe fn test_mm256_fnmadd_ps() {
|
||
let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
|
||
let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
|
||
let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
|
||
let r = _mm256_setr_ps(-1., 3., -20., -1., -5., 71., -2., 25.);
|
||
assert_eq_m256(_mm256_fnmadd_ps(a, b, c), r);
|
||
}
|
||
|
||
#[simd_test(enable = "fma")]
|
||
unsafe fn test_mm_fnmadd_sd() {
|
||
let a = _mm_setr_pd(1., 2.);
|
||
let b = _mm_setr_pd(5., 3.);
|
||
let c = _mm_setr_pd(4., 9.);
|
||
let r = _mm_setr_pd(-1., 2.);
|
||
assert_eq_m128d(_mm_fnmadd_sd(a, b, c), r);
|
||
}
|
||
|
||
#[simd_test(enable = "fma")]
|
||
unsafe fn test_mm_fnmadd_ss() {
|
||
let a = _mm_setr_ps(1., 2., 3., 4.);
|
||
let b = _mm_setr_ps(5., 3., 7., 2.);
|
||
let c = _mm_setr_ps(4., 9., 1., 7.);
|
||
let r = _mm_setr_ps(-1., 2., 3., 4.);
|
||
assert_eq_m128(_mm_fnmadd_ss(a, b, c), r);
|
||
}
|
||
|
||
#[simd_test(enable = "fma")]
|
||
unsafe fn test_mm_fnmsub_pd() {
|
||
let a = _mm_setr_pd(1., 2.);
|
||
let b = _mm_setr_pd(5., 3.);
|
||
let c = _mm_setr_pd(4., 9.);
|
||
let r = _mm_setr_pd(-9., -15.);
|
||
assert_eq_m128d(_mm_fnmsub_pd(a, b, c), r);
|
||
}
|
||
|
||
#[simd_test(enable = "fma")]
|
||
unsafe fn test_mm256_fnmsub_pd() {
|
||
let a = _mm256_setr_pd(1., 2., 3., 4.);
|
||
let b = _mm256_setr_pd(5., 3., 7., 2.);
|
||
let c = _mm256_setr_pd(4., 9., 1., 7.);
|
||
let r = _mm256_setr_pd(-9., -15., -22., -15.);
|
||
assert_eq_m256d(_mm256_fnmsub_pd(a, b, c), r);
|
||
}
|
||
|
||
#[simd_test(enable = "fma")]
|
||
unsafe fn test_mm_fnmsub_ps() {
|
||
let a = _mm_setr_ps(1., 2., 3., 4.);
|
||
let b = _mm_setr_ps(5., 3., 7., 2.);
|
||
let c = _mm_setr_ps(4., 9., 1., 7.);
|
||
let r = _mm_setr_ps(-9., -15., -22., -15.);
|
||
assert_eq_m128(_mm_fnmsub_ps(a, b, c), r);
|
||
}
|
||
|
||
#[simd_test(enable = "fma")]
|
||
unsafe fn test_mm256_fnmsub_ps() {
|
||
let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
|
||
let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
|
||
let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
|
||
let r = _mm256_setr_ps(-9., -15., -22., -15., 5., 49., 2., 31.);
|
||
assert_eq_m256(_mm256_fnmsub_ps(a, b, c), r);
|
||
}
|
||
|
||
#[simd_test(enable = "fma")]
|
||
unsafe fn test_mm_fnmsub_sd() {
|
||
let a = _mm_setr_pd(1., 2.);
|
||
let b = _mm_setr_pd(5., 3.);
|
||
let c = _mm_setr_pd(4., 9.);
|
||
let r = _mm_setr_pd(-9., 2.);
|
||
assert_eq_m128d(_mm_fnmsub_sd(a, b, c), r);
|
||
}
|
||
|
||
#[simd_test(enable = "fma")]
|
||
unsafe fn test_mm_fnmsub_ss() {
|
||
let a = _mm_setr_ps(1., 2., 3., 4.);
|
||
let b = _mm_setr_ps(5., 3., 7., 2.);
|
||
let c = _mm_setr_ps(4., 9., 1., 7.);
|
||
let r = _mm_setr_ps(-9., 2., 3., 4.);
|
||
assert_eq_m128(_mm_fnmsub_ss(a, b, c), r);
|
||
}
|
||
}
|