mirror of
https://github.com/rust-lang/rust.git
synced 2025-10-18 02:41:39 +00:00
1290 lines
40 KiB
Python
1290 lines
40 KiB
Python
// ARM Neon intrinsic specification.
|
|
//
|
|
// This file contains the specification for a number of
|
|
// intrinsics that allows us to generate them along with
|
|
// their test cases.
|
|
//
|
|
// To the syntax of the file - it's not very intelligently parsed!
|
|
//
|
|
// # Comments
|
|
// start with AT LEAST two, or four or more slashes so // is a
|
|
// comment /////// is too.
|
|
//
|
|
// # Sections
|
|
// Sections start with EXACTLY three slashes followed
|
|
// by AT LEAST one space. Sections are used for two things:
|
|
//
|
|
// 1) they serve as the doc comment for the given intrinics.
|
|
// 2) they reset all variables (name, fn, etc.)
|
|
//
|
|
// # Variables
|
|
//
|
|
// name - The prefix of the function, suffixes are auto
|
|
// generated by the type they get passed.
|
|
//
|
|
// fn - The function to call in rust-land.
|
|
//
|
|
// aarch64 - The intrinsic to check on aarch64 architecture.
|
|
// If this is given but no arm intrinsic is provided,
|
|
// the function will exclusively be generated for
|
|
// aarch64.
|
|
// This is used to generate both aarch64 specific and
|
|
// shared intrinics by first only specifying th aarch64
|
|
// variant then the arm variant.
|
|
//
|
|
// arm - The arm v7 intrinics used to checked for arm code
|
|
// generation. All neon functions available in arm are
|
|
// also available in aarch64. If no aarch64 intrinic was
|
|
// set they are assumed to be the same.
|
|
// Intrinics ending with a `.` will have a size suffixes
|
|
// added (such as `i8` or `i64`) that is not sign specific
|
|
// Intrinics ending with a `.s` will have a size suffixes
|
|
// added (such as `s8` or `u64`) that is sign specific
|
|
//
|
|
// a - First input for tests, it gets scaled to the size of
|
|
// the type.
|
|
//
|
|
// b - Second input for tests, it gets scaled to the size of
|
|
// the type.
|
|
//
|
|
// # special values
|
|
//
|
|
// TRUE - 'true' all bits are set to 1
|
|
// FALSE - 'false' all bits are set to 0
|
|
// FF - same as 'true'
|
|
// MIN - minimal value (either 0 or the lowest negative number)
|
|
// MAX - maximal value proper to overflow
|
|
//
|
|
// # validate <values>
|
|
// Validates a and b aginst the expected result of the test.
|
|
// The special values 'TRUE' and 'FALSE' can be used to
|
|
// represent the correct NEON representation of true or
|
|
// false values. It too gets scaled to the type.
|
|
//
|
|
// Validate needs to be called before generate as it sets
|
|
// up the rules for validation that get generated for each
|
|
// type.
|
|
// # generate <types>
|
|
// The generate command generates the intrinsics, it uses the
|
|
// Variables set and can be called multiple times while overwriting
|
|
// some of the variables.
|
|
|
|
/// Vector bitwise and
|
|
name = vand
|
|
fn = simd_and
|
|
arm = vand
|
|
aarch64 = and
|
|
a = 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x00
|
|
b = 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F
|
|
validate 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x00
|
|
b = 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
|
|
validate 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
|
|
generate int*_t, uint*_t, int64x*_t, uint64x*_t
|
|
|
|
/// Vector bitwise or (immediate, inclusive)
|
|
name = vorr
|
|
fn = simd_or
|
|
arm = vorr
|
|
aarch64 = orr
|
|
a = 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F
|
|
b = 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
|
|
validate 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F
|
|
generate int*_t, uint*_t, int64x*_t, uint64x*_t
|
|
|
|
|
|
/// Vector bitwise exclusive or (vector)
|
|
name = veor
|
|
fn = simd_xor
|
|
arm = veor
|
|
aarch64 = eor
|
|
a = 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F
|
|
b = 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
|
|
validate 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F
|
|
generate int*_t, uint*_t, int64x*_t, uint64x*_t
|
|
|
|
////////////////////
|
|
// Absolute difference between the arguments
|
|
////////////////////
|
|
|
|
/// Absolute difference between the arguments
|
|
name = vabd
|
|
a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
b = 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
|
|
validate 15, 13, 11, 9, 7, 5, 3, 1, 1, 3, 5, 7, 9, 11, 13, 15
|
|
|
|
arm = vabd.s
|
|
aarch64 = sabd
|
|
link-arm = vabds._EXT_
|
|
link-aarch64 = sabd._EXT_
|
|
generate int*_t
|
|
|
|
arm = vabd.s
|
|
aarch64 = uabd
|
|
link-arm = vabdu._EXT_
|
|
link-aarch64 = uabd._EXT_
|
|
generate uint*_t
|
|
|
|
/// Absolute difference between the arguments of Floating
|
|
name = vabd
|
|
a = 1.0, 2.0, 5.0, -4.0
|
|
b = 9.0, 3.0, 2.0, 8.0
|
|
validate 8.0, 1.0, 3.0, 12.0
|
|
|
|
aarch64 = fabd
|
|
link-aarch64 = fabd._EXT_
|
|
generate float64x*_t
|
|
|
|
arm = vabd.s
|
|
aarch64 = fabd
|
|
link-arm = vabds._EXT_
|
|
link-aarch64 = fabd._EXT_
|
|
generate float*_t
|
|
|
|
////////////////////
|
|
// equality
|
|
////////////////////
|
|
|
|
/// Compare bitwise Equal (vector)
|
|
name = vceq
|
|
fn = simd_eq
|
|
a = MIN, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, MAX
|
|
b = MIN, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, MAX
|
|
validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
|
|
a = MIN, MIN, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xCC, 0x0D, 0xEE, MAX
|
|
b = MIN, MAX, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08, 0x08, 0x00, 0x0A, 0x0A, 0xCC, 0xD0, 0xEE, MIN
|
|
validate TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE
|
|
|
|
aarch64 = cmeq
|
|
generate uint64x*_t, int64x1_t:uint64x1_t, int64x2_t:uint64x2_t, poly64x1_t:uint64x1_t, poly64x2_t:uint64x2_t
|
|
|
|
arm = vceq.
|
|
generate uint*_t, int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, poly8x8_t:uint8x8_t, poly8x16_t:uint8x16_t
|
|
|
|
/// Floating-point compare equal
|
|
name = vceq
|
|
fn = simd_eq
|
|
a = 1.2, 3.4, 5.6, 7.8
|
|
b = 1.2, 3.4, 5.6, 7.8
|
|
validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
|
|
|
|
aarch64 = fcmeq
|
|
generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t
|
|
|
|
arm = vceq.
|
|
// we are missing float16x4_t:uint16x4_t, float16x8_t:uint16x8_t
|
|
generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t
|
|
|
|
/// Signed compare bitwise equal to zero
|
|
name = vceqz
|
|
fn = simd_eq
|
|
a = MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX
|
|
fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
|
validate FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE
|
|
|
|
aarch64 = cmeq
|
|
generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, int64x1_t:uint64x1_t, int64x2_t:uint64x2_t, poly8x8_t:uint8x8_t, poly8x16_t:uint8x16_t, poly64x1_t:uint64x1_t, poly64x2_t:uint64x2_t
|
|
|
|
/// Unsigned compare bitwise equal to zero
|
|
name = vceqz
|
|
fn = simd_eq
|
|
a = MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX
|
|
fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
|
validate TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE
|
|
|
|
aarch64 = cmeq
|
|
generate uint*_t, uint64x*_t
|
|
|
|
/// Floating-point compare bitwise equal to zero
|
|
name = vceqz
|
|
fn = simd_eq
|
|
a = 0.0, 1.2, 3.4, 5.6
|
|
fixed = 0.0, 0.0, 0.0, 0.0
|
|
validate TRUE, FALSE, FALSE, FALSE
|
|
|
|
aarch64 = fcmeq
|
|
generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t
|
|
|
|
/// Signed compare bitwise Test bits nonzero
|
|
name = vtst
|
|
multi_fn = simd_and, c:in_t
|
|
multi_fn = fixed, d:in_t
|
|
multi_fn = simd_ne, c, transmute(d)
|
|
a = MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX
|
|
b = MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX
|
|
fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
|
validate TRUE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
|
|
|
|
aarch64 = cmtst
|
|
generate int64x1_t:uint64x1_t, int64x2_t:uint64x2_t, poly64x1_t:uint64x1_t, poly64x2_t:uint64x2_t
|
|
|
|
arm = vtst
|
|
generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, poly8x8_t:uint8x8_t, poly8x16_t:uint8x16_t
|
|
|
|
/// Unsigned compare bitwise Test bits nonzero
|
|
name = vtst
|
|
multi_fn = simd_and, c:in_t
|
|
multi_fn = fixed, d:in_t
|
|
multi_fn = simd_ne, c, transmute(d)
|
|
a = MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX
|
|
b = MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX
|
|
fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
|
validate FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
|
|
|
|
aarch64 = cmtst
|
|
generate uint64x*_t
|
|
|
|
arm = vtst
|
|
generate uint*_t
|
|
|
|
////////////////////
|
|
// Floating-point absolute value
|
|
////////////////////
|
|
|
|
/// Floating-point absolute value
|
|
name = vabs
|
|
fn = simd_fabs
|
|
a = -0.1, -2.2, -3.3, -6.6
|
|
validate 0.1, 2.2, 3.3, 6.6
|
|
aarch64 = fabs
|
|
generate float64x1_t:float64x1_t, float64x2_t:float64x2_t
|
|
|
|
arm = vabs
|
|
generate float32x2_t:float32x2_t, float32x4_t:float32x4_t
|
|
|
|
////////////////////
|
|
// greater then
|
|
////////////////////
|
|
|
|
/// Compare signed greater than
|
|
name = vcgt
|
|
fn = simd_gt
|
|
a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
|
|
aarch64 = cmgt
|
|
generate int64x1_t:uint64x1_t, int64x2_t:uint64x2_t
|
|
|
|
arm = vcgt.s
|
|
generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t
|
|
|
|
/// Compare unsigned highe
|
|
name = vcgt
|
|
fn = simd_gt
|
|
a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
|
|
|
|
aarch64 = cmhi
|
|
generate uint64x*_t
|
|
|
|
arm = vcgt.s
|
|
generate uint*_t
|
|
|
|
/// Floating-point compare greater than
|
|
name = vcgt
|
|
fn = simd_gt
|
|
a = 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8, 8.9
|
|
b = 0.1, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8
|
|
validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
|
|
|
|
aarch64 = fcmgt
|
|
generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t
|
|
|
|
arm = vcgt.s
|
|
// we are missing float16x4_t:uint16x4_t, float16x8_t:uint16x8_t
|
|
generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t
|
|
|
|
////////////////////
|
|
// lesser then
|
|
////////////////////
|
|
|
|
/// Compare signed less than
|
|
name = vclt
|
|
fn = simd_lt
|
|
a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
|
|
aarch64 = cmgt
|
|
generate int64x1_t:uint64x1_t, int64x2_t:uint64x2_t
|
|
|
|
arm = vcgt.s
|
|
generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t
|
|
|
|
/// Compare unsigned less than
|
|
name = vclt
|
|
fn = simd_lt
|
|
a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
|
|
|
|
aarch64 = cmhi
|
|
generate uint64x*_t
|
|
|
|
arm = vcgt.s
|
|
generate uint*_t
|
|
|
|
/// Floating-point compare less than
|
|
name = vclt
|
|
fn = simd_lt
|
|
a = 0.1, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8
|
|
b = 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8, 8.9
|
|
validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
|
|
|
|
aarch64 = fcmgt
|
|
generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t
|
|
|
|
arm = vcgt.s
|
|
// we are missing float16x4_t:uint16x4_t, float16x8_t:uint16x8_t
|
|
generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t
|
|
|
|
////////////////////
|
|
// lesser then equals
|
|
////////////////////
|
|
|
|
/// Compare signed less than or equal
|
|
name = vcle
|
|
fn = simd_le
|
|
a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
|
|
|
|
aarch64 = cmge
|
|
generate int64x1_t:uint64x1_t, int64x2_t:uint64x2_t
|
|
|
|
arm = vcge.s
|
|
generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t
|
|
|
|
/// Compare unsigned less than or equal
|
|
name = vcle
|
|
fn = simd_le
|
|
a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
|
|
|
|
aarch64 = cmhs
|
|
generate uint64x*_t
|
|
|
|
arm = vcge.s
|
|
generate uint*_t
|
|
|
|
/// Floating-point compare less than or equal
|
|
name = vcle
|
|
fn = simd_le
|
|
a = 0.1, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8
|
|
b = 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8, 8.9
|
|
validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
|
|
aarch64 = fcmge
|
|
generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t
|
|
|
|
// we are missing float16x4_t:uint16x4_t, float16x8_t:uint16x8_t
|
|
arm = vcge.s
|
|
generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t
|
|
|
|
////////////////////
|
|
// greater then equals
|
|
////////////////////
|
|
|
|
/// Compare signed greater than or equal
|
|
name = vcge
|
|
fn = simd_ge
|
|
a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
|
|
|
|
aarch64 = cmge
|
|
generate int64x1_t:uint64x1_t, int64x2_t:uint64x2_t
|
|
|
|
arm = vcge.s
|
|
generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t
|
|
|
|
/// Compare unsigned greater than or equal
|
|
name = vcge
|
|
fn = simd_ge
|
|
a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
|
|
|
|
aarch64 = cmhs
|
|
generate uint64x*_t
|
|
|
|
arm = vcge.s
|
|
generate uint*_t
|
|
|
|
/// Floating-point compare greater than or equal
|
|
name = vcge
|
|
fn = simd_ge
|
|
a = 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8, 8.9
|
|
b = 0.1, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8
|
|
validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
|
|
|
|
aarch64 = fcmge
|
|
generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t
|
|
|
|
arm = vcge.s
|
|
// we are missing float16x4_t:uint16x4_t, float16x8_t:uint16x8_t
|
|
generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t
|
|
|
|
/// Compare signed greater than or equal to zero
|
|
name = vcgez
|
|
fn = simd_ge
|
|
a = MIN, -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, MAX
|
|
fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
|
validate FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
|
|
|
|
aarch64 = cmge
|
|
generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, int64x1_t:uint64x1_t, int64x2_t:uint64x2_t
|
|
|
|
/// Floating-point compare greater than or equal to zero
|
|
name = vcgez
|
|
fn = simd_ge
|
|
a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7
|
|
fixed = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
|
|
validate FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
|
|
|
|
aarch64 = fcmge
|
|
generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t
|
|
|
|
/// Compare signed greater than zero
|
|
name = vcgtz
|
|
fn = simd_gt
|
|
a = MIN, -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, MAX
|
|
fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
|
validate FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
|
|
|
|
aarch64 = cmgt
|
|
generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, int64x1_t:uint64x1_t, int64x2_t:uint64x2_t
|
|
|
|
/// Floating-point compare greater than zero
|
|
name = vcgtz
|
|
fn = simd_gt
|
|
a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7
|
|
fixed = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
|
|
validate FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
|
|
|
|
aarch64 = fcmgt
|
|
generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t
|
|
|
|
/// Compare signed less than or equal to zero
|
|
name = vclez
|
|
fn = simd_le
|
|
a = MIN, -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, MAX
|
|
fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
|
validate TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE
|
|
|
|
aarch64 = cmgt
|
|
generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, int64x1_t:uint64x1_t, int64x2_t:uint64x2_t
|
|
|
|
/// Floating-point compare less than or equal to zero
|
|
name = vclez
|
|
fn = simd_le
|
|
a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7
|
|
fixed = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
|
|
validate TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE
|
|
|
|
aarch64 = fcmle
|
|
generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t
|
|
|
|
/// Compare signed less than zero
|
|
name = vcltz
|
|
fn = simd_lt
|
|
a = MIN, -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, MAX
|
|
fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
|
validate TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE
|
|
|
|
aarch64 = sshr
|
|
generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, int64x1_t:uint64x1_t, int64x2_t:uint64x2_t
|
|
|
|
/// Floating-point compare less than zero
|
|
name = vcltz
|
|
fn = simd_lt
|
|
a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7
|
|
fixed = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
|
|
validate TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE
|
|
|
|
aarch64 = fcmlt
|
|
generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t
|
|
|
|
/// Count leading sign bits
|
|
name = vcls
|
|
a = MIN, -1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, MAX
|
|
validate 0, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, 0
|
|
|
|
arm = vcls.s
|
|
aarch64 = cls
|
|
link-arm = vcls._EXT_
|
|
link-aarch64 = cls._EXT_
|
|
generate int*_t
|
|
|
|
/// Signed count leading sign bits
|
|
name = vclz
|
|
multi_fn = self-signed-ext, a
|
|
a = MIN, -1, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, MAX
|
|
validate 0, 0, BITS, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, 1
|
|
|
|
arm = vclz.
|
|
aarch64 = clz
|
|
generate int*_t
|
|
|
|
/// Unsigned count leading sign bits
|
|
name = vclz
|
|
multi_fn = transmute, {self-signed-ext, transmute(a)}
|
|
a = MIN, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, MAX
|
|
validate BITS, BITS, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, 0
|
|
|
|
arm = vclz.
|
|
aarch64 = clz
|
|
generate uint*_t
|
|
|
|
/// Floating-point absolute compare greater than
|
|
name = vcagt
|
|
a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7
|
|
b = -1.1, 0.0, 1.1, 2.4, 3.3, 4.6, 5.5, 6.8
|
|
validate TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE
|
|
|
|
aarch64 = facgt
|
|
link-aarch64 = facgt._EXT2_._EXT_
|
|
generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t
|
|
|
|
arm = vacgt.s
|
|
link-arm = vacgt._EXT2_._EXT_
|
|
generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t
|
|
|
|
/// Floating-point absolute compare greater than or equal
|
|
name = vcage
|
|
a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7
|
|
b = -1.1, 0.0, 1.1, 2.4, 3.3, 4.6, 5.5, 6.8
|
|
validate TRUE, TRUE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE
|
|
|
|
aarch64 = facge
|
|
link-aarch64 = facge._EXT2_._EXT_
|
|
generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t
|
|
|
|
arm = vacge.s
|
|
link-arm = vacge._EXT2_._EXT_
|
|
generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t
|
|
|
|
/// Floating-point absolute compare less than
|
|
name = vcalt
|
|
multi_fn = vcagt-self-noext, b, a
|
|
a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7
|
|
b = -1.1, 0.0, 1.1, 2.4, 3.3, 4.6, 5.5, 6.8
|
|
validate FALSE, FALSE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE
|
|
|
|
aarch64 = facgt
|
|
generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t
|
|
|
|
arm = vacgt.s
|
|
generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t
|
|
|
|
/// Floating-point absolute compare less than or equal
|
|
name = vcale
|
|
multi_fn = vcage-self-noext , b, a
|
|
a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7
|
|
b = -1.1, 0.0, 1.1, 2.4, 3.3, 4.6, 5.5, 6.8
|
|
validate FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE
|
|
|
|
aarch64 = facge
|
|
generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t
|
|
|
|
arm = vacge.s
|
|
generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t
|
|
|
|
/// Floating-point convert to higher precision long
|
|
name = vcvt
|
|
double-suffixes
|
|
fn = simd_cast
|
|
a = -1.2, 1.2
|
|
validate -1.2f32 as f64, 1.2f32 as f64
|
|
|
|
aarch64 = fcvtl
|
|
generate float32x2_t:float64x2_t
|
|
|
|
/// Floating-point convert to higher precision long
|
|
name = vcvt_high
|
|
double-suffixes
|
|
multi_fn = simd_shuffle2, b:float32x2_t, a, a, [2, 3]
|
|
multi_fn = simd_cast, b
|
|
a = -1.2, 1.2, 2.3, 3.4
|
|
validate 2.3f32 as f64, 3.4f32 as f64
|
|
|
|
aarch64 = fcvtl
|
|
generate float32x4_t:float64x2_t
|
|
|
|
/// Floating-point convert to lower precision narrow
|
|
name = vcvt
|
|
double-suffixes
|
|
fn = simd_cast
|
|
a = -1.2, 1.2
|
|
validate -1.2f64 as f32, 1.2f64 as f32
|
|
|
|
aarch64 = fcvtn
|
|
generate float64x2_t:float32x2_t
|
|
|
|
/// Floating-point convert to lower precision narrow
|
|
name = vcvt_high
|
|
double-suffixes
|
|
multi_fn = simd_shuffle4, a, {simd_cast, b}, [0, 1, 2, 3]
|
|
a = -1.2, 1.2
|
|
b = -2.3, 3.4
|
|
validate -1.2, 1.2, -2.3f64 as f32, 3.4f64 as f32
|
|
|
|
aarch64 = fcvtn
|
|
generate float32x2_t:float64x2_t:float32x4_t
|
|
|
|
/// Floating-point convert to lower precision narrow, rounding to odd
|
|
name = vcvtx
|
|
double-suffixes
|
|
a = -1.0, 2.0
|
|
validate -1.0, 2.0
|
|
|
|
aarch64 = fcvtxn
|
|
link-aarch64 = fcvtxn._EXT2_._EXT_
|
|
generate float64x2_t:float32x2_t
|
|
|
|
/// Floating-point convert to lower precision narrow, rounding to odd
|
|
name = vcvtx_high
|
|
double-suffixes
|
|
multi_fn = simd_shuffle4, a, {vcvtx-doubleself-noext, b}, [0, 1, 2, 3]
|
|
a = -1.0, 2.0
|
|
b = -3.0, 4.0
|
|
validate -1.0, 2.0, -3.0, 4.0
|
|
|
|
aarch64 = fcvtxn
|
|
generate float32x2_t:float64x2_t:float32x4_t
|
|
|
|
/// Floating-point convert to signed fixed-point, rounding toward zero
|
|
name = vcvt
|
|
double-suffixes
|
|
fn = simd_cast
|
|
a = -1.1, 2.1, -2.9, 3.9
|
|
validate -1, 2, -2, 3
|
|
|
|
aarch64 = fcvtzs
|
|
generate float64x1_t:int64x1_t, float64x2_t:int64x2_t
|
|
|
|
arm = vcvt
|
|
generate float32x2_t:int32x2_t, float32x4_t:int32x4_t
|
|
|
|
/// Floating-point convert to unsigned fixed-point, rounding toward zero
|
|
name = vcvt
|
|
double-suffixes
|
|
fn = simd_cast
|
|
a = 1.1, 2.1, 2.9, 3.9
|
|
validate 1, 2, 2, 3
|
|
|
|
aarch64 = fcvtzu
|
|
generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t
|
|
|
|
arm = vcvt
|
|
generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t
|
|
|
|
/// Floating-point convert to signed integer, rounding to nearest with ties to away
|
|
name = vcvta
|
|
double-suffixes
|
|
a = -1.1, 2.1, -2.9, 3.9
|
|
validate -1, 2, -3, 4
|
|
|
|
aarch64 = fcvtas
|
|
link-aarch64 = fcvtas._EXT2_._EXT_
|
|
generate float32x2_t:int32x2_t, float32x4_t:int32x4_t, float64x1_t:int64x1_t, float64x2_t:int64x2_t
|
|
|
|
/// Floating-point convert to signed integer, rounding to nearest with ties to even
|
|
name = vcvtn
|
|
double-suffixes
|
|
a = -1.5, 2.1, -2.9, 3.9
|
|
validate -2, 2, -3, 4
|
|
|
|
aarch64 = fcvtns
|
|
link-aarch64 = fcvtns._EXT2_._EXT_
|
|
generate float32x2_t:int32x2_t, float32x4_t:int32x4_t, float64x1_t:int64x1_t, float64x2_t:int64x2_t
|
|
|
|
/// Floating-point convert to signed integer, rounding toward minus infinity
|
|
name = vcvtm
|
|
double-suffixes
|
|
a = -1.1, 2.1, -2.9, 3.9
|
|
validate -2, 2, -3, 3
|
|
|
|
aarch64 = fcvtms
|
|
link-aarch64 = fcvtms._EXT2_._EXT_
|
|
generate float32x2_t:int32x2_t, float32x4_t:int32x4_t, float64x1_t:int64x1_t, float64x2_t:int64x2_t
|
|
|
|
/// Floating-point convert to signed integer, rounding toward plus infinity
|
|
name = vcvtp
|
|
double-suffixes
|
|
a = -1.1, 2.1, -2.9, 3.9
|
|
validate -1, 3, -2, 4
|
|
|
|
aarch64 = fcvtps
|
|
link-aarch64 = fcvtps._EXT2_._EXT_
|
|
generate float32x2_t:int32x2_t, float32x4_t:int32x4_t, float64x1_t:int64x1_t, float64x2_t:int64x2_t
|
|
|
|
/// Floating-point convert to unsigned integer, rounding to nearest with ties to away
|
|
name = vcvta
|
|
double-suffixes
|
|
a = 1.1, 2.1, 2.9, 3.9
|
|
validate 1, 2, 3, 4
|
|
|
|
aarch64 = fcvtau
|
|
link-aarch64 = fcvtau._EXT2_._EXT_
|
|
generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t
|
|
|
|
/// Floating-point convert to unsigned integer, rounding to nearest with ties to even
|
|
name = vcvtn
|
|
double-suffixes
|
|
a = 1.5, 2.1, 2.9, 3.9
|
|
validate 2, 2, 3, 4
|
|
|
|
aarch64 = fcvtnu
|
|
link-aarch64 = fcvtnu._EXT2_._EXT_
|
|
generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t
|
|
|
|
/// Floating-point convert to unsigned integer, rounding toward minus infinity
|
|
name = vcvtm
|
|
double-suffixes
|
|
a = 1.1, 2.1, 2.9, 3.9
|
|
validate 1, 2, 2, 3
|
|
|
|
aarch64 = fcvtmu
|
|
link-aarch64 = fcvtmu._EXT2_._EXT_
|
|
generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t
|
|
|
|
/// Floating-point convert to unsigned integer, rounding toward plus infinity
|
|
name = vcvtp
|
|
double-suffixes
|
|
a = 1.1, 2.1, 2.9, 3.9
|
|
validate 2, 3, 3, 4
|
|
|
|
aarch64 = fcvtpu
|
|
link-aarch64 = fcvtpu._EXT2_._EXT_
|
|
generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t
|
|
|
|
/// Multiply-add to accumulator
|
|
name = vmla
|
|
multi_fn = simd_add, a, {simd_mul, b, c}
|
|
a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
|
|
c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
|
|
validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
|
|
|
|
arm = vmla.
|
|
aarch64 = mla
|
|
generate int*_t, uint*_t
|
|
|
|
/// Floating-point multiply-add to accumulator
|
|
name = vmla
|
|
multi_fn = simd_add, a, {simd_mul, b, c}
|
|
a = 0., 1., 2., 3.
|
|
b = 2., 2., 2., 2.
|
|
c = 3., 3., 3., 3.
|
|
validate 6., 7., 8., 9.
|
|
|
|
aarch64 = fmul
|
|
generate float64x*_t
|
|
|
|
arm = vmla.
|
|
generate float*_t
|
|
|
|
/// Signed multiply-add long
|
|
name = vmlal
|
|
multi_fn = simd_add, a, {vmull-self-noext, b, c}
|
|
a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
|
|
c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
|
|
validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
|
|
|
|
arm = vmlal.s
|
|
aarch64 = smlal
|
|
generate int16x8_t:int8x8_t:int8x8_t:int16x8_t, int32x4_t:int16x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int32x2_t:int64x2_t
|
|
|
|
/// Unsigned multiply-add long
|
|
name = vmlal
|
|
multi_fn = simd_add, a, {vmull-self-noext, b, c}
|
|
a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
|
|
c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
|
|
validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
|
|
|
|
arm = vmlal.s
|
|
aarch64 = umlal
|
|
generate uint16x8_t:uint8x8_t:uint8x8_t:uint16x8_t, uint32x4_t:uint16x4_t:uint16x4_t:uint32x4_t, uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t
|
|
|
|
/// Signed multiply-add long
|
|
name = vmlal_high
|
|
no-q
|
|
multi_fn = simd_shuffle-out_len-noext, b:half, b, b, {fixed-half-right}
|
|
multi_fn = simd_shuffle-out_len-noext, c:half, c, c, {fixed-half-right}
|
|
multi_fn = vmlal-noqself-noext, a, b, c
|
|
a = 8, 7, 6, 5, 4, 3, 2, 1
|
|
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
|
|
c = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7
|
|
fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
validate 8, 9, 10, 11, 12, 13, 14, 15
|
|
|
|
aarch64 = smlal2
|
|
generate int16x8_t:int8x16_t:int8x16_t:int16x8_t, int32x4_t:int16x8_t:int16x8_t:int32x4_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t
|
|
|
|
/// Unsigned multiply-add long
|
|
name = vmlal_high
|
|
no-q
|
|
multi_fn = simd_shuffle-out_len-noext, b:half, b, b, {fixed-half-right}
|
|
multi_fn = simd_shuffle-out_len-noext, c:half, c, c, {fixed-half-right}
|
|
multi_fn = vmlal-noqself-noext, a, b, c
|
|
a = 8, 7, 6, 5, 4, 3, 2, 1
|
|
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
|
|
c = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7
|
|
fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
validate 8, 9, 10, 11, 12, 13, 14, 15
|
|
|
|
aarch64 = umlal2
|
|
generate uint16x8_t:uint8x16_t:uint8x16_t:uint16x8_t, uint32x4_t:uint16x8_t:uint16x8_t:uint32x4_t, uint64x2_t:uint32x4_t:uint32x4_t:uint64x2_t
|
|
|
|
/// Multiply-subtract from accumulator
|
|
name = vmls
|
|
multi_fn = simd_sub, a, {simd_mul, b, c}
|
|
a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
|
|
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
|
|
c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
|
|
validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
|
|
arm = vmls.
|
|
aarch64 = mls
|
|
generate int*_t, uint*_t
|
|
|
|
/// Floating-point multiply-subtract from accumulator
|
|
name = vmls
|
|
multi_fn = simd_sub, a, {simd_mul, b, c}
|
|
a = 6., 7., 8., 9.
|
|
b = 2., 2., 2., 2.
|
|
c = 3., 3., 3., 3.
|
|
validate 0., 1., 2., 3.
|
|
|
|
aarch64 = fmul
|
|
generate float64x*_t
|
|
|
|
arm = vmls.
|
|
generate float*_t
|
|
|
|
/// Signed multiply-subtract long
|
|
name = vmlsl
|
|
multi_fn = simd_sub, a, {vmull-self-noext, b, c}
|
|
a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
|
|
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
|
|
c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
|
|
validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
|
|
arm = vmlsl.s
|
|
aarch64 = smlsl
|
|
generate int16x8_t:int8x8_t:int8x8_t:int16x8_t, int32x4_t:int16x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int32x2_t:int64x2_t
|
|
|
|
/// Signed multiply-subtract long
|
|
name = vmlsl
|
|
multi_fn = simd_sub, a, {vmull-self-noext, b, c}
|
|
a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
|
|
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
|
|
c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
|
|
validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
|
|
arm = vmlsl.s
|
|
aarch64 = umlsl
|
|
generate uint16x8_t:uint8x8_t:uint8x8_t:uint16x8_t, uint32x4_t:uint16x4_t:uint16x4_t:uint32x4_t, uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t
|
|
|
|
/// Signed multiply-subtract long
|
|
name = vmlsl_high
|
|
no-q
|
|
multi_fn = simd_shuffle-out_len-noext, b:half, b, b, {fixed-half-right}
|
|
multi_fn = simd_shuffle-out_len-noext, c:half, c, c, {fixed-half-right}
|
|
multi_fn = vmlsl-noqself-noext, a, b, c
|
|
a = 14, 15, 16, 17, 18, 19, 20, 21
|
|
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
|
|
c = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7
|
|
fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
validate 14, 13, 12, 11, 10, 9, 8, 7
|
|
|
|
aarch64 = smlsl2
|
|
generate int16x8_t:int8x16_t:int8x16_t:int16x8_t, int32x4_t:int16x8_t:int16x8_t:int32x4_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t
|
|
|
|
/// Unsigned multiply-subtract long
|
|
name = vmlsl_high
|
|
no-q
|
|
multi_fn = simd_shuffle-out_len-noext, b:half, b, b, {fixed-half-right}
|
|
multi_fn = simd_shuffle-out_len-noext, c:half, c, c, {fixed-half-right}
|
|
multi_fn = vmlsl-noqself-noext, a, b, c
|
|
a = 14, 15, 16, 17, 18, 19, 20, 21
|
|
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
|
|
c = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7
|
|
fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
validate 14, 13, 12, 11, 10, 9, 8, 7
|
|
|
|
aarch64 = umlsl2
|
|
generate uint16x8_t:uint8x16_t:uint8x16_t:uint16x8_t, uint32x4_t:uint16x8_t:uint16x8_t:uint32x4_t, uint64x2_t:uint32x4_t:uint32x4_t:uint64x2_t
|
|
|
|
/// Saturating subtract
|
|
name = vqsub
|
|
a = 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42
|
|
b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
validate 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26
|
|
|
|
arm = vqsub.s
|
|
aarch64 = uqsub
|
|
link-arm = vqsubu._EXT_
|
|
link-aarch64 = uqsub._EXT_
|
|
generate uint*_t, uint64x*_t
|
|
|
|
arm = vqsub.s
|
|
aarch64 = sqsub
|
|
link-arm = vqsubs._EXT_
|
|
link-aarch64 = sqsub._EXT_
|
|
generate int*_t, int64x*_t
|
|
|
|
/// Halving add
|
|
name = vhadd
|
|
a = 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42
|
|
b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
validate 21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29
|
|
|
|
|
|
arm = vhadd.s
|
|
aarch64 = uhadd
|
|
link-aarch64 = uhadd._EXT_
|
|
link-arm = vhaddu._EXT_
|
|
generate uint*_t
|
|
|
|
|
|
arm = vhadd.s
|
|
aarch64 = shadd
|
|
link-aarch64 = shadd._EXT_
|
|
link-arm = vhadds._EXT_
|
|
generate int*_t
|
|
|
|
/// Rounding halving add
|
|
name = vrhadd
|
|
a = 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42
|
|
b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
validate 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29
|
|
|
|
arm = vrhadd.s
|
|
aarch64 = urhadd
|
|
link-arm = vrhaddu._EXT_
|
|
link-aarch64 = urhadd._EXT_
|
|
generate uint*_t
|
|
|
|
arm = vrhadd.s
|
|
aarch64 = srhadd
|
|
link-arm = vrhadds._EXT_
|
|
link-aarch64 = srhadd._EXT_
|
|
generate int*_t
|
|
|
|
/// Saturating add
|
|
name = vqadd
|
|
a = 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42
|
|
b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
validate 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58
|
|
|
|
arm = vqadd.s
|
|
aarch64 = uqadd
|
|
link-arm = vqaddu._EXT_
|
|
link-aarch64 = uqadd._EXT_
|
|
generate uint*_t, uint64x*_t
|
|
|
|
arm = vqadd.s
|
|
aarch64 = sqadd
|
|
link-arm = vqadds._EXT_
|
|
link-aarch64 = sqadd._EXT_
|
|
generate int*_t, int64x*_t
|
|
|
|
/// Multiply
|
|
name = vmul
|
|
a = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2
|
|
b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
validate 1, 4, 3, 8, 5, 12, 7, 16, 9, 20, 11, 24, 13, 28, 15, 32
|
|
arm = vmul.
|
|
aarch64 = mul
|
|
fn = simd_mul
|
|
generate int*_t, uint*_t
|
|
|
|
/// Multiply
|
|
name = vmul
|
|
fn = simd_mul
|
|
a = 1.0, 2.0, 1.0, 2.0
|
|
b = 2.0, 3.0, 4.0, 5.0
|
|
validate 2.0, 6.0, 4.0, 10.0
|
|
|
|
aarch64 = fmul
|
|
generate float64x*_t
|
|
|
|
arm = vmul.
|
|
generate float*_t
|
|
|
|
/// Signed multiply long
|
|
name = vmull
|
|
a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2
|
|
validate 1, 4, 3, 8, 5, 12, 7, 16, 9, 20, 11, 24, 13, 28, 15, 32
|
|
|
|
arm = vmull.s
|
|
aarch64 = smull
|
|
link-arm = vmulls._EXT_
|
|
link-aarch64 = smull._EXT_
|
|
generate int8x8_t:int8x8_t:int16x8_t, int16x4_t:int16x4_t:int32x4_t, int32x2_t:int32x2_t:int64x2_t
|
|
|
|
/// Signed multiply long
|
|
name = vmull_high
|
|
no-q
|
|
multi_fn = simd_shuffle-out_len-noext, a:half, a, a, {fixed-half-right}
|
|
multi_fn = simd_shuffle-out_len-noext, b:half, b, b, {fixed-half-right}
|
|
multi_fn = vmull-noqself-noext, a, b
|
|
a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16
|
|
b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2
|
|
fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
validate 9, 20, 11, 24, 13, 28, 15, 32
|
|
|
|
aarch64 = smull2
|
|
generate int8x16_t:int8x16_t:int16x8_t, int16x8_t:int16x8_t:int32x4_t, int32x4_t:int32x4_t:int64x2_t
|
|
|
|
/// Unsigned multiply long
|
|
name = vmull
|
|
a = 1, 2, 3, 4, 5, 6, 7, 8
|
|
b = 1, 2, 1, 2, 1, 2, 1, 2
|
|
validate 1, 4, 3, 8, 5, 12, 7, 16
|
|
|
|
arm = vmull.s
|
|
aarch64 = umull
|
|
link-arm = vmullu._EXT_
|
|
link-aarch64 = umull._EXT_
|
|
generate uint8x8_t:uint8x8_t:uint16x8_t, uint16x4_t:uint16x4_t:uint32x4_t, uint32x2_t:uint32x2_t:uint64x2_t
|
|
|
|
/// Unsigned multiply long
|
|
name = vmull_high
|
|
no-q
|
|
multi_fn = simd_shuffle-out_len-noext, a:half, a, a, {fixed-half-right}
|
|
multi_fn = simd_shuffle-out_len-noext, b:half, b, b, {fixed-half-right}
|
|
multi_fn = vmull-noqself-noext, a, b
|
|
a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16
|
|
b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2
|
|
fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
validate 9, 20, 11, 24, 13, 28, 15, 32
|
|
|
|
aarch64 = umull2
|
|
generate uint8x16_t:uint8x16_t:uint16x8_t, uint16x8_t:uint16x8_t:uint32x4_t, uint32x4_t:uint32x4_t:uint64x2_t
|
|
|
|
/// Polynomial multiply long
|
|
name = vmull
|
|
a = 1, 2, 3, 4, 5, 6, 7, 8
|
|
b = 1, 3, 1, 3, 1, 3, 1, 3
|
|
validate 1, 6, 3, 12, 5, 10, 7, 24
|
|
|
|
arm = vmull.s
|
|
aarch64 = pmull
|
|
link-arm = vmullp._EXT_
|
|
link-aarch64 = pmull._EXT_
|
|
generate poly8x8_t:poly8x8_t:poly16x8_t
|
|
|
|
/// Polynomial multiply long
|
|
name = vmull_high
|
|
no-q
|
|
multi_fn = simd_shuffle-out_len-noext, a:half, a, a, {fixed-half-right}
|
|
multi_fn = simd_shuffle-out_len-noext, b:half, b, b, {fixed-half-right}
|
|
multi_fn = vmull-noqself-noext, a, b
|
|
a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16
|
|
b = 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3
|
|
fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
validate 9, 30, 11, 20, 13, 18, 15, 48
|
|
|
|
aarch64 = pmull
|
|
generate poly8x16_t:poly8x16_t:poly16x8_t
|
|
|
|
/// Divide
|
|
name = vdiv
|
|
fn = simd_div
|
|
a = 2.0, 6.0, 4.0, 10.0
|
|
b = 1.0, 2.0, 1.0, 2.0
|
|
validate 2.0, 3.0, 4.0, 5.0
|
|
|
|
aarch64 = fdiv
|
|
generate float*_t, float64x*_t
|
|
|
|
/// Subtract
|
|
name = vsub
|
|
a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2
|
|
validate 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14
|
|
arm = vsub.
|
|
aarch64 = sub
|
|
fn = simd_sub
|
|
generate int*_t, uint*_t, int64x*_t, uint64x*_t
|
|
|
|
/// Subtract
|
|
name = vsub
|
|
fn = simd_sub
|
|
a = 1.0, 4.0, 3.0, 8.0
|
|
b = 1.0, 2.0, 3.0, 4.0
|
|
validate 0.0, 2.0, 0.0, 4.0
|
|
|
|
aarch64 = fsub
|
|
generate float64x*_t
|
|
|
|
arm = vsub.
|
|
generate float*_t
|
|
|
|
|
|
/// Signed halving subtract
|
|
name = vhsub
|
|
a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2
|
|
validate 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7
|
|
|
|
arm = vhsub.s
|
|
aarch64 = uhsub
|
|
link-arm = vhsubu._EXT_
|
|
link-aarch64 = uhsub._EXT_
|
|
generate uint*_t
|
|
|
|
arm = vhsub.s
|
|
aarch64 = shsub
|
|
link-arm = vhsubs._EXT_
|
|
link-aarch64 = shsub._EXT_
|
|
generate int*_t
|
|
|
|
/// Maximum (vector)
|
|
name = vmax
|
|
a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
b = 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
|
|
validate 16, 15, 14, 13, 12, 11, 10, 9, 9, 10, 11, 12, 13, 14, 15, 16
|
|
|
|
arm = vmax
|
|
aarch64 = smax
|
|
link-arm = vmaxs._EXT_
|
|
link-aarch64 = smax._EXT_
|
|
generate int*_t
|
|
|
|
arm = vmax
|
|
aarch64 = umax
|
|
link-arm = vmaxu._EXT_
|
|
link-aarch64 = umax._EXT_
|
|
generate uint*_t
|
|
|
|
/// Maximum (vector)
|
|
name = vmax
|
|
a = 1.0, -2.0, 3.0, -4.0
|
|
b = 0.0, 3.0, 2.0, 8.0
|
|
validate 1.0, 3.0, 3.0, 8.0
|
|
|
|
aarch64 = fmax
|
|
link-aarch64 = fmax._EXT_
|
|
generate float64x*_t
|
|
|
|
arm = vmax
|
|
aarch64 = fmax
|
|
link-arm = vmaxs._EXT_
|
|
link-aarch64 = fmax._EXT_
|
|
generate float*_t
|
|
|
|
/// Minimum (vector)
|
|
name = vmin
|
|
a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
b = 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
|
|
validate 1, 2, 3, 4, 5, 6, 7, 8, 8, 7, 6, 5, 4, 3, 2, 1
|
|
|
|
arm = vmin
|
|
aarch64 = smin
|
|
link-arm = vmins._EXT_
|
|
link-aarch64 = smin._EXT_
|
|
generate int*_t
|
|
|
|
arm = vmin
|
|
aarch64 = umin
|
|
link-arm = vminu._EXT_
|
|
link-aarch64 = umin._EXT_
|
|
generate uint*_t
|
|
|
|
/// Minimum (vector)
|
|
name = vmin
|
|
a = 1.0, -2.0, 3.0, -4.0
|
|
b = 0.0, 3.0, 2.0, 8.0
|
|
validate 0.0, -2.0, 2.0, -4.0
|
|
|
|
aarch64 = fmin
|
|
link-aarch64 = fmin._EXT_
|
|
generate float64x*_t
|
|
|
|
arm = vmin
|
|
aarch64 = fmin
|
|
link-arm = vmins._EXT_
|
|
link-aarch64 = fmin._EXT_
|
|
generate float*_t
|
|
|
|
/// Calculates the square root of each lane.
|
|
name = vsqrt
|
|
fn = simd_fsqrt
|
|
a = 4.0, 9.0, 16.0, 25.0
|
|
validate 2.0, 3.0, 4.0, 5.0
|
|
|
|
aarch64 = fsqrt
|
|
generate float*_t, float64x*_t
|
|
|
|
/// Reciprocal square-root estimate.
|
|
name = vrsqrte
|
|
a = 1.0, 2.0, 3.0, 4.0
|
|
validate 0.998046875, 0.705078125, 0.576171875, 0.4990234375
|
|
|
|
aarch64 = frsqrte
|
|
link-aarch64 = frsqrte._EXT_
|
|
generate float64x*_t
|
|
|
|
arm = vrsqrte
|
|
link-arm = vrsqrte._EXT_
|
|
generate float*_t
|
|
|
|
/// Reciprocal estimate.
|
|
name = vrecpe
|
|
a = 4.0, 3.0, 2.0, 1.0
|
|
validate 0.24951171875, 0.3330078125, 0.4990234375, 0.998046875
|
|
|
|
aarch64 = frecpe
|
|
link-aarch64 = frecpe._EXT_
|
|
generate float64x*_t
|
|
|
|
arm = vrecpe
|
|
link-arm = vrecpe._EXT_
|
|
generate float*_t
|
|
|
|
/// Transpose vectors
|
|
name = vtrn1
|
|
multi_fn = simd_shuffle-in_len-noext, a, b, {transpose-1-in_len}
|
|
a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
|
|
b = 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
|
|
validate 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
|
|
|
|
aarch64 = trn1
|
|
generate int*_t, int64x2_t, uint*_t, uint64x2_t, poly8x8_t:poly8x8_t, poly8x16_t:poly8x16_t, poly16x4_t:poly16x4_t, poly16x8_t:poly16x8_t, poly64x2_t:poly64x2_t
|
|
|
|
/// Transpose vectors
|
|
name = vtrn1
|
|
multi_fn = simd_shuffle-in_len-noext, a, b, {transpose-1-in_len}
|
|
a = 0., 2., 4., 6., 8., 10., 12., 14.
|
|
b = 1., 3., 5., 7., 9., 11., 13., 15.
|
|
validate 0., 1., 4., 5., 8., 9., 12., 13.
|
|
|
|
aarch64 = trn1
|
|
generate float32x2_t:float32x2_t, float32x4_t:float32x4_t, float64x2_t:float64x2_t
|
|
|
|
/// Transpose vectors
|
|
name = vtrn2
|
|
multi_fn = simd_shuffle-in_len-noext, a, b, {transpose-2-in_len}
|
|
a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
|
|
b = 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
|
|
validate 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
|
|
|
|
aarch64 = trn2
|
|
generate int*_t, int64x2_t, uint*_t, uint64x2_t, poly8x8_t:poly8x8_t, poly8x16_t:poly8x16_t, poly16x4_t:poly16x4_t, poly16x8_t:poly16x8_t, poly64x2_t:poly64x2_t
|
|
|
|
/// Transpose vectors
|
|
name = vtrn2
|
|
multi_fn = simd_shuffle-in_len-noext, a, b, {transpose-2-in_len}
|
|
a = 0., 2., 4., 6., 8., 10., 12., 14.
|
|
b = 1., 3., 5., 7., 9., 11., 13., 15.
|
|
validate 2., 3., 6., 7., 10., 11., 14., 15.
|
|
|
|
aarch64 = trn2
|
|
generate float32x2_t:float32x2_t, float32x4_t:float32x4_t, float64x2_t:float64x2_t
|