rsimd

Make SIMD instruction sets easier to use
git clone git://git.meso-star.fr/rsimd.git
Log | Files | Refs | README | LICENSE

commit 9870c031f34427d5d4a01e34fd5f72a42a8e0f28
parent 18a76e8311f942a4c4128e892a0d96912560fca5
Author: vaplv <vaplv@free.fr>
Date:   Mon, 21 May 2018 12:14:55 +0200

Add and test the v8f_T API

Diffstat:
Mcmake/CMakeLists.txt | 5+++++
Asrc/avx/avx.h | 27+++++++++++++++++++++++++++
Asrc/avx/avxf.h | 330+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/rsimd.h | 5+++--
Msrc/sse/ssef.h | 2+-
Asrc/test_v8f.c | 450+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
6 files changed, 816 insertions(+), 3 deletions(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt @@ -37,6 +37,7 @@ include(rcmake) if(CMAKE_COMPILER_IS_GNUCC) include(CheckCCompilerFlag) CHECK_C_COMPILER_FLAG("-msse4.1" SSE4_1) + CHECK_C_COMPILER_FLAG("-mavx" AVX) endif(CMAKE_COMPILER_IS_GNUCC) ################################################################################ @@ -112,6 +113,10 @@ if(NOT NO_TEST) new_test_named(test_v4i_sse4_1 test_v4i "-msse4.1") endif(SSE4_1 AND CMAKE_COMPILER_IS_GNUCC) + if(AVX AND CMAKE_COMPILER_IS_GNUCC) + new_test(test_v8f "-mavx") + endif(AVX AND CMAKE_COMPILER_IS_GNUCC) + endif(NOT NO_TEST) ################################################################################ diff --git a/src/avx/avx.h b/src/avx/avx.h @@ -0,0 +1,27 @@ +/* Copyright (C) 2014-2018 Vincent Forest (vaplv@free.fr) + * + * The RSIMD library is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * The RSIMD library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ + +#ifndef RSIMD_AVX_H +#define RSIMD_AVX_H + +#include "avxf.h" + +typedef __m256i v8i_T; + +/* Reinterpret cast */ +static FINLINE v8i_T v8f_rcast_v8i(const v8f_T v) {return _mm256_castps_si256(v);} +static FINLINE v8f_T v8i_rcast_v8f(const v8i_T v) {return _mm256_castsi256_ps(v);} + +#endif /* RSIMD_AVX_H */ diff --git a/src/avx/avxf.h b/src/avx/avxf.h @@ -0,0 +1,330 @@ +/* Copyright (C) 2014-2018 Vincent Forest (vaplv@free.fr) + * + * The RSIMD library is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * The RSIMD library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ + +#ifndef RSIMD_AVXF_H +#define RSIMD_AVXF_H + +/* + * 8 packed single precision floating-point values + */ + +#include "avx.h" + +#include <rsys/math.h> +#include <immintrin.h> + +typedef __m256 v8f_T; + +/******************************************************************************* + * Set operations + ******************************************************************************/ +static FINLINE float* +v8f_store(float dst[8], v8f_T v) +{ + ASSERT(dst && IS_ALIGNED(dst, 32)); + _mm256_store_ps(dst, v); + return dst; +} + +static FINLINE v8f_T +v8f_load(const float src[8]) +{ + ASSERT(src && IS_ALIGNED(src, 32)); + return _mm256_load_ps(src); +} + +static FINLINE v8f_T +v8f_loadu(const float f[8]) +{ + ASSERT(f); + return _mm256_set_ps(f[7], f[6], f[5], f[4], f[3],f[2], f[1], f[0]); +} + +static FINLINE v8f_T +v8f_set1(const float x) +{ + return _mm256_set1_ps(x); +} + +static FINLINE v8f_T +v8f_set + (const float a, const float b, const float c, const float d, + const float e, const float f, const float g, const float h) +{ + return _mm256_set_ps(h, g, f, e, d, c, b, a); +} + +static FINLINE v8f_T +v8f_zero(void) +{ + return _mm256_setzero_ps(); +} + +static FINLINE v8f_T +v8f_mask + (const int32_t a, const int32_t b, const int32_t c, const int32_t d, + const int32_t e, const int32_t f, const int32_t g, const int32_t h) +{ + return _mm256_castsi256_ps(_mm256_set_epi32(h, g, f, e, d, c, b, a)); +} + +static FINLINE v8f_T +v8f_mask1(const int32_t x) +{ + return _mm256_castsi256_ps(_mm256_set1_epi32(x)); +} + +static FINLINE v8f_T +v8f_true(void) +{ + return _mm256_castsi256_ps(_mm256_set1_epi32(~0)); +} + +static FINLINE v8f_T +v8f_false(void) +{ + return v8f_zero(); +} + +/******************************************************************************* + * Extract components + ******************************************************************************/ +static FINLINE v4f_T +v8f_abcd(const v8f_T v) +{ + return _mm256_extractf128_ps(v, 0); +} + +static FINLINE v4f_T +v8f_efgh(const v8f_T v) +{ + return _mm256_extractf128_ps(v, 1); +} + +static FINLINE int +v8f_movemask(const v8f_T v) +{ + return _mm256_movemask_ps(v); +} + +/******************************************************************************* + * Bitwise operations + ******************************************************************************/ +static FINLINE v8f_T +v8f_or(const v8f_T v0, const v8f_T v1) +{ + return _mm256_or_ps(v0, v1); +} + +static FINLINE v8f_T +v8f_and(const v8f_T v0, const v8f_T v1) +{ + return _mm256_and_ps(v0, v1); +} + +static FINLINE v8f_T +v8f_andnot(const v8f_T v0, const v8f_T v1) +{ + return _mm256_andnot_ps(v0, v1); +} + +static FINLINE v8f_T +v8f_xor(const v8f_T v0, const v8f_T v1) +{ + return _mm256_xor_ps(v0, v1); +} + +static FINLINE v8f_T +v8f_sel(const v8f_T vfalse, const v8f_T vtrue, const v8f_T vcond) +{ + return _mm256_blendv_ps(vfalse, vtrue, vcond); +} + +/******************************************************************************* + * Arithmetic operations + ******************************************************************************/ +static FINLINE v8f_T +v8f_minus(const v8f_T v) +{ + return v8f_xor(v8f_set1(-0.f), v); +} + +static FINLINE v8f_T +v8f_add(const v8f_T v0, const v8f_T v1) +{ + return _mm256_add_ps(v0, v1); +} + +static FINLINE v8f_T +v8f_sub(const v8f_T v0, const v8f_T v1) +{ + return _mm256_sub_ps(v0, v1); +} + +static FINLINE v8f_T +v8f_mul(const v8f_T v0, const v8f_T v1) +{ + return _mm256_mul_ps(v0, v1); +} + +static FINLINE v8f_T +v8f_div(const v8f_T v0, const v8f_T v1) +{ + return _mm256_div_ps(v0, v1); +} + +static FINLINE v8f_T +v8f_madd(const v8f_T v0, const v8f_T v1, const v8f_T v2) +{ + return _mm256_add_ps(_mm256_mul_ps(v0, v1), v2); +} + +static FINLINE v8f_T +v8f_abs(const v8f_T v) +{ + const union { int32_t i; float f; } mask = { 0x7fffffff }; + return v8f_and(v, v8f_set1(mask.f)); +} + +static FINLINE v8f_T +v8f_sqrt(const v8f_T v) +{ + return _mm256_sqrt_ps(v); +} + +static FINLINE v8f_T +v8f_rsqrte(const v8f_T v) +{ + return _mm256_rsqrt_ps(v); +} + +static FINLINE v8f_T +v8f_rsqrt(const v8f_T v) +{ + const v8f_T y = v8f_rsqrte(v); + const v8f_T yyv = v8f_mul(v8f_mul(y, y), v); + const v8f_T tmp = v8f_sub(v8f_set1(1.5f), v8f_mul(yyv, v8f_set1(0.5f))); + return v8f_mul(tmp, y); +} + +static FINLINE v8f_T +v8f_rcpe(const v8f_T v) +{ + return _mm256_rcp_ps(v); +} + +static FINLINE v8f_T +v8f_rcp(const v8f_T v) +{ + const v8f_T y = v8f_rcpe(v); + const v8f_T tmp = v8f_sub(v8f_set1(2.f), v8f_mul(y, v)); + return v8f_mul(tmp, y); +} + +static FINLINE v8f_T +v8f_lerp(const v8f_T from, const v8f_T to, const v8f_T param) +{ + return v8f_madd(v8f_sub(to, from), param, from); +} + +/******************************************************************************* + * Comparators + ******************************************************************************/ +static FINLINE v8f_T +v8f_eq(const v8f_T v0, const v8f_T v1) +{ + return _mm256_cmp_ps(v0, v1, _CMP_EQ_OS); +} + +static FINLINE v8f_T +v8f_neq(const v8f_T v0, const v8f_T v1) +{ + return _mm256_cmp_ps(v0, v1, _CMP_NEQ_OS); +} + +static FINLINE v8f_T +v8f_ge(const v8f_T v0, const v8f_T v1) +{ + return _mm256_cmp_ps(v0, v1, _CMP_GE_OS); +} + +static FINLINE v8f_T +v8f_le(const v8f_T v0, const v8f_T v1) +{ + return _mm256_cmp_ps(v0, v1, _CMP_LE_OS); +} + +static FINLINE v8f_T +v8f_gt(const v8f_T v0, const v8f_T v1) +{ + return _mm256_cmp_ps(v0, v1, _CMP_GT_OS); +} + +static FINLINE v8f_T +v8f_lt(const v8f_T v0, const v8f_T v1) +{ + return _mm256_cmp_ps(v0, v1, _CMP_LT_OS); +} + +static FINLINE v8f_T +v8f_eq_eps(const v8f_T v0, const v8f_T v1, const v8f_T eps) +{ + return v8f_le(v8f_abs(v8f_sub(v0, v1)), eps); +} + +static FINLINE v8f_T +v8f_min(const v8f_T v0, const v8f_T v1) +{ + return _mm256_min_ps(v0, v1); +} + +static FINLINE v8f_T +v8f_max(const v8f_T v0, const v8f_T v1) +{ + return _mm256_max_ps(v0, v1); +} + +static FINLINE float +v8f_reduce_min(const v8f_T v0) +{ + ALIGN(32) float tmp[8]; + const v8f_T v1 = _mm256_permute_ps(v0, _MM_SHUFFLE(1, 0, 3, 2)); + const v8f_T v2 = _mm256_min_ps(v0, v1); + const v8f_T v3 = _mm256_permute_ps(v2, _MM_SHUFFLE(2, 3, 0, 1)); + const v8f_T v4 = _mm256_min_ps(v2, v3); + _mm256_store_ps(tmp, v4); + return MMIN(tmp[0], tmp[4]); +} + +static FINLINE float +v8f_reduce_max(const v8f_T v0) +{ + ALIGN(32) float tmp[8]; + const v8f_T v1 = _mm256_permute_ps(v0, _MM_SHUFFLE(1, 0, 3, 2)); + const v8f_T v2 = _mm256_max_ps(v0, v1); + const v8f_T v3 = _mm256_permute_ps(v2, _MM_SHUFFLE(2, 3, 0, 1)); + const v8f_T v4 = _mm256_max_ps(v2, v3); + _mm256_store_ps(tmp, v4); + return MMAX(tmp[0], tmp[4]); +} + +static FINLINE v8f_T +v8f_clamp(const v8f_T v, const v8f_T vmin, const v8f_T vmax) +{ + return v8f_min(v8f_max(v, vmin), vmax); +} + +#endif /* RSIMD_AVX_H */ + diff --git a/src/rsimd.h b/src/rsimd.h @@ -28,8 +28,9 @@ #ifdef SIMD_SSE2 #include "sse/sse.h" -#else - #error Unsupported_Platform +#endif +#ifdef SIMD_AVX + #include "avx/avx.h" #endif #endif /* RSIMD_H */ diff --git a/src/sse/ssef.h b/src/sse/ssef.h @@ -69,7 +69,7 @@ v4f_loadu3(const float src[3]) } static FINLINE v4f_T -v4f_set1(float x) +v4f_set1(const float x) { return _mm_set1_ps(x); } diff --git a/src/test_v8f.c b/src/test_v8f.c @@ -0,0 +1,450 @@ +/* Copyright (C) 2014-2018 Vincent Forest (vaplv@free.fr) + * + * The RSIMD library is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * The RSIMD library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ + +#include "rsimd.h" + +int +main(int argc, char** argv) +{ + v8f_T i, j, k; + ALIGN(32) union { int32_t i[8]; float f[8]; } cast; + ALIGN(32) float tmp[9] = {0.f,1.f,2.f,3.f,4.f,5.f,6.f,7.f,8.f}; + (void)argc, (void)argv; + + i = v8f_loadu(tmp+1); + CHK(v4f_x(v8f_abcd(i)) == 1.f); + CHK(v4f_y(v8f_abcd(i)) == 2.f); + CHK(v4f_z(v8f_abcd(i)) == 3.f); + CHK(v4f_w(v8f_abcd(i)) == 4.f); + CHK(v4f_x(v8f_efgh(i)) == 5.f); + CHK(v4f_y(v8f_efgh(i)) == 6.f); + CHK(v4f_z(v8f_efgh(i)) == 7.f); + CHK(v4f_w(v8f_efgh(i)) == 8.f); + + i = v8f_load(tmp); + CHK(v4f_x(v8f_abcd(i)) == 0.f); + CHK(v4f_y(v8f_abcd(i)) == 1.f); + CHK(v4f_z(v8f_abcd(i)) == 2.f); + CHK(v4f_w(v8f_abcd(i)) == 3.f); + CHK(v4f_x(v8f_efgh(i)) == 4.f); + CHK(v4f_y(v8f_efgh(i)) == 5.f); + CHK(v4f_z(v8f_efgh(i)) == 6.f); + CHK(v4f_w(v8f_efgh(i)) == 7.f); + + tmp[0] = tmp[1] = tmp[2] = tmp[3] = 0.f; + tmp[4] = tmp[5] = tmp[6] = tmp[7] = 0.f; + CHK(v8f_store(tmp, i) == tmp); + CHK(tmp[0] == 0.f); + CHK(tmp[1] == 1.f); + CHK(tmp[2] == 2.f); + CHK(tmp[3] == 3.f); + CHK(tmp[4] == 4.f); + CHK(tmp[5] == 5.f); + CHK(tmp[6] == 6.f); + CHK(tmp[7] == 7.f); + CHK(tmp[8] == 8.f); + + i = v8f_set1(-2.f); + CHK(v4f_x(v8f_abcd(i)) == -2.f); + CHK(v4f_y(v8f_abcd(i)) == -2.f); + CHK(v4f_z(v8f_abcd(i)) == -2.f); + CHK(v4f_w(v8f_abcd(i)) == -2.f); + CHK(v4f_x(v8f_efgh(i)) == -2.f); + CHK(v4f_y(v8f_efgh(i)) == -2.f); + CHK(v4f_z(v8f_efgh(i)) == -2.f); + CHK(v4f_w(v8f_efgh(i)) == -2.f); + + i = v8f_set(0.f,1.f,2.f,3.f,4.f,5.f,6.f,7.f); + CHK(v4f_x(v8f_abcd(i)) == 0.f); + CHK(v4f_y(v8f_abcd(i)) == 1.f); + CHK(v4f_z(v8f_abcd(i)) == 2.f); + CHK(v4f_w(v8f_abcd(i)) == 3.f); + CHK(v4f_x(v8f_efgh(i)) == 4.f); + CHK(v4f_y(v8f_efgh(i)) == 5.f); + CHK(v4f_z(v8f_efgh(i)) == 6.f); + CHK(v4f_w(v8f_efgh(i)) == 7.f); + + i = v8f_zero(); + CHK(v4f_x(v8f_abcd(i)) == 0.f); + CHK(v4f_y(v8f_abcd(i)) == 0.f); + CHK(v4f_z(v8f_abcd(i)) == 0.f); + CHK(v4f_w(v8f_abcd(i)) == 0.f); + CHK(v4f_x(v8f_efgh(i)) == 0.f); + CHK(v4f_y(v8f_efgh(i)) == 0.f); + CHK(v4f_z(v8f_efgh(i)) == 0.f); + CHK(v4f_w(v8f_efgh(i)) == 0.f); + + i = v8f_mask(~0,~0,0,0,0,~0,~0,0); + cast.f[0] = v4f_x(v8f_abcd(i)); CHK(cast.i[0] == (int32_t)0xFFFFFFFF); + cast.f[1] = v4f_y(v8f_abcd(i)); CHK(cast.i[1] == (int32_t)0xFFFFFFFF); + cast.f[2] = v4f_z(v8f_abcd(i)); CHK(cast.i[2] == (int32_t)0x00000000); + cast.f[3] = v4f_w(v8f_abcd(i)); CHK(cast.i[3] == (int32_t)0x00000000); + cast.f[4] = v4f_x(v8f_efgh(i)); CHK(cast.i[4] == (int32_t)0x00000000); + cast.f[5] = v4f_y(v8f_efgh(i)); CHK(cast.i[5] == (int32_t)0xFFFFFFFF); + cast.f[6] = v4f_z(v8f_efgh(i)); CHK(cast.i[6] == (int32_t)0xFFFFFFFF); + cast.f[7] = v4f_w(v8f_efgh(i)); CHK(cast.i[7] == (int32_t)0x00000000); + + i = v8f_mask1(~0); + cast.f[0] = v4f_x(v8f_abcd(i)); CHK(cast.i[0] == (int32_t)0xFFFFFFFF); + cast.f[1] = v4f_y(v8f_abcd(i)); CHK(cast.i[1] == (int32_t)0xFFFFFFFF); + cast.f[2] = v4f_z(v8f_abcd(i)); CHK(cast.i[2] == (int32_t)0xFFFFFFFF); + cast.f[3] = v4f_w(v8f_abcd(i)); CHK(cast.i[3] == (int32_t)0xFFFFFFFF); + cast.f[4] = v4f_x(v8f_efgh(i)); CHK(cast.i[4] == (int32_t)0xFFFFFFFF); + cast.f[5] = v4f_y(v8f_efgh(i)); CHK(cast.i[5] == (int32_t)0xFFFFFFFF); + cast.f[6] = v4f_z(v8f_efgh(i)); CHK(cast.i[6] == (int32_t)0xFFFFFFFF); + cast.f[7] = v4f_w(v8f_efgh(i)); CHK(cast.i[7] == (int32_t)0xFFFFFFFF); + + i = v8f_true(); + cast.f[0] = v4f_x(v8f_abcd(i)); CHK(cast.i[0] == (int32_t)0xFFFFFFFF); + cast.f[1] = v4f_y(v8f_abcd(i)); CHK(cast.i[1] == (int32_t)0xFFFFFFFF); + cast.f[2] = v4f_z(v8f_abcd(i)); CHK(cast.i[2] == (int32_t)0xFFFFFFFF); + cast.f[3] = v4f_w(v8f_abcd(i)); CHK(cast.i[3] == (int32_t)0xFFFFFFFF); + cast.f[4] = v4f_x(v8f_efgh(i)); CHK(cast.i[4] == (int32_t)0xFFFFFFFF); + cast.f[5] = v4f_y(v8f_efgh(i)); CHK(cast.i[5] == (int32_t)0xFFFFFFFF); + cast.f[6] = v4f_z(v8f_efgh(i)); CHK(cast.i[6] == (int32_t)0xFFFFFFFF); + cast.f[7] = v4f_w(v8f_efgh(i)); CHK(cast.i[7] == (int32_t)0xFFFFFFFF); + + i = v8f_false(); + cast.f[0] = v4f_x(v8f_abcd(i)); CHK(cast.i[0] == (int32_t)0x00000000); + cast.f[1] = v4f_y(v8f_abcd(i)); CHK(cast.i[1] == (int32_t)0x00000000); + cast.f[2] = v4f_z(v8f_abcd(i)); CHK(cast.i[2] == (int32_t)0x00000000); + cast.f[3] = v4f_w(v8f_abcd(i)); CHK(cast.i[3] == (int32_t)0x00000000); + cast.f[4] = v4f_x(v8f_efgh(i)); CHK(cast.i[4] == (int32_t)0x00000000); + cast.f[5] = v4f_y(v8f_efgh(i)); CHK(cast.i[5] == (int32_t)0x00000000); + cast.f[6] = v4f_z(v8f_efgh(i)); CHK(cast.i[6] == (int32_t)0x00000000); + cast.f[7] = v4f_w(v8f_efgh(i)); CHK(cast.i[7] == (int32_t)0x00000000); + + i = v8f_mask(~0,~0,0,0,0,~0,~0,0); + j = v8f_mask(~0,0,~0,0,0,~0,0,~0); + k = v8f_or(i, j); + cast.f[0] = v4f_x(v8f_abcd(k)); CHK(cast.i[0] == (int32_t)0xFFFFFFFF); + cast.f[1] = v4f_y(v8f_abcd(k)); CHK(cast.i[1] == (int32_t)0xFFFFFFFF); + cast.f[2] = v4f_z(v8f_abcd(k)); CHK(cast.i[2] == (int32_t)0xFFFFFFFF); + cast.f[3] = v4f_w(v8f_abcd(k)); CHK(cast.i[3] == (int32_t)0x00000000); + cast.f[4] = v4f_x(v8f_efgh(k)); CHK(cast.i[4] == (int32_t)0x00000000); + cast.f[5] = v4f_y(v8f_efgh(k)); CHK(cast.i[5] == (int32_t)0xFFFFFFFF); + cast.f[6] = v4f_z(v8f_efgh(k)); CHK(cast.i[6] == (int32_t)0xFFFFFFFF); + cast.f[7] = v4f_w(v8f_efgh(k)); CHK(cast.i[7] == (int32_t)0xFFFFFFFF); + + k = v8f_and(i, j); + cast.f[0] = v4f_x(v8f_abcd(k)); CHK(cast.i[0] == (int32_t)0xFFFFFFFF); + cast.f[1] = v4f_y(v8f_abcd(k)); CHK(cast.i[1] == (int32_t)0x00000000); + cast.f[2] = v4f_z(v8f_abcd(k)); CHK(cast.i[2] == (int32_t)0x00000000); + cast.f[3] = v4f_w(v8f_abcd(k)); CHK(cast.i[3] == (int32_t)0x00000000); + cast.f[4] = v4f_x(v8f_efgh(k)); CHK(cast.i[4] == (int32_t)0x00000000); + cast.f[5] = v4f_y(v8f_efgh(k)); CHK(cast.i[5] == (int32_t)0xFFFFFFFF); + cast.f[6] = v4f_z(v8f_efgh(k)); CHK(cast.i[6] == (int32_t)0x00000000); + cast.f[7] = v4f_w(v8f_efgh(k)); CHK(cast.i[7] == (int32_t)0x00000000); + + k = v8f_andnot(i, j); + cast.f[0] = v4f_x(v8f_abcd(k)); CHK(cast.i[0] == (int32_t)0x00000000); + cast.f[1] = v4f_y(v8f_abcd(k)); CHK(cast.i[1] == (int32_t)0x00000000); + cast.f[2] = v4f_z(v8f_abcd(k)); CHK(cast.i[2] == (int32_t)0xFFFFFFFF); + cast.f[3] = v4f_w(v8f_abcd(k)); CHK(cast.i[3] == (int32_t)0x00000000); + cast.f[4] = v4f_x(v8f_efgh(k)); CHK(cast.i[4] == (int32_t)0x00000000); + cast.f[5] = v4f_y(v8f_efgh(k)); CHK(cast.i[5] == (int32_t)0x00000000); + cast.f[6] = v4f_z(v8f_efgh(k)); CHK(cast.i[6] == (int32_t)0x00000000); + cast.f[7] = v4f_w(v8f_efgh(k)); CHK(cast.i[7] == (int32_t)0xFFFFFFFF); + + k = v8f_xor(i, j); + cast.f[0] = v4f_x(v8f_abcd(k)); CHK(cast.i[0] == (int32_t)0x00000000); + cast.f[1] = v4f_y(v8f_abcd(k)); CHK(cast.i[1] == (int32_t)0xFFFFFFFF); + cast.f[2] = v4f_z(v8f_abcd(k)); CHK(cast.i[2] == (int32_t)0xFFFFFFFF); + cast.f[3] = v4f_w(v8f_abcd(k)); CHK(cast.i[3] == (int32_t)0x00000000); + cast.f[4] = v4f_x(v8f_efgh(k)); CHK(cast.i[4] == (int32_t)0x00000000); + cast.f[5] = v4f_y(v8f_efgh(k)); CHK(cast.i[5] == (int32_t)0x00000000); + cast.f[6] = v4f_z(v8f_efgh(k)); CHK(cast.i[6] == (int32_t)0xFFFFFFFF); + cast.f[7] = v4f_w(v8f_efgh(k)); CHK(cast.i[7] == (int32_t)0xFFFFFFFF); + + CHK(v8f_movemask(k) == 0xC6); + i = v8f_mask + ((int32_t)0x01020401, (int32_t)0x80605040, (int32_t)0x7F1F2F3F, (int32_t)0, + (int32_t)0xF0000000, (int32_t)0xFFFFFFFF, (int32_t)0x7FFFFFFF, (int32_t)~0); + CHK(v8f_movemask(i) == 0xB2); + + i = v8f_set(0.f,1.f,2.f,3.f,4.f,5.f,6.f,7.f); + j = v8f_set(8.f,9.f,10.f,11.f,12.f,13.f,14.f,15.f); + k = v8f_sel(i, j, v8f_mask(~0,~0,0,0,0,~0,~0,0)); + CHK(v4f_x(v8f_abcd(k)) == 8.f); + CHK(v4f_y(v8f_abcd(k)) == 9.f); + CHK(v4f_z(v8f_abcd(k)) == 2.f); + CHK(v4f_w(v8f_abcd(k)) == 3.f); + CHK(v4f_x(v8f_efgh(k)) == 4.f); + CHK(v4f_y(v8f_efgh(k)) == 13.f); + CHK(v4f_z(v8f_efgh(k)) == 14.f); + CHK(v4f_w(v8f_efgh(k)) == 7.f); + + k = v8f_minus(i); + CHK(v4f_x(v8f_abcd(k)) == -0.f); + CHK(v4f_y(v8f_abcd(k)) == -1.f); + CHK(v4f_z(v8f_abcd(k)) == -2.f); + CHK(v4f_w(v8f_abcd(k)) == -3.f); + CHK(v4f_x(v8f_efgh(k)) == -4.f); + CHK(v4f_y(v8f_efgh(k)) == -5.f); + CHK(v4f_z(v8f_efgh(k)) == -6.f); + CHK(v4f_w(v8f_efgh(k)) == -7.f); + + k = v8f_add(i, j); + CHK(v4f_x(v8f_abcd(k)) == 8.f); + CHK(v4f_y(v8f_abcd(k)) == 10.f); + CHK(v4f_z(v8f_abcd(k)) == 12.f); + CHK(v4f_w(v8f_abcd(k)) == 14.f); + CHK(v4f_x(v8f_efgh(k)) == 16.f); + CHK(v4f_y(v8f_efgh(k)) == 18.f); + CHK(v4f_z(v8f_efgh(k)) == 20.f); + CHK(v4f_w(v8f_efgh(k)) == 22.f); + + k = v8f_sub(i, j); + CHK(v4f_x(v8f_abcd(k)) == -8.f); + CHK(v4f_y(v8f_abcd(k)) == -8.f); + CHK(v4f_z(v8f_abcd(k)) == -8.f); + CHK(v4f_w(v8f_abcd(k)) == -8.f); + CHK(v4f_x(v8f_efgh(k)) == -8.f); + CHK(v4f_y(v8f_efgh(k)) == -8.f); + CHK(v4f_z(v8f_efgh(k)) == -8.f); + CHK(v4f_w(v8f_efgh(k)) == -8.f); + + k = v8f_mul(i, j); + CHK(v4f_x(v8f_abcd(k)) == 0.f); + CHK(v4f_y(v8f_abcd(k)) == 9.f); + CHK(v4f_z(v8f_abcd(k)) == 20.f); + CHK(v4f_w(v8f_abcd(k)) == 33.f); + CHK(v4f_x(v8f_efgh(k)) == 48.f); + CHK(v4f_y(v8f_efgh(k)) == 65.f); + CHK(v4f_z(v8f_efgh(k)) == 84.f); + CHK(v4f_w(v8f_efgh(k)) == 105.f); + + k = v8f_div(i, j); + CHK(v4f_x(v8f_abcd(k)) == 0.f); + CHK(v4f_y(v8f_abcd(k)) == 1.f/9.f); + CHK(v4f_z(v8f_abcd(k)) == 0.2f); + CHK(v4f_w(v8f_abcd(k)) == 3.f/11.f); + CHK(v4f_x(v8f_efgh(k)) == 1.f/3.f); + CHK(v4f_y(v8f_efgh(k)) == 5.f/13.f); + CHK(v4f_z(v8f_efgh(k)) == 3.f/7.f); + CHK(v4f_w(v8f_efgh(k)) == 7.f/15.f); + + k = v8f_set(0.1f,0.2f,0.3f,0.4f,0.5f,0.6f,0.7f,0.8f); + k = v8f_madd(i, j, k); + CHK(v4f_x(v8f_abcd(k)) == 0.1f); + CHK(v4f_y(v8f_abcd(k)) == 9.2f); + CHK(v4f_z(v8f_abcd(k)) == 20.3f); + CHK(v4f_w(v8f_abcd(k)) == 33.4f); + CHK(v4f_x(v8f_efgh(k)) == 48.5f); + CHK(v4f_y(v8f_efgh(k)) == 65.6f); + CHK(v4f_z(v8f_efgh(k)) == 84.7f); + CHK(v4f_w(v8f_efgh(k)) == 105.8f); + + k = v8f_abs(v8f_minus(i)); + CHK(v4f_x(v8f_abcd(k)) == 0.f); + CHK(v4f_y(v8f_abcd(k)) == 1.f); + CHK(v4f_z(v8f_abcd(k)) == 2.f); + CHK(v4f_w(v8f_abcd(k)) == 3.f); + CHK(v4f_x(v8f_efgh(k)) == 4.f); + CHK(v4f_y(v8f_efgh(k)) == 5.f); + CHK(v4f_z(v8f_efgh(k)) == 6.f); + CHK(v4f_w(v8f_efgh(k)) == 7.f); + + i = v8f_set(1.f, 4.f, 9.f, 16.f, 25.f, 36.f, 49.f, 64.f); + k = v8f_sqrt(i); + CHK(v4f_x(v8f_abcd(k)) == 1.f); + CHK(v4f_y(v8f_abcd(k)) == 2.f); + CHK(v4f_z(v8f_abcd(k)) == 3.f); + CHK(v4f_w(v8f_abcd(k)) == 4.f); + CHK(v4f_x(v8f_efgh(k)) == 5.f); + CHK(v4f_y(v8f_efgh(k)) == 6.f); + CHK(v4f_z(v8f_efgh(k)) == 7.f); + CHK(v4f_w(v8f_efgh(k)) == 8.f); + + k = v8f_rsqrte(i); + CHK(eq_epsf(v4f_x(v8f_abcd(k)), 1.f/1.f, 1.e-3f)); + CHK(eq_epsf(v4f_y(v8f_abcd(k)), 1.f/2.f, 1.e-3f)); + CHK(eq_epsf(v4f_z(v8f_abcd(k)), 1.f/3.f, 1.e-3f)); + CHK(eq_epsf(v4f_w(v8f_abcd(k)), 1.f/4.f, 1.e-3f)); + CHK(eq_epsf(v4f_x(v8f_efgh(k)), 1.f/5.f, 1.e-3f)); + CHK(eq_epsf(v4f_y(v8f_efgh(k)), 1.f/6.f, 1.e-3f)); + CHK(eq_epsf(v4f_z(v8f_efgh(k)), 1.f/7.f, 1.e-3f)); + CHK(eq_epsf(v4f_w(v8f_efgh(k)), 1.f/8.f, 1.e-3f)); + + k = v8f_rsqrt(i); + CHK(eq_epsf(v4f_x(v8f_abcd(k)), 1.f/1.f, 1.e-6f)); + CHK(eq_epsf(v4f_y(v8f_abcd(k)), 1.f/2.f, 1.e-6f)); + CHK(eq_epsf(v4f_z(v8f_abcd(k)), 1.f/3.f, 1.e-6f)); + CHK(eq_epsf(v4f_w(v8f_abcd(k)), 1.f/4.f, 1.e-6f)); + CHK(eq_epsf(v4f_x(v8f_efgh(k)), 1.f/5.f, 1.e-6f)); + CHK(eq_epsf(v4f_y(v8f_efgh(k)), 1.f/6.f, 1.e-6f)); + CHK(eq_epsf(v4f_z(v8f_efgh(k)), 1.f/7.f, 1.e-6f)); + CHK(eq_epsf(v4f_w(v8f_efgh(k)), 1.f/8.f, 1.e-6f)); + + i = v8f_set(1.f,2.f,3.f,4.f,5.f,6.f,7.f,8.f); + k = v8f_rcpe(i); + CHK(eq_epsf(v4f_x(v8f_abcd(k)), 1.f/1.f, 1.e-3f)); + CHK(eq_epsf(v4f_y(v8f_abcd(k)), 1.f/2.f, 1.e-3f)); + CHK(eq_epsf(v4f_z(v8f_abcd(k)), 1.f/3.f, 1.e-3f)); + CHK(eq_epsf(v4f_w(v8f_abcd(k)), 1.f/4.f, 1.e-3f)); + CHK(eq_epsf(v4f_x(v8f_efgh(k)), 1.f/5.f, 1.e-3f)); + CHK(eq_epsf(v4f_y(v8f_efgh(k)), 1.f/6.f, 1.e-3f)); + CHK(eq_epsf(v4f_z(v8f_efgh(k)), 1.f/7.f, 1.e-3f)); + CHK(eq_epsf(v4f_w(v8f_efgh(k)), 1.f/8.f, 1.e-3f)); + + k = v8f_rcp(i); + CHK(eq_epsf(v4f_x(v8f_abcd(k)), 1.f/1.f, 1.e-6f)); + CHK(eq_epsf(v4f_y(v8f_abcd(k)), 1.f/2.f, 1.e-6f)); + CHK(eq_epsf(v4f_z(v8f_abcd(k)), 1.f/3.f, 1.e-6f)); + CHK(eq_epsf(v4f_w(v8f_abcd(k)), 1.f/4.f, 1.e-6f)); + CHK(eq_epsf(v4f_x(v8f_efgh(k)), 1.f/5.f, 1.e-6f)); + CHK(eq_epsf(v4f_y(v8f_efgh(k)), 1.f/6.f, 1.e-6f)); + CHK(eq_epsf(v4f_z(v8f_efgh(k)), 1.f/7.f, 1.e-6f)); + CHK(eq_epsf(v4f_w(v8f_efgh(k)), 1.f/8.f, 1.e-6f)); + + j = v8f_set(2.f,3.f,4.f,5.f,6.f,7.f,8.f,9.f); + k = v8f_lerp(i, j, v8f_set1(0.5f)); + CHK(v4f_x(v8f_abcd(k)) == 1.5f); + CHK(v4f_y(v8f_abcd(k)) == 2.5f); + CHK(v4f_z(v8f_abcd(k)) == 3.5f); + CHK(v4f_w(v8f_abcd(k)) == 4.5f); + CHK(v4f_x(v8f_efgh(k)) == 5.5f); + CHK(v4f_y(v8f_efgh(k)) == 6.5f); + CHK(v4f_z(v8f_efgh(k)) == 7.5f); + CHK(v4f_w(v8f_efgh(k)) == 8.5f); + + i = v8f_set(0.f, 1.f,2.f,3.f, 4.f,5.f,6.f,7.f); + j = v8f_set(0.f,-1.f,4.f,4.f,-2.f,6.f,6.f,8.f); + + k = v8f_eq(i, j); + cast.f[0] = v4f_x(v8f_abcd(k)); CHK(cast.i[0] ==~0); + cast.f[1] = v4f_y(v8f_abcd(k)); CHK(cast.i[1] == 0); + cast.f[2] = v4f_z(v8f_abcd(k)); CHK(cast.i[2] == 0); + cast.f[3] = v4f_w(v8f_abcd(k)); CHK(cast.i[3] == 0); + cast.f[4] = v4f_x(v8f_efgh(k)); CHK(cast.i[4] == 0); + cast.f[5] = v4f_y(v8f_efgh(k)); CHK(cast.i[5] == 0); + cast.f[6] = v4f_z(v8f_efgh(k)); CHK(cast.i[6] ==~0); + cast.f[7] = v4f_w(v8f_efgh(k)); CHK(cast.i[7] == 0); + + k = v8f_neq(i, j); + cast.f[0] = v4f_x(v8f_abcd(k)); CHK(cast.i[0] == 0); + cast.f[1] = v4f_y(v8f_abcd(k)); CHK(cast.i[1] ==~0); + cast.f[2] = v4f_z(v8f_abcd(k)); CHK(cast.i[2] ==~0); + cast.f[3] = v4f_w(v8f_abcd(k)); CHK(cast.i[3] ==~0); + cast.f[4] = v4f_x(v8f_efgh(k)); CHK(cast.i[4] ==~0); + cast.f[5] = v4f_y(v8f_efgh(k)); CHK(cast.i[5] ==~0); + cast.f[6] = v4f_z(v8f_efgh(k)); CHK(cast.i[6] == 0); + cast.f[7] = v4f_w(v8f_efgh(k)); CHK(cast.i[7] ==~0); + + k = v8f_ge(i, j); + cast.f[0] = v4f_x(v8f_abcd(k)); CHK(cast.i[0] ==~0); + cast.f[1] = v4f_y(v8f_abcd(k)); CHK(cast.i[1] ==~0); + cast.f[2] = v4f_z(v8f_abcd(k)); CHK(cast.i[2] == 0); + cast.f[3] = v4f_w(v8f_abcd(k)); CHK(cast.i[3] == 0); + cast.f[4] = v4f_x(v8f_efgh(k)); CHK(cast.i[4] ==~0); + cast.f[5] = v4f_y(v8f_efgh(k)); CHK(cast.i[5] == 0); + cast.f[6] = v4f_z(v8f_efgh(k)); CHK(cast.i[6] ==~0); + cast.f[7] = v4f_w(v8f_efgh(k)); CHK(cast.i[7] == 0); + + k = v8f_le(i, j); + cast.f[0] = v4f_x(v8f_abcd(k)); CHK(cast.i[0] ==~0); + cast.f[1] = v4f_y(v8f_abcd(k)); CHK(cast.i[1] == 0); + cast.f[2] = v4f_z(v8f_abcd(k)); CHK(cast.i[2] ==~0); + cast.f[3] = v4f_w(v8f_abcd(k)); CHK(cast.i[3] ==~0); + cast.f[4] = v4f_x(v8f_efgh(k)); CHK(cast.i[4] == 0); + cast.f[5] = v4f_y(v8f_efgh(k)); CHK(cast.i[5] ==~0); + cast.f[6] = v4f_z(v8f_efgh(k)); CHK(cast.i[6] ==~0); + cast.f[7] = v4f_w(v8f_efgh(k)); CHK(cast.i[7] ==~0); + + k = v8f_gt(i, j); + cast.f[0] = v4f_x(v8f_abcd(k)); CHK(cast.i[0] == 0); + cast.f[1] = v4f_y(v8f_abcd(k)); CHK(cast.i[1] ==~0); + cast.f[2] = v4f_z(v8f_abcd(k)); CHK(cast.i[2] == 0); + cast.f[3] = v4f_w(v8f_abcd(k)); CHK(cast.i[3] == 0); + cast.f[4] = v4f_x(v8f_efgh(k)); CHK(cast.i[4] ==~0); + cast.f[5] = v4f_y(v8f_efgh(k)); CHK(cast.i[5] == 0); + cast.f[6] = v4f_z(v8f_efgh(k)); CHK(cast.i[6] == 0); + cast.f[7] = v4f_w(v8f_efgh(k)); CHK(cast.i[7] == 0); + + k = v8f_lt(i, j); + cast.f[0] = v4f_x(v8f_abcd(k)); CHK(cast.i[0] == 0); + cast.f[1] = v4f_y(v8f_abcd(k)); CHK(cast.i[1] == 0); + cast.f[2] = v4f_z(v8f_abcd(k)); CHK(cast.i[2] ==~0); + cast.f[3] = v4f_w(v8f_abcd(k)); CHK(cast.i[3] ==~0); + cast.f[4] = v4f_x(v8f_efgh(k)); CHK(cast.i[4] == 0); + cast.f[5] = v4f_y(v8f_efgh(k)); CHK(cast.i[5] ==~0); + cast.f[6] = v4f_z(v8f_efgh(k)); CHK(cast.i[6] == 0); + cast.f[7] = v4f_w(v8f_efgh(k)); CHK(cast.i[7] ==~0); + + j = v8f_set(0.0001f, 0.99999f, 2.f, 3.1f, 4.001f, 5.0002f, 6.f, 6.999999f); + k = v8f_eq_eps(i, j, v8f_set1(1.e-4f)); + cast.f[0] = v4f_x(v8f_abcd(k)); CHK(cast.i[0] ==~0); + cast.f[1] = v4f_y(v8f_abcd(k)); CHK(cast.i[1] ==~0); + cast.f[2] = v4f_z(v8f_abcd(k)); CHK(cast.i[2] ==~0); + cast.f[3] = v4f_w(v8f_abcd(k)); CHK(cast.i[3] == 0); + cast.f[4] = v4f_x(v8f_efgh(k)); CHK(cast.i[4] == 0); + cast.f[5] = v4f_y(v8f_efgh(k)); CHK(cast.i[5] == 0); + cast.f[6] = v4f_z(v8f_efgh(k)); CHK(cast.i[6] ==~0); + cast.f[7] = v4f_w(v8f_efgh(k)); CHK(cast.i[7] ==~0); + + k = v8f_eq_eps(i, j, v8f_set(1.e-4f, 1.e-4f, 0.f, 0.1f, 1.e-3f, 2.e-4f, 0.f, 1.e-5f)); + cast.f[0] = v4f_x(v8f_abcd(k)); CHK(cast.i[0] ==~0); + cast.f[1] = v4f_y(v8f_abcd(k)); CHK(cast.i[1] ==~0); + cast.f[2] = v4f_z(v8f_abcd(k)); CHK(cast.i[2] ==~0); + cast.f[3] = v4f_w(v8f_abcd(k)); CHK(cast.i[3] ==~0); + cast.f[4] = v4f_x(v8f_efgh(k)); CHK(cast.i[4] ==~0); + cast.f[5] = v4f_y(v8f_efgh(k)); CHK(cast.i[5] ==~0); + cast.f[6] = v4f_z(v8f_efgh(k)); CHK(cast.i[6] ==~0); + cast.f[7] = v4f_w(v8f_efgh(k)); CHK(cast.i[7] ==~0); + + i = v8f_set(0.f, 1.f,2.f,3.f, 4.f,5.f,6.f,7.f); + j = v8f_set(0.f,-1.f,4.f,4.f,-2.f,6.f,6.f,8.f); + + k = v8f_min(i, j); + CHK(v4f_x(v8f_abcd(k)) == 0.f); + CHK(v4f_y(v8f_abcd(k)) ==-1.f); + CHK(v4f_z(v8f_abcd(k)) == 2.f); + CHK(v4f_w(v8f_abcd(k)) == 3.f); + CHK(v4f_x(v8f_efgh(k)) ==-2.f); + CHK(v4f_y(v8f_efgh(k)) == 5.f); + CHK(v4f_z(v8f_efgh(k)) == 6.f); + CHK(v4f_w(v8f_efgh(k)) == 7.f); + + k = v8f_max(i, j); + CHK(v4f_x(v8f_abcd(k)) == 0.f); + CHK(v4f_y(v8f_abcd(k)) == 1.f); + CHK(v4f_z(v8f_abcd(k)) == 4.f); + CHK(v4f_w(v8f_abcd(k)) == 4.f); + CHK(v4f_x(v8f_efgh(k)) == 4.f); + CHK(v4f_y(v8f_efgh(k)) == 6.f); + CHK(v4f_z(v8f_efgh(k)) == 6.f); + CHK(v4f_w(v8f_efgh(k)) == 8.f); + + CHK(v8f_reduce_min(i) == 0.f); + CHK(v8f_reduce_min(j) ==-2.f); + CHK(v8f_reduce_max(i) == 7.f); + CHK(v8f_reduce_max(j) == 8.f); + + k = v8f_clamp(i, + v8f_set(1.f, 1.f, 3.1f, 5.f, 4.f, 0.f, 0.f, -1.f), + v8f_set(1.f, 1.f, 4.f, 6.f, 4.f, 1.f, 6.f, 5.f)); + + CHK(v4f_x(v8f_abcd(k)) == 1.f); + CHK(v4f_y(v8f_abcd(k)) == 1.f); + CHK(v4f_z(v8f_abcd(k)) == 3.1f); + CHK(v4f_w(v8f_abcd(k)) == 5.f); + CHK(v4f_x(v8f_efgh(k)) == 4.f); + CHK(v4f_y(v8f_efgh(k)) == 1.f); + CHK(v4f_z(v8f_efgh(k)) == 6.f); + CHK(v4f_w(v8f_efgh(k)) == 5.f); + + return 0; +} +