commit 9870c031f34427d5d4a01e34fd5f72a42a8e0f28
parent 18a76e8311f942a4c4128e892a0d96912560fca5
Author: vaplv <vaplv@free.fr>
Date: Mon, 21 May 2018 12:14:55 +0200
Add and test the v8f_T API
Diffstat:
6 files changed, 816 insertions(+), 3 deletions(-)
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
@@ -37,6 +37,7 @@ include(rcmake)
if(CMAKE_COMPILER_IS_GNUCC)
include(CheckCCompilerFlag)
CHECK_C_COMPILER_FLAG("-msse4.1" SSE4_1)
+ CHECK_C_COMPILER_FLAG("-mavx" AVX)
endif(CMAKE_COMPILER_IS_GNUCC)
################################################################################
@@ -112,6 +113,10 @@ if(NOT NO_TEST)
new_test_named(test_v4i_sse4_1 test_v4i "-msse4.1")
endif(SSE4_1 AND CMAKE_COMPILER_IS_GNUCC)
+ if(AVX AND CMAKE_COMPILER_IS_GNUCC)
+ new_test(test_v8f "-mavx")
+ endif(AVX AND CMAKE_COMPILER_IS_GNUCC)
+
endif(NOT NO_TEST)
################################################################################
diff --git a/src/avx/avx.h b/src/avx/avx.h
@@ -0,0 +1,27 @@
+/* Copyright (C) 2014-2018 Vincent Forest (vaplv@free.fr)
+ *
+ * The RSIMD library is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * The RSIMD library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */
+
+#ifndef RSIMD_AVX_H
+#define RSIMD_AVX_H
+
+#include "avxf.h"
+
+typedef __m256i v8i_T;
+
+/* Reinterpret cast */
+static FINLINE v8i_T v8f_rcast_v8i(const v8f_T v) {return _mm256_castps_si256(v);}
+static FINLINE v8f_T v8i_rcast_v8f(const v8i_T v) {return _mm256_castsi256_ps(v);}
+
+#endif /* RSIMD_AVX_H */
diff --git a/src/avx/avxf.h b/src/avx/avxf.h
@@ -0,0 +1,330 @@
+/* Copyright (C) 2014-2018 Vincent Forest (vaplv@free.fr)
+ *
+ * The RSIMD library is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * The RSIMD library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */
+
+#ifndef RSIMD_AVXF_H
+#define RSIMD_AVXF_H
+
+/*
+ * 8 packed single precision floating-point values
+ */
+
+#include "avx.h"
+
+#include <rsys/math.h>
+#include <immintrin.h>
+
+typedef __m256 v8f_T;
+
+/*******************************************************************************
+ * Set operations
+ ******************************************************************************/
+static FINLINE float*
+v8f_store(float dst[8], v8f_T v)
+{
+ ASSERT(dst && IS_ALIGNED(dst, 32));
+ _mm256_store_ps(dst, v);
+ return dst;
+}
+
+static FINLINE v8f_T
+v8f_load(const float src[8])
+{
+ ASSERT(src && IS_ALIGNED(src, 32));
+ return _mm256_load_ps(src);
+}
+
+static FINLINE v8f_T
+v8f_loadu(const float f[8])
+{
+ ASSERT(f);
+ return _mm256_set_ps(f[7], f[6], f[5], f[4], f[3],f[2], f[1], f[0]);
+}
+
+static FINLINE v8f_T
+v8f_set1(const float x)
+{
+ return _mm256_set1_ps(x);
+}
+
+static FINLINE v8f_T
+v8f_set
+ (const float a, const float b, const float c, const float d,
+ const float e, const float f, const float g, const float h)
+{
+ return _mm256_set_ps(h, g, f, e, d, c, b, a);
+}
+
+static FINLINE v8f_T
+v8f_zero(void)
+{
+ return _mm256_setzero_ps();
+}
+
+static FINLINE v8f_T
+v8f_mask
+ (const int32_t a, const int32_t b, const int32_t c, const int32_t d,
+ const int32_t e, const int32_t f, const int32_t g, const int32_t h)
+{
+ return _mm256_castsi256_ps(_mm256_set_epi32(h, g, f, e, d, c, b, a));
+}
+
+static FINLINE v8f_T
+v8f_mask1(const int32_t x)
+{
+ return _mm256_castsi256_ps(_mm256_set1_epi32(x));
+}
+
+static FINLINE v8f_T
+v8f_true(void)
+{
+ return _mm256_castsi256_ps(_mm256_set1_epi32(~0));
+}
+
+static FINLINE v8f_T
+v8f_false(void)
+{
+ return v8f_zero();
+}
+
+/*******************************************************************************
+ * Extract components
+ ******************************************************************************/
+static FINLINE v4f_T
+v8f_abcd(const v8f_T v)
+{
+ return _mm256_extractf128_ps(v, 0);
+}
+
+static FINLINE v4f_T
+v8f_efgh(const v8f_T v)
+{
+ return _mm256_extractf128_ps(v, 1);
+}
+
+static FINLINE int
+v8f_movemask(const v8f_T v)
+{
+ return _mm256_movemask_ps(v);
+}
+
+/*******************************************************************************
+ * Bitwise operations
+ ******************************************************************************/
+static FINLINE v8f_T
+v8f_or(const v8f_T v0, const v8f_T v1)
+{
+ return _mm256_or_ps(v0, v1);
+}
+
+static FINLINE v8f_T
+v8f_and(const v8f_T v0, const v8f_T v1)
+{
+ return _mm256_and_ps(v0, v1);
+}
+
+static FINLINE v8f_T
+v8f_andnot(const v8f_T v0, const v8f_T v1)
+{
+ return _mm256_andnot_ps(v0, v1);
+}
+
+static FINLINE v8f_T
+v8f_xor(const v8f_T v0, const v8f_T v1)
+{
+ return _mm256_xor_ps(v0, v1);
+}
+
+static FINLINE v8f_T
+v8f_sel(const v8f_T vfalse, const v8f_T vtrue, const v8f_T vcond)
+{
+ return _mm256_blendv_ps(vfalse, vtrue, vcond);
+}
+
+/*******************************************************************************
+ * Arithmetic operations
+ ******************************************************************************/
+static FINLINE v8f_T
+v8f_minus(const v8f_T v)
+{
+ return v8f_xor(v8f_set1(-0.f), v);
+}
+
+static FINLINE v8f_T
+v8f_add(const v8f_T v0, const v8f_T v1)
+{
+ return _mm256_add_ps(v0, v1);
+}
+
+static FINLINE v8f_T
+v8f_sub(const v8f_T v0, const v8f_T v1)
+{
+ return _mm256_sub_ps(v0, v1);
+}
+
+static FINLINE v8f_T
+v8f_mul(const v8f_T v0, const v8f_T v1)
+{
+ return _mm256_mul_ps(v0, v1);
+}
+
+static FINLINE v8f_T
+v8f_div(const v8f_T v0, const v8f_T v1)
+{
+ return _mm256_div_ps(v0, v1);
+}
+
+static FINLINE v8f_T
+v8f_madd(const v8f_T v0, const v8f_T v1, const v8f_T v2)
+{
+ return _mm256_add_ps(_mm256_mul_ps(v0, v1), v2);
+}
+
+static FINLINE v8f_T
+v8f_abs(const v8f_T v)
+{
+ const union { int32_t i; float f; } mask = { 0x7fffffff };
+ return v8f_and(v, v8f_set1(mask.f));
+}
+
+static FINLINE v8f_T
+v8f_sqrt(const v8f_T v)
+{
+ return _mm256_sqrt_ps(v);
+}
+
+static FINLINE v8f_T
+v8f_rsqrte(const v8f_T v)
+{
+ return _mm256_rsqrt_ps(v);
+}
+
+static FINLINE v8f_T
+v8f_rsqrt(const v8f_T v)
+{
+ const v8f_T y = v8f_rsqrte(v);
+ const v8f_T yyv = v8f_mul(v8f_mul(y, y), v);
+ const v8f_T tmp = v8f_sub(v8f_set1(1.5f), v8f_mul(yyv, v8f_set1(0.5f)));
+ return v8f_mul(tmp, y);
+}
+
+static FINLINE v8f_T
+v8f_rcpe(const v8f_T v)
+{
+ return _mm256_rcp_ps(v);
+}
+
+static FINLINE v8f_T
+v8f_rcp(const v8f_T v)
+{
+ const v8f_T y = v8f_rcpe(v);
+ const v8f_T tmp = v8f_sub(v8f_set1(2.f), v8f_mul(y, v));
+ return v8f_mul(tmp, y);
+}
+
+static FINLINE v8f_T
+v8f_lerp(const v8f_T from, const v8f_T to, const v8f_T param)
+{
+ return v8f_madd(v8f_sub(to, from), param, from);
+}
+
+/*******************************************************************************
+ * Comparators
+ ******************************************************************************/
+static FINLINE v8f_T
+v8f_eq(const v8f_T v0, const v8f_T v1)
+{
+ return _mm256_cmp_ps(v0, v1, _CMP_EQ_OS);
+}
+
+static FINLINE v8f_T
+v8f_neq(const v8f_T v0, const v8f_T v1)
+{
+ return _mm256_cmp_ps(v0, v1, _CMP_NEQ_OS);
+}
+
+static FINLINE v8f_T
+v8f_ge(const v8f_T v0, const v8f_T v1)
+{
+ return _mm256_cmp_ps(v0, v1, _CMP_GE_OS);
+}
+
+static FINLINE v8f_T
+v8f_le(const v8f_T v0, const v8f_T v1)
+{
+ return _mm256_cmp_ps(v0, v1, _CMP_LE_OS);
+}
+
+static FINLINE v8f_T
+v8f_gt(const v8f_T v0, const v8f_T v1)
+{
+ return _mm256_cmp_ps(v0, v1, _CMP_GT_OS);
+}
+
+static FINLINE v8f_T
+v8f_lt(const v8f_T v0, const v8f_T v1)
+{
+ return _mm256_cmp_ps(v0, v1, _CMP_LT_OS);
+}
+
+static FINLINE v8f_T
+v8f_eq_eps(const v8f_T v0, const v8f_T v1, const v8f_T eps)
+{
+ return v8f_le(v8f_abs(v8f_sub(v0, v1)), eps);
+}
+
+static FINLINE v8f_T
+v8f_min(const v8f_T v0, const v8f_T v1)
+{
+ return _mm256_min_ps(v0, v1);
+}
+
+static FINLINE v8f_T
+v8f_max(const v8f_T v0, const v8f_T v1)
+{
+ return _mm256_max_ps(v0, v1);
+}
+
+static FINLINE float
+v8f_reduce_min(const v8f_T v0)
+{
+ ALIGN(32) float tmp[8];
+ const v8f_T v1 = _mm256_permute_ps(v0, _MM_SHUFFLE(1, 0, 3, 2));
+ const v8f_T v2 = _mm256_min_ps(v0, v1);
+ const v8f_T v3 = _mm256_permute_ps(v2, _MM_SHUFFLE(2, 3, 0, 1));
+ const v8f_T v4 = _mm256_min_ps(v2, v3);
+ _mm256_store_ps(tmp, v4);
+ return MMIN(tmp[0], tmp[4]);
+}
+
+static FINLINE float
+v8f_reduce_max(const v8f_T v0)
+{
+ ALIGN(32) float tmp[8];
+ const v8f_T v1 = _mm256_permute_ps(v0, _MM_SHUFFLE(1, 0, 3, 2));
+ const v8f_T v2 = _mm256_max_ps(v0, v1);
+ const v8f_T v3 = _mm256_permute_ps(v2, _MM_SHUFFLE(2, 3, 0, 1));
+ const v8f_T v4 = _mm256_max_ps(v2, v3);
+ _mm256_store_ps(tmp, v4);
+ return MMAX(tmp[0], tmp[4]);
+}
+
+static FINLINE v8f_T
+v8f_clamp(const v8f_T v, const v8f_T vmin, const v8f_T vmax)
+{
+ return v8f_min(v8f_max(v, vmin), vmax);
+}
+
+#endif /* RSIMD_AVX_H */
+
diff --git a/src/rsimd.h b/src/rsimd.h
@@ -28,8 +28,9 @@
#ifdef SIMD_SSE2
#include "sse/sse.h"
-#else
- #error Unsupported_Platform
+#endif
+#ifdef SIMD_AVX
+ #include "avx/avx.h"
#endif
#endif /* RSIMD_H */
diff --git a/src/sse/ssef.h b/src/sse/ssef.h
@@ -69,7 +69,7 @@ v4f_loadu3(const float src[3])
}
static FINLINE v4f_T
-v4f_set1(float x)
+v4f_set1(const float x)
{
return _mm_set1_ps(x);
}
diff --git a/src/test_v8f.c b/src/test_v8f.c
@@ -0,0 +1,450 @@
+/* Copyright (C) 2014-2018 Vincent Forest (vaplv@free.fr)
+ *
+ * The RSIMD library is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * The RSIMD library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */
+
+#include "rsimd.h"
+
+int
+main(int argc, char** argv)
+{
+ v8f_T i, j, k;
+ ALIGN(32) union { int32_t i[8]; float f[8]; } cast;
+ ALIGN(32) float tmp[9] = {0.f,1.f,2.f,3.f,4.f,5.f,6.f,7.f,8.f};
+ (void)argc, (void)argv;
+
+ i = v8f_loadu(tmp+1);
+ CHK(v4f_x(v8f_abcd(i)) == 1.f);
+ CHK(v4f_y(v8f_abcd(i)) == 2.f);
+ CHK(v4f_z(v8f_abcd(i)) == 3.f);
+ CHK(v4f_w(v8f_abcd(i)) == 4.f);
+ CHK(v4f_x(v8f_efgh(i)) == 5.f);
+ CHK(v4f_y(v8f_efgh(i)) == 6.f);
+ CHK(v4f_z(v8f_efgh(i)) == 7.f);
+ CHK(v4f_w(v8f_efgh(i)) == 8.f);
+
+ i = v8f_load(tmp);
+ CHK(v4f_x(v8f_abcd(i)) == 0.f);
+ CHK(v4f_y(v8f_abcd(i)) == 1.f);
+ CHK(v4f_z(v8f_abcd(i)) == 2.f);
+ CHK(v4f_w(v8f_abcd(i)) == 3.f);
+ CHK(v4f_x(v8f_efgh(i)) == 4.f);
+ CHK(v4f_y(v8f_efgh(i)) == 5.f);
+ CHK(v4f_z(v8f_efgh(i)) == 6.f);
+ CHK(v4f_w(v8f_efgh(i)) == 7.f);
+
+ tmp[0] = tmp[1] = tmp[2] = tmp[3] = 0.f;
+ tmp[4] = tmp[5] = tmp[6] = tmp[7] = 0.f;
+ CHK(v8f_store(tmp, i) == tmp);
+ CHK(tmp[0] == 0.f);
+ CHK(tmp[1] == 1.f);
+ CHK(tmp[2] == 2.f);
+ CHK(tmp[3] == 3.f);
+ CHK(tmp[4] == 4.f);
+ CHK(tmp[5] == 5.f);
+ CHK(tmp[6] == 6.f);
+ CHK(tmp[7] == 7.f);
+ CHK(tmp[8] == 8.f);
+
+ i = v8f_set1(-2.f);
+ CHK(v4f_x(v8f_abcd(i)) == -2.f);
+ CHK(v4f_y(v8f_abcd(i)) == -2.f);
+ CHK(v4f_z(v8f_abcd(i)) == -2.f);
+ CHK(v4f_w(v8f_abcd(i)) == -2.f);
+ CHK(v4f_x(v8f_efgh(i)) == -2.f);
+ CHK(v4f_y(v8f_efgh(i)) == -2.f);
+ CHK(v4f_z(v8f_efgh(i)) == -2.f);
+ CHK(v4f_w(v8f_efgh(i)) == -2.f);
+
+ i = v8f_set(0.f,1.f,2.f,3.f,4.f,5.f,6.f,7.f);
+ CHK(v4f_x(v8f_abcd(i)) == 0.f);
+ CHK(v4f_y(v8f_abcd(i)) == 1.f);
+ CHK(v4f_z(v8f_abcd(i)) == 2.f);
+ CHK(v4f_w(v8f_abcd(i)) == 3.f);
+ CHK(v4f_x(v8f_efgh(i)) == 4.f);
+ CHK(v4f_y(v8f_efgh(i)) == 5.f);
+ CHK(v4f_z(v8f_efgh(i)) == 6.f);
+ CHK(v4f_w(v8f_efgh(i)) == 7.f);
+
+ i = v8f_zero();
+ CHK(v4f_x(v8f_abcd(i)) == 0.f);
+ CHK(v4f_y(v8f_abcd(i)) == 0.f);
+ CHK(v4f_z(v8f_abcd(i)) == 0.f);
+ CHK(v4f_w(v8f_abcd(i)) == 0.f);
+ CHK(v4f_x(v8f_efgh(i)) == 0.f);
+ CHK(v4f_y(v8f_efgh(i)) == 0.f);
+ CHK(v4f_z(v8f_efgh(i)) == 0.f);
+ CHK(v4f_w(v8f_efgh(i)) == 0.f);
+
+ i = v8f_mask(~0,~0,0,0,0,~0,~0,0);
+ cast.f[0] = v4f_x(v8f_abcd(i)); CHK(cast.i[0] == (int32_t)0xFFFFFFFF);
+ cast.f[1] = v4f_y(v8f_abcd(i)); CHK(cast.i[1] == (int32_t)0xFFFFFFFF);
+ cast.f[2] = v4f_z(v8f_abcd(i)); CHK(cast.i[2] == (int32_t)0x00000000);
+ cast.f[3] = v4f_w(v8f_abcd(i)); CHK(cast.i[3] == (int32_t)0x00000000);
+ cast.f[4] = v4f_x(v8f_efgh(i)); CHK(cast.i[4] == (int32_t)0x00000000);
+ cast.f[5] = v4f_y(v8f_efgh(i)); CHK(cast.i[5] == (int32_t)0xFFFFFFFF);
+ cast.f[6] = v4f_z(v8f_efgh(i)); CHK(cast.i[6] == (int32_t)0xFFFFFFFF);
+ cast.f[7] = v4f_w(v8f_efgh(i)); CHK(cast.i[7] == (int32_t)0x00000000);
+
+ i = v8f_mask1(~0);
+ cast.f[0] = v4f_x(v8f_abcd(i)); CHK(cast.i[0] == (int32_t)0xFFFFFFFF);
+ cast.f[1] = v4f_y(v8f_abcd(i)); CHK(cast.i[1] == (int32_t)0xFFFFFFFF);
+ cast.f[2] = v4f_z(v8f_abcd(i)); CHK(cast.i[2] == (int32_t)0xFFFFFFFF);
+ cast.f[3] = v4f_w(v8f_abcd(i)); CHK(cast.i[3] == (int32_t)0xFFFFFFFF);
+ cast.f[4] = v4f_x(v8f_efgh(i)); CHK(cast.i[4] == (int32_t)0xFFFFFFFF);
+ cast.f[5] = v4f_y(v8f_efgh(i)); CHK(cast.i[5] == (int32_t)0xFFFFFFFF);
+ cast.f[6] = v4f_z(v8f_efgh(i)); CHK(cast.i[6] == (int32_t)0xFFFFFFFF);
+ cast.f[7] = v4f_w(v8f_efgh(i)); CHK(cast.i[7] == (int32_t)0xFFFFFFFF);
+
+ i = v8f_true();
+ cast.f[0] = v4f_x(v8f_abcd(i)); CHK(cast.i[0] == (int32_t)0xFFFFFFFF);
+ cast.f[1] = v4f_y(v8f_abcd(i)); CHK(cast.i[1] == (int32_t)0xFFFFFFFF);
+ cast.f[2] = v4f_z(v8f_abcd(i)); CHK(cast.i[2] == (int32_t)0xFFFFFFFF);
+ cast.f[3] = v4f_w(v8f_abcd(i)); CHK(cast.i[3] == (int32_t)0xFFFFFFFF);
+ cast.f[4] = v4f_x(v8f_efgh(i)); CHK(cast.i[4] == (int32_t)0xFFFFFFFF);
+ cast.f[5] = v4f_y(v8f_efgh(i)); CHK(cast.i[5] == (int32_t)0xFFFFFFFF);
+ cast.f[6] = v4f_z(v8f_efgh(i)); CHK(cast.i[6] == (int32_t)0xFFFFFFFF);
+ cast.f[7] = v4f_w(v8f_efgh(i)); CHK(cast.i[7] == (int32_t)0xFFFFFFFF);
+
+ i = v8f_false();
+ cast.f[0] = v4f_x(v8f_abcd(i)); CHK(cast.i[0] == (int32_t)0x00000000);
+ cast.f[1] = v4f_y(v8f_abcd(i)); CHK(cast.i[1] == (int32_t)0x00000000);
+ cast.f[2] = v4f_z(v8f_abcd(i)); CHK(cast.i[2] == (int32_t)0x00000000);
+ cast.f[3] = v4f_w(v8f_abcd(i)); CHK(cast.i[3] == (int32_t)0x00000000);
+ cast.f[4] = v4f_x(v8f_efgh(i)); CHK(cast.i[4] == (int32_t)0x00000000);
+ cast.f[5] = v4f_y(v8f_efgh(i)); CHK(cast.i[5] == (int32_t)0x00000000);
+ cast.f[6] = v4f_z(v8f_efgh(i)); CHK(cast.i[6] == (int32_t)0x00000000);
+ cast.f[7] = v4f_w(v8f_efgh(i)); CHK(cast.i[7] == (int32_t)0x00000000);
+
+ i = v8f_mask(~0,~0,0,0,0,~0,~0,0);
+ j = v8f_mask(~0,0,~0,0,0,~0,0,~0);
+ k = v8f_or(i, j);
+ cast.f[0] = v4f_x(v8f_abcd(k)); CHK(cast.i[0] == (int32_t)0xFFFFFFFF);
+ cast.f[1] = v4f_y(v8f_abcd(k)); CHK(cast.i[1] == (int32_t)0xFFFFFFFF);
+ cast.f[2] = v4f_z(v8f_abcd(k)); CHK(cast.i[2] == (int32_t)0xFFFFFFFF);
+ cast.f[3] = v4f_w(v8f_abcd(k)); CHK(cast.i[3] == (int32_t)0x00000000);
+ cast.f[4] = v4f_x(v8f_efgh(k)); CHK(cast.i[4] == (int32_t)0x00000000);
+ cast.f[5] = v4f_y(v8f_efgh(k)); CHK(cast.i[5] == (int32_t)0xFFFFFFFF);
+ cast.f[6] = v4f_z(v8f_efgh(k)); CHK(cast.i[6] == (int32_t)0xFFFFFFFF);
+ cast.f[7] = v4f_w(v8f_efgh(k)); CHK(cast.i[7] == (int32_t)0xFFFFFFFF);
+
+ k = v8f_and(i, j);
+ cast.f[0] = v4f_x(v8f_abcd(k)); CHK(cast.i[0] == (int32_t)0xFFFFFFFF);
+ cast.f[1] = v4f_y(v8f_abcd(k)); CHK(cast.i[1] == (int32_t)0x00000000);
+ cast.f[2] = v4f_z(v8f_abcd(k)); CHK(cast.i[2] == (int32_t)0x00000000);
+ cast.f[3] = v4f_w(v8f_abcd(k)); CHK(cast.i[3] == (int32_t)0x00000000);
+ cast.f[4] = v4f_x(v8f_efgh(k)); CHK(cast.i[4] == (int32_t)0x00000000);
+ cast.f[5] = v4f_y(v8f_efgh(k)); CHK(cast.i[5] == (int32_t)0xFFFFFFFF);
+ cast.f[6] = v4f_z(v8f_efgh(k)); CHK(cast.i[6] == (int32_t)0x00000000);
+ cast.f[7] = v4f_w(v8f_efgh(k)); CHK(cast.i[7] == (int32_t)0x00000000);
+
+ k = v8f_andnot(i, j);
+ cast.f[0] = v4f_x(v8f_abcd(k)); CHK(cast.i[0] == (int32_t)0x00000000);
+ cast.f[1] = v4f_y(v8f_abcd(k)); CHK(cast.i[1] == (int32_t)0x00000000);
+ cast.f[2] = v4f_z(v8f_abcd(k)); CHK(cast.i[2] == (int32_t)0xFFFFFFFF);
+ cast.f[3] = v4f_w(v8f_abcd(k)); CHK(cast.i[3] == (int32_t)0x00000000);
+ cast.f[4] = v4f_x(v8f_efgh(k)); CHK(cast.i[4] == (int32_t)0x00000000);
+ cast.f[5] = v4f_y(v8f_efgh(k)); CHK(cast.i[5] == (int32_t)0x00000000);
+ cast.f[6] = v4f_z(v8f_efgh(k)); CHK(cast.i[6] == (int32_t)0x00000000);
+ cast.f[7] = v4f_w(v8f_efgh(k)); CHK(cast.i[7] == (int32_t)0xFFFFFFFF);
+
+ k = v8f_xor(i, j);
+ cast.f[0] = v4f_x(v8f_abcd(k)); CHK(cast.i[0] == (int32_t)0x00000000);
+ cast.f[1] = v4f_y(v8f_abcd(k)); CHK(cast.i[1] == (int32_t)0xFFFFFFFF);
+ cast.f[2] = v4f_z(v8f_abcd(k)); CHK(cast.i[2] == (int32_t)0xFFFFFFFF);
+ cast.f[3] = v4f_w(v8f_abcd(k)); CHK(cast.i[3] == (int32_t)0x00000000);
+ cast.f[4] = v4f_x(v8f_efgh(k)); CHK(cast.i[4] == (int32_t)0x00000000);
+ cast.f[5] = v4f_y(v8f_efgh(k)); CHK(cast.i[5] == (int32_t)0x00000000);
+ cast.f[6] = v4f_z(v8f_efgh(k)); CHK(cast.i[6] == (int32_t)0xFFFFFFFF);
+ cast.f[7] = v4f_w(v8f_efgh(k)); CHK(cast.i[7] == (int32_t)0xFFFFFFFF);
+
+ CHK(v8f_movemask(k) == 0xC6);
+ i = v8f_mask
+ ((int32_t)0x01020401, (int32_t)0x80605040, (int32_t)0x7F1F2F3F, (int32_t)0,
+ (int32_t)0xF0000000, (int32_t)0xFFFFFFFF, (int32_t)0x7FFFFFFF, (int32_t)~0);
+ CHK(v8f_movemask(i) == 0xB2);
+
+ i = v8f_set(0.f,1.f,2.f,3.f,4.f,5.f,6.f,7.f);
+ j = v8f_set(8.f,9.f,10.f,11.f,12.f,13.f,14.f,15.f);
+ k = v8f_sel(i, j, v8f_mask(~0,~0,0,0,0,~0,~0,0));
+ CHK(v4f_x(v8f_abcd(k)) == 8.f);
+ CHK(v4f_y(v8f_abcd(k)) == 9.f);
+ CHK(v4f_z(v8f_abcd(k)) == 2.f);
+ CHK(v4f_w(v8f_abcd(k)) == 3.f);
+ CHK(v4f_x(v8f_efgh(k)) == 4.f);
+ CHK(v4f_y(v8f_efgh(k)) == 13.f);
+ CHK(v4f_z(v8f_efgh(k)) == 14.f);
+ CHK(v4f_w(v8f_efgh(k)) == 7.f);
+
+ k = v8f_minus(i);
+ CHK(v4f_x(v8f_abcd(k)) == -0.f);
+ CHK(v4f_y(v8f_abcd(k)) == -1.f);
+ CHK(v4f_z(v8f_abcd(k)) == -2.f);
+ CHK(v4f_w(v8f_abcd(k)) == -3.f);
+ CHK(v4f_x(v8f_efgh(k)) == -4.f);
+ CHK(v4f_y(v8f_efgh(k)) == -5.f);
+ CHK(v4f_z(v8f_efgh(k)) == -6.f);
+ CHK(v4f_w(v8f_efgh(k)) == -7.f);
+
+ k = v8f_add(i, j);
+ CHK(v4f_x(v8f_abcd(k)) == 8.f);
+ CHK(v4f_y(v8f_abcd(k)) == 10.f);
+ CHK(v4f_z(v8f_abcd(k)) == 12.f);
+ CHK(v4f_w(v8f_abcd(k)) == 14.f);
+ CHK(v4f_x(v8f_efgh(k)) == 16.f);
+ CHK(v4f_y(v8f_efgh(k)) == 18.f);
+ CHK(v4f_z(v8f_efgh(k)) == 20.f);
+ CHK(v4f_w(v8f_efgh(k)) == 22.f);
+
+ k = v8f_sub(i, j);
+ CHK(v4f_x(v8f_abcd(k)) == -8.f);
+ CHK(v4f_y(v8f_abcd(k)) == -8.f);
+ CHK(v4f_z(v8f_abcd(k)) == -8.f);
+ CHK(v4f_w(v8f_abcd(k)) == -8.f);
+ CHK(v4f_x(v8f_efgh(k)) == -8.f);
+ CHK(v4f_y(v8f_efgh(k)) == -8.f);
+ CHK(v4f_z(v8f_efgh(k)) == -8.f);
+ CHK(v4f_w(v8f_efgh(k)) == -8.f);
+
+ k = v8f_mul(i, j);
+ CHK(v4f_x(v8f_abcd(k)) == 0.f);
+ CHK(v4f_y(v8f_abcd(k)) == 9.f);
+ CHK(v4f_z(v8f_abcd(k)) == 20.f);
+ CHK(v4f_w(v8f_abcd(k)) == 33.f);
+ CHK(v4f_x(v8f_efgh(k)) == 48.f);
+ CHK(v4f_y(v8f_efgh(k)) == 65.f);
+ CHK(v4f_z(v8f_efgh(k)) == 84.f);
+ CHK(v4f_w(v8f_efgh(k)) == 105.f);
+
+ k = v8f_div(i, j);
+ CHK(v4f_x(v8f_abcd(k)) == 0.f);
+ CHK(v4f_y(v8f_abcd(k)) == 1.f/9.f);
+ CHK(v4f_z(v8f_abcd(k)) == 0.2f);
+ CHK(v4f_w(v8f_abcd(k)) == 3.f/11.f);
+ CHK(v4f_x(v8f_efgh(k)) == 1.f/3.f);
+ CHK(v4f_y(v8f_efgh(k)) == 5.f/13.f);
+ CHK(v4f_z(v8f_efgh(k)) == 3.f/7.f);
+ CHK(v4f_w(v8f_efgh(k)) == 7.f/15.f);
+
+ k = v8f_set(0.1f,0.2f,0.3f,0.4f,0.5f,0.6f,0.7f,0.8f);
+ k = v8f_madd(i, j, k);
+ CHK(v4f_x(v8f_abcd(k)) == 0.1f);
+ CHK(v4f_y(v8f_abcd(k)) == 9.2f);
+ CHK(v4f_z(v8f_abcd(k)) == 20.3f);
+ CHK(v4f_w(v8f_abcd(k)) == 33.4f);
+ CHK(v4f_x(v8f_efgh(k)) == 48.5f);
+ CHK(v4f_y(v8f_efgh(k)) == 65.6f);
+ CHK(v4f_z(v8f_efgh(k)) == 84.7f);
+ CHK(v4f_w(v8f_efgh(k)) == 105.8f);
+
+ k = v8f_abs(v8f_minus(i));
+ CHK(v4f_x(v8f_abcd(k)) == 0.f);
+ CHK(v4f_y(v8f_abcd(k)) == 1.f);
+ CHK(v4f_z(v8f_abcd(k)) == 2.f);
+ CHK(v4f_w(v8f_abcd(k)) == 3.f);
+ CHK(v4f_x(v8f_efgh(k)) == 4.f);
+ CHK(v4f_y(v8f_efgh(k)) == 5.f);
+ CHK(v4f_z(v8f_efgh(k)) == 6.f);
+ CHK(v4f_w(v8f_efgh(k)) == 7.f);
+
+ i = v8f_set(1.f, 4.f, 9.f, 16.f, 25.f, 36.f, 49.f, 64.f);
+ k = v8f_sqrt(i);
+ CHK(v4f_x(v8f_abcd(k)) == 1.f);
+ CHK(v4f_y(v8f_abcd(k)) == 2.f);
+ CHK(v4f_z(v8f_abcd(k)) == 3.f);
+ CHK(v4f_w(v8f_abcd(k)) == 4.f);
+ CHK(v4f_x(v8f_efgh(k)) == 5.f);
+ CHK(v4f_y(v8f_efgh(k)) == 6.f);
+ CHK(v4f_z(v8f_efgh(k)) == 7.f);
+ CHK(v4f_w(v8f_efgh(k)) == 8.f);
+
+ k = v8f_rsqrte(i);
+ CHK(eq_epsf(v4f_x(v8f_abcd(k)), 1.f/1.f, 1.e-3f));
+ CHK(eq_epsf(v4f_y(v8f_abcd(k)), 1.f/2.f, 1.e-3f));
+ CHK(eq_epsf(v4f_z(v8f_abcd(k)), 1.f/3.f, 1.e-3f));
+ CHK(eq_epsf(v4f_w(v8f_abcd(k)), 1.f/4.f, 1.e-3f));
+ CHK(eq_epsf(v4f_x(v8f_efgh(k)), 1.f/5.f, 1.e-3f));
+ CHK(eq_epsf(v4f_y(v8f_efgh(k)), 1.f/6.f, 1.e-3f));
+ CHK(eq_epsf(v4f_z(v8f_efgh(k)), 1.f/7.f, 1.e-3f));
+ CHK(eq_epsf(v4f_w(v8f_efgh(k)), 1.f/8.f, 1.e-3f));
+
+ k = v8f_rsqrt(i);
+ CHK(eq_epsf(v4f_x(v8f_abcd(k)), 1.f/1.f, 1.e-6f));
+ CHK(eq_epsf(v4f_y(v8f_abcd(k)), 1.f/2.f, 1.e-6f));
+ CHK(eq_epsf(v4f_z(v8f_abcd(k)), 1.f/3.f, 1.e-6f));
+ CHK(eq_epsf(v4f_w(v8f_abcd(k)), 1.f/4.f, 1.e-6f));
+ CHK(eq_epsf(v4f_x(v8f_efgh(k)), 1.f/5.f, 1.e-6f));
+ CHK(eq_epsf(v4f_y(v8f_efgh(k)), 1.f/6.f, 1.e-6f));
+ CHK(eq_epsf(v4f_z(v8f_efgh(k)), 1.f/7.f, 1.e-6f));
+ CHK(eq_epsf(v4f_w(v8f_efgh(k)), 1.f/8.f, 1.e-6f));
+
+ i = v8f_set(1.f,2.f,3.f,4.f,5.f,6.f,7.f,8.f);
+ k = v8f_rcpe(i);
+ CHK(eq_epsf(v4f_x(v8f_abcd(k)), 1.f/1.f, 1.e-3f));
+ CHK(eq_epsf(v4f_y(v8f_abcd(k)), 1.f/2.f, 1.e-3f));
+ CHK(eq_epsf(v4f_z(v8f_abcd(k)), 1.f/3.f, 1.e-3f));
+ CHK(eq_epsf(v4f_w(v8f_abcd(k)), 1.f/4.f, 1.e-3f));
+ CHK(eq_epsf(v4f_x(v8f_efgh(k)), 1.f/5.f, 1.e-3f));
+ CHK(eq_epsf(v4f_y(v8f_efgh(k)), 1.f/6.f, 1.e-3f));
+ CHK(eq_epsf(v4f_z(v8f_efgh(k)), 1.f/7.f, 1.e-3f));
+ CHK(eq_epsf(v4f_w(v8f_efgh(k)), 1.f/8.f, 1.e-3f));
+
+ k = v8f_rcp(i);
+ CHK(eq_epsf(v4f_x(v8f_abcd(k)), 1.f/1.f, 1.e-6f));
+ CHK(eq_epsf(v4f_y(v8f_abcd(k)), 1.f/2.f, 1.e-6f));
+ CHK(eq_epsf(v4f_z(v8f_abcd(k)), 1.f/3.f, 1.e-6f));
+ CHK(eq_epsf(v4f_w(v8f_abcd(k)), 1.f/4.f, 1.e-6f));
+ CHK(eq_epsf(v4f_x(v8f_efgh(k)), 1.f/5.f, 1.e-6f));
+ CHK(eq_epsf(v4f_y(v8f_efgh(k)), 1.f/6.f, 1.e-6f));
+ CHK(eq_epsf(v4f_z(v8f_efgh(k)), 1.f/7.f, 1.e-6f));
+ CHK(eq_epsf(v4f_w(v8f_efgh(k)), 1.f/8.f, 1.e-6f));
+
+ j = v8f_set(2.f,3.f,4.f,5.f,6.f,7.f,8.f,9.f);
+ k = v8f_lerp(i, j, v8f_set1(0.5f));
+ CHK(v4f_x(v8f_abcd(k)) == 1.5f);
+ CHK(v4f_y(v8f_abcd(k)) == 2.5f);
+ CHK(v4f_z(v8f_abcd(k)) == 3.5f);
+ CHK(v4f_w(v8f_abcd(k)) == 4.5f);
+ CHK(v4f_x(v8f_efgh(k)) == 5.5f);
+ CHK(v4f_y(v8f_efgh(k)) == 6.5f);
+ CHK(v4f_z(v8f_efgh(k)) == 7.5f);
+ CHK(v4f_w(v8f_efgh(k)) == 8.5f);
+
+ i = v8f_set(0.f, 1.f,2.f,3.f, 4.f,5.f,6.f,7.f);
+ j = v8f_set(0.f,-1.f,4.f,4.f,-2.f,6.f,6.f,8.f);
+
+ k = v8f_eq(i, j);
+ cast.f[0] = v4f_x(v8f_abcd(k)); CHK(cast.i[0] ==~0);
+ cast.f[1] = v4f_y(v8f_abcd(k)); CHK(cast.i[1] == 0);
+ cast.f[2] = v4f_z(v8f_abcd(k)); CHK(cast.i[2] == 0);
+ cast.f[3] = v4f_w(v8f_abcd(k)); CHK(cast.i[3] == 0);
+ cast.f[4] = v4f_x(v8f_efgh(k)); CHK(cast.i[4] == 0);
+ cast.f[5] = v4f_y(v8f_efgh(k)); CHK(cast.i[5] == 0);
+ cast.f[6] = v4f_z(v8f_efgh(k)); CHK(cast.i[6] ==~0);
+ cast.f[7] = v4f_w(v8f_efgh(k)); CHK(cast.i[7] == 0);
+
+ k = v8f_neq(i, j);
+ cast.f[0] = v4f_x(v8f_abcd(k)); CHK(cast.i[0] == 0);
+ cast.f[1] = v4f_y(v8f_abcd(k)); CHK(cast.i[1] ==~0);
+ cast.f[2] = v4f_z(v8f_abcd(k)); CHK(cast.i[2] ==~0);
+ cast.f[3] = v4f_w(v8f_abcd(k)); CHK(cast.i[3] ==~0);
+ cast.f[4] = v4f_x(v8f_efgh(k)); CHK(cast.i[4] ==~0);
+ cast.f[5] = v4f_y(v8f_efgh(k)); CHK(cast.i[5] ==~0);
+ cast.f[6] = v4f_z(v8f_efgh(k)); CHK(cast.i[6] == 0);
+ cast.f[7] = v4f_w(v8f_efgh(k)); CHK(cast.i[7] ==~0);
+
+ k = v8f_ge(i, j);
+ cast.f[0] = v4f_x(v8f_abcd(k)); CHK(cast.i[0] ==~0);
+ cast.f[1] = v4f_y(v8f_abcd(k)); CHK(cast.i[1] ==~0);
+ cast.f[2] = v4f_z(v8f_abcd(k)); CHK(cast.i[2] == 0);
+ cast.f[3] = v4f_w(v8f_abcd(k)); CHK(cast.i[3] == 0);
+ cast.f[4] = v4f_x(v8f_efgh(k)); CHK(cast.i[4] ==~0);
+ cast.f[5] = v4f_y(v8f_efgh(k)); CHK(cast.i[5] == 0);
+ cast.f[6] = v4f_z(v8f_efgh(k)); CHK(cast.i[6] ==~0);
+ cast.f[7] = v4f_w(v8f_efgh(k)); CHK(cast.i[7] == 0);
+
+ k = v8f_le(i, j);
+ cast.f[0] = v4f_x(v8f_abcd(k)); CHK(cast.i[0] ==~0);
+ cast.f[1] = v4f_y(v8f_abcd(k)); CHK(cast.i[1] == 0);
+ cast.f[2] = v4f_z(v8f_abcd(k)); CHK(cast.i[2] ==~0);
+ cast.f[3] = v4f_w(v8f_abcd(k)); CHK(cast.i[3] ==~0);
+ cast.f[4] = v4f_x(v8f_efgh(k)); CHK(cast.i[4] == 0);
+ cast.f[5] = v4f_y(v8f_efgh(k)); CHK(cast.i[5] ==~0);
+ cast.f[6] = v4f_z(v8f_efgh(k)); CHK(cast.i[6] ==~0);
+ cast.f[7] = v4f_w(v8f_efgh(k)); CHK(cast.i[7] ==~0);
+
+ k = v8f_gt(i, j);
+ cast.f[0] = v4f_x(v8f_abcd(k)); CHK(cast.i[0] == 0);
+ cast.f[1] = v4f_y(v8f_abcd(k)); CHK(cast.i[1] ==~0);
+ cast.f[2] = v4f_z(v8f_abcd(k)); CHK(cast.i[2] == 0);
+ cast.f[3] = v4f_w(v8f_abcd(k)); CHK(cast.i[3] == 0);
+ cast.f[4] = v4f_x(v8f_efgh(k)); CHK(cast.i[4] ==~0);
+ cast.f[5] = v4f_y(v8f_efgh(k)); CHK(cast.i[5] == 0);
+ cast.f[6] = v4f_z(v8f_efgh(k)); CHK(cast.i[6] == 0);
+ cast.f[7] = v4f_w(v8f_efgh(k)); CHK(cast.i[7] == 0);
+
+ k = v8f_lt(i, j);
+ cast.f[0] = v4f_x(v8f_abcd(k)); CHK(cast.i[0] == 0);
+ cast.f[1] = v4f_y(v8f_abcd(k)); CHK(cast.i[1] == 0);
+ cast.f[2] = v4f_z(v8f_abcd(k)); CHK(cast.i[2] ==~0);
+ cast.f[3] = v4f_w(v8f_abcd(k)); CHK(cast.i[3] ==~0);
+ cast.f[4] = v4f_x(v8f_efgh(k)); CHK(cast.i[4] == 0);
+ cast.f[5] = v4f_y(v8f_efgh(k)); CHK(cast.i[5] ==~0);
+ cast.f[6] = v4f_z(v8f_efgh(k)); CHK(cast.i[6] == 0);
+ cast.f[7] = v4f_w(v8f_efgh(k)); CHK(cast.i[7] ==~0);
+
+ j = v8f_set(0.0001f, 0.99999f, 2.f, 3.1f, 4.001f, 5.0002f, 6.f, 6.999999f);
+ k = v8f_eq_eps(i, j, v8f_set1(1.e-4f));
+ cast.f[0] = v4f_x(v8f_abcd(k)); CHK(cast.i[0] ==~0);
+ cast.f[1] = v4f_y(v8f_abcd(k)); CHK(cast.i[1] ==~0);
+ cast.f[2] = v4f_z(v8f_abcd(k)); CHK(cast.i[2] ==~0);
+ cast.f[3] = v4f_w(v8f_abcd(k)); CHK(cast.i[3] == 0);
+ cast.f[4] = v4f_x(v8f_efgh(k)); CHK(cast.i[4] == 0);
+ cast.f[5] = v4f_y(v8f_efgh(k)); CHK(cast.i[5] == 0);
+ cast.f[6] = v4f_z(v8f_efgh(k)); CHK(cast.i[6] ==~0);
+ cast.f[7] = v4f_w(v8f_efgh(k)); CHK(cast.i[7] ==~0);
+
+ k = v8f_eq_eps(i, j, v8f_set(1.e-4f, 1.e-4f, 0.f, 0.1f, 1.e-3f, 2.e-4f, 0.f, 1.e-5f));
+ cast.f[0] = v4f_x(v8f_abcd(k)); CHK(cast.i[0] ==~0);
+ cast.f[1] = v4f_y(v8f_abcd(k)); CHK(cast.i[1] ==~0);
+ cast.f[2] = v4f_z(v8f_abcd(k)); CHK(cast.i[2] ==~0);
+ cast.f[3] = v4f_w(v8f_abcd(k)); CHK(cast.i[3] ==~0);
+ cast.f[4] = v4f_x(v8f_efgh(k)); CHK(cast.i[4] ==~0);
+ cast.f[5] = v4f_y(v8f_efgh(k)); CHK(cast.i[5] ==~0);
+ cast.f[6] = v4f_z(v8f_efgh(k)); CHK(cast.i[6] ==~0);
+ cast.f[7] = v4f_w(v8f_efgh(k)); CHK(cast.i[7] ==~0);
+
+ i = v8f_set(0.f, 1.f,2.f,3.f, 4.f,5.f,6.f,7.f);
+ j = v8f_set(0.f,-1.f,4.f,4.f,-2.f,6.f,6.f,8.f);
+
+ k = v8f_min(i, j);
+ CHK(v4f_x(v8f_abcd(k)) == 0.f);
+ CHK(v4f_y(v8f_abcd(k)) ==-1.f);
+ CHK(v4f_z(v8f_abcd(k)) == 2.f);
+ CHK(v4f_w(v8f_abcd(k)) == 3.f);
+ CHK(v4f_x(v8f_efgh(k)) ==-2.f);
+ CHK(v4f_y(v8f_efgh(k)) == 5.f);
+ CHK(v4f_z(v8f_efgh(k)) == 6.f);
+ CHK(v4f_w(v8f_efgh(k)) == 7.f);
+
+ k = v8f_max(i, j);
+ CHK(v4f_x(v8f_abcd(k)) == 0.f);
+ CHK(v4f_y(v8f_abcd(k)) == 1.f);
+ CHK(v4f_z(v8f_abcd(k)) == 4.f);
+ CHK(v4f_w(v8f_abcd(k)) == 4.f);
+ CHK(v4f_x(v8f_efgh(k)) == 4.f);
+ CHK(v4f_y(v8f_efgh(k)) == 6.f);
+ CHK(v4f_z(v8f_efgh(k)) == 6.f);
+ CHK(v4f_w(v8f_efgh(k)) == 8.f);
+
+ CHK(v8f_reduce_min(i) == 0.f);
+ CHK(v8f_reduce_min(j) ==-2.f);
+ CHK(v8f_reduce_max(i) == 7.f);
+ CHK(v8f_reduce_max(j) == 8.f);
+
+ k = v8f_clamp(i,
+ v8f_set(1.f, 1.f, 3.1f, 5.f, 4.f, 0.f, 0.f, -1.f),
+ v8f_set(1.f, 1.f, 4.f, 6.f, 4.f, 1.f, 6.f, 5.f));
+
+ CHK(v4f_x(v8f_abcd(k)) == 1.f);
+ CHK(v4f_y(v8f_abcd(k)) == 1.f);
+ CHK(v4f_z(v8f_abcd(k)) == 3.1f);
+ CHK(v4f_w(v8f_abcd(k)) == 5.f);
+ CHK(v4f_x(v8f_efgh(k)) == 4.f);
+ CHK(v4f_y(v8f_efgh(k)) == 1.f);
+ CHK(v4f_z(v8f_efgh(k)) == 6.f);
+ CHK(v4f_w(v8f_efgh(k)) == 5.f);
+
+ return 0;
+}
+