rsimd

Make SIMD instruction sets easier to use
git clone git://git.meso-star.fr/rsimd.git
Log | Files | Refs | README | LICENSE

commit 07f25d2d6f1905054d968f9d7aefd8eff73b4f02
parent 9870c031f34427d5d4a01e34fd5f72a42a8e0f28
Author: vaplv <vaplv@free.fr>
Date:   Sat,  2 Jun 2018 15:08:47 +0200

Add and test the v8i_T API

Diffstat:
Mcmake/CMakeLists.txt | 1+
Msrc/avx/avx.h | 3+--
Asrc/avx/avxi.h | 205+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/sse/ssei.h | 16++++++++++++++++
Msrc/test_v4i.c | 4++++
Asrc/test_v8i.c | 182+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
6 files changed, 409 insertions(+), 2 deletions(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt @@ -115,6 +115,7 @@ if(NOT NO_TEST) if(AVX AND CMAKE_COMPILER_IS_GNUCC) new_test(test_v8f "-mavx") + new_test(test_v8i "-mavx") endif(AVX AND CMAKE_COMPILER_IS_GNUCC) endif(NOT NO_TEST) diff --git a/src/avx/avx.h b/src/avx/avx.h @@ -17,8 +17,7 @@ #define RSIMD_AVX_H #include "avxf.h" - -typedef __m256i v8i_T; +#include "avxi.h" /* Reinterpret cast */ static FINLINE v8i_T v8f_rcast_v8i(const v8f_T v) {return _mm256_castps_si256(v);} diff --git a/src/avx/avxi.h b/src/avx/avxi.h @@ -0,0 +1,205 @@ +/* Copyright (C) 2014-2018 Vincent Forest (vaplv@free.fr) + * + * The RSIMD library is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * The RSIMD library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ + +#ifndef RSIMD_AVXI_H +#define RSIMD_AVXI_H + +/* + * 8 packed signed integers + */ + +#include "avx.h" + +#include <rsys/math.h> +#include <immintrin.h> + +typedef __m256i v8i_T; + +/******************************************************************************* + * Set operations + ******************************************************************************/ +static FINLINE int32_t* +v8i_store(int32_t dst[8], v8i_T v) +{ + ASSERT(dst && IS_ALIGNED(dst, 32)); + _mm256_store_si256((v8i_T*)dst, v); + return dst; +} + +static FINLINE v8i_T +v8i_load(const int32_t src[8]) +{ + ASSERT(src && IS_ALIGNED(src, 32)); + return _mm256_load_si256((const v8i_T*)src); +} + +static FINLINE v8i_T +v8i_set1(const int32_t i) +{ + return _mm256_set1_epi32(i); +} + +static FINLINE v8i_T +v8i_set + (const int32_t a, const int32_t b, const int32_t c, const int32_t d, + const int32_t e, const int32_t f, const int32_t g, const int32_t h) +{ + return _mm256_set_epi32(h, g, f, e, d, c, b, a); +} + +static FINLINE v8i_T +v8i_zero(void) +{ + return _mm256_setzero_si256(); +} + +/******************************************************************************* + * Extract components + ******************************************************************************/ +static FINLINE v4i_T +v8i_abcd(const v8i_T v) +{ + return _mm256_extractf128_si256(v, 0); +} + +static FINLINE v4i_T +v8i_efgh(const v8i_T v) +{ + return _mm256_extractf128_si256(v, 1); +} + +/******************************************************************************* + * Bitwise operators + ******************************************************************************/ +static FINLINE v8i_T +v8i_or(const v8i_T v0, const v8i_T v1) +{ + const v8f_T a = _mm256_castsi256_ps(v0); + const v8f_T b = _mm256_castsi256_ps(v1); + const v8f_T c = _mm256_or_ps(a, b); + return _mm256_castps_si256(c); +} + +static FINLINE v8i_T +v8i_and(const v8i_T v0, const v8i_T v1) +{ + const v8f_T a = _mm256_castsi256_ps(v0); + const v8f_T b = _mm256_castsi256_ps(v1); + const v8f_T c = _mm256_and_ps(a, b); + return _mm256_castps_si256(c); +} + +static FINLINE v8i_T +v8i_andnot(const v8i_T v0, const v8i_T v1) +{ + const v8f_T a = _mm256_castsi256_ps(v0); + const v8f_T b = _mm256_castsi256_ps(v1); + const v8f_T c = _mm256_andnot_ps(a, b); + return _mm256_castps_si256(c); +} + +static FINLINE v8i_T +v8i_xor(const v8i_T v0, const v8i_T v1) +{ + const v8f_T a = _mm256_castsi256_ps(v0); + const v8f_T b = _mm256_castsi256_ps(v1); + const v8f_T c = _mm256_xor_ps(a, b); + return _mm256_castps_si256(c); +} + +/******************************************************************************* + * Comparators + ******************************************************************************/ +static FINLINE v8i_T +v8i_eq(const v8i_T v0, const v8i_T v1) +{ + ALIGN(32) int32_t a[8]; + ALIGN(32) int32_t b[8]; + v8i_store(a, v0); + v8i_store(b, v1); + return v8i_set + (-(a[0]==b[0]),-(a[1]==b[1]),-(a[2]==b[2]),-(a[3]==b[3]), + -(a[4]==b[4]),-(a[5]==b[5]),-(a[6]==b[6]),-(a[7]==b[7])); + +} + +static FINLINE v8i_T +v8i_neq(const v8i_T v0, const v8i_T v1) +{ + ALIGN(32) int32_t a[8]; + ALIGN(32) int32_t b[8]; + v8i_store(a, v0); + v8i_store(b, v1); + return v8i_set + (-(a[0]!=b[0]),-(a[1]!=b[1]),-(a[2]!=b[2]),-(a[3]!=b[3]), + -(a[4]!=b[4]),-(a[5]!=b[5]),-(a[6]!=b[6]),-(a[7]!=b[7])); + +} + +static FINLINE v8i_T +v8i_sel(const v8i_T vfalse, const v8i_T vtrue, const v8i_T vcond) +{ + const v8f_T a = _mm256_castsi256_ps(vfalse); + const v8f_T b = _mm256_castsi256_ps(vtrue); + const v8f_T c = _mm256_castsi256_ps(vcond); + return _mm256_castps_si256(_mm256_blendv_ps(a, b, c)); +} + +static FINLINE v8i_T +v8i_min(const v8i_T v0, const v8i_T v1) +{ + ALIGN(32) int32_t a[8]; + ALIGN(32) int32_t b[8]; + v8i_store(a, v0); + v8i_store(b, v1); + return v8i_set + (MMIN(a[0],b[0]), MMIN(a[1],b[1]), MMIN(a[2],b[2]), MMIN(a[3],b[3]), + MMIN(a[4],b[4]), MMIN(a[5],b[5]), MMIN(a[6],b[6]), MMIN(a[7],b[7])); +} + +static FINLINE v8i_T +v8i_max(const v8i_T v0, const v8i_T v1) +{ + ALIGN(32) int32_t a[8]; + ALIGN(32) int32_t b[8]; + v8i_store(a, v0); + v8i_store(b, v1); + return v8i_set + (MMAX(a[0],b[0]), MMAX(a[1],b[1]), MMAX(a[2],b[2]), MMAX(a[3],b[3]), + MMAX(a[4],b[4]), MMAX(a[5],b[5]), MMAX(a[6],b[6]), MMAX(a[7],b[7])); +} + +static FINLINE int32_t +v8i_reduce_min_i32(const v8i_T v) +{ + ALIGN(32) int32_t tmp[8]; + v8i_store(tmp, v); + return MMIN + (MMIN(MMIN(tmp[0], tmp[1]), MMIN(tmp[2], tmp[3])), + MMIN(MMIN(tmp[4], tmp[5]), MMIN(tmp[6], tmp[7]))); +} + +static FINLINE int32_t +v8i_reduce_max_i32(const v8i_T v) +{ + ALIGN(32) int32_t tmp[8]; + v8i_store(tmp, v); + return MMAX + (MMAX(MMAX(tmp[0], tmp[1]), MMAX(tmp[2], tmp[3])), + MMAX(MMAX(tmp[4], tmp[5]), MMAX(tmp[6], tmp[7]))); +} + +#endif /* RSIMD_AVXI_H */ + diff --git a/src/sse/ssei.h b/src/sse/ssei.h @@ -266,5 +266,21 @@ v4i_reduce_max(const v4i_T v) #endif } +static FINLINE int32_t +v4i_reduce_min_i32(const v4i_T v) +{ + ALIGN(16) int32_t a[4]; + v4i_store(a, v); + return MMIN(MMIN(a[0], a[1]), MMIN(a[2], a[3])); +} + +static FINLINE int32_t +v4i_reduce_max_i32(const v4i_T v) +{ + ALIGN(16) int32_t a[4]; + v4i_store(a, v); + return MMAX(MMAX(a[0], a[1]), MMAX(a[2], a[3])); +} + #endif /* RSIMD_SSEI_H */ diff --git a/src/test_v4i.c b/src/test_v4i.c @@ -217,24 +217,28 @@ main(int argc, char** argv) CHK(v4i_y(k) == 1); CHK(v4i_z(k) == 1); CHK(v4i_w(k) == 1); + CHK(v4i_reduce_min_i32(i) == 1); k = v4i_reduce_min(j); CHK(v4i_x(k) == -4); CHK(v4i_y(k) == -4); CHK(v4i_z(k) == -4); CHK(v4i_w(k) == -4); + CHK(v4i_reduce_min_i32(j) == -4); k = v4i_reduce_max(i); CHK(v4i_x(k) == 4); CHK(v4i_y(k) == 4); CHK(v4i_z(k) == 4); CHK(v4i_w(k) == 4); + CHK(v4i_reduce_max_i32(i) == 4); k = v4i_reduce_max(j); CHK(v4i_x(k) == 6); CHK(v4i_y(k) == 6); CHK(v4i_z(k) == 6); CHK(v4i_w(k) == 6); + CHK(v4i_reduce_max_i32(j) == 6); return 0; } diff --git a/src/test_v8i.c b/src/test_v8i.c @@ -0,0 +1,182 @@ +/* Copyright (C) 2014-2018 Vincent Forest (vaplv@free.fr) + * + * The RSIMD library is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * The RSIMD library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ + +#include "rsimd.h" + +int +main(int argc, char** argv) +{ + v8i_T i, j, k; + ALIGN(32) int32_t tmp[8] = {0,1,2,3,4,5,6,7}; + (void)argc, (void)argv; + + i = v8i_load(tmp); + CHK(v4i_x(v8i_abcd(i)) == 0); + CHK(v4i_y(v8i_abcd(i)) == 1); + CHK(v4i_z(v8i_abcd(i)) == 2); + CHK(v4i_w(v8i_abcd(i)) == 3); + CHK(v4i_x(v8i_efgh(i)) == 4); + CHK(v4i_y(v8i_efgh(i)) == 5); + CHK(v4i_z(v8i_efgh(i)) == 6); + CHK(v4i_w(v8i_efgh(i)) == 7); + + tmp[0]= tmp[1] = tmp[2] = tmp[3] = 0; + tmp[4]= tmp[5] = tmp[6] = tmp[7] = 0; + CHK(v8i_store(tmp, i) == tmp); + CHK(tmp[0] == 0); + CHK(tmp[1] == 1); + CHK(tmp[2] == 2); + CHK(tmp[3] == 3); + CHK(tmp[4] == 4); + CHK(tmp[5] == 5); + CHK(tmp[6] == 6); + CHK(tmp[7] == 7); + + i = v8i_set(1, 2, 3, 4, 5, 6, 7, 8); + CHK(v4i_x(v8i_abcd(i)) == 1); + CHK(v4i_y(v8i_abcd(i)) == 2); + CHK(v4i_z(v8i_abcd(i)) == 3); + CHK(v4i_w(v8i_abcd(i)) == 4); + CHK(v4i_x(v8i_efgh(i)) == 5); + CHK(v4i_y(v8i_efgh(i)) == 6); + CHK(v4i_z(v8i_efgh(i)) == 7); + CHK(v4i_w(v8i_efgh(i)) == 8); + + i = v8i_set1(-1); + CHK(v4i_x(v8i_abcd(i)) == -1); + CHK(v4i_y(v8i_abcd(i)) == -1); + CHK(v4i_z(v8i_abcd(i)) == -1); + CHK(v4i_w(v8i_abcd(i)) == -1); + CHK(v4i_x(v8i_efgh(i)) == -1); + CHK(v4i_y(v8i_efgh(i)) == -1); + CHK(v4i_z(v8i_efgh(i)) == -1); + CHK(v4i_w(v8i_efgh(i)) == -1); + + i = v8i_zero(); + CHK(v4i_x(v8i_abcd(i)) == 0); + CHK(v4i_y(v8i_abcd(i)) == 0); + CHK(v4i_z(v8i_abcd(i)) == 0); + CHK(v4i_w(v8i_abcd(i)) == 0); + CHK(v4i_x(v8i_efgh(i)) == 0); + CHK(v4i_y(v8i_efgh(i)) == 0); + CHK(v4i_z(v8i_efgh(i)) == 0); + CHK(v4i_w(v8i_efgh(i)) == 0); + + i = v8i_set + (0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F, + 0x00102030, 0x40506070, (int32_t)0x8090A0B0, (int32_t)0xC0D0E0F0); + j = v8i_set + (0x01020401, 0x70605040, 0x0F1F2F3F, 0x00000000, + 0x10204010, 0x06050400, (int32_t)0xF1F2F3F0, 0x10000000); + k = v8i_or(i, j); + CHK(v4i_x(v8i_abcd(k)) == (int32_t)0x01030603); + CHK(v4i_y(v8i_abcd(k)) == (int32_t)0x74655647); + CHK(v4i_z(v8i_abcd(k)) == (int32_t)0x0F1F2F3F); + CHK(v4i_w(v8i_abcd(k)) == (int32_t)0x0C0D0E0F); + CHK(v4i_x(v8i_efgh(k)) == (int32_t)0x10306030); + CHK(v4i_y(v8i_efgh(k)) == (int32_t)0x46556470); + CHK(v4i_z(v8i_efgh(k)) == (int32_t)0xF1F2F3F0); + CHK(v4i_w(v8i_efgh(k)) == (int32_t)0xD0D0E0F0); + + k = v8i_and(i, j); + CHK(v4i_x(v8i_abcd(k)) == (int32_t)0x00000001); + CHK(v4i_y(v8i_abcd(k)) == (int32_t)0x00000000); + CHK(v4i_z(v8i_abcd(k)) == (int32_t)0x08090A0B); + CHK(v4i_w(v8i_abcd(k)) == (int32_t)0x00000000); + CHK(v4i_x(v8i_efgh(k)) == (int32_t)0x00000010); + CHK(v4i_y(v8i_efgh(k)) == (int32_t)0x00000000); + CHK(v4i_z(v8i_efgh(k)) == (int32_t)0x8090A0B0); + CHK(v4i_w(v8i_efgh(k)) == (int32_t)0x00000000); + + k = v8i_andnot(i, j); + CHK(v4i_x(v8i_abcd(k)) == (int32_t)0x01020400); + CHK(v4i_y(v8i_abcd(k)) == (int32_t)0x70605040); + CHK(v4i_z(v8i_abcd(k)) == (int32_t)0x07162534); + CHK(v4i_w(v8i_abcd(k)) == (int32_t)0x00000000); + CHK(v4i_x(v8i_efgh(k)) == (int32_t)0x10204000); + CHK(v4i_y(v8i_efgh(k)) == (int32_t)0x06050400); + CHK(v4i_z(v8i_efgh(k)) == (int32_t)0x71625340); + CHK(v4i_w(v8i_efgh(k)) == (int32_t)0x10000000); + + k = v8i_xor(i, j); + CHK(v4i_x(v8i_abcd(k)) == (int32_t)0x01030602); + CHK(v4i_y(v8i_abcd(k)) == (int32_t)0x74655647); + CHK(v4i_z(v8i_abcd(k)) == (int32_t)0x07162534); + CHK(v4i_w(v8i_abcd(k)) == (int32_t)0x0C0D0E0F); + CHK(v4i_x(v8i_efgh(k)) == (int32_t)0x10306020); + CHK(v4i_y(v8i_efgh(k)) == (int32_t)0x46556470); + CHK(v4i_z(v8i_efgh(k)) == (int32_t)0x71625340); + CHK(v4i_w(v8i_efgh(k)) == (int32_t)0XD0D0E0F0); + + i = v8i_set( 1, 2,3,4,5, 6,7,8); + j = v8i_set(-2,-4,3,6,5,-1,8,8); + + k = v8i_eq(i, j); + CHK(v4i_x(v8i_abcd(k)) == 0); + CHK(v4i_y(v8i_abcd(k)) == 0); + CHK(v4i_z(v8i_abcd(k)) ==~0); + CHK(v4i_w(v8i_abcd(k)) == 0); + CHK(v4i_x(v8i_efgh(k)) ==~0); + CHK(v4i_y(v8i_efgh(k)) == 0); + CHK(v4i_z(v8i_efgh(k)) == 0); + CHK(v4i_w(v8i_efgh(k)) ==~0); + + k = v8i_neq(i, j); + CHK(v4i_x(v8i_abcd(k)) ==~0); + CHK(v4i_y(v8i_abcd(k)) ==~0); + CHK(v4i_z(v8i_abcd(k)) == 0); + CHK(v4i_w(v8i_abcd(k)) ==~0); + CHK(v4i_x(v8i_efgh(k)) == 0); + CHK(v4i_y(v8i_efgh(k)) ==~0); + CHK(v4i_z(v8i_efgh(k)) ==~0); + CHK(v4i_w(v8i_efgh(k)) == 0); + + k = v8i_sel(i, j, v8i_set(~0,~0,0,~0,0,0,~0,0)); + CHK(v4i_x(v8i_abcd(k)) ==-2); + CHK(v4i_y(v8i_abcd(k)) ==-4); + CHK(v4i_z(v8i_abcd(k)) == 3); + CHK(v4i_w(v8i_abcd(k)) == 6); + CHK(v4i_x(v8i_efgh(k)) == 5); + CHK(v4i_y(v8i_efgh(k)) == 6); + CHK(v4i_z(v8i_efgh(k)) == 8); + CHK(v4i_w(v8i_efgh(k)) == 8); + + k = v8i_min(i, j); + CHK(v4i_x(v8i_abcd(k)) ==-2); + CHK(v4i_y(v8i_abcd(k)) ==-4); + CHK(v4i_z(v8i_abcd(k)) == 3); + CHK(v4i_w(v8i_abcd(k)) == 4); + CHK(v4i_x(v8i_efgh(k)) == 5); + CHK(v4i_y(v8i_efgh(k)) ==-1); + CHK(v4i_z(v8i_efgh(k)) == 7); + CHK(v4i_w(v8i_efgh(k)) == 8); + + k = v8i_max(i, j); + CHK(v4i_x(v8i_abcd(k)) == 1); + CHK(v4i_y(v8i_abcd(k)) == 2); + CHK(v4i_z(v8i_abcd(k)) == 3); + CHK(v4i_w(v8i_abcd(k)) == 6); + CHK(v4i_x(v8i_efgh(k)) == 5); + CHK(v4i_y(v8i_efgh(k)) == 6); + CHK(v4i_z(v8i_efgh(k)) == 8); + CHK(v4i_w(v8i_efgh(k)) == 8); + + CHK(v8i_reduce_min_i32(i) == 1); + CHK(v8i_reduce_min_i32(j) ==-4); + CHK(v8i_reduce_max_i32(i) == 8); + CHK(v8i_reduce_max_i32(j) == 8); + + return 0; +}