rsimd

Make SIMD instruction sets easier to use
git clone git://git.meso-star.fr/rsimd.git
Log | Files | Refs | README | LICENSE

commit 7087e6b6dd782bd4034d8a8394926a8bb3e8a4ed
parent bc48e6fdcd0908c6a2efe2fa264bf40e2485365e
Author: vaplv <vaplv@free.fr>
Date:   Sun, 17 Jun 2018 16:55:49 +0200

Add the v8i_set_v4i function

Use this new function to rewrite some v8i functions and improve their
performances.

Diffstat:
Mcmake/CMakeLists.txt | 11++++++++++-
Msrc/avx/avxi.h | 83+++++++++++++++++++++++++++++++++++++++----------------------------------------
Msrc/sse/ssei.h | 8++------
Msrc/test_v8i.c | 10++++++++++
4 files changed, 63 insertions(+), 49 deletions(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt @@ -62,6 +62,10 @@ set(RSIMD_FILES_INC_SSE sse/ssef.h sse/ssei.h sse/sse_swz.h) +set(RSIMD_FILES_INC_AVX + avx/avx.h + avx/avxf.h + avx/avxi.h) set(RSIMD_FILES_SRC aosf44.c aosq.c @@ -69,9 +73,13 @@ set(RSIMD_FILES_SRC set(RSIMD_FILES_DOC COPYING COPYING.LESSER README.md) rcmake_prepend_path(RSIMD_FILES_INC_LEGACY ${RSIMD_SOURCE_DIR}) rcmake_prepend_path(RSIMD_FILES_INC_SSE ${RSIMD_SOURCE_DIR}) +rcmake_prepend_path(RSIMD_FILES_INC_AVX ${RSIMD_SOURCE_DIR}) rcmake_prepend_path(RSIMD_FILES_SRC ${RSIMD_SOURCE_DIR}) rcmake_prepend_path(RSIMD_FILES_DOC ${PROJECT_SOURCE_DIR}/../) -set(RSIMD_FILES_INC ${RSIMD_FILES_INC_LEGACY} ${RSIMD_FILES_INC_SSE}) +set(RSIMD_FILES_INC + ${RSIMD_FILES_INC_LEGACY} + ${RSIMD_FILES_INC_SSE} + ${RSIMD_FILES_INC_AVX}) add_library(rsimd SHARED ${RSIMD_FILES_INC} ${RSIMD_FILES_SRC}) set_target_properties(rsimd PROPERTIES DEFINE_SYMBOL RSIMD_SHARED_BUILD) @@ -129,5 +137,6 @@ install(TARGETS rsimd RUNTIME DESTINATION bin) install(FILES ${RSIMD_FILES_INC_LEGACY} DESTINATION include/rsimd) install(FILES ${RSIMD_FILES_INC_SSE} DESTINATION include/rsimd/sse) +install(FILES ${RSIMD_FILES_INC_AVX} DESTINATION include/rsimd/avx) install(FILES ${RSIMD_FILES_DOC} DESTINATION share/doc/rsimd) diff --git a/src/avx/avxi.h b/src/avx/avxi.h @@ -20,8 +20,6 @@ * 8 packed signed integers */ -#include "avx.h" - #include <rsys/math.h> #include <immintrin.h> @@ -65,6 +63,15 @@ v8i_zero(void) return _mm256_setzero_si256(); } +static FINLINE v8i_T +v8i_set_v4i(const v4i_T abcd, const v4i_T efgh) +{ + v8i_T tmp = v8i_zero(); + tmp = _mm256_insertf128_si256(tmp, abcd, 0); + tmp = _mm256_insertf128_si256(tmp, efgh, 1); + return tmp; +} + /******************************************************************************* * Extract components ******************************************************************************/ @@ -125,27 +132,25 @@ v8i_xor(const v8i_T v0, const v8i_T v1) static FINLINE v8i_T v8i_eq(const v8i_T v0, const v8i_T v1) { - ALIGN(32) int32_t a[8]; - ALIGN(32) int32_t b[8]; - v8i_store(a, v0); - v8i_store(b, v1); - return v8i_set - (-(a[0]==b[0]),-(a[1]==b[1]),-(a[2]==b[2]),-(a[3]==b[3]), - -(a[4]==b[4]),-(a[5]==b[5]),-(a[6]==b[6]),-(a[7]==b[7])); - + const v4i_T v0_abcd = v8i_abcd(v0); + const v4i_T v0_efgh = v8i_efgh(v0); + const v4i_T v1_abcd = v8i_abcd(v1); + const v4i_T v1_efgh = v8i_efgh(v1); + const v4i_T abcd = v4i_eq(v0_abcd, v1_abcd); + const v4i_T efgh = v4i_eq(v0_efgh, v1_efgh); + return v8i_set_v4i(abcd, efgh); } static FINLINE v8i_T v8i_neq(const v8i_T v0, const v8i_T v1) { - ALIGN(32) int32_t a[8]; - ALIGN(32) int32_t b[8]; - v8i_store(a, v0); - v8i_store(b, v1); - return v8i_set - (-(a[0]!=b[0]),-(a[1]!=b[1]),-(a[2]!=b[2]),-(a[3]!=b[3]), - -(a[4]!=b[4]),-(a[5]!=b[5]),-(a[6]!=b[6]),-(a[7]!=b[7])); - + const v4i_T v0_abcd = v8i_abcd(v0); + const v4i_T v0_efgh = v8i_efgh(v0); + const v4i_T v1_abcd = v8i_abcd(v1); + const v4i_T v1_efgh = v8i_efgh(v1); + const v4i_T abcd = v4i_neq(v0_abcd, v1_abcd); + const v4i_T efgh = v4i_neq(v0_efgh, v1_efgh); + return v8i_set_v4i(abcd, efgh); } static FINLINE v8i_T @@ -160,45 +165,39 @@ v8i_sel(const v8i_T vfalse, const v8i_T vtrue, const v8i_T vcond) static FINLINE v8i_T v8i_min(const v8i_T v0, const v8i_T v1) { - ALIGN(32) int32_t a[8]; - ALIGN(32) int32_t b[8]; - v8i_store(a, v0); - v8i_store(b, v1); - return v8i_set - (MMIN(a[0],b[0]), MMIN(a[1],b[1]), MMIN(a[2],b[2]), MMIN(a[3],b[3]), - MMIN(a[4],b[4]), MMIN(a[5],b[5]), MMIN(a[6],b[6]), MMIN(a[7],b[7])); + const v4i_T v0_abcd = v8i_abcd(v0); + const v4i_T v0_efgh = v8i_efgh(v0); + const v4i_T v1_abcd = v8i_abcd(v1); + const v4i_T v1_efgh = v8i_efgh(v1); + const v4i_T abcd = v4i_min(v0_abcd, v1_abcd); + const v4i_T efgh = v4i_min(v0_efgh, v1_efgh); + return v8i_set_v4i(abcd, efgh); } static FINLINE v8i_T v8i_max(const v8i_T v0, const v8i_T v1) { - ALIGN(32) int32_t a[8]; - ALIGN(32) int32_t b[8]; - v8i_store(a, v0); - v8i_store(b, v1); - return v8i_set - (MMAX(a[0],b[0]), MMAX(a[1],b[1]), MMAX(a[2],b[2]), MMAX(a[3],b[3]), - MMAX(a[4],b[4]), MMAX(a[5],b[5]), MMAX(a[6],b[6]), MMAX(a[7],b[7])); + const v4i_T v0_abcd = v8i_abcd(v0); + const v4i_T v0_efgh = v8i_efgh(v0); + const v4i_T v1_abcd = v8i_abcd(v1); + const v4i_T v1_efgh = v8i_efgh(v1); + const v4i_T abcd = v4i_max(v0_abcd, v1_abcd); + const v4i_T efgh = v4i_max(v0_efgh, v1_efgh); + return v8i_set_v4i(abcd, efgh); } static FINLINE int32_t v8i_reduce_min_i32(const v8i_T v) { - ALIGN(32) int32_t tmp[8]; - v8i_store(tmp, v); - return MMIN - (MMIN(MMIN(tmp[0], tmp[1]), MMIN(tmp[2], tmp[3])), - MMIN(MMIN(tmp[4], tmp[5]), MMIN(tmp[6], tmp[7]))); + const v4i_T tmp = v4i_min(v8i_abcd(v), v8i_efgh(v)); + return v4i_x(v4i_reduce_min(tmp)); } static FINLINE int32_t v8i_reduce_max_i32(const v8i_T v) { - ALIGN(32) int32_t tmp[8]; - v8i_store(tmp, v); - return MMAX - (MMAX(MMAX(tmp[0], tmp[1]), MMAX(tmp[2], tmp[3])), - MMAX(MMAX(tmp[4], tmp[5]), MMAX(tmp[6], tmp[7]))); + const v4i_T tmp = v4i_max(v8i_abcd(v), v8i_efgh(v)); + return v4i_x(v4i_reduce_max(tmp)); } #endif /* RSIMD_AVXI_H */ diff --git a/src/sse/ssei.h b/src/sse/ssei.h @@ -269,17 +269,13 @@ v4i_reduce_max(const v4i_T v) static FINLINE int32_t v4i_reduce_min_i32(const v4i_T v) { - ALIGN(16) int32_t a[4]; - v4i_store(a, v); - return MMIN(MMIN(a[0], a[1]), MMIN(a[2], a[3])); + return v4i_x(v4i_reduce_min(v)); } static FINLINE int32_t v4i_reduce_max_i32(const v4i_T v) { - ALIGN(16) int32_t a[4]; - v4i_store(a, v); - return MMAX(MMAX(a[0], a[1]), MMAX(a[2], a[3])); + return v4i_x(v4i_reduce_max(v)); } #endif /* RSIMD_SSEI_H */ diff --git a/src/test_v8i.c b/src/test_v8i.c @@ -74,6 +74,16 @@ main(int argc, char** argv) CHK(v4i_z(v8i_efgh(i)) == 0); CHK(v4i_w(v8i_efgh(i)) == 0); + i = v8i_set_v4i(v4i_set(-1,-2,3,4), v4i_set(5,6,-7,-8)); + CHK(v4i_x(v8i_abcd(i)) ==-1); + CHK(v4i_y(v8i_abcd(i)) ==-2); + CHK(v4i_z(v8i_abcd(i)) == 3); + CHK(v4i_w(v8i_abcd(i)) == 4); + CHK(v4i_x(v8i_efgh(i)) == 5); + CHK(v4i_y(v8i_efgh(i)) == 6); + CHK(v4i_z(v8i_efgh(i)) ==-7); + CHK(v4i_w(v8i_efgh(i)) ==-8); + i = v8i_set (0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F, 0x00102030, 0x40506070, (int32_t)0x8090A0B0, (int32_t)0xC0D0E0F0);