commit 7087e6b6dd782bd4034d8a8394926a8bb3e8a4ed
parent bc48e6fdcd0908c6a2efe2fa264bf40e2485365e
Author: vaplv <vaplv@free.fr>
Date: Sun, 17 Jun 2018 16:55:49 +0200
Add the v8i_set_v4i function
Use this new function to rewrite some v8i functions and improve their
performances.
Diffstat:
4 files changed, 63 insertions(+), 49 deletions(-)
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
@@ -62,6 +62,10 @@ set(RSIMD_FILES_INC_SSE
sse/ssef.h
sse/ssei.h
sse/sse_swz.h)
+set(RSIMD_FILES_INC_AVX
+ avx/avx.h
+ avx/avxf.h
+ avx/avxi.h)
set(RSIMD_FILES_SRC
aosf44.c
aosq.c
@@ -69,9 +73,13 @@ set(RSIMD_FILES_SRC
set(RSIMD_FILES_DOC COPYING COPYING.LESSER README.md)
rcmake_prepend_path(RSIMD_FILES_INC_LEGACY ${RSIMD_SOURCE_DIR})
rcmake_prepend_path(RSIMD_FILES_INC_SSE ${RSIMD_SOURCE_DIR})
+rcmake_prepend_path(RSIMD_FILES_INC_AVX ${RSIMD_SOURCE_DIR})
rcmake_prepend_path(RSIMD_FILES_SRC ${RSIMD_SOURCE_DIR})
rcmake_prepend_path(RSIMD_FILES_DOC ${PROJECT_SOURCE_DIR}/../)
-set(RSIMD_FILES_INC ${RSIMD_FILES_INC_LEGACY} ${RSIMD_FILES_INC_SSE})
+set(RSIMD_FILES_INC
+ ${RSIMD_FILES_INC_LEGACY}
+ ${RSIMD_FILES_INC_SSE}
+ ${RSIMD_FILES_INC_AVX})
add_library(rsimd SHARED ${RSIMD_FILES_INC} ${RSIMD_FILES_SRC})
set_target_properties(rsimd PROPERTIES DEFINE_SYMBOL RSIMD_SHARED_BUILD)
@@ -129,5 +137,6 @@ install(TARGETS rsimd
RUNTIME DESTINATION bin)
install(FILES ${RSIMD_FILES_INC_LEGACY} DESTINATION include/rsimd)
install(FILES ${RSIMD_FILES_INC_SSE} DESTINATION include/rsimd/sse)
+install(FILES ${RSIMD_FILES_INC_AVX} DESTINATION include/rsimd/avx)
install(FILES ${RSIMD_FILES_DOC} DESTINATION share/doc/rsimd)
diff --git a/src/avx/avxi.h b/src/avx/avxi.h
@@ -20,8 +20,6 @@
* 8 packed signed integers
*/
-#include "avx.h"
-
#include <rsys/math.h>
#include <immintrin.h>
@@ -65,6 +63,15 @@ v8i_zero(void)
return _mm256_setzero_si256();
}
+static FINLINE v8i_T
+v8i_set_v4i(const v4i_T abcd, const v4i_T efgh)
+{
+ v8i_T tmp = v8i_zero();
+ tmp = _mm256_insertf128_si256(tmp, abcd, 0);
+ tmp = _mm256_insertf128_si256(tmp, efgh, 1);
+ return tmp;
+}
+
/*******************************************************************************
* Extract components
******************************************************************************/
@@ -125,27 +132,25 @@ v8i_xor(const v8i_T v0, const v8i_T v1)
static FINLINE v8i_T
v8i_eq(const v8i_T v0, const v8i_T v1)
{
- ALIGN(32) int32_t a[8];
- ALIGN(32) int32_t b[8];
- v8i_store(a, v0);
- v8i_store(b, v1);
- return v8i_set
- (-(a[0]==b[0]),-(a[1]==b[1]),-(a[2]==b[2]),-(a[3]==b[3]),
- -(a[4]==b[4]),-(a[5]==b[5]),-(a[6]==b[6]),-(a[7]==b[7]));
-
+ const v4i_T v0_abcd = v8i_abcd(v0);
+ const v4i_T v0_efgh = v8i_efgh(v0);
+ const v4i_T v1_abcd = v8i_abcd(v1);
+ const v4i_T v1_efgh = v8i_efgh(v1);
+ const v4i_T abcd = v4i_eq(v0_abcd, v1_abcd);
+ const v4i_T efgh = v4i_eq(v0_efgh, v1_efgh);
+ return v8i_set_v4i(abcd, efgh);
}
static FINLINE v8i_T
v8i_neq(const v8i_T v0, const v8i_T v1)
{
- ALIGN(32) int32_t a[8];
- ALIGN(32) int32_t b[8];
- v8i_store(a, v0);
- v8i_store(b, v1);
- return v8i_set
- (-(a[0]!=b[0]),-(a[1]!=b[1]),-(a[2]!=b[2]),-(a[3]!=b[3]),
- -(a[4]!=b[4]),-(a[5]!=b[5]),-(a[6]!=b[6]),-(a[7]!=b[7]));
-
+ const v4i_T v0_abcd = v8i_abcd(v0);
+ const v4i_T v0_efgh = v8i_efgh(v0);
+ const v4i_T v1_abcd = v8i_abcd(v1);
+ const v4i_T v1_efgh = v8i_efgh(v1);
+ const v4i_T abcd = v4i_neq(v0_abcd, v1_abcd);
+ const v4i_T efgh = v4i_neq(v0_efgh, v1_efgh);
+ return v8i_set_v4i(abcd, efgh);
}
static FINLINE v8i_T
@@ -160,45 +165,39 @@ v8i_sel(const v8i_T vfalse, const v8i_T vtrue, const v8i_T vcond)
static FINLINE v8i_T
v8i_min(const v8i_T v0, const v8i_T v1)
{
- ALIGN(32) int32_t a[8];
- ALIGN(32) int32_t b[8];
- v8i_store(a, v0);
- v8i_store(b, v1);
- return v8i_set
- (MMIN(a[0],b[0]), MMIN(a[1],b[1]), MMIN(a[2],b[2]), MMIN(a[3],b[3]),
- MMIN(a[4],b[4]), MMIN(a[5],b[5]), MMIN(a[6],b[6]), MMIN(a[7],b[7]));
+ const v4i_T v0_abcd = v8i_abcd(v0);
+ const v4i_T v0_efgh = v8i_efgh(v0);
+ const v4i_T v1_abcd = v8i_abcd(v1);
+ const v4i_T v1_efgh = v8i_efgh(v1);
+ const v4i_T abcd = v4i_min(v0_abcd, v1_abcd);
+ const v4i_T efgh = v4i_min(v0_efgh, v1_efgh);
+ return v8i_set_v4i(abcd, efgh);
}
static FINLINE v8i_T
v8i_max(const v8i_T v0, const v8i_T v1)
{
- ALIGN(32) int32_t a[8];
- ALIGN(32) int32_t b[8];
- v8i_store(a, v0);
- v8i_store(b, v1);
- return v8i_set
- (MMAX(a[0],b[0]), MMAX(a[1],b[1]), MMAX(a[2],b[2]), MMAX(a[3],b[3]),
- MMAX(a[4],b[4]), MMAX(a[5],b[5]), MMAX(a[6],b[6]), MMAX(a[7],b[7]));
+ const v4i_T v0_abcd = v8i_abcd(v0);
+ const v4i_T v0_efgh = v8i_efgh(v0);
+ const v4i_T v1_abcd = v8i_abcd(v1);
+ const v4i_T v1_efgh = v8i_efgh(v1);
+ const v4i_T abcd = v4i_max(v0_abcd, v1_abcd);
+ const v4i_T efgh = v4i_max(v0_efgh, v1_efgh);
+ return v8i_set_v4i(abcd, efgh);
}
static FINLINE int32_t
v8i_reduce_min_i32(const v8i_T v)
{
- ALIGN(32) int32_t tmp[8];
- v8i_store(tmp, v);
- return MMIN
- (MMIN(MMIN(tmp[0], tmp[1]), MMIN(tmp[2], tmp[3])),
- MMIN(MMIN(tmp[4], tmp[5]), MMIN(tmp[6], tmp[7])));
+ const v4i_T tmp = v4i_min(v8i_abcd(v), v8i_efgh(v));
+ return v4i_x(v4i_reduce_min(tmp));
}
static FINLINE int32_t
v8i_reduce_max_i32(const v8i_T v)
{
- ALIGN(32) int32_t tmp[8];
- v8i_store(tmp, v);
- return MMAX
- (MMAX(MMAX(tmp[0], tmp[1]), MMAX(tmp[2], tmp[3])),
- MMAX(MMAX(tmp[4], tmp[5]), MMAX(tmp[6], tmp[7])));
+ const v4i_T tmp = v4i_max(v8i_abcd(v), v8i_efgh(v));
+ return v4i_x(v4i_reduce_max(tmp));
}
#endif /* RSIMD_AVXI_H */
diff --git a/src/sse/ssei.h b/src/sse/ssei.h
@@ -269,17 +269,13 @@ v4i_reduce_max(const v4i_T v)
static FINLINE int32_t
v4i_reduce_min_i32(const v4i_T v)
{
- ALIGN(16) int32_t a[4];
- v4i_store(a, v);
- return MMIN(MMIN(a[0], a[1]), MMIN(a[2], a[3]));
+ return v4i_x(v4i_reduce_min(v));
}
static FINLINE int32_t
v4i_reduce_max_i32(const v4i_T v)
{
- ALIGN(16) int32_t a[4];
- v4i_store(a, v);
- return MMAX(MMAX(a[0], a[1]), MMAX(a[2], a[3]));
+ return v4i_x(v4i_reduce_max(v));
}
#endif /* RSIMD_SSEI_H */
diff --git a/src/test_v8i.c b/src/test_v8i.c
@@ -74,6 +74,16 @@ main(int argc, char** argv)
CHK(v4i_z(v8i_efgh(i)) == 0);
CHK(v4i_w(v8i_efgh(i)) == 0);
+ i = v8i_set_v4i(v4i_set(-1,-2,3,4), v4i_set(5,6,-7,-8));
+ CHK(v4i_x(v8i_abcd(i)) ==-1);
+ CHK(v4i_y(v8i_abcd(i)) ==-2);
+ CHK(v4i_z(v8i_abcd(i)) == 3);
+ CHK(v4i_w(v8i_abcd(i)) == 4);
+ CHK(v4i_x(v8i_efgh(i)) == 5);
+ CHK(v4i_y(v8i_efgh(i)) == 6);
+ CHK(v4i_z(v8i_efgh(i)) ==-7);
+ CHK(v4i_w(v8i_efgh(i)) ==-8);
+
i = v8i_set
(0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F,
0x00102030, 0x40506070, (int32_t)0x8090A0B0, (int32_t)0xC0D0E0F0);