rsimd

Make SIMD instruction sets easier to use
git clone git://git.meso-star.fr/rsimd.git
Log | Files | Refs | README | LICENSE

commit 292240923e34d35b08b0e11ea6ca2b82542a46e8
parent ee22862cb041351d56c14327a801b09519e71674
Author: vaplv <vaplv@free.fr>
Date:   Fri, 17 Oct 2014 16:17:01 +0200

Add and test the AoS quaternion SIMD functions

Diffstat:
Mcmake/CMakeLists.txt | 3+++
Asrc/aosq.c | 78++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/aosq.h | 115+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/test_aosq.c | 145+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 341 insertions(+), 0 deletions(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt @@ -42,6 +42,7 @@ set(VERSION ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}) set(RSIMD_FILES_INC aosf33.h aosf44.h + aosq.h rsimd.h sse/sse.h sse/ssef.h @@ -49,6 +50,7 @@ set(RSIMD_FILES_INC sse/sse_swz.h) set(RSIMD_FILES_SRC aosf44.c + aosq.c sse/ssef.c) rcmake_prepend_path(RSIMD_FILES_INC ${RSIMD_SOURCE_DIR}) rcmake_prepend_path(RSIMD_FILES_SRC ${RSIMD_SOURCE_DIR}) @@ -79,6 +81,7 @@ new_test(test_v4f) new_test(test_v4i) new_test(test_aosf33) new_test(test_aosf44) +new_test(test_aosq) ################################################################################ # Install directives diff --git a/src/aosq.c b/src/aosq.c @@ -0,0 +1,78 @@ +/* Copyright (C) 2014 Vincent Forest (vaplv@free.fr) + * + * The RSIMD library is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * The RSIMD library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ + +#include "aosq.h" + +v4f_T +aosq_slerp(const v4f_T from, const v4f_T to, const v4f_T vvvv) +{ + v4f_T tmp_cos_omega, cos_omega, omega, rcp_sin_omega; + v4f_T one_sub_v; + v4f_T mask; + v4f_T tmp0, tmp1, tmp2; + v4f_T scale0, scale1; + float f; + + f = v4f_x(vvvv); + if(f == 0.f) + return from; + else if(f == 1.f) + return to; + + tmp_cos_omega = v4f_dot(from, to); + + mask = v4f_lt(tmp_cos_omega, v4f_zero()); + tmp0 = v4f_sel(to, v4f_minus(to), mask); + cos_omega = v4f_sel(tmp_cos_omega, v4f_minus(tmp_cos_omega), mask); + + omega = v4f_acos(cos_omega); + rcp_sin_omega = v4f_rcp(v4f_sin(omega)); + one_sub_v = v4f_sub(v4f_set1(1.f), vvvv); + tmp1 = v4f_mul(v4f_sin(v4f_mul(one_sub_v, omega)), rcp_sin_omega); + tmp2 = v4f_mul(v4f_sin(v4f_mul(omega, vvvv)), rcp_sin_omega); + + mask = v4f_gt(v4f_sub(v4f_set1(1.f), cos_omega), v4f_set1(1.e-6f)); + scale0 = v4f_sel(one_sub_v, tmp1, mask); + scale1 = v4f_sel(vvvv, tmp2, mask); + + return v4f_madd(from, scale0, v4f_mul(tmp0, scale1)); +} + +void +aosq_to_aosf33(const v4f_T q, v4f_T out[3]) +{ + const v4f_T i2j2k2_ = v4f_add(q, q); + + const v4f_T r0 = /* { jj2 + kk2, ij2 + ak2, ik2 - aj2 } */ + v4f_madd(v4f_mul(v4f_zzyy(i2j2k2_), v4f_zwwz(q)), + v4f_set(1.f, 1.f, -1.f, 0.f), + v4f_mul(v4f_yyzz(i2j2k2_), v4f_yxxy(q))); + const v4f_T r1 = /* { ij2 - ak2, ii2 + kk2, jk2 + ai2 } */ + v4f_madd(v4f_mul(v4f_zzxx(i2j2k2_), v4f_wzwz(q)), + v4f_set(-1.f, 1.f, 1.f, 0.f), + v4f_mul(v4f_yxzw(i2j2k2_), v4f_xxyy(q))); + const v4f_T r2 = /* { ik2 + aj2, jk2 - ai2, ii2 + jj2 } */ + v4f_madd(v4f_mul(v4f_yxyx(i2j2k2_), v4f_wwyy(q)), + v4f_set(1.f, -1.f, 1.f, 0.f), + v4f_mul(v4f_zzxx(i2j2k2_), v4f_xyxy(q))); + + out[0] = /* { 1 - (jj2 + kk2), ij2 + ak2, ik2 - aj2 } */ + v4f_madd(r0, v4f_set(-1.f, 1.f, 1.f, 0.f), v4f_set(1.f, 0.f, 0.f, 0.f)); + out[1] = /* { ij2 - ak2, 1 - (ii2 + kk2), jk2 + ai2 } */ + v4f_madd(r1, v4f_set(1.f, -1.f, 1.f, 0.f), v4f_set(0.f, 1.f, 0.f, 0.f)); + out[2] = /* { ik2 + aj2, jk2 - ai2, 1 - (ii2 + jj2) } */ + v4f_madd(r2, v4f_set(1.f, 1.f, -1.f, 0.f), v4f_set(0.f, 0.f, 1.f, 0.f)); +} + diff --git a/src/aosq.h b/src/aosq.h @@ -0,0 +1,115 @@ +/* Copyright (C) 2014 Vincent Forest (vaplv@free.fr) + * + * The RSIMD library is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * The RSIMD library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ + +#ifndef AOSQ_H +#define AOSQ_H + +#include "rsimd.h" + +/* + * Functions on AoS quaternion encoded into a v4f_T as { i, j, k, a } + */ + +/******************************************************************************* + * Set operations + ******************************************************************************/ +static FINLINE v4f_T +aosq_identity(void) +{ + return v4f_set(0.f, 0.f, 0.f, 1.f); +} + +static FINLINE v4f_T +aosq_set_axis_angle(const v4f_T xyz_, const v4f_T aaaa) +{ + const v4f_T half_angle = v4f_mul(aaaa, v4f_set1(0.5f)); + v4f_T s, c; + v4f_T axis1; + v4f_T sssc; + + v4f_sincos(half_angle, &s, &c); + + axis1 = v4f_xyzd(xyz_, v4f_set1(1.f)); + sssc = v4f_xyzd(s, c); + + /* { x*sin(a/2), y*sin(a/2), z*sin(a/2), cos(a/2) } */ + return v4f_mul(axis1, sssc); +} + +/******************************************************************************* + * Comparison operations + ******************************************************************************/ +static FINLINE v4f_T +aosq_eq(const v4f_T q0, const v4f_T q1) +{ + const v4f_T r0 = v4f_eq(q0, q1); + const v4f_T r1 = v4f_and(v4f_xxyy(r0), v4f_zzww(r0)); + return v4f_and(v4f_xxyy(r1), v4f_zzww(r1)); +} + +static FINLINE v4f_T +aosq_eq_eps(const v4f_T q0, const v4f_T q1, const v4f_T eps) +{ + const v4f_T r0 = v4f_eq_eps(q0, q1, eps); + const v4f_T r1 = v4f_and(v4f_xxyy(r0), v4f_zzww(r0)); + return v4f_and(v4f_xxyy(r1), v4f_zzww(r1)); +} + +/******************************************************************************* + * Arithmetic operations + ******************************************************************************/ +#define SBIT__ (int32_t)0x80000000 +static FINLINE v4f_T +aosq_mul(const v4f_T q0, const v4f_T q1) +{ + const v4f_T a = v4f_mul(v4f_xor(v4f_mask(0, 0, SBIT__, 0), q0), v4f_wzyx(q1)); + const v4f_T b = v4f_mul(v4f_xor(v4f_mask(SBIT__, 0, 0, 0), q0), v4f_zwxy(q1)); + const v4f_T c = v4f_mul(v4f_xor(v4f_mask(0, SBIT__, 0, 0), q0), v4f_yxwz(q1)); + const v4f_T d = v4f_mul(v4f_xor(v4f_mask(SBIT__, SBIT__, SBIT__, 0), q0), q1); + const v4f_T ijij = v4f_xayb(v4f_sum(a), v4f_sum(b)); + const v4f_T kaka = v4f_xayb(v4f_sum(c), v4f_sum(d)); + return v4f_xyab(ijij, kaka); +} + +static FINLINE v4f_T /* { -ix, -jy, -jz, a } */ +aosq_conj(const v4f_T q) +{ + return v4f_xor(q, v4f_mask(SBIT__, SBIT__, SBIT__, 0)); +} +#undef SBIT__ + +static FINLINE v4f_T +aosq_calca(const v4f_T ijk_) +{ + const v4f_T ijk_square_len = v4f_dot3(ijk_, ijk_); + return v4f_sqrt(v4f_abs(v4f_sub(v4f_set1(1.f), ijk_square_len))); +} + +static FINLINE v4f_T +aosq_nlerp(const v4f_T from, const v4f_T to, const v4f_T aaaa) +{ + return v4f_normalize(v4f_lerp(from, to, aaaa)); +} + +RSIMD_API v4f_T aosq_slerp(const v4f_T from, const v4f_T to, const v4f_T aaaa); + +/******************************************************************************* + * Conversion + ******************************************************************************/ +RSIMD_API void aosq_to_aosf33(const v4f_T q, v4f_T out[3]); + +#endif /* AOSQ_H */ + + diff --git a/src/test_aosq.c b/src/test_aosq.c @@ -0,0 +1,145 @@ +/* Copyright (C) 2014 Vincent Forest (vaplv@free.fr) + * + * The RSIMD library is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * The RSIMD library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ + +#include "aosq.h" +#include "aosf33.h" +#include <rsys/float33.h> + +#define AOSF33_EQ_EPS(M, A, B, C, D, E, F, G, H, I, Eps) \ + { \ + float a[9], b[9]; \ + b[0] = (A); b[1] = (B); b[2] = (C); \ + b[3] = (D); b[4] = (E); b[5] = (F); \ + b[6] = (G); b[7] = (H); b[8] = (I); \ + CHECK(f33_eq_eps(aosf33_store(a, (M)), b, Eps), 1); \ + } (void)0 + +int +main(int argc, char** argv) +{ + union { int32_t i; float f; } cast; + v4f_T q0, q1, q2, t; + v4f_T m[3]; + (void)argc, (void)argv; + + q0 = aosq_identity(); + CHECK(v4f_x(q0), 0.f); + CHECK(v4f_y(q0), 0.f); + CHECK(v4f_z(q0), 0.f); + CHECK(v4f_w(q0), 1.f); + + q0 = aosq_set_axis_angle(v4f_set(2.f, 5.f, 1.f, 0.f), v4f_set1((float)PI*0.3f)); + CHECK(eq_eps(v4f_x(q0), 0.907981f, 1.e-6f), 1); + CHECK(eq_eps(v4f_y(q0), 2.269953f, 1.e-6f), 1); + CHECK(eq_eps(v4f_z(q0), 0.453991f, 1.e-6f), 1); + CHECK(eq_eps(v4f_w(q0), 0.891007f, 1.e-6f), 1); + + q0 = v4f_set(1.f, 2.f, 3.f, -3.f); + q1 = v4f_set(1.f, 2.f, 3.f, -3.f); + t = aosq_eq(q0, q1); + cast.f = v4f_x(t); CHECK(cast.i, (int32_t)0xFFFFFFFF); + cast.f = v4f_y(t); CHECK(cast.i, (int32_t)0xFFFFFFFF); + cast.f = v4f_z(t); CHECK(cast.i, (int32_t)0xFFFFFFFF); + cast.f = v4f_w(t); CHECK(cast.i, (int32_t)0xFFFFFFFF); + + q1 = v4f_set(0.f, 2.f, 3.f, -3.f); + t = aosq_eq(q0, q1); + cast.f = v4f_x(t); CHECK(cast.i, (int32_t)0x00000000); + cast.f = v4f_y(t); CHECK(cast.i, (int32_t)0x00000000); + cast.f = v4f_z(t); CHECK(cast.i, (int32_t)0x00000000); + cast.f = v4f_w(t); CHECK(cast.i, (int32_t)0x00000000); + + q1 = v4f_set(1.f, 0.f, 3.f, -3.f); + t = aosq_eq(q0, q1); + cast.f = v4f_x(t); CHECK(cast.i, (int32_t)0x00000000); + cast.f = v4f_y(t); CHECK(cast.i, (int32_t)0x00000000); + cast.f = v4f_z(t); CHECK(cast.i, (int32_t)0x00000000); + cast.f = v4f_w(t); CHECK(cast.i, (int32_t)0x00000000); + + q1 = v4f_set(1.f, 2.f, 0.f, -3.f); + t = aosq_eq(q0, q1); + cast.f = v4f_x(t); CHECK(cast.i, (int32_t)0x00000000); + cast.f = v4f_y(t); CHECK(cast.i, (int32_t)0x00000000); + cast.f = v4f_z(t); CHECK(cast.i, (int32_t)0x00000000); + cast.f = v4f_w(t); CHECK(cast.i, (int32_t)0x00000000); + + q1 = v4f_set(1.f, 2.f, 3.f, 0.f); + t = aosq_eq(q0, q1); + cast.f = v4f_x(t); CHECK(cast.i, (int32_t)0x00000000); + cast.f = v4f_y(t); CHECK(cast.i, (int32_t)0x00000000); + cast.f = v4f_z(t); CHECK(cast.i, (int32_t)0x00000000); + cast.f = v4f_w(t); CHECK(cast.i, (int32_t)0x00000000); + + q1 = v4f_set(1.01f, 2.f, 3.02f, -3.f); + t = aosq_eq_eps(q0, q1, v4f_set1(0.01f)); + cast.f = v4f_x(t); CHECK(cast.i, (int32_t)0x00000000); + cast.f = v4f_y(t); CHECK(cast.i, (int32_t)0x00000000); + cast.f = v4f_z(t); CHECK(cast.i, (int32_t)0x00000000); + cast.f = v4f_w(t); CHECK(cast.i, (int32_t)0x00000000); + t = aosq_eq_eps(q0, q1, v4f_set1(0.02f)); + cast.f = v4f_x(t); CHECK(cast.i, (int32_t)0xFFFFFFFF); + cast.f = v4f_y(t); CHECK(cast.i, (int32_t)0xFFFFFFFF); + cast.f = v4f_z(t); CHECK(cast.i, (int32_t)0xFFFFFFFF); + cast.f = v4f_w(t); CHECK(cast.i, (int32_t)0xFFFFFFFF); + + q0 = v4f_set(1.f, 2.f, 3.f, 4.f); + q1 = v4f_set(5.f, 6.f, 7.f, 8.f); + q2 = aosq_mul(q0, q1); + CHECK(v4f_x(q2), 24.f); + CHECK(v4f_y(q2), 48.f); + CHECK(v4f_z(q2), 48.f); + CHECK(v4f_w(q2), -6.f); + + q2 = aosq_conj(q0); + CHECK(v4f_x(q2), -1.f); + CHECK(v4f_y(q2), -2.f); + CHECK(v4f_z(q2), -3.f); + CHECK(v4f_w(q2), 4.f); + + q0 = v4f_normalize(v4f_set(1.f, 2.f, 5.f, 0.5f)); + q1 = v4f_xyzz(q0); + q1 = v4f_xyzd(q1, aosq_calca(q1)); + CHECK(v4f_x(q0), v4f_x(q1)); + CHECK(v4f_y(q0), v4f_y(q1)); + CHECK(v4f_z(q0), v4f_z(q1)); + CHECK(eq_eps(v4f_w(q0), v4f_w(q1), 1.e-6f), 1); + + q0 = v4f_set(1.f, 2.f, 3.f, 5.f); + q1 = v4f_set(2.f, 6.f, 7.f, 6.f); + q2 = aosq_slerp(q0, q1, v4f_set1(0.3f)); + CHECK(eq_eps(v4f_x(q2), 1.3f, 1.e-6f), 1); + CHECK(eq_eps(v4f_y(q2), 3.2f, 1.e-6f), 1); + CHECK(eq_eps(v4f_z(q2), 4.2f, 1.e-6f), 1); + CHECK(eq_eps(v4f_w(q2), 5.3f, 1.e-6f), 1); + + q0 = v4f_set(2.f, 5.f, 17.f, 9.f); + aosq_to_aosf33(q0, m); + AOSF33_EQ_EPS(m, + -627.f, 326.f, -22.f, + -286.f, -585.f, 206.f, + 158.f, 134.f, -57.f, + 1.e-6f); + + q0 = v4f_normalize(q0); + aosq_to_aosf33(q0, m); + AOSF33_EQ_EPS(m, + -0.573935f, 0.817043f, -0.055138f, + -0.716792f, -0.468672f, 0.516291f, + 0.395990f, 0.335840f, 0.854637f, + 1.e-6f); + + return 0; +} +