rsimd

Make SIMD instruction sets easier to use
git clone git://git.meso-star.fr/rsimd.git
Log | Files | Refs | README | LICENSE

commit ee22862cb041351d56c14327a801b09519e71674
parent 0add4c02c451f18ec21f41fa90ce7a360377eb28
Author: vaplv <vaplv@free.fr>
Date:   Fri, 17 Oct 2014 15:44:33 +0200

Add and test the AoS float44 SIMD functions

Diffstat:
Mcmake/CMakeLists.txt | 6+++++-
Asrc/aosf44.c | 112+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/aosf44.h | 400+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/test_aosf44.c | 420+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 937 insertions(+), 1 deletion(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt @@ -41,12 +41,15 @@ set(VERSION ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}) set(RSIMD_FILES_INC aosf33.h + aosf44.h rsimd.h sse/sse.h sse/ssef.h sse/ssei.h sse/sse_swz.h) -set(RSIMD_FILES_SRC sse/ssef.c) +set(RSIMD_FILES_SRC + aosf44.c + sse/ssef.c) rcmake_prepend_path(RSIMD_FILES_INC ${RSIMD_SOURCE_DIR}) rcmake_prepend_path(RSIMD_FILES_SRC ${RSIMD_SOURCE_DIR}) @@ -75,6 +78,7 @@ endmacro(new_test) new_test(test_v4f) new_test(test_v4i) new_test(test_aosf33) +new_test(test_aosf44) ################################################################################ # Install directives diff --git a/src/aosf44.c b/src/aosf44.c @@ -0,0 +1,112 @@ +/* Copyright (C) 2014 Vincent Forest (vaplv@free.fr) + * + * The RSIMD library is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * The RSIMD library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ + +#include "aosf44.h" + +v4f_T +aosf44_inverse(v4f_T res[4], const v4f_T m[4]) +{ + v4f_T c0, c1, c2, c3, r3; + v4f_T f33_023_c0, f33_023_c1, f33_023_c2, f33_023_c3; + v4f_T f33_123_c0, f33_123_c1, f33_123_c2, f33_123_c3; + v4f_T f33_013_c0, f33_013_c1, f33_013_c2, f33_013_c3; + v4f_T f33_012_012[3], f33_012_013[3], f33_012_023[3], f33_012_123[3]; + v4f_T f33_023_012[3], f33_023_013[3], f33_023_023[3], f33_023_123[3]; + v4f_T f33_123_012[3], f33_123_013[3], f33_123_023[3], f33_123_123[3]; + v4f_T f33_013_012[3], f33_013_013[3], f33_013_023[3], f33_013_123[3]; + v4f_T det_012, det_023, det_123, det_013; + v4f_T cofacts, det, idet, mpmp_idet, pmpm_idet; + ASSERT(res && m); + + /* Retrieve the columns 0, 1, 2 and 3 and the row 3 of the "m" matrix. */ + c0 = m[0]; + c1 = m[1]; + c2 = m[2]; + c3 = m[3]; + r3 = aosf44_row3(m); + + /* Define the 3x3 sub-matrix and compute their determinant */ + aosf33_set(f33_012_012, c0, c1, c2); + aosf33_set(f33_012_013, c0, c1, c3); + aosf33_set(f33_012_023, c0, c2, c3); + aosf33_set(f33_012_123, c1, c2, c3); + det_012 = v4f_048C + (aosf33_det(f33_012_123), + aosf33_det(f33_012_023), + aosf33_det(f33_012_013), + aosf33_det(f33_012_012)); + + f33_023_c0 = v4f_xzww(c0); + f33_023_c1 = v4f_xzww(c1); + f33_023_c2 = v4f_xzww(c2); + f33_023_c3 = v4f_xzww(c3); + aosf33_set(f33_023_012, f33_023_c0, f33_023_c1, f33_023_c2); + aosf33_set(f33_023_013, f33_023_c0, f33_023_c1, f33_023_c3); + aosf33_set(f33_023_023, f33_023_c0, f33_023_c2, f33_023_c3); + aosf33_set(f33_023_123, f33_023_c1, f33_023_c2, f33_023_c3); + det_023 = v4f_048C + (aosf33_det(f33_023_123), + aosf33_det(f33_023_023), + aosf33_det(f33_023_013), + aosf33_det(f33_023_012)); + + f33_123_c0 = v4f_yzww(c0); + f33_123_c1 = v4f_yzww(c1); + f33_123_c2 = v4f_yzww(c2); + f33_123_c3 = v4f_yzww(c3); + aosf33_set(f33_123_012, f33_123_c0, f33_123_c1, f33_123_c2); + aosf33_set(f33_123_013, f33_123_c0, f33_123_c1, f33_123_c3); + aosf33_set(f33_123_023, f33_123_c0, f33_123_c2, f33_123_c3); + aosf33_set(f33_123_123, f33_123_c1, f33_123_c2, f33_123_c3); + det_123 = v4f_048C + (aosf33_det(f33_123_123), + aosf33_det(f33_123_023), + aosf33_det(f33_123_013), + aosf33_det(f33_123_012)); + + f33_013_c0 = v4f_xyww(c0); + f33_013_c1 = v4f_xyww(c1); + f33_013_c2 = v4f_xyww(c2); + f33_013_c3 = v4f_xyww(c3); + aosf33_set(f33_013_012, f33_013_c0, f33_013_c1, f33_013_c2); + aosf33_set(f33_013_013, f33_013_c0, f33_013_c1, f33_013_c3); + aosf33_set(f33_013_023, f33_013_c0, f33_013_c2, f33_013_c3); + aosf33_set(f33_013_123, f33_013_c1, f33_013_c2, f33_013_c3); + det_013 = v4f_048C + (aosf33_det(f33_013_123), + aosf33_det(f33_013_023), + aosf33_det(f33_013_013), + aosf33_det(f33_013_012)); + + /* Compute the cofactors of the column 3 */ + cofacts = v4f_mul(det_012, v4f_set(-1.f, 1.f, -1.f, 1.f)); + + /* Compute the determinant of the "m" matrix */ + det = v4f_dot(cofacts, r3); + + /* Invert the matrix */ + idet = v4f_rcp(det); + mpmp_idet = v4f_xor + (idet, v4f_mask((int32_t)0x80000000, 0, (int32_t)0x80000000, 0)); + pmpm_idet = v4f_xor + (idet, v4f_mask(0, (int32_t)0x80000000, 0, (int32_t)0x80000000)); + res[0] = v4f_mul(det_123, pmpm_idet); + res[1] = v4f_mul(det_023, mpmp_idet); + res[2] = v4f_mul(det_013, pmpm_idet); + res[3] = v4f_mul(det_012, mpmp_idet); + + return det; +} + diff --git a/src/aosf44.h b/src/aosf44.h @@ -0,0 +1,400 @@ +/* Copyright (C) 2014 Vincent Forest (vaplv@free.fr) + * + * The RSIMD library is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * The RSIMD library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ + +#ifndef AOSF44_H +#define AOSF44_H + +#include "aosf33.h" +#include "rsimd.h" + +/* + * Functions on column major AoS float44 matrices. A 4x4 matrix is a set of 4 + * 4-wide SIMD float vectors, each representing a matrix column. + */ + +/******************************************************************************* + * Set operations + ******************************************************************************/ +static FINLINE float* +aosf44_store(float dst[16], const v4f_T m[4]) +{ + ASSERT(m && dst); + + if(IS_ALIGNED(dst, 16)) { + v4f_store(dst + 0, m[0]); + v4f_store(dst + 4, m[1]); + v4f_store(dst + 8, m[2]); + v4f_store(dst + 12, m[3]); + } else { + ALIGN(16) float tmp[4]; + int i; + FOR_EACH(i, 0, 4) { + v4f_store(tmp, m[i]); + dst[i*4 + 0] = tmp[0]; + dst[i*4 + 1] = tmp[1]; + dst[i*4 + 2] = tmp[2]; + dst[i*4 + 3] = tmp[3]; + } + } + return dst; +} + +static FINLINE v4f_T* +aosf44_load(v4f_T m[4], const float src[16]) +{ + ASSERT(m && src); + if(IS_ALIGNED(src, 16)) { + m[0] = v4f_load(src + 0); + m[1] = v4f_load(src + 4); + m[2] = v4f_load(src + 8); + m[3] = v4f_load(src + 12); + } else { + int i; + FOR_EACH(i, 0, 4) + m[i] = v4f_set(src[i*3+0], src[i*3+1], src[i*4+2], src[i*4+3]); + } + return m; +} + +static FINLINE v4f_T* +aosf44_set + (v4f_T m[4], const v4f_T c0, const v4f_T c1, const v4f_T c2, const v4f_T c3) +{ + ASSERT(m); + m[0] = c0, m[1] = c1, m[2] = c2, m[3] = c3; + return m; +} + +static FINLINE v4f_T* +aosf44_identity(v4f_T m[4]) +{ + ASSERT(m); + m[0] = v4f_set(1.f, 0.f, 0.f, 0.f); + m[1] = v4f_set(0.f, 1.f, 0.f, 0.f); + m[2] = v4f_set(0.f, 0.f, 1.f, 0.f); + m[3] = v4f_set(0.f, 0.f, 0.f, 1.f); + return m; +} + +static FINLINE v4f_T* +aosf44_zero(v4f_T m[4]) +{ + ASSERT(m); + m[0] = v4f_zero(); + m[1] = v4f_zero(); + m[2] = v4f_zero(); + m[3] = v4f_zero(); + return m; +} + +static FINLINE v4f_T* +aosf44_set_row0(v4f_T m[4], const v4f_T v) +{ + const v4f_T xyzw = v; + const v4f_T yyww = v4f_yyww(v); + const v4f_T zwzw = v4f_zwzw(v); + const v4f_T wwww = v4f_yyww(zwzw); + ASSERT(m); + m[0] = v4f_ayzw(m[0], xyzw); + m[1] = v4f_ayzw(m[1], yyww); + m[2] = v4f_ayzw(m[2], zwzw); + m[3] = v4f_ayzw(m[3], wwww); + return m; +} + +static FINLINE v4f_T* +aosf44_set_row1(v4f_T m[4], const v4f_T v) +{ + ASSERT(m); + m[0] = v4f_xbzw(m[0], v4f_xxyy(v)); + m[1] = v4f_xbzw(m[1], v); + m[2] = v4f_xbzw(m[2], v4f_zzww(v)); + m[3] = v4f_xbzw(m[3], v4f_zwzw(v)); + return m; +} + +static FINLINE v4f_T* +aosf44_set_row2(v4f_T m[4], const v4f_T v) +{ + ASSERT(m); + m[0] = v4f_xycw(m[0], v4f_xyxy(v)); + m[1] = v4f_xycw(m[1], v4f_xxyy(v)); + m[2] = v4f_xycw(m[2], v); + m[3] = v4f_xycw(m[3], v4f_zzww(v)); + return m; +} + +static FINLINE v4f_T* +aosf44_set_row3(v4f_T m[4], const v4f_T v) +{ + ASSERT(m); + m[0] = v4f_xyzd(m[0], v4f_xxxx(v)); + m[1] = v4f_xyzd(m[1], v4f_xxyy(v)); + m[2] = v4f_xyzd(m[2], v4f_xxzz(v)); + m[3] = v4f_xyzd(m[3], v); + return m; +} + +static FINLINE v4f_T* +aosf44_set_row(v4f_T m[4], const v4f_T v, const int id) +{ + const v4f_T mask = v4f_mask(-(id==0), -(id==1), -(id==2), -(id==3)); + ASSERT(m && id >= 0 && id <= 3); + m[0] = v4f_sel(m[0], v4f_xxxx(v), mask); + m[1] = v4f_sel(m[1], v4f_yyyy(v), mask); + m[2] = v4f_sel(m[2], v4f_zzzz(v), mask); + m[3] = v4f_sel(m[3], v4f_wwww(v), mask); + return m; +} + +static FINLINE v4f_T* +aosf44_set_col(v4f_T m[4], const v4f_T v, const int id) +{ + ASSERT(m && id >= 0 && id <= 3); + m[id] = v; + return m; +} + +/******************************************************************************* + * Get operations + ******************************************************************************/ +static FINLINE v4f_T +aosf44_row0(const v4f_T m[4]) +{ + ASSERT(m); + return v4f_048C + (v4f_xxxx(m[0]), v4f_xxxx(m[1]), v4f_xxxx(m[2]), v4f_xxxx(m[3])); +} + +static FINLINE v4f_T +aosf44_row1(const v4f_T m[4]) +{ + ASSERT(m); + return v4f_048C + (v4f_yyyy(m[0]), v4f_yyyy(m[1]), v4f_yyyy(m[2]), v4f_yyyy(m[3])); +} + +static FINLINE v4f_T +aosf44_row2(const v4f_T m[4]) +{ + ASSERT(m); + return v4f_048C + (v4f_zzzz(m[0]), v4f_zzzz(m[1]), v4f_zzzz(m[2]), v4f_zzzz(m[3])); +} + +static FINLINE v4f_T +aosf44_row3(const v4f_T m[4]) +{ + ASSERT(m); + return v4f_048C + (v4f_wwww(m[0]), v4f_wwww(m[1]), v4f_wwww(m[2]), v4f_wwww(m[3])); +} + +static FINLINE v4f_T +aosf44_row(const v4f_T m[4], const int id) +{ + ASSERT(m && id >= 0 && id <= 3); + if(id == 0) { + return aosf44_row0(m); + } else if(id == 1) { + return aosf44_row1(m); + } else if(id == 2) { + return aosf44_row2(m); + } else { + return aosf44_row3(m); + } +} + +static FINLINE v4f_T +aosf44_col(const v4f_T m[4], const int id) +{ + ASSERT(m && id >= 0 && id <= 3); + return m[id]; +} + +/******************************************************************************* + * Arithmetic operations + ******************************************************************************/ +static FINLINE v4f_T* +aosf44_add(v4f_T res[4], const v4f_T m0[4], const v4f_T m1[4]) +{ + ASSERT(res && m0 && m1); + res[0] = v4f_add(m0[0], m1[0]); + res[1] = v4f_add(m0[1], m1[1]); + res[2] = v4f_add(m0[2], m1[2]); + res[3] = v4f_add(m0[3], m1[3]); + return res; +} + +static FINLINE v4f_T* +aosf44_sub(v4f_T res[4], const v4f_T m0[4], const v4f_T m1[4]) +{ + ASSERT(res && m0 && m1); + res[0] = v4f_sub(m0[0], m1[0]); + res[1] = v4f_sub(m0[1], m1[1]); + res[2] = v4f_sub(m0[2], m1[2]); + res[3] = v4f_sub(m0[3], m1[3]); + return res; +} + +static FINLINE v4f_T* +aosf44_minus(v4f_T res[4], const v4f_T m[4]) +{ + ASSERT(res && m); + res[0] = v4f_minus(m[0]); + res[1] = v4f_minus(m[1]); + res[2] = v4f_minus(m[2]); + res[3] = v4f_minus(m[3]); + return res; +} + +static FINLINE v4f_T* +aosf44_abs(v4f_T res[4], const v4f_T m[4]) +{ + ASSERT(res && m); + res[0] = v4f_abs(m[0]); + res[1] = v4f_abs(m[1]); + res[2] = v4f_abs(m[2]); + res[3] = v4f_abs(m[3]); + return res; +} + +static FINLINE v4f_T* +aosf44_mul(v4f_T res[4], const v4f_T m[4], const v4f_T v) +{ + ASSERT(res && m); + res[0] = v4f_mul(m[0], v); + res[1] = v4f_mul(m[1], v); + res[2] = v4f_mul(m[2], v); + res[3] = v4f_mul(m[3], v); + return res; +} + +static FINLINE v4f_T +aosf44_mulf4(const v4f_T m[4], const v4f_T v) +{ + v4f_T r0, r1, r2; + ASSERT(m); + r0 = v4f_mul(m[0], v4f_xxxx(v)); + r1 = v4f_madd(m[1], v4f_yyyy(v), r0); + r2 = v4f_madd(m[2], v4f_zzzz(v), r1); + return v4f_madd(m[3], v4f_wwww(v), r2); +} + +static FINLINE v4f_T +aosf4_mulf44(v4f_T v, const v4f_T m[4]) +{ + v4f_T xxxx, yyyy, zzzz, wwww, xyxy, zwzw; + ASSERT(m); + xxxx = v4f_dot(v, m[0]); + yyyy = v4f_dot(v, m[1]); + zzzz = v4f_dot(v, m[2]); + wwww = v4f_dot(v, m[3]); + xyxy = v4f_xayb(xxxx, yyyy); + zwzw = v4f_xayb(zzzz, wwww); + return v4f_xyab(xyxy, zwzw); +} + +static FINLINE v4f_T* +aosf44_mulf44 + (v4f_T res[4], const v4f_T m0[4], const v4f_T m1[4]) +{ + v4f_T c0, c1, c2, c3; + ASSERT(res && m0 && m1); + c0 = aosf44_mulf4(m0, m1[0]); + c1 = aosf44_mulf4(m0, m1[1]); + c2 = aosf44_mulf4(m0, m1[2]); + c3 = aosf44_mulf4(m0, m1[3]); + res[0] = c0; + res[1] = c1; + res[2] = c2; + res[3] = c3; + return res; +} + +static FINLINE v4f_T* +aosf44_transpose(v4f_T res[4], const v4f_T m[4]) +{ + v4f_T in_c0, in_c1, in_c2, in_c3; + v4f_T x0x2y0y2, x1x3y1y3, z0z2w0w2, z1z3w1w3; + ASSERT(res && m); + in_c0 = m[0]; + in_c1 = m[1]; + in_c2 = m[2]; + in_c3 = m[3]; + x0x2y0y2 = v4f_xayb(in_c0, in_c2); + x1x3y1y3 = v4f_xayb(in_c1, in_c3); + z0z2w0w2 = v4f_zcwd(in_c0, in_c2); + z1z3w1w3 = v4f_zcwd(in_c1, in_c3); + res[0] = v4f_xayb(x0x2y0y2, x1x3y1y3); + res[1] = v4f_zcwd(x0x2y0y2, x1x3y1y3); + res[2] = v4f_xayb(z0z2w0w2, z1z3w1w3); + res[3] = v4f_zcwd(z0z2w0w2, z1z3w1w3); + return res; +} + +static FINLINE v4f_T +aosf44_det(const v4f_T m[4]) +{ + v4f_T xxxx, yyyy, zzzz, wwww, xyxy, zwzw, xyzw; + v4f_T f33_012_012[3], f33_012_013[3], f33_012_023[3], f33_012_123[3]; + ASSERT(m); + aosf33_set(f33_012_012, m[0], m[1], m[2]); + aosf33_set(f33_012_013, m[0], m[1], m[3]); + aosf33_set(f33_012_023, m[0], m[2], m[3]); + aosf33_set(f33_012_123, m[1], m[2], m[3]); + xxxx = v4f_minus(aosf33_det(f33_012_123)); + yyyy = aosf33_det(f33_012_023); + zzzz = v4f_minus(aosf33_det(f33_012_013)); + wwww = aosf33_det(f33_012_012); + xyxy = v4f_xayb(xxxx, yyyy); + zwzw = v4f_xayb(zzzz, wwww); + xyzw = v4f_xyab(xyxy, zwzw); + return v4f_dot(xyzw, aosf44_row3(m)); +} + +RSIMD_API v4f_T /* Return the determinant */ +aosf44_inverse(v4f_T out[4], const v4f_T in[4]); + +static FINLINE v4f_T /* Return the determinant */ +aosf44_invtrans(v4f_T out[4], const v4f_T a[4]) +{ + v4f_T det; + ASSERT(out && a); + det = aosf44_inverse(out, a); + aosf44_transpose(out, out); + return det; +} + +static FINLINE v4f_T +aosf44_eq(const v4f_T a[4], const v4f_T b[4]) +{ + ASSERT(a && b); + if(a == b) { + return v4f_true(); + } else { + const v4f_T eq_c0 = v4f_eq(a[0], b[0]); + const v4f_T eq_c1 = v4f_eq(a[1], b[1]); + const v4f_T eq_c2 = v4f_eq(a[2], b[2]); + const v4f_T eq_c3 = v4f_eq(a[3], b[3]); + const v4f_T eq = v4f_and(v4f_and(eq_c0, eq_c1), v4f_and(eq_c2, eq_c3)); + const v4f_T tmp = v4f_and(v4f_xzxz(eq), v4f_ywyw(eq)); + const v4f_T ret = v4f_and(tmp, v4f_yxwz(tmp)); + return ret; + } +} + +#endif /* AOSF44_H */ + diff --git a/src/test_aosf44.c b/src/test_aosf44.c @@ -0,0 +1,420 @@ +/* Copyright (C) 2014 Vincent Forest (vaplv@free.fr) + * + * The RSIMD library is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * The RSIMD library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ + +#include "aosf44.h" +#include <rsys/float44.h> + +#define AOSF44_EQ_EPS(Mat, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Eps)\ + { \ + float a[16], b[16]; \ + b[0] = (A); b[1] = (B); b[2] = (C); b[3] = (D); \ + b[4] = (E); b[5] = (F); b[6] = (G); b[7] = (H); \ + b[8] = (I); b[9] = (J); b[10]= (K); b[11]= (L); \ + b[12]= (M); b[13]= (N); b[14]= (O); b[15]= (P); \ + CHECK(f44_eq_eps(aosf44_store(a, (Mat)), b, Eps), 1); \ + } (void)0 +#define AOSF44_EQ(Mat, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) \ + AOSF44_EQ_EPS(Mat, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, 0.f) + +int +main(int argc, char** argv) +{ + v4f_T m[4], n[4], o[4], v; + ALIGN(16) float tmp[16]; + (void)argc, (void)argv; + + CHECK(aosf44_set(m, + v4f_set(0.f, 1.f, 2.f, 3.f), + v4f_set(4.f, 5.f, 6.f, 7.f), + v4f_set(8.f, 9.f, 10.f, 11.f), + v4f_set(12.f, 13.f, 14.f, 15.f)), m); + AOSF44_EQ(m, + 0.f, 1.f, 2.f, 3.f, + 4.f, 5.f, 6.f, 7.f, + 8.f, 9.f, 10.f, 11.f, + 12.f, 13.f, 14.f, 15.f); + + CHECK(aosf44_store(tmp, m), tmp); + CHECK(tmp[0], 0.f); + CHECK(tmp[1], 1.f); + CHECK(tmp[2], 2.f); + CHECK(tmp[3], 3.f); + CHECK(tmp[4], 4.f); + CHECK(tmp[5], 5.f); + CHECK(tmp[6], 6.f); + CHECK(tmp[7], 7.f); + CHECK(tmp[8], 8.f); + CHECK(tmp[9], 9.f); + CHECK(tmp[10], 10.f); + CHECK(tmp[11], 11.f); + CHECK(tmp[12], 12.f); + CHECK(tmp[13], 13.f); + CHECK(tmp[14], 14.f); + CHECK(tmp[15], 15.f); + + tmp[0] = 0.f; tmp[1] = 2.f; tmp[2] = 4.f; tmp[3] = 6.f; + tmp[4] = 8.f; tmp[5] = 10.f; tmp[6] = 12.f; tmp[7] = 14.f; + tmp[8] = 16.f; tmp[9] = 18.f; tmp[10] = 20.f; tmp[11] = 22.f; + tmp[12] = 24.f; tmp[13] = 26.f; tmp[14] = 28.f; tmp[15] = 30.f; + CHECK(aosf44_load(m, tmp), m); + AOSF44_EQ(m, + 0.f, 2.f, 4.f, 6.f, + 8.f, 10.f, 12.f, 14.f, + 16.f, 18.f, 20.f, 22.f, + 24.f, 26.f, 28.f, 30.f); + + CHECK(aosf44_identity(m), m); + AOSF44_EQ(m, + 1.f, 0.f, 0.f, 0.f, + 0.f, 1.f, 0.f, 0.f, + 0.f, 0.f, 1.f, 0.f, + 0.f, 0.f, 0.f, 1.f); + + CHECK(aosf44_zero(m), m); + AOSF44_EQ(m, + 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f); + + CHECK(aosf44_set_row0(m, v4f_set(0.f, 1.f, 2.f, 3.f)), m); + AOSF44_EQ(m, + 0.f, 0.f, 0.f, 0.f, + 1.f, 0.f, 0.f, 0.f, + 2.f, 0.f, 0.f, 0.f, + 3.f, 0.f, 0.f, 0.f); + CHECK(aosf44_set_row1(m, v4f_set(4.f, 5.f, 6.f, 7.f)), m); + AOSF44_EQ(m, + 0.f, 4.f, 0.f, 0.f, + 1.f, 5.f, 0.f, 0.f, + 2.f, 6.f, 0.f, 0.f, + 3.f, 7.f, 0.f, 0.f); + CHECK(aosf44_set_row2(m, v4f_set(8.f, 9.f, 10.f, 11.f)), m); + AOSF44_EQ(m, + 0.f, 4.f, 8.f, 0.f, + 1.f, 5.f, 9.f, 0.f, + 2.f, 6.f, 10.f, 0.f, + 3.f, 7.f, 11.f, 0.f); + CHECK(aosf44_set_row3(m, v4f_set(12.f, 13.f, 14.f, 15.f)), m); + AOSF44_EQ(m, + 0.f, 4.f, 8.f, 12.f, + 1.f, 5.f, 9.f, 13.f, + 2.f, 6.f, 10.f, 14.f, + 3.f, 7.f, 11.f, 15.f); + + CHECK(aosf44_zero(m), m); + CHECK(aosf44_set_row(m, v4f_set(0.f, 1.f, 2.f, 3.f), 0), m); + AOSF44_EQ(m, + 0.f, 0.f, 0.f, 0.f, + 1.f, 0.f, 0.f, 0.f, + 2.f, 0.f, 0.f, 0.f, + 3.f, 0.f, 0.f, 0.f); + CHECK(aosf44_set_row(m, v4f_set(4.f, 5.f, 6.f, 7.f), 1), m); + AOSF44_EQ(m, + 0.f, 4.f, 0.f, 0.f, + 1.f, 5.f, 0.f, 0.f, + 2.f, 6.f, 0.f, 0.f, + 3.f, 7.f, 0.f, 0.f); + CHECK(aosf44_set_row(m, v4f_set(8.f, 9.f, 10.f, 11.f), 2), m); + AOSF44_EQ(m, + 0.f, 4.f, 8.f, 0.f, + 1.f, 5.f, 9.f, 0.f, + 2.f, 6.f, 10.f, 0.f, + 3.f, 7.f, 11.f, 0.f); + CHECK(aosf44_set_row(m, v4f_set(12.f, 13.f, 14.f, 15.f), 3), m); + AOSF44_EQ(m, + 0.f, 4.f, 8.f, 12.f, + 1.f, 5.f, 9.f, 13.f, + 2.f, 6.f, 10.f, 14.f, + 3.f, 7.f, 11.f, 15.f); + + CHECK(aosf44_zero(m), m); + CHECK(aosf44_set_col(m, v4f_set(0.f, 1.f, 2.f, 3.f), 0), m); + AOSF44_EQ(m, + 0.f, 1.f, 2.f, 3.f, + 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f); + CHECK(aosf44_set_col(m, v4f_set(4.f, 5.f, 6.f, 7.f), 1), m); + AOSF44_EQ(m, + 0.f, 1.f, 2.f, 3.f, + 4.f, 5.f, 6.f, 7.f, + 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f); + CHECK(aosf44_set_col(m, v4f_set(8.f, 9.f, 10.f, 11.f), 2), m); + AOSF44_EQ(m, + 0.f, 1.f, 2.f, 3.f, + 4.f, 5.f, 6.f, 7.f, + 8.f, 9.f, 10.f, 11.f, + 0.f, 0.f, 0.f, 0.f); + CHECK(aosf44_set_col(m, v4f_set(12.f, 13.f, 14.f, 15.f), 3), m); + AOSF44_EQ(m, + 0.f, 1.f, 2.f, 3.f, + 4.f, 5.f, 6.f, 7.f, + 8.f, 9.f, 10.f, 11.f, + 12.f, 13.f, 14.f, 15.f); + + v = aosf44_row0(m); + CHECK(v4f_x(v), 0.f); + CHECK(v4f_y(v), 4.f); + CHECK(v4f_z(v), 8.f); + CHECK(v4f_w(v), 12.f); + + v = aosf44_row1(m); + CHECK(v4f_x(v), 1.f); + CHECK(v4f_y(v), 5.f); + CHECK(v4f_z(v), 9.f); + CHECK(v4f_w(v), 13.f); + + v = aosf44_row2(m); + CHECK(v4f_x(v), 2.f); + CHECK(v4f_y(v), 6.f); + CHECK(v4f_z(v), 10.f); + CHECK(v4f_w(v), 14.f); + + v = aosf44_row3(m); + CHECK(v4f_x(v), 3.f); + CHECK(v4f_y(v), 7.f); + CHECK(v4f_z(v), 11.f); + CHECK(v4f_w(v), 15.f); + + v = aosf44_row(m, 0); + CHECK(v4f_x(v), 0.f); + CHECK(v4f_y(v), 4.f); + CHECK(v4f_z(v), 8.f); + CHECK(v4f_w(v), 12.f); + + v = aosf44_row(m, 1); + CHECK(v4f_x(v), 1.f); + CHECK(v4f_y(v), 5.f); + CHECK(v4f_z(v), 9.f); + CHECK(v4f_w(v), 13.f); + + v = aosf44_row(m, 2); + CHECK(v4f_x(v), 2.f); + CHECK(v4f_y(v), 6.f); + CHECK(v4f_z(v), 10.f); + CHECK(v4f_w(v), 14.f); + + v = aosf44_row(m, 3); + CHECK(v4f_x(v), 3.f); + CHECK(v4f_y(v), 7.f); + CHECK(v4f_z(v), 11.f); + CHECK(v4f_w(v), 15.f); + + v = aosf44_col(m, 0); + CHECK(v4f_x(v), 0.f); + CHECK(v4f_y(v), 1.f); + CHECK(v4f_z(v), 2.f); + CHECK(v4f_w(v), 3.f); + + v = aosf44_col(m, 1); + CHECK(v4f_x(v), 4.f); + CHECK(v4f_y(v), 5.f); + CHECK(v4f_z(v), 6.f); + CHECK(v4f_w(v), 7.f); + + v = aosf44_col(m, 2); + CHECK(v4f_x(v), 8.f); + CHECK(v4f_y(v), 9.f); + CHECK(v4f_z(v), 10.f); + CHECK(v4f_w(v), 11.f); + + v = aosf44_col(m, 3); + CHECK(v4f_x(v), 12.f); + CHECK(v4f_y(v), 13.f); + CHECK(v4f_z(v), 14.f); + CHECK(v4f_w(v), 15.f); + + CHECK(aosf44_set(m, + v4f_set(0.f, 1.f, 2.f, 3.f), + v4f_set(4.f, 5.f, 6.f, 7.f), + v4f_set(8.f, 9.f, 10.f, 11.f), + v4f_set(12.f, 13.f, 14.f, 15.f)), m); + CHECK(aosf44_set(n, + v4f_set(0.f, 2.f, 1.f, 3.f), + v4f_set(1.f, -2.f, -1.f, -3.f), + v4f_set(1.f, 0.f, 0.f, 2.f), + v4f_set(3.f, 2.f, 1.f, 0.f)), n); + CHECK(aosf44_add(o, m, n), o); + AOSF44_EQ(o, + 0.f, 3.f, 3.f, 6.f, + 5.f, 3.f, 5.f, 4.f, + 9.f, 9.f, 10.f, 13.f, + 15.f, 15.f, 15.f, 15.f); + + CHECK(aosf44_sub(o, m, n), o); + AOSF44_EQ(o, + 0.f, -1.f, 1.f, 0.f, + 3.f, 7.f, 7.f, 10.f, + 7.f, 9.f, 10.f, 9.f, + 9.f, 11.f, 13.f, 15.f); + + CHECK(aosf44_minus(o, n), o); + AOSF44_EQ(o, + 0.f, -2.f, -1.f, -3.f, + -1.f, 2.f, 1.f, 3.f, + -1.f, 0.f, 0.f, -2.f, + -3.f, -2.f, -1.f, 0.f); + + CHECK(aosf44_abs(o, o), o); + AOSF44_EQ(o, + 0.f, 2.f, 1.f, 3.f, + 1.f, 2.f, 1.f, 3.f, + 1.f, 0.f, 0.f, 2.f, + 3.f, 2.f, 1.f, 0.f); + + CHECK(aosf44_mul(o, n, v4f_set(1.f, 2.f, 3.f, 2.f)), o); + AOSF44_EQ(o, + 0.f, 4.f, 3.f, 6.f, + 1.f, -4.f, -3.f, -6.f, + 1.f, 0.f, 0.f, 4.f, + 3.f, 4.f, 3.f, 0.f); + + aosf44_set(m, + v4f_set(0.f, 1.f, 2.f, 3.f), + v4f_set(4.f, 5.f, 6.f, 7.f), + v4f_set(8.f, 9.f, 10.f, 11.f), + v4f_set(12.f, 13.f, 14.f, 15.f)); + v = aosf44_mulf4(m, v4f_set(1.f, 2.f, 3.f, 1.f)); + CHECK(v4f_x(v), 44.f); + CHECK(v4f_y(v), 51.f); + CHECK(v4f_z(v), 58.f); + CHECK(v4f_w(v), 65.f); + + v = aosf4_mulf44(v4f_set(1.f, 2.f, 3.f, 1.f), m); + CHECK(v4f_x(v), 11.f); + CHECK(v4f_y(v), 39.f); + CHECK(v4f_z(v), 67.f); + CHECK(v4f_w(v), 95.f); + + aosf44_set(m, + v4f_set(1.f, 2.f, 3.f, 4.f), + v4f_set(4.f, 5.f, 6.f, 7.f), + v4f_set(7.f, 8.f, 9.f, 10.f), + v4f_set(10.f, 11.f, 12.f, 13.f)); + aosf44_set(n, + v4f_set(2.f, 9.f, 8.f, 1.f), + v4f_set(1.f, -2.f, 2.f, 1.f), + v4f_set(1.f, -8.f, -4.f, 2.f), + v4f_set(1.f, 3.f, 4.f, 2.f)); + CHECK(aosf44_mulf44(o, m, n), o); + AOSF44_EQ(o, + 104.f, 124.f, 144.f, 164.f, + 17.f, 19.f, 21.f, 23.f, + -39.f, -48.f, -57.f, -66.f, + 61.f, 71.f, 81.f, 91.f); + + CHECK(aosf44_transpose(o, n), o); + AOSF44_EQ(o, + 2.f, 1.f, 1.f, 1.f, + 9.f, -2.f, -8.f, 3.f, + 8.f, 2.f, -4.f, 4.f, + 1.f, 1.f, 2.f, 2.f); + + v = aosf44_det(n); + CHECK(v4f_x(v), 78.f); + CHECK(v4f_y(v), 78.f); + CHECK(v4f_z(v), 78.f); + CHECK(v4f_w(v), 78.f); + + v = aosf44_inverse(m, n); + CHECK(v4f_x(v), 78.f); + CHECK(v4f_y(v), 78.f); + CHECK(v4f_z(v), 78.f); + CHECK(v4f_w(v), 78.f); + CHECK(aosf44_mulf44(o, m, n), o); + AOSF44_EQ_EPS(o, + 1.f, 0.f, 0.f, 0.f, + 0.f, 1.f, 0.f, 0.f, + 0.f, 0.f, 1.f, 0.f, + 0.f, 0.f, 0.f, 1.f, + 1.e-6f); + + v = aosf44_invtrans(o, n); + CHECK(v4f_x(v), 78.f); + CHECK(v4f_y(v), 78.f); + CHECK(v4f_z(v), 78.f); + CHECK(v4f_w(v), 78.f); + AOSF44_EQ(o, + v4f_x(m[0]), v4f_x(m[1]), v4f_x(m[2]), v4f_x(m[3]), + v4f_y(m[0]), v4f_y(m[1]), v4f_y(m[2]), v4f_y(m[3]), + v4f_z(m[0]), v4f_z(m[1]), v4f_z(m[2]), v4f_z(m[3]), + v4f_w(m[0]), v4f_w(m[1]), v4f_w(m[2]), v4f_w(m[3])); + + aosf44_set(m, + v4f_set(0.f, 1.f, 2.f, 3.f), + v4f_set(5.f, 5.f, 6.f, 7.f), + v4f_set(8.f, 9.f, 10.f, 11.f), + v4f_set(12.f, 13.f, 14.f, 15.f)); + aosf44_set(n, + v4f_set(0.f, 1.f, 2.f, 3.f), + v4f_set(5.f, 5.f, 6.f, 7.f), + v4f_set(8.f, 9.f, 10.f, 11.f), + v4f_set(12.f, 13.f, 14.f, 15.f)); + + v = aosf44_eq(m, n); + CHECK(v4f_mask_x(v), ~0); + CHECK(v4f_mask_y(v), ~0); + CHECK(v4f_mask_z(v), ~0); + CHECK(v4f_mask_w(v), ~0); + + n[0] = v4f_set(0.f, 1.0f, 2.f, 4.f); + v = aosf44_eq(m, n); + CHECK(v4f_mask_x(v), 0); + CHECK(v4f_mask_y(v), 0); + CHECK(v4f_mask_z(v), 0); + CHECK(v4f_mask_w(v), 0); + n[0] = v4f_set(0.f, 1.0f, 2.f, 3.f); + + n[1] = v4f_set(4.f, 5.0f, 6.f, 7.f); + v = aosf44_eq(m, n); + CHECK(v4f_mask_x(v), 0); + CHECK(v4f_mask_y(v), 0); + CHECK(v4f_mask_z(v), 0); + CHECK(v4f_mask_w(v), 0); + n[1] = v4f_set(5.f, 5.0f, 6.f, 7.f); + + m[2] = v4f_set(8.f, -9.0f, 10.f, 11.f); + v = aosf44_eq(m, n); + CHECK(v4f_mask_x(v), 0); + CHECK(v4f_mask_y(v), 0); + CHECK(v4f_mask_z(v), 0); + CHECK(v4f_mask_w(v), 0); + m[2] = v4f_set(8.f, 9.0f, 10.f, 11.f); + + n[3] = v4f_set(12.f, 13.1f, 14.f, 15.f); + v = aosf44_eq(m, n); + CHECK(v4f_mask_x(v), 0); + CHECK(v4f_mask_y(v), 0); + CHECK(v4f_mask_z(v), 0); + CHECK(v4f_mask_w(v), 0); + + v = aosf44_eq(m, m); + CHECK(v4f_mask_x(v), ~0); + CHECK(v4f_mask_y(v), ~0); + CHECK(v4f_mask_z(v), ~0); + CHECK(v4f_mask_w(v), ~0); + n[3] = v4f_set(12.f, 13.0f, 14.f, 15.f); + + v = aosf44_eq(m, n); + CHECK(v4f_mask_x(v), ~0); + CHECK(v4f_mask_y(v), ~0); + CHECK(v4f_mask_z(v), ~0); + CHECK(v4f_mask_w(v), ~0); + return 0; +} + +