rsimd

Make SIMD instruction sets easier to use
git clone git://git.meso-star.fr/rsimd.git
Log | Files | Refs | README | LICENSE

aosf44.h (9775B)


      1 /* Copyright (C) 2014-2019, 2021, 2023, 2025 Vincent Forest (vaplv@free.fr)
      2  *
      3  * The RSIMD library is free software: you can redistribute it and/or modify
      4  * it under the terms of the GNU General Public License as published
      5  * by the Free Software Foundation, either version 3 of the License, or
      6  * (at your option) any later version.
      7  *
      8  * The RSIMD library is distributed in the hope that it will be useful,
      9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
     10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
     11  * GNU General Public License for more details.
     12  *
     13  * You should have received a copy of the GNU General Public License
     14  * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */
     15 
     16 #ifndef AOSF44_H
     17 #define AOSF44_H
     18 
     19 #include "aosf33.h"
     20 #include "rsimd.h"
     21 
     22 /*
     23  * Functions on column major AoS float44 matrices. A 4x4 matrix is a set of 4
     24  * 4-wide SIMD float vectors, each representing a matrix column.
     25  */
     26 
     27 /*******************************************************************************
     28  * Set operations
     29  ******************************************************************************/
     30 static FINLINE float*
     31 aosf44_store(float dst[16], const v4f_T m[4])
     32 {
     33   ASSERT(m && dst);
     34 
     35   if(IS_ALIGNED(dst, 16)) {
     36     v4f_store(dst + 0, m[0]);
     37     v4f_store(dst + 4, m[1]);
     38     v4f_store(dst + 8, m[2]);
     39     v4f_store(dst + 12, m[3]);
     40   } else {
     41     ALIGN(16) float tmp[4];
     42     int i;
     43     FOR_EACH(i, 0, 4) {
     44       v4f_store(tmp, m[i]);
     45       dst[i*4 + 0] = tmp[0];
     46       dst[i*4 + 1] = tmp[1];
     47       dst[i*4 + 2] = tmp[2];
     48       dst[i*4 + 3] = tmp[3];
     49     }
     50   }
     51   return dst;
     52 }
     53 
     54 static FINLINE v4f_T*
     55 aosf44_load(v4f_T m[4], const float src[16])
     56 {
     57   ASSERT(m && src);
     58   if(IS_ALIGNED(src, 16)) {
     59     m[0] = v4f_load(src + 0);
     60     m[1] = v4f_load(src + 4);
     61     m[2] = v4f_load(src + 8);
     62     m[3] = v4f_load(src + 12);
     63   } else {
     64     int i;
     65     FOR_EACH(i, 0, 4)
     66       m[i] = v4f_set(src[i*3+0], src[i*3+1], src[i*4+2], src[i*4+3]);
     67   }
     68   return m;
     69 }
     70 
     71 static FINLINE v4f_T*
     72 aosf44_set
     73   (v4f_T m[4], const v4f_T c0, const v4f_T c1, const v4f_T c2, const v4f_T c3)
     74 {
     75   ASSERT(m);
     76   m[0] = c0, m[1] = c1, m[2] = c2, m[3] = c3;
     77   return m;
     78 }
     79 
     80 static FINLINE v4f_T*
     81 aosf44_identity(v4f_T m[4])
     82 {
     83   ASSERT(m);
     84   m[0] = v4f_set(1.f, 0.f, 0.f, 0.f);
     85   m[1] = v4f_set(0.f, 1.f, 0.f, 0.f);
     86   m[2] = v4f_set(0.f, 0.f, 1.f, 0.f);
     87   m[3] = v4f_set(0.f, 0.f, 0.f, 1.f);
     88   return m;
     89 }
     90 
     91 static FINLINE v4f_T*
     92 aosf44_zero(v4f_T m[4])
     93 {
     94   ASSERT(m);
     95   m[0] = v4f_zero();
     96   m[1] = v4f_zero();
     97   m[2] = v4f_zero();
     98   m[3] = v4f_zero();
     99   return m;
    100 }
    101 
    102 static FINLINE v4f_T*
    103 aosf44_set_row0(v4f_T m[4], const v4f_T v)
    104 {
    105   const v4f_T xyzw = v;
    106   const v4f_T yyww = v4f_yyww(v);
    107   const v4f_T zwzw = v4f_zwzw(v);
    108   const v4f_T wwww = v4f_yyww(zwzw);
    109   ASSERT(m);
    110   m[0] = v4f_ayzw(m[0], xyzw);
    111   m[1] = v4f_ayzw(m[1], yyww);
    112   m[2] = v4f_ayzw(m[2], zwzw);
    113   m[3] = v4f_ayzw(m[3], wwww);
    114   return m;
    115 }
    116 
    117 static FINLINE v4f_T*
    118 aosf44_set_row1(v4f_T m[4], const v4f_T v)
    119 {
    120   ASSERT(m);
    121   m[0] = v4f_xbzw(m[0], v4f_xxyy(v));
    122   m[1] = v4f_xbzw(m[1], v);
    123   m[2] = v4f_xbzw(m[2], v4f_zzww(v));
    124   m[3] = v4f_xbzw(m[3], v4f_zwzw(v));
    125   return m;
    126 }
    127 
    128 static FINLINE v4f_T*
    129 aosf44_set_row2(v4f_T m[4], const v4f_T v)
    130 {
    131   ASSERT(m);
    132   m[0] = v4f_xycw(m[0], v4f_xyxy(v));
    133   m[1] = v4f_xycw(m[1], v4f_xxyy(v));
    134   m[2] = v4f_xycw(m[2], v);
    135   m[3] = v4f_xycw(m[3], v4f_zzww(v));
    136   return m;
    137 }
    138 
    139 static FINLINE v4f_T*
    140 aosf44_set_row3(v4f_T m[4], const v4f_T v)
    141 {
    142   ASSERT(m);
    143   m[0] = v4f_xyzd(m[0], v4f_xxxx(v));
    144   m[1] = v4f_xyzd(m[1], v4f_xxyy(v));
    145   m[2] = v4f_xyzd(m[2], v4f_xxzz(v));
    146   m[3] = v4f_xyzd(m[3], v);
    147   return m;
    148 }
    149 
    150 static FINLINE v4f_T*
    151 aosf44_set_row(v4f_T m[4], const v4f_T v, const int id)
    152 {
    153   const v4f_T mask = v4f_mask(-(id==0), -(id==1), -(id==2), -(id==3));
    154   ASSERT(m && id >= 0 && id <= 3);
    155   m[0] = v4f_sel(m[0], v4f_xxxx(v), mask);
    156   m[1] = v4f_sel(m[1], v4f_yyyy(v), mask);
    157   m[2] = v4f_sel(m[2], v4f_zzzz(v), mask);
    158   m[3] = v4f_sel(m[3], v4f_wwww(v), mask);
    159   return m;
    160 }
    161 
    162 static FINLINE v4f_T*
    163 aosf44_set_col(v4f_T m[4], const v4f_T v, const int id)
    164 {
    165   ASSERT(m && id >= 0 && id <= 3);
    166   m[id] = v;
    167   return m;
    168 }
    169 
    170 /*******************************************************************************
    171  * Get operations
    172  ******************************************************************************/
    173 static FINLINE v4f_T
    174 aosf44_row0(const v4f_T m[4])
    175 {
    176   ASSERT(m);
    177   return v4f_048C
    178     (v4f_xxxx(m[0]), v4f_xxxx(m[1]), v4f_xxxx(m[2]), v4f_xxxx(m[3]));
    179 }
    180 
    181 static FINLINE v4f_T
    182 aosf44_row1(const v4f_T m[4])
    183 {
    184   ASSERT(m);
    185   return v4f_048C
    186     (v4f_yyyy(m[0]), v4f_yyyy(m[1]), v4f_yyyy(m[2]), v4f_yyyy(m[3]));
    187 }
    188 
    189 static FINLINE v4f_T
    190 aosf44_row2(const v4f_T m[4])
    191 {
    192   ASSERT(m);
    193   return v4f_048C
    194     (v4f_zzzz(m[0]), v4f_zzzz(m[1]), v4f_zzzz(m[2]), v4f_zzzz(m[3]));
    195 }
    196 
    197 static FINLINE v4f_T
    198 aosf44_row3(const v4f_T m[4])
    199 {
    200   ASSERT(m);
    201   return v4f_048C
    202     (v4f_wwww(m[0]), v4f_wwww(m[1]), v4f_wwww(m[2]), v4f_wwww(m[3]));
    203 }
    204 
    205 static FINLINE v4f_T
    206 aosf44_row(const v4f_T m[4], const int id)
    207 {
    208   ASSERT(m && id >= 0 && id <= 3);
    209   if(id == 0) {
    210     return aosf44_row0(m);
    211   } else if(id == 1) {
    212     return aosf44_row1(m);
    213   } else if(id == 2) {
    214     return aosf44_row2(m);
    215   } else {
    216     return aosf44_row3(m);
    217   }
    218 }
    219 
    220 static FINLINE v4f_T
    221 aosf44_col(const v4f_T m[4], const int id)
    222 {
    223   ASSERT(m && id >= 0 && id <= 3);
    224   return m[id];
    225 }
    226 
    227 /*******************************************************************************
    228  * Arithmetic operations
    229  ******************************************************************************/
    230 static FINLINE v4f_T*
    231 aosf44_add(v4f_T res[4], const v4f_T m0[4], const v4f_T m1[4])
    232 {
    233   ASSERT(res && m0 && m1);
    234   res[0] = v4f_add(m0[0], m1[0]);
    235   res[1] = v4f_add(m0[1], m1[1]);
    236   res[2] = v4f_add(m0[2], m1[2]);
    237   res[3] = v4f_add(m0[3], m1[3]);
    238   return res;
    239 }
    240 
    241 static FINLINE v4f_T*
    242 aosf44_sub(v4f_T res[4], const v4f_T m0[4], const v4f_T m1[4])
    243 {
    244   ASSERT(res && m0 && m1);
    245   res[0] = v4f_sub(m0[0], m1[0]);
    246   res[1] = v4f_sub(m0[1], m1[1]);
    247   res[2] = v4f_sub(m0[2], m1[2]);
    248   res[3] = v4f_sub(m0[3], m1[3]);
    249   return res;
    250 }
    251 
    252 static FINLINE v4f_T*
    253 aosf44_minus(v4f_T res[4], const v4f_T m[4])
    254 {
    255   ASSERT(res && m);
    256   res[0] = v4f_minus(m[0]);
    257   res[1] = v4f_minus(m[1]);
    258   res[2] = v4f_minus(m[2]);
    259   res[3] = v4f_minus(m[3]);
    260   return res;
    261 }
    262 
    263 static FINLINE v4f_T*
    264 aosf44_abs(v4f_T res[4], const v4f_T m[4])
    265 {
    266   ASSERT(res && m);
    267   res[0] = v4f_abs(m[0]);
    268   res[1] = v4f_abs(m[1]);
    269   res[2] = v4f_abs(m[2]);
    270   res[3] = v4f_abs(m[3]);
    271   return res;
    272 }
    273 
    274 static FINLINE v4f_T*
    275 aosf44_mul(v4f_T res[4], const v4f_T m[4], const v4f_T v)
    276 {
    277   ASSERT(res && m);
    278   res[0] = v4f_mul(m[0], v);
    279   res[1] = v4f_mul(m[1], v);
    280   res[2] = v4f_mul(m[2], v);
    281   res[3] = v4f_mul(m[3], v);
    282   return res;
    283 }
    284 
    285 static FINLINE v4f_T
    286 aosf44_mulf4(const v4f_T m[4], const v4f_T v)
    287 {
    288   v4f_T r0, r1, r2;
    289   ASSERT(m);
    290   r0 = v4f_mul(m[0], v4f_xxxx(v));
    291   r1 = v4f_madd(m[1], v4f_yyyy(v), r0);
    292   r2 = v4f_madd(m[2], v4f_zzzz(v), r1);
    293   return v4f_madd(m[3], v4f_wwww(v), r2);
    294 }
    295 
    296 static FINLINE v4f_T
    297 aosf4_mulf44(v4f_T v, const v4f_T m[4])
    298 {
    299   v4f_T xxxx, yyyy, zzzz, wwww, xyxy, zwzw;
    300   ASSERT(m);
    301   xxxx = v4f_dot(v, m[0]);
    302   yyyy = v4f_dot(v, m[1]);
    303   zzzz = v4f_dot(v, m[2]);
    304   wwww = v4f_dot(v, m[3]);
    305   xyxy = v4f_xayb(xxxx, yyyy);
    306   zwzw = v4f_xayb(zzzz, wwww);
    307   return v4f_xyab(xyxy, zwzw);
    308 }
    309 
    310 static FINLINE v4f_T*
    311 aosf44_mulf44
    312   (v4f_T res[4], const v4f_T m0[4], const v4f_T m1[4])
    313 {
    314   v4f_T c0, c1, c2, c3;
    315   ASSERT(res && m0 && m1);
    316   c0 = aosf44_mulf4(m0, m1[0]);
    317   c1 = aosf44_mulf4(m0, m1[1]);
    318   c2 = aosf44_mulf4(m0, m1[2]);
    319   c3 = aosf44_mulf4(m0, m1[3]);
    320   res[0] = c0;
    321   res[1] = c1;
    322   res[2] = c2;
    323   res[3] = c3;
    324   return res;
    325 }
    326 
    327 static FINLINE v4f_T*
    328 aosf44_transpose(v4f_T res[4], const v4f_T m[4])
    329 {
    330   v4f_T in_c0, in_c1, in_c2, in_c3;
    331   v4f_T x0x2y0y2, x1x3y1y3, z0z2w0w2, z1z3w1w3;
    332   ASSERT(res && m);
    333   in_c0 = m[0];
    334   in_c1 = m[1];
    335   in_c2 = m[2];
    336   in_c3 = m[3];
    337   x0x2y0y2 = v4f_xayb(in_c0, in_c2);
    338   x1x3y1y3 = v4f_xayb(in_c1, in_c3);
    339   z0z2w0w2 = v4f_zcwd(in_c0, in_c2);
    340   z1z3w1w3 = v4f_zcwd(in_c1, in_c3);
    341   res[0] = v4f_xayb(x0x2y0y2, x1x3y1y3);
    342   res[1] = v4f_zcwd(x0x2y0y2, x1x3y1y3);
    343   res[2] = v4f_xayb(z0z2w0w2, z1z3w1w3);
    344   res[3] = v4f_zcwd(z0z2w0w2, z1z3w1w3);
    345   return res;
    346 }
    347 
    348 static FINLINE v4f_T
    349 aosf44_det(const v4f_T m[4])
    350 {
    351   v4f_T xxxx, yyyy, zzzz, wwww, xyxy, zwzw, xyzw;
    352   v4f_T f33_012_012[3], f33_012_013[3], f33_012_023[3], f33_012_123[3];
    353   ASSERT(m);
    354   aosf33_set(f33_012_012, m[0], m[1], m[2]);
    355   aosf33_set(f33_012_013, m[0], m[1], m[3]);
    356   aosf33_set(f33_012_023, m[0], m[2], m[3]);
    357   aosf33_set(f33_012_123, m[1], m[2], m[3]);
    358   xxxx = v4f_minus(aosf33_det(f33_012_123));
    359   yyyy = aosf33_det(f33_012_023);
    360   zzzz = v4f_minus(aosf33_det(f33_012_013));
    361   wwww = aosf33_det(f33_012_012);
    362   xyxy = v4f_xayb(xxxx, yyyy);
    363   zwzw = v4f_xayb(zzzz, wwww);
    364   xyzw = v4f_xyab(xyxy, zwzw);
    365   return v4f_dot(xyzw, aosf44_row3(m));
    366 }
    367 
    368 RSIMD_API v4f_T /* Return the determinant */
    369 aosf44_inverse(v4f_T out[4], const v4f_T in[4]);
    370 
    371 static FINLINE v4f_T /* Return the determinant */
    372 aosf44_invtrans(v4f_T out[4], const v4f_T a[4])
    373 {
    374   v4f_T det;
    375   ASSERT(out && a);
    376   det = aosf44_inverse(out, a);
    377   aosf44_transpose(out, out);
    378   return det;
    379 }
    380 
    381 static FINLINE v4f_T
    382 aosf44_eq(const v4f_T a[4], const v4f_T b[4])
    383 {
    384   ASSERT(a && b);
    385   if(a == b) {
    386     return v4f_true();
    387   } else {
    388     const v4f_T eq_c0 = v4f_eq(a[0], b[0]);
    389     const v4f_T eq_c1 = v4f_eq(a[1], b[1]);
    390     const v4f_T eq_c2 = v4f_eq(a[2], b[2]);
    391     const v4f_T eq_c3 = v4f_eq(a[3], b[3]);
    392     const v4f_T eq = v4f_and(v4f_and(eq_c0, eq_c1), v4f_and(eq_c2, eq_c3));
    393     const v4f_T tmp = v4f_and(v4f_xzxz(eq), v4f_ywyw(eq));
    394     const v4f_T ret = v4f_and(tmp, v4f_yxwz(tmp));
    395     return ret;
    396   }
    397 }
    398 
    399 #endif /* AOSF44_H */
    400