rsimd

Make SIMD instruction sets easier to use
git clone git://git.meso-star.fr/rsimd.git
Log | Files | Refs | README | LICENSE

avxf.h (7306B)


      1 /* Copyright (C) 2014-2019, 2021, 2023, 2025 Vincent Forest (vaplv@free.fr)
      2  *
      3  * The RSIMD library is free software: you can redistribute it and/or modify
      4  * it under the terms of the GNU General Public License as published
      5  * by the Free Software Foundation, either version 3 of the License, or
      6  * (at your option) any later version.
      7  *
      8  * The RSIMD library is distributed in the hope that it will be useful,
      9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
     10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
     11  * GNU General Public License for more details.
     12  *
     13  * You should have received a copy of the GNU General Public License
     14  * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */
     15 
     16 #ifndef RSIMD_AVXF_H
     17 #define RSIMD_AVXF_H
     18 
     19 /*
     20  * 8 packed single precision floating-point values
     21  */
     22 
     23 #include "avx.h"
     24 
     25 #include <rsys/math.h>
     26 #include <immintrin.h>
     27 
     28 typedef __m256 v8f_T;
     29 
     30 /*******************************************************************************
     31  * Set operations
     32  ******************************************************************************/
     33 static FINLINE float*
     34 v8f_store(float dst[8], v8f_T v)
     35 {
     36   ASSERT(dst && IS_ALIGNED(dst, 32));
     37   _mm256_store_ps(dst, v);
     38   return dst;
     39 }
     40 
     41 static FINLINE v8f_T
     42 v8f_load(const float src[8])
     43 {
     44   ASSERT(src && IS_ALIGNED(src, 32));
     45   return _mm256_load_ps(src);
     46 }
     47 
     48 static FINLINE v8f_T
     49 v8f_loadu(const float f[8])
     50 {
     51   ASSERT(f);
     52   return _mm256_set_ps(f[7], f[6], f[5], f[4], f[3],f[2], f[1], f[0]);
     53 }
     54 
     55 static FINLINE v8f_T
     56 v8f_set1(const float x)
     57 {
     58   return _mm256_set1_ps(x);
     59 }
     60 
     61 static FINLINE v8f_T
     62 v8f_set
     63   (const float a, const float b, const float c, const float d,
     64    const float e, const float f, const float g, const float h)
     65 {
     66   return _mm256_set_ps(h, g, f, e, d, c, b, a);
     67 }
     68 
     69 static FINLINE v8f_T
     70 v8f_zero(void)
     71 {
     72   return _mm256_setzero_ps();
     73 }
     74 
     75 static FINLINE v8f_T
     76 v8f_mask
     77   (const int32_t a, const int32_t b, const int32_t c, const int32_t d,
     78    const int32_t e, const int32_t f, const int32_t g, const int32_t h)
     79 {
     80   return _mm256_castsi256_ps(_mm256_set_epi32(h, g, f, e, d, c, b, a));
     81 }
     82 
     83 static FINLINE v8f_T
     84 v8f_mask1(const int32_t x)
     85 {
     86   return _mm256_castsi256_ps(_mm256_set1_epi32(x));
     87 }
     88 
     89 static FINLINE v8f_T
     90 v8f_true(void)
     91 {
     92   return _mm256_castsi256_ps(_mm256_set1_epi32(~0));
     93 }
     94 
     95 static FINLINE v8f_T
     96 v8f_false(void)
     97 {
     98   return v8f_zero();
     99 }
    100 
    101 /*******************************************************************************
    102  * Extract components
    103  ******************************************************************************/
    104 static FINLINE v4f_T
    105 v8f_abcd(const v8f_T v)
    106 {
    107   return _mm256_extractf128_ps(v, 0);
    108 }
    109 
    110 static FINLINE v4f_T
    111 v8f_efgh(const v8f_T v)
    112 {
    113   return  _mm256_extractf128_ps(v, 1);
    114 }
    115 
    116 static FINLINE int
    117 v8f_movemask(const v8f_T v)
    118 {
    119   return _mm256_movemask_ps(v);
    120 }
    121 
    122 /*******************************************************************************
    123  * Bitwise operations
    124  ******************************************************************************/
    125 static FINLINE v8f_T
    126 v8f_or(const v8f_T v0, const v8f_T v1)
    127 {
    128   return _mm256_or_ps(v0, v1);
    129 }
    130 
    131 static FINLINE v8f_T
    132 v8f_and(const v8f_T v0, const v8f_T v1)
    133 {
    134   return _mm256_and_ps(v0, v1);
    135 }
    136 
    137 static FINLINE v8f_T
    138 v8f_andnot(const v8f_T v0, const v8f_T v1)
    139 {
    140   return _mm256_andnot_ps(v0, v1);
    141 }
    142 
    143 static FINLINE v8f_T
    144 v8f_xor(const v8f_T v0, const v8f_T v1)
    145 {
    146   return _mm256_xor_ps(v0, v1);
    147 }
    148 
    149 static FINLINE v8f_T
    150 v8f_sel(const v8f_T vfalse, const v8f_T vtrue, const v8f_T vcond)
    151 {
    152   return _mm256_blendv_ps(vfalse, vtrue, vcond);
    153 }
    154 
    155 /*******************************************************************************
    156  * Arithmetic operations
    157  ******************************************************************************/
    158 static FINLINE v8f_T
    159 v8f_minus(const v8f_T v)
    160 {
    161   return v8f_xor(v8f_set1(-0.f), v);
    162 }
    163 
    164 static FINLINE v8f_T
    165 v8f_add(const v8f_T v0, const v8f_T v1)
    166 {
    167   return _mm256_add_ps(v0, v1);
    168 }
    169 
    170 static FINLINE v8f_T
    171 v8f_sub(const v8f_T v0, const v8f_T v1)
    172 {
    173   return _mm256_sub_ps(v0, v1);
    174 }
    175 
    176 static FINLINE v8f_T
    177 v8f_mul(const v8f_T v0, const v8f_T v1)
    178 {
    179   return _mm256_mul_ps(v0, v1);
    180 }
    181 
    182 static FINLINE v8f_T
    183 v8f_div(const v8f_T v0, const v8f_T v1)
    184 {
    185   return _mm256_div_ps(v0, v1);
    186 }
    187 
    188 static FINLINE v8f_T
    189 v8f_madd(const v8f_T v0, const v8f_T v1, const v8f_T v2)
    190 {
    191   return _mm256_add_ps(_mm256_mul_ps(v0, v1), v2);
    192 }
    193 
    194 static FINLINE v8f_T
    195 v8f_abs(const v8f_T v)
    196 {
    197   const union { int32_t i; float f; } mask = { 0x7fffffff };
    198   return v8f_and(v, v8f_set1(mask.f));
    199 }
    200 
    201 static FINLINE v8f_T
    202 v8f_sqrt(const v8f_T v)
    203 {
    204   return _mm256_sqrt_ps(v);
    205 }
    206 
    207 static FINLINE v8f_T
    208 v8f_rsqrte(const v8f_T v)
    209 {
    210   return _mm256_rsqrt_ps(v);
    211 }
    212 
    213 static FINLINE v8f_T
    214 v8f_rsqrt(const v8f_T v)
    215 {
    216   const v8f_T y = v8f_rsqrte(v);
    217   const v8f_T yyv = v8f_mul(v8f_mul(y, y), v);
    218   const v8f_T tmp = v8f_sub(v8f_set1(1.5f), v8f_mul(yyv, v8f_set1(0.5f)));
    219   return v8f_mul(tmp, y);
    220 }
    221 
    222 static FINLINE v8f_T
    223 v8f_rcpe(const v8f_T v)
    224 {
    225   return _mm256_rcp_ps(v);
    226 }
    227 
    228 static FINLINE v8f_T
    229 v8f_rcp(const v8f_T v)
    230 {
    231   const v8f_T y = v8f_rcpe(v);
    232   const v8f_T tmp = v8f_sub(v8f_set1(2.f), v8f_mul(y, v));
    233   return v8f_mul(tmp, y);
    234 }
    235 
    236 static FINLINE v8f_T
    237 v8f_lerp(const v8f_T from, const v8f_T to, const v8f_T param)
    238 {
    239   return v8f_madd(v8f_sub(to, from), param, from);
    240 }
    241 
    242 /*******************************************************************************
    243  * Comparators
    244  ******************************************************************************/
    245 static FINLINE v8f_T
    246 v8f_eq(const v8f_T v0, const v8f_T v1)
    247 {
    248   return _mm256_cmp_ps(v0, v1, _CMP_EQ_OS);
    249 }
    250 
    251 static FINLINE v8f_T
    252 v8f_neq(const v8f_T v0, const v8f_T v1)
    253 {
    254   return _mm256_cmp_ps(v0, v1, _CMP_NEQ_OS);
    255 }
    256 
    257 static FINLINE v8f_T
    258 v8f_ge(const v8f_T v0, const v8f_T v1)
    259 {
    260   return _mm256_cmp_ps(v0, v1, _CMP_GE_OS);
    261 }
    262 
    263 static FINLINE v8f_T
    264 v8f_le(const v8f_T v0, const v8f_T v1)
    265 {
    266   return _mm256_cmp_ps(v0, v1, _CMP_LE_OS);
    267 }
    268 
    269 static FINLINE v8f_T
    270 v8f_gt(const v8f_T v0, const v8f_T v1)
    271 {
    272   return _mm256_cmp_ps(v0, v1, _CMP_GT_OS);
    273 }
    274 
    275 static FINLINE v8f_T
    276 v8f_lt(const v8f_T v0, const v8f_T v1)
    277 {
    278   return _mm256_cmp_ps(v0, v1, _CMP_LT_OS);
    279 }
    280 
    281 static FINLINE v8f_T
    282 v8f_eq_eps(const v8f_T v0, const v8f_T v1, const v8f_T eps)
    283 {
    284   return v8f_le(v8f_abs(v8f_sub(v0, v1)), eps);
    285 }
    286 
    287 static FINLINE v8f_T
    288 v8f_min(const v8f_T v0, const v8f_T v1)
    289 {
    290   return _mm256_min_ps(v0, v1);
    291 }
    292 
    293 static FINLINE v8f_T
    294 v8f_max(const v8f_T v0, const v8f_T v1)
    295 {
    296   return _mm256_max_ps(v0, v1);
    297 }
    298 
    299 static FINLINE float
    300 v8f_reduce_min(const v8f_T v0)
    301 {
    302   ALIGN(32) float tmp[8];
    303   const v8f_T v1 = _mm256_permute_ps(v0, _MM_SHUFFLE(1, 0, 3, 2));
    304   const v8f_T v2 = _mm256_min_ps(v0, v1);
    305   const v8f_T v3 = _mm256_permute_ps(v2, _MM_SHUFFLE(2, 3, 0, 1));
    306   const v8f_T v4 = _mm256_min_ps(v2, v3);
    307   _mm256_store_ps(tmp, v4);
    308   return MMIN(tmp[0], tmp[4]);
    309 }
    310 
    311 static FINLINE float
    312 v8f_reduce_max(const v8f_T v0)
    313 {
    314   ALIGN(32) float tmp[8];
    315   const v8f_T v1 = _mm256_permute_ps(v0, _MM_SHUFFLE(1, 0, 3, 2));
    316   const v8f_T v2 = _mm256_max_ps(v0, v1);
    317   const v8f_T v3 = _mm256_permute_ps(v2, _MM_SHUFFLE(2, 3, 0, 1));
    318   const v8f_T v4 = _mm256_max_ps(v2, v3);
    319   _mm256_store_ps(tmp, v4);
    320   return MMAX(tmp[0], tmp[4]);
    321 }
    322 
    323 static FINLINE v8f_T
    324 v8f_clamp(const v8f_T v, const v8f_T vmin, const v8f_T vmax)
    325 {
    326   return v8f_min(v8f_max(v, vmin), vmax);
    327 }
    328 
    329 #endif /* RSIMD_AVX_H */
    330