rsimd

Make SIMD instruction sets easier to use
git clone git://git.meso-star.fr/rsimd.git
Log | Files | Refs | README | LICENSE

ssei.h (6776B)


      1 /* Copyright (C) 2014-2019, 2021, 2023, 2025 Vincent Forest (vaplv@free.fr)
      2  *
      3  * The RSIMD library is free software: you can redistribute it and/or modify
      4  * it under the terms of the GNU General Public License as published
      5  * by the Free Software Foundation, either version 3 of the License, or
      6  * (at your option) any later version.
      7  *
      8  * The RSIMD library is distributed in the hope that it will be useful,
      9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
     10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
     11  * GNU General Public License for more details.
     12  *
     13  * You should have received a copy of the GNU General Public License
     14  * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */
     15 
     16 #ifndef RSIMD_SSEI_H
     17 #define RSIMD_SSEI_H
     18 
     19 /*
     20  * 4 packed signed integers
     21  */
     22 
     23 #include "sse_swz.h"
     24 
     25 #include <rsys/math.h>
     26 #include <xmmintrin.h>
     27 #include <emmintrin.h>
     28 #ifdef SIMD_SSE4_1
     29   #include <smmintrin.h>
     30 #endif
     31 
     32 typedef __m128i v4i_T;
     33 #define V4I_AT__(Vec, Id) __builtin_ia32_vec_ext_v4si((__v4si)Vec, Id)
     34 
     35 #define v4i_SWZ__(Vec, Op0, Op1, Op2, Op3)                                     \
     36   _mm_shuffle_epi32(Vec, _MM_SHUFFLE(Op3, Op2, Op1, Op0))
     37 GENERATE_V4_SWZ_FUNCS__(v4i) /* Swizzle operations */
     38 
     39 /*******************************************************************************
     40  * Set operations
     41  ******************************************************************************/
     42 static FINLINE int32_t*
     43 v4i_store(int32_t dst[4], v4i_T v)
     44 {
     45   ASSERT(dst && IS_ALIGNED(dst, 16));
     46   _mm_store_si128((v4i_T*)dst, v);
     47   return dst;
     48 }
     49 
     50 static FINLINE v4i_T
     51 v4i_load(const int32_t src[4])
     52 {
     53   ASSERT(src && IS_ALIGNED(src, 16));
     54   return _mm_load_si128((const v4i_T*)src);
     55 }
     56 
     57 static FINLINE v4i_T
     58 v4i_set1(const int32_t i)
     59 {
     60   return _mm_set1_epi32(i);
     61 }
     62 
     63 static FINLINE v4i_T
     64 v4i_set(const int32_t x, const int32_t y, const int32_t z, const int32_t w)
     65 {
     66   return _mm_set_epi32(w, z, y, x);
     67 }
     68 
     69 static FINLINE v4i_T
     70 v4i_zero(void)
     71 {
     72   return _mm_setzero_si128();
     73 }
     74 
     75 /*******************************************************************************
     76  * Extract int32 from SIMD packed representation
     77  ******************************************************************************/
     78 static FINLINE int32_t v4i_x(const v4i_T v) { return V4I_AT__(v, 0); }
     79 static FINLINE int32_t v4i_y(const v4i_T v) { return V4I_AT__(v, 1); }
     80 static FINLINE int32_t v4i_z(const v4i_T v) { return V4I_AT__(v, 2); }
     81 static FINLINE int32_t v4i_w(const v4i_T v) { return V4I_AT__(v, 3); }
     82 
     83 /*******************************************************************************
     84  * Merge operations
     85  ******************************************************************************/
     86 static FINLINE v4i_T
     87 v4i_xayb(const v4i_T xyzw, const v4i_T abcd)
     88 {
     89   return _mm_unpacklo_epi32(xyzw, abcd);
     90 }
     91 
     92 static FINLINE v4i_T
     93 v4i_zcwd(const v4i_T xyzw, const v4i_T abcd)
     94 {
     95   return _mm_unpackhi_epi32(xyzw, abcd);
     96 }
     97 
     98 /*******************************************************************************
     99  * Bitwise operators
    100  ******************************************************************************/
    101 static FINLINE v4i_T
    102 v4i_or(const v4i_T v0, const v4i_T v1)
    103 {
    104   return _mm_or_si128(v0, v1);
    105 }
    106 
    107 static FINLINE v4i_T
    108 v4i_and(const v4i_T v0, const v4i_T v1)
    109 {
    110   return _mm_and_si128(v0, v1);
    111 }
    112 
    113 static FINLINE v4i_T
    114 v4i_andnot(const v4i_T v0, const v4i_T v1)
    115 {
    116   return _mm_andnot_si128(v0, v1);
    117 }
    118 
    119 static FINLINE v4i_T
    120 v4i_xor(const v4i_T v0, const v4i_T v1)
    121 {
    122   return _mm_xor_si128(v0, v1);
    123 }
    124 
    125 static FINLINE v4i_T
    126 v4i_not(const v4i_T v)
    127 {
    128   return _mm_xor_si128(v, _mm_set1_epi32(-1));
    129 }
    130 
    131 static FINLINE v4i_T
    132 v4i_rshift(const v4i_T v, const int32_t rshift)
    133 {
    134   return _mm_srli_epi32(v, rshift);
    135 }
    136 
    137 static FINLINE v4i_T
    138 v4i_lshift(const v4i_T v, const int32_t lshift)
    139 {
    140   return _mm_slli_epi32(v, lshift);
    141 }
    142 
    143 /*******************************************************************************
    144  * Arithmetic operators
    145  ******************************************************************************/
    146 static FINLINE v4i_T
    147 v4i_add(const v4i_T v0, const v4i_T v1)
    148 {
    149   return _mm_add_epi32(v0, v1);
    150 }
    151 
    152 static FINLINE v4i_T
    153 v4i_sub(const v4i_T v0, const v4i_T v1)
    154 {
    155   return _mm_sub_epi32(v0, v1);
    156 }
    157 
    158 static FINLINE v4i_T
    159 v4i_minus(const v4i_T v)
    160 {
    161   return v4i_add(v4i_not(v), v4i_set1(1));
    162 }
    163 
    164 /*******************************************************************************
    165  * Comparators
    166  ******************************************************************************/
    167 static FINLINE v4i_T
    168 v4i_eq(const v4i_T v0, const v4i_T v1)
    169 {
    170   return _mm_cmpeq_epi32(v0, v1);
    171 }
    172 
    173 static FINLINE v4i_T
    174 v4i_neq(const v4i_T v0, const v4i_T v1)
    175 {
    176   return v4i_xor(v4i_eq(v0, v1), v4i_set1(-1));
    177 }
    178 
    179 static FINLINE v4i_T
    180 v4i_gt(const v4i_T v0, const v4i_T v1)
    181 {
    182   return _mm_cmpgt_epi32(v0, v1);
    183 }
    184 
    185 static FINLINE v4i_T
    186 v4i_lt(const v4i_T v0, const v4i_T v1)
    187 {
    188   return _mm_cmplt_epi32(v0, v1);
    189 }
    190 
    191 static FINLINE v4i_T
    192 v4i_ge(const v4i_T v0, const v4i_T v1)
    193 {
    194   return v4i_xor(v4i_lt(v0, v1), v4i_set1(-1));
    195 }
    196 
    197 static FINLINE v4i_T
    198 v4i_le(const v4i_T v0, const v4i_T v1)
    199 {
    200   return v4i_xor(v4i_gt(v0, v1), v4i_set1(-1));
    201 }
    202 
    203 static FINLINE v4i_T
    204 v4i_sel(const v4i_T vfalse, const v4i_T vtrue, const v4i_T vcond)
    205 {
    206 #ifdef SIMD_SSE4_1
    207   return _mm_blendv_epi8(vfalse, vtrue, vcond);
    208 #else
    209   return v4i_xor(vfalse, v4i_and(vcond, v4i_xor(vfalse, vtrue)));
    210 #endif
    211 }
    212 
    213 static FINLINE v4i_T
    214 v4i_min(const v4i_T v0, const v4i_T v1)
    215 {
    216 #ifdef SIMD_SSE4_1
    217   return _mm_min_epi32(v0, v1);
    218 #else
    219   ALIGN(16) int32_t a[4];
    220   ALIGN(16) int32_t b[4];
    221   v4i_store(a, v0);
    222   v4i_store(b, v1);
    223   return v4i_set
    224     (MMIN(a[0], b[0]),
    225      MMIN(a[1], b[1]),
    226      MMIN(a[2], b[2]),
    227      MMIN(a[3], b[3]));
    228 #endif
    229 }
    230 
    231 static FINLINE v4i_T
    232 v4i_max(const v4i_T v0, const v4i_T v1)
    233 {
    234 #ifdef SIMD_SSE4_1
    235   return _mm_max_epi32(v0, v1);
    236 #else
    237   ALIGN(16) int32_t a[4];
    238   ALIGN(16) int32_t b[4];
    239   v4i_store(a, v0);
    240   v4i_store(b, v1);
    241   return v4i_set
    242     (MMAX(a[0], b[0]),
    243      MMAX(a[1], b[1]),
    244      MMAX(a[2], b[2]),
    245      MMAX(a[3], b[3]));
    246 #endif
    247 }
    248 
    249 static FINLINE v4i_T
    250 v4i_reduce_min(const v4i_T v)
    251 {
    252 #ifdef SIMD_SSE4_1
    253   const v4i_T tmp = v4i_min(v4i_yxwz(v), v);
    254   return v4i_min(v4i_zwxy(tmp), tmp);
    255 #else
    256   ALIGN(16) int32_t a[4];
    257   v4i_store(a, v);
    258   return v4i_set1(MMIN(MMIN(a[0], a[1]), MMIN(a[2], a[3])));
    259 #endif
    260 }
    261 
    262 static FINLINE v4i_T
    263 v4i_reduce_max(const v4i_T v)
    264 {
    265 #ifdef SIMD_SSE4_1
    266   const v4i_T tmp = v4i_max(v4i_yxwz(v), v);
    267   return v4i_max(v4i_zwxy(tmp), tmp);
    268 #else
    269   ALIGN(16) int32_t a[4];
    270   v4i_store(a, v);
    271   return v4i_set1(MMAX(MMAX(a[0], a[1]), MMAX(a[2], a[3])));
    272 #endif
    273 }
    274 
    275 static FINLINE int32_t
    276 v4i_reduce_min_i32(const v4i_T v)
    277 {
    278   return v4i_x(v4i_reduce_min(v));
    279 }
    280 
    281 static FINLINE int32_t
    282 v4i_reduce_max_i32(const v4i_T v)
    283 {
    284   return v4i_x(v4i_reduce_max(v));
    285 }
    286 
    287 #endif /* RSIMD_SSEI_H */
    288