ssei.h (6776B)
1 /* Copyright (C) 2014-2019, 2021, 2023, 2025 Vincent Forest (vaplv@free.fr) 2 * 3 * The RSIMD library is free software: you can redistribute it and/or modify 4 * it under the terms of the GNU General Public License as published 5 * by the Free Software Foundation, either version 3 of the License, or 6 * (at your option) any later version. 7 * 8 * The RSIMD library is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 * GNU General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public License 14 * along with the RSIMD library. If not, see <http://www.gnu.org/licenses/>. */ 15 16 #ifndef RSIMD_SSEI_H 17 #define RSIMD_SSEI_H 18 19 /* 20 * 4 packed signed integers 21 */ 22 23 #include "sse_swz.h" 24 25 #include <rsys/math.h> 26 #include <xmmintrin.h> 27 #include <emmintrin.h> 28 #ifdef SIMD_SSE4_1 29 #include <smmintrin.h> 30 #endif 31 32 typedef __m128i v4i_T; 33 #define V4I_AT__(Vec, Id) __builtin_ia32_vec_ext_v4si((__v4si)Vec, Id) 34 35 #define v4i_SWZ__(Vec, Op0, Op1, Op2, Op3) \ 36 _mm_shuffle_epi32(Vec, _MM_SHUFFLE(Op3, Op2, Op1, Op0)) 37 GENERATE_V4_SWZ_FUNCS__(v4i) /* Swizzle operations */ 38 39 /******************************************************************************* 40 * Set operations 41 ******************************************************************************/ 42 static FINLINE int32_t* 43 v4i_store(int32_t dst[4], v4i_T v) 44 { 45 ASSERT(dst && IS_ALIGNED(dst, 16)); 46 _mm_store_si128((v4i_T*)dst, v); 47 return dst; 48 } 49 50 static FINLINE v4i_T 51 v4i_load(const int32_t src[4]) 52 { 53 ASSERT(src && IS_ALIGNED(src, 16)); 54 return _mm_load_si128((const v4i_T*)src); 55 } 56 57 static FINLINE v4i_T 58 v4i_set1(const int32_t i) 59 { 60 return _mm_set1_epi32(i); 61 } 62 63 static FINLINE v4i_T 64 v4i_set(const int32_t x, const int32_t y, const int32_t z, const int32_t w) 65 { 66 return _mm_set_epi32(w, z, y, x); 67 } 68 69 static FINLINE v4i_T 70 v4i_zero(void) 71 { 72 return _mm_setzero_si128(); 73 } 74 75 /******************************************************************************* 76 * Extract int32 from SIMD packed representation 77 ******************************************************************************/ 78 static FINLINE int32_t v4i_x(const v4i_T v) { return V4I_AT__(v, 0); } 79 static FINLINE int32_t v4i_y(const v4i_T v) { return V4I_AT__(v, 1); } 80 static FINLINE int32_t v4i_z(const v4i_T v) { return V4I_AT__(v, 2); } 81 static FINLINE int32_t v4i_w(const v4i_T v) { return V4I_AT__(v, 3); } 82 83 /******************************************************************************* 84 * Merge operations 85 ******************************************************************************/ 86 static FINLINE v4i_T 87 v4i_xayb(const v4i_T xyzw, const v4i_T abcd) 88 { 89 return _mm_unpacklo_epi32(xyzw, abcd); 90 } 91 92 static FINLINE v4i_T 93 v4i_zcwd(const v4i_T xyzw, const v4i_T abcd) 94 { 95 return _mm_unpackhi_epi32(xyzw, abcd); 96 } 97 98 /******************************************************************************* 99 * Bitwise operators 100 ******************************************************************************/ 101 static FINLINE v4i_T 102 v4i_or(const v4i_T v0, const v4i_T v1) 103 { 104 return _mm_or_si128(v0, v1); 105 } 106 107 static FINLINE v4i_T 108 v4i_and(const v4i_T v0, const v4i_T v1) 109 { 110 return _mm_and_si128(v0, v1); 111 } 112 113 static FINLINE v4i_T 114 v4i_andnot(const v4i_T v0, const v4i_T v1) 115 { 116 return _mm_andnot_si128(v0, v1); 117 } 118 119 static FINLINE v4i_T 120 v4i_xor(const v4i_T v0, const v4i_T v1) 121 { 122 return _mm_xor_si128(v0, v1); 123 } 124 125 static FINLINE v4i_T 126 v4i_not(const v4i_T v) 127 { 128 return _mm_xor_si128(v, _mm_set1_epi32(-1)); 129 } 130 131 static FINLINE v4i_T 132 v4i_rshift(const v4i_T v, const int32_t rshift) 133 { 134 return _mm_srli_epi32(v, rshift); 135 } 136 137 static FINLINE v4i_T 138 v4i_lshift(const v4i_T v, const int32_t lshift) 139 { 140 return _mm_slli_epi32(v, lshift); 141 } 142 143 /******************************************************************************* 144 * Arithmetic operators 145 ******************************************************************************/ 146 static FINLINE v4i_T 147 v4i_add(const v4i_T v0, const v4i_T v1) 148 { 149 return _mm_add_epi32(v0, v1); 150 } 151 152 static FINLINE v4i_T 153 v4i_sub(const v4i_T v0, const v4i_T v1) 154 { 155 return _mm_sub_epi32(v0, v1); 156 } 157 158 static FINLINE v4i_T 159 v4i_minus(const v4i_T v) 160 { 161 return v4i_add(v4i_not(v), v4i_set1(1)); 162 } 163 164 /******************************************************************************* 165 * Comparators 166 ******************************************************************************/ 167 static FINLINE v4i_T 168 v4i_eq(const v4i_T v0, const v4i_T v1) 169 { 170 return _mm_cmpeq_epi32(v0, v1); 171 } 172 173 static FINLINE v4i_T 174 v4i_neq(const v4i_T v0, const v4i_T v1) 175 { 176 return v4i_xor(v4i_eq(v0, v1), v4i_set1(-1)); 177 } 178 179 static FINLINE v4i_T 180 v4i_gt(const v4i_T v0, const v4i_T v1) 181 { 182 return _mm_cmpgt_epi32(v0, v1); 183 } 184 185 static FINLINE v4i_T 186 v4i_lt(const v4i_T v0, const v4i_T v1) 187 { 188 return _mm_cmplt_epi32(v0, v1); 189 } 190 191 static FINLINE v4i_T 192 v4i_ge(const v4i_T v0, const v4i_T v1) 193 { 194 return v4i_xor(v4i_lt(v0, v1), v4i_set1(-1)); 195 } 196 197 static FINLINE v4i_T 198 v4i_le(const v4i_T v0, const v4i_T v1) 199 { 200 return v4i_xor(v4i_gt(v0, v1), v4i_set1(-1)); 201 } 202 203 static FINLINE v4i_T 204 v4i_sel(const v4i_T vfalse, const v4i_T vtrue, const v4i_T vcond) 205 { 206 #ifdef SIMD_SSE4_1 207 return _mm_blendv_epi8(vfalse, vtrue, vcond); 208 #else 209 return v4i_xor(vfalse, v4i_and(vcond, v4i_xor(vfalse, vtrue))); 210 #endif 211 } 212 213 static FINLINE v4i_T 214 v4i_min(const v4i_T v0, const v4i_T v1) 215 { 216 #ifdef SIMD_SSE4_1 217 return _mm_min_epi32(v0, v1); 218 #else 219 ALIGN(16) int32_t a[4]; 220 ALIGN(16) int32_t b[4]; 221 v4i_store(a, v0); 222 v4i_store(b, v1); 223 return v4i_set 224 (MMIN(a[0], b[0]), 225 MMIN(a[1], b[1]), 226 MMIN(a[2], b[2]), 227 MMIN(a[3], b[3])); 228 #endif 229 } 230 231 static FINLINE v4i_T 232 v4i_max(const v4i_T v0, const v4i_T v1) 233 { 234 #ifdef SIMD_SSE4_1 235 return _mm_max_epi32(v0, v1); 236 #else 237 ALIGN(16) int32_t a[4]; 238 ALIGN(16) int32_t b[4]; 239 v4i_store(a, v0); 240 v4i_store(b, v1); 241 return v4i_set 242 (MMAX(a[0], b[0]), 243 MMAX(a[1], b[1]), 244 MMAX(a[2], b[2]), 245 MMAX(a[3], b[3])); 246 #endif 247 } 248 249 static FINLINE v4i_T 250 v4i_reduce_min(const v4i_T v) 251 { 252 #ifdef SIMD_SSE4_1 253 const v4i_T tmp = v4i_min(v4i_yxwz(v), v); 254 return v4i_min(v4i_zwxy(tmp), tmp); 255 #else 256 ALIGN(16) int32_t a[4]; 257 v4i_store(a, v); 258 return v4i_set1(MMIN(MMIN(a[0], a[1]), MMIN(a[2], a[3]))); 259 #endif 260 } 261 262 static FINLINE v4i_T 263 v4i_reduce_max(const v4i_T v) 264 { 265 #ifdef SIMD_SSE4_1 266 const v4i_T tmp = v4i_max(v4i_yxwz(v), v); 267 return v4i_max(v4i_zwxy(tmp), tmp); 268 #else 269 ALIGN(16) int32_t a[4]; 270 v4i_store(a, v); 271 return v4i_set1(MMAX(MMAX(a[0], a[1]), MMAX(a[2], a[3]))); 272 #endif 273 } 274 275 static FINLINE int32_t 276 v4i_reduce_min_i32(const v4i_T v) 277 { 278 return v4i_x(v4i_reduce_min(v)); 279 } 280 281 static FINLINE int32_t 282 v4i_reduce_max_i32(const v4i_T v) 283 { 284 return v4i_x(v4i_reduce_max(v)); 285 } 286 287 #endif /* RSIMD_SSEI_H */ 288