src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx.inl

   1 /****************************************************************************
   2  * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  ****************************************************************************/
  23 #if !defined(__SIMD_LIB_AVX_HPP__)
  24 #error Do not include this file directly, use "simdlib.hpp" instead.
  25 #endif
  26
  27 //============================================================================
  28 // SIMD128 AVX (1) implementation
  29 //============================================================================
  30
  31 #define SIMD_WRAPPER_1(op) \
  32     static SIMDINLINE Float SIMDCALL op(Float a) { return _mm_##op(a); }
  33
  34 #define SIMD_WRAPPER_2(op) \
  35     static SIMDINLINE Float SIMDCALL op(Float a, Float b) { return _mm_##op(a, b); }
  36
  37 #define SIMD_DWRAPPER_2(op) \
  38     static SIMDINLINE Double SIMDCALL op(Double a, Double b) { return _mm_##op(a, b); }
  39
  40 #define SIMD_WRAPPER_2I(op)                               \
  41     template <int ImmT>                                   \
  42     static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
  43     {                                                     \
  44         return _mm_##op(a, b, ImmT);                      \
  45     }
  46
  47 #define SIMD_DWRAPPER_2I(op)                                 \
  48     template <int ImmT>                                      \
  49     static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
  50     {                                                        \
  51         return _mm_##op(a, b, ImmT);                         \
  52     }
  53
  54 #define SIMD_WRAPPER_3(op) \
  55     static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) { return _mm_##op(a, b, c); }
  56
  57 #define SIMD_IWRAPPER_1(op) \
  58     static SIMDINLINE Integer SIMDCALL op(Integer a) { return _mm_##op(a); }
  59
  60 #define SIMD_IWRAPPER_1I_(op, intrin)                \
  61     template <int ImmT>                              \
  62     static SIMDINLINE Integer SIMDCALL op(Integer a) \
  63     {                                                \
  64         return intrin(a, ImmT);                      \
  65     }
  66 #define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm_##op)
  67
  68 #define SIMD_IWRAPPER_2_(op, intrin) \
  69     static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return intrin(a, b); }
  70
  71 #define SIMD_IWRAPPER_2(op) \
  72     static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return _mm_##op(a, b); }
  73
  74 #define SIMD_IFWRAPPER_2(op, intrin)                            \
  75     static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
  76     {                                                           \
  77         return castps_si(intrin(castsi_ps(a), castsi_ps(b)));   \
  78     }
  79
  80 #define SIMD_IWRAPPER_2I(op)                                    \
  81     template <int ImmT>                                         \
  82     static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
  83     {                                                           \
  84         return _mm_##op(a, b, ImmT);                            \
  85     }
  86
  87 //-----------------------------------------------------------------------
  88 // Single precision floating point arithmetic operations
  89 //-----------------------------------------------------------------------
  90 SIMD_WRAPPER_2(add_ps);   // return a + b
  91 SIMD_WRAPPER_2(div_ps);   // return a / b
  92 SIMD_WRAPPER_2(max_ps);   // return (a > b) ? a : b
  93 SIMD_WRAPPER_2(min_ps);   // return (a < b) ? a : b
  94 SIMD_WRAPPER_2(mul_ps);   // return a * b
  95 SIMD_WRAPPER_1(rcp_ps);   // return 1.0f / a
  96 SIMD_WRAPPER_1(rsqrt_ps); // return 1.0f / sqrt(a)
  97 SIMD_WRAPPER_2(sub_ps);   // return a - b
  98
  99 static SIMDINLINE Float SIMDCALL fmadd_ps(Float a, Float b, Float c) // return (a * b) + c
 100 {
 101     return add_ps(mul_ps(a, b), c);
 102 }
 103 static SIMDINLINE Float SIMDCALL fmsub_ps(Float a, Float b, Float c) // return (a * b) - c
 104 {
 105     return sub_ps(mul_ps(a, b), c);
 106 }
 107
 108 template <RoundMode RMT>
 109 static SIMDINLINE Float SIMDCALL round_ps(Float a)
 110 {
 111     return _mm_round_ps(a, static_cast<int>(RMT));
 112 }
 113
 114 static SIMDINLINE Float SIMDCALL ceil_ps(Float a)
 115 {
 116     return round_ps<RoundMode::CEIL_NOEXC>(a);
 117 }
 118 static SIMDINLINE Float SIMDCALL floor_ps(Float a)
 119 {
 120     return round_ps<RoundMode::FLOOR_NOEXC>(a);
 121 }
 122
 123 //-----------------------------------------------------------------------
 124 // Integer (various width) arithmetic operations
 125 //-----------------------------------------------------------------------
 126 SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
 127 SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
 128 SIMD_IWRAPPER_2(add_epi8);  // return a + b (int8)
 129 SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
 130 SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
 131 SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
 132 SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
 133 SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
 134 SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
 135
 136 // return (a * b) & 0xFFFFFFFF
 137 //
 138 // Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
 139 // and store the low 32 bits of the intermediate integers in dst.
 140 SIMD_IWRAPPER_2(mullo_epi32);
 141 SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
 142 SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
 143 SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
 144
 145 //-----------------------------------------------------------------------
 146 // Logical operations
 147 //-----------------------------------------------------------------------
 148 SIMD_WRAPPER_2(and_ps);                        // return a & b       (float treated as int)
 149 SIMD_IWRAPPER_2_(and_si, _mm_and_si128);       // return a & b       (int)
 150 SIMD_WRAPPER_2(andnot_ps);                     // return (~a) & b    (float treated as int)
 151 SIMD_IWRAPPER_2_(andnot_si, _mm_andnot_si128); // return (~a) & b    (int)
 152 SIMD_WRAPPER_2(or_ps);                         // return a | b       (float treated as int)
 153 SIMD_IWRAPPER_2_(or_si, _mm_or_si128);         // return a | b       (int)
 154 SIMD_WRAPPER_2(xor_ps);                        // return a ^ b       (float treated as int)
 155 SIMD_IWRAPPER_2_(xor_si, _mm_xor_si128);       // return a ^ b       (int)
 156
 157 //-----------------------------------------------------------------------
 158 // Shift operations
 159 //-----------------------------------------------------------------------
 160 SIMD_IWRAPPER_1I(slli_epi32); // return a << ImmT
 161 SIMD_IWRAPPER_1I(slli_epi64); // return a << ImmT
 162
 163 static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer vA, Integer vB) // return a << b      (uint32)
 164 {
 165     int32_t a, count;
 166     a     = _mm_extract_epi32(vA, 0);
 167     count = _mm_extract_epi32(vB, 0);
 168     a <<= count;
 169     vA = _mm_insert_epi32(vA, a, 0);
 170
 171     a     = _mm_extract_epi32(vA, 1);
 172     count = _mm_extract_epi32(vB, 1);
 173     a <<= count;
 174     vA = _mm_insert_epi32(vA, a, 1);
 175
 176     a     = _mm_extract_epi32(vA, 2);
 177     count = _mm_extract_epi32(vB, 2);
 178     a <<= count;
 179     vA = _mm_insert_epi32(vA, a, 2);
 180
 181     a     = _mm_extract_epi32(vA, 3);
 182     count = _mm_extract_epi32(vB, 3);
 183     a <<= count;
 184     vA = _mm_insert_epi32(vA, a, 3);
 185
 186     return vA;
 187 }
 188
 189 SIMD_IWRAPPER_1I(srai_epi32);               // return a >> ImmT   (int32)
 190 SIMD_IWRAPPER_1I(srli_epi32);               // return a >> ImmT   (uint32)
 191 SIMD_IWRAPPER_1I_(srli_si, _mm_srli_si128); // return a >> (ImmT*8) (uint)
 192
 193 static SIMDINLINE Integer SIMDCALL srl_epi64(Integer a, Integer n)
 194 {
 195     return _mm_srl_epi64(a, n);
 196 }
 197
 198 template <int ImmT> // same as srli_si, but with Float cast to int
 199 static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)
 200 {
 201     return castsi_ps(srli_si<ImmT>(castps_si(a)));
 202 }
 203
 204 static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer vA, Integer vB) // return a >> b      (uint32)
 205 {
 206     int32_t a, count;
 207     a     = _mm_extract_epi32(vA, 0);
 208     count = _mm_extract_epi32(vB, 0);
 209     a >>= count;
 210     vA = _mm_insert_epi32(vA, a, 0);
 211
 212     a     = _mm_extract_epi32(vA, 1);
 213     count = _mm_extract_epi32(vB, 1);
 214     a >>= count;
 215     vA = _mm_insert_epi32(vA, a, 1);
 216
 217     a     = _mm_extract_epi32(vA, 2);
 218     count = _mm_extract_epi32(vB, 2);
 219     a >>= count;
 220     vA = _mm_insert_epi32(vA, a, 2);
 221
 222     a     = _mm_extract_epi32(vA, 3);
 223     count = _mm_extract_epi32(vB, 3);
 224     a >>= count;
 225     vA = _mm_insert_epi32(vA, a, 3);
 226
 227     return vA;
 228 }
 229
 230 //-----------------------------------------------------------------------
 231 // Conversion operations
 232 //-----------------------------------------------------------------------
 233 static SIMDINLINE Float SIMDCALL castpd_ps(Double a) // return *(Float*)(&a)
 234 {
 235     return _mm_castpd_ps(a);
 236 }
 237
 238 static SIMDINLINE Integer SIMDCALL castps_si(Float a) // return *(Integer*)(&a)
 239 {
 240     return _mm_castps_si128(a);
 241 }
 242
 243 static SIMDINLINE Double SIMDCALL castsi_pd(Integer a) // return *(Double*)(&a)
 244 {
 245     return _mm_castsi128_pd(a);
 246 }
 247
 248 static SIMDINLINE Double SIMDCALL castps_pd(Float a) // return *(Double*)(&a)
 249 {
 250     return _mm_castps_pd(a);
 251 }
 252
 253 static SIMDINLINE Float SIMDCALL castsi_ps(Integer a) // return *(Float*)(&a)
 254 {
 255     return _mm_castsi128_ps(a);
 256 }
 257
 258 static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a) // return (float)a    (int32 --> float)
 259 {
 260     return _mm_cvtepi32_ps(a);
 261 }
 262
 263 static SIMDINLINE int32_t SIMDCALL cvtsi128_si32(Integer a) // return a.v[0]
 264 {
 265     return _mm_cvtsi128_si32(a);
 266 }
 267
 268 static SIMDINLINE Integer SIMDCALL cvtsi32_si128(int32_t n) // return a[0] = n, a[1]...a[3] = 0
 269 {
 270     return _mm_cvtsi32_si128(n);
 271 }
 272
 273 SIMD_IWRAPPER_1(cvtepu8_epi16);  // return (int16)a    (uint8 --> int16)
 274 SIMD_IWRAPPER_1(cvtepu8_epi32);  // return (int32)a    (uint8 --> int32)
 275 SIMD_IWRAPPER_1(cvtepu16_epi32); // return (int32)a    (uint16 --> int32)
 276 SIMD_IWRAPPER_1(cvtepu16_epi64); // return (int64)a    (uint16 --> int64)
 277 SIMD_IWRAPPER_1(cvtepu32_epi64); // return (int64)a    (uint32 --> int64)
 278
 279 static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a) // return (int32)a    (float --> int32)
 280 {
 281     return _mm_cvtps_epi32(a);
 282 }
 283
 284 static SIMDINLINE Integer SIMDCALL
 285                           cvttps_epi32(Float a) // return (int32)a    (rnd_to_zero(float) --> int32)
 286 {
 287     return _mm_cvttps_epi32(a);
 288 }
 289
 290 //-----------------------------------------------------------------------
 291 // Comparison operations
 292 //-----------------------------------------------------------------------
 293 template <CompareType CmpTypeT>
 294 static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b
 295 {
 296     return _mm_cmp_ps(a, b, static_cast<const int>(CmpTypeT));
 297 }
 298 static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b)
 299 {
 300     return cmp_ps<CompareType::LT_OQ>(a, b);
 301 }
 302 static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b)
 303 {
 304     return cmp_ps<CompareType::GT_OQ>(a, b);
 305 }
 306 static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b)
 307 {
 308     return cmp_ps<CompareType::NEQ_OQ>(a, b);
 309 }
 310 static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b)
 311 {
 312     return cmp_ps<CompareType::EQ_OQ>(a, b);
 313 }
 314 static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b)
 315 {
 316     return cmp_ps<CompareType::GE_OQ>(a, b);
 317 }
 318 static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b)
 319 {
 320     return cmp_ps<CompareType::LE_OQ>(a, b);
 321 }
 322
 323 SIMD_IWRAPPER_2(cmpeq_epi8);  // return a == b (int8)
 324 SIMD_IWRAPPER_2(cmpeq_epi16); // return a == b (int16)
 325 SIMD_IWRAPPER_2(cmpeq_epi32); // return a == b (int32)
 326 SIMD_IWRAPPER_2(cmpeq_epi64); // return a == b (int64)
 327 SIMD_IWRAPPER_2(cmpgt_epi8);  // return a > b (int8)
 328 SIMD_IWRAPPER_2(cmpgt_epi16); // return a > b (int16)
 329 SIMD_IWRAPPER_2(cmpgt_epi32); // return a > b (int32)
 330 SIMD_IWRAPPER_2(cmpgt_epi64); // return a > b (int64)
 331 SIMD_IWRAPPER_2(cmplt_epi32); // return a < b (int32)
 332
 333 static SIMDINLINE bool SIMDCALL testz_ps(Float a,
 334                                          Float b) // return all_lanes_zero(a & b) ? 1 : 0 (float)
 335 {
 336     return 0 != _mm_testz_ps(a, b);
 337 }
 338
 339 static SIMDINLINE bool SIMDCALL testz_si(Integer a,
 340                                          Integer b) // return all_lanes_zero(a & b) ? 1 : 0 (int)
 341 {
 342     return 0 != _mm_testz_si128(a, b);
 343 }
 344
 345 //-----------------------------------------------------------------------
 346 // Blend / shuffle / permute operations
 347 //-----------------------------------------------------------------------
 348 SIMD_WRAPPER_2I(blend_ps); // return ImmT ? b : a  (float)
 349 SIMD_WRAPPER_3(blendv_ps); // return mask ? b : a  (float)
 350
 351 static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a,
 352                                                 Integer b,
 353                                                 Float   mask) // return mask ? b : a (int)
 354 {
 355     return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask));
 356 }
 357
 358 static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a,
 359                                                 Integer b,
 360                                                 Integer mask) // return mask ? b : a (int)
 361 {
 362     return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask)));
 363 }
 364
 365 static SIMDINLINE Float SIMDCALL
 366                         broadcast_ss(float const* p) // return *p (all elements in vector get same value)
 367 {
 368     return _mm_broadcast_ss(p);
 369 }
 370
 371 SIMD_IWRAPPER_2(packs_epi16);  // See documentation for _mm_packs_epi16 and _mm512_packs_epi16
 372 SIMD_IWRAPPER_2(packs_epi32);  // See documentation for _mm_packs_epi32 and _mm512_packs_epi32
 373 SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm_packus_epi16 and _mm512_packus_epi16
 374 SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm_packus_epi32 and _mm512_packus_epi32
 375
 376 static SIMDINLINE Integer SIMDCALL
 377                           permute_epi32(Integer a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
 378 {
 379     return castps_si(_mm_permutevar_ps(castsi_ps(a), swiz));
 380 }
 381
 382 static SIMDINLINE Float SIMDCALL
 383                         permute_ps(Float a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
 384 {
 385     return _mm_permutevar_ps(a, swiz);
 386 }
 387
 388 SIMD_IWRAPPER_1I(shuffle_epi32);
 389
 390 template <int ImmT>
 391 static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b) = delete;
 392
 393 SIMD_IWRAPPER_2(shuffle_epi8);
 394 SIMD_DWRAPPER_2I(shuffle_pd);
 395 SIMD_WRAPPER_2I(shuffle_ps);
 396 SIMD_IWRAPPER_2(unpackhi_epi16);
 397
 398 // SIMD_IFWRAPPER_2(unpackhi_epi32, _mm_unpackhi_ps);
 399 static SIMDINLINE Integer SIMDCALL unpackhi_epi32(Integer a, Integer b)
 400 {
 401     return castps_si(_mm_unpackhi_ps(castsi_ps(a), castsi_ps(b)));
 402 }
 403
 404 SIMD_IWRAPPER_2(unpackhi_epi64);
 405 SIMD_IWRAPPER_2(unpackhi_epi8);
 406 SIMD_DWRAPPER_2(unpackhi_pd);
 407 SIMD_WRAPPER_2(unpackhi_ps);
 408 SIMD_IWRAPPER_2(unpacklo_epi16);
 409 SIMD_IFWRAPPER_2(unpacklo_epi32, _mm_unpacklo_ps);
 410 SIMD_IWRAPPER_2(unpacklo_epi64);
 411 SIMD_IWRAPPER_2(unpacklo_epi8);
 412 SIMD_DWRAPPER_2(unpacklo_pd);
 413 SIMD_WRAPPER_2(unpacklo_ps);
 414
 415 //-----------------------------------------------------------------------
 416 // Load / store operations
 417 //-----------------------------------------------------------------------
 418 template <ScaleFactor ScaleT>
 419 static SIMDINLINE Float SIMDCALL
 420                         i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
 421 {
 422     uint32_t* pOffsets = (uint32_t*)&idx;
 423     Float     vResult;
 424     float*    pResult = (float*)&vResult;
 425     for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
 426     {
 427         uint32_t offset = pOffsets[i];
 428         offset          = offset * static_cast<uint32_t>(ScaleT);
 429         pResult[i]      = *(float const*)(((uint8_t const*)p + offset));
 430     }
 431
 432     return vResult;
 433 }
 434
 435 static SIMDINLINE Float SIMDCALL
 436                         load1_ps(float const* p) // return *p    (broadcast 1 value to all elements)
 437 {
 438     return broadcast_ss(p);
 439 }
 440
 441 static SIMDINLINE Float SIMDCALL
 442                         load_ps(float const* p) // return *p    (loads SIMD width elements from memory)
 443 {
 444     return _mm_load_ps(p);
 445 }
 446
 447 static SIMDINLINE Integer SIMDCALL load_si(Integer const* p) // return *p
 448 {
 449     return _mm_load_si128(&p->v);
 450 }
 451
 452 static SIMDINLINE Float SIMDCALL
 453                         loadu_ps(float const* p) // return *p    (same as load_ps but allows for unaligned mem)
 454 {
 455     return _mm_loadu_ps(p);
 456 }
 457
 458 static SIMDINLINE Integer SIMDCALL
 459                           loadu_si(Integer const* p) // return *p    (same as load_si but allows for unaligned mem)
 460 {
 461     return _mm_lddqu_si128(&p->v);
 462 }
 463
 464 // for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
 465 template <ScaleFactor ScaleT>
 466 static SIMDINLINE Float SIMDCALL
 467                         mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
 468 {
 469     uint32_t* pOffsets = (uint32_t*)&idx;
 470     Float     vResult  = old;
 471     float*    pResult  = (float*)&vResult;
 472     DWORD     index;
 473     uint32_t  umask = movemask_ps(mask);
 474     while (_BitScanForward(&index, umask))
 475     {
 476         umask &= ~(1 << index);
 477         uint32_t offset = pOffsets[index];
 478         offset          = offset * static_cast<uint32_t>(ScaleT);
 479         pResult[index]  = *(float const*)(((uint8_t const*)p + offset));
 480     }
 481
 482     return vResult;
 483 }
 484
 485 static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer mask, Float src)
 486 {
 487     _mm_maskstore_ps(p, mask, src);
 488 }
 489
 490 static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
 491 {
 492     return static_cast<uint32_t>(_mm_movemask_epi8(a));
 493 }
 494
 495 static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a)
 496 {
 497     return static_cast<uint32_t>(_mm_movemask_pd(a));
 498 }
 499 static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a)
 500 {
 501     return static_cast<uint32_t>(_mm_movemask_ps(a));
 502 }
 503
 504 static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
 505 {
 506     return _mm_set1_epi32(i);
 507 }
 508
 509 static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
 510 {
 511     return _mm_set1_epi8(i);
 512 }
 513
 514 static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value)
 515 {
 516     return _mm_set1_ps(f);
 517 }
 518
 519 static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float)
 520 {
 521     return _mm_setzero_ps();
 522 }
 523
 524 static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer)
 525 {
 526     return _mm_setzero_si128();
 527 }
 528
 529 static SIMDINLINE void SIMDCALL
 530                        store_ps(float* p, Float a) // *p = a   (stores all elements contiguously in memory)
 531 {
 532     _mm_store_ps(p, a);
 533 }
 534
 535 static SIMDINLINE void SIMDCALL store_si(Integer* p, Integer a) // *p = a
 536 {
 537     _mm_store_si128(&p->v, a);
 538 }
 539
 540 static SIMDINLINE void SIMDCALL
 541                        storeu_si(Integer* p, Integer a) // *p = a    (same as store_si but allows for unaligned mem)
 542 {
 543     _mm_storeu_si128(&p->v, a);
 544 }
 545
 546 static SIMDINLINE void SIMDCALL
 547                        stream_ps(float* p, Float a) // *p = a   (same as store_ps, but doesn't keep memory in cache)
 548 {
 549     _mm_stream_ps(p, a);
 550 }
 551
 552 static SIMDINLINE Float SIMDCALL set_ps(float in3, float in2, float in1, float in0)
 553 {
 554     return _mm_set_ps(in3, in2, in1, in0);
 555 }
 556
 557 static SIMDINLINE Integer SIMDCALL set_epi32(int in3, int in2, int in1, int in0)
 558 {
 559     return _mm_set_epi32(in3, in2, in1, in0);
 560 }
 561
 562 template <int ImmT>
 563 static SIMDINLINE float SIMDCALL extract_ps(Float a)
 564 {
 565     int tmp = _mm_extract_ps(a, ImmT);
 566     return *reinterpret_cast<float*>(&tmp);
 567 }
 568
 569 static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
 570 {
 571     Integer       vec = set1_epi32(mask);
 572     const Integer bit = set_epi32(0x08, 0x04, 0x02, 0x01);
 573     vec               = and_si(vec, bit);
 574     vec               = cmplt_epi32(setzero_si(), vec);
 575     return castsi_ps(vec);
 576 }
 577
 578 #undef SIMD_WRAPPER_1
 579 #undef SIMD_WRAPPER_2
 580 #undef SIMD_DWRAPPER_2
 581 #undef SIMD_DWRAPPER_2I
 582 #undef SIMD_WRAPPER_2I
 583 #undef SIMD_WRAPPER_3
 584 #undef SIMD_IWRAPPER_1
 585 #undef SIMD_IWRAPPER_2
 586 #undef SIMD_IFWRAPPER_2
 587 #undef SIMD_IWRAPPER_2I
 588 #undef SIMD_IWRAPPER_1
 589 #undef SIMD_IWRAPPER_1I
 590 #undef SIMD_IWRAPPER_1I_
 591 #undef SIMD_IWRAPPER_2
 592 #undef SIMD_IWRAPPER_2_
 593 #undef SIMD_IWRAPPER_2I