src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl

   1 /****************************************************************************
   2  * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  ****************************************************************************/
  23 #if !defined(__SIMD_LIB_AVX_HPP__)
  24 #error Do not include this file directly, use "simdlib.hpp" instead.
  25 #endif
  26
  27 using SIMD128T = SIMD128Impl::AVXImpl;
  28
  29 //============================================================================
  30 // SIMD256 AVX (1) implementation
  31 //============================================================================
  32
  33 #define SIMD_WRAPPER_1(op) \
  34     static SIMDINLINE Float SIMDCALL op(Float const& a) { return _mm256_##op(a); }
  35
  36 #define SIMD_WRAPPER_2(op)                                              \
  37     static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \
  38     {                                                                   \
  39         return _mm256_##op(a, b);                                       \
  40     }
  41
  42 #define SIMD_DWRAPPER_2(op)                                                \
  43     static SIMDINLINE Double SIMDCALL op(Double const& a, Double const& b) \
  44     {                                                                      \
  45         return _mm256_##op(a, b);                                          \
  46     }
  47
  48 #define SIMD_WRAPPER_2I(op)                                             \
  49     template <int ImmT>                                                 \
  50     static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \
  51     {                                                                   \
  52         return _mm256_##op(a, b, ImmT);                                 \
  53     }
  54
  55 #define SIMD_DWRAPPER_2I(op)                                               \
  56     template <int ImmT>                                                    \
  57     static SIMDINLINE Double SIMDCALL op(Double const& a, Double const& b) \
  58     {                                                                      \
  59         return _mm256_##op(a, b, ImmT);                                    \
  60     }
  61
  62 #define SIMD_WRAPPER_3(op)                                                              \
  63     static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b, Float const& c) \
  64     {                                                                                   \
  65         return _mm256_##op(a, b, c);                                                    \
  66     }
  67
  68 #define SIMD_IWRAPPER_1(op) \
  69     static SIMDINLINE Integer SIMDCALL op(Integer const& a) { return _mm256_##op(a); }
  70
  71 #define SIMD_IWRAPPER_2(op)                                                   \
  72     static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
  73     {                                                                         \
  74         return _mm256_##op(a, b);                                             \
  75     }
  76
  77 #define SIMD_IFWRAPPER_2(op, intrin)                                          \
  78     static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
  79     {                                                                         \
  80         return castps_si(intrin(castsi_ps(a), castsi_ps(b)));                 \
  81     }
  82
  83 #define SIMD_IFWRAPPER_2I(op, intrin)                                         \
  84     template <int ImmT>                                                       \
  85     static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
  86     {                                                                         \
  87         return castps_si(intrin(castsi_ps(a), castsi_ps(b), ImmT));           \
  88     }
  89
  90 #define SIMD_IWRAPPER_2I_(op, intrin)                                         \
  91     template <int ImmT>                                                       \
  92     static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
  93     {                                                                         \
  94         return _mm256_##intrin(a, b, ImmT);                                   \
  95     }
  96 #define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op)
  97
  98 #define SIMD_IWRAPPER_3(op)                                                                     \
  99     static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b, Integer const& c) \
 100     {                                                                                           \
 101         return _mm256_##op(a, b, c);                                                            \
 102     }
 103
 104 // emulated integer simd
 105 #define SIMD_EMU_IWRAPPER_1(op)                             \
 106     static SIMDINLINE Integer SIMDCALL op(Integer const& a) \
 107     {                                                       \
 108         return Integer{                                     \
 109             SIMD128T::op(a.v4[0]),                          \
 110             SIMD128T::op(a.v4[1]),                          \
 111         };                                                  \
 112     }
 113 #define SIMD_EMU_IWRAPPER_1L(op, shift)                                  \
 114     static SIMDINLINE Integer SIMDCALL op(Integer const& a)              \
 115     {                                                                    \
 116         return Integer{                                                  \
 117             SIMD128T::op(a.v4[0]),                                       \
 118             SIMD128T::op(SIMD128T::template srli_si<shift>(a.v4[0])),    \
 119         };                                                               \
 120     }                                                                    \
 121     static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer const& a) \
 122     {                                                                    \
 123         return Integer{                                                  \
 124             SIMD128T::op(a),                                             \
 125             SIMD128T::op(SIMD128T::template srli_si<shift>(a)),          \
 126         };                                                               \
 127     }
 128
 129 #define SIMD_EMU_IWRAPPER_1I(op)                            \
 130     template <int ImmT>                                     \
 131     static SIMDINLINE Integer SIMDCALL op(Integer const& a) \
 132     {                                                       \
 133         return Integer{                                     \
 134             SIMD128T::template op<ImmT>(a.v4[0]),           \
 135             SIMD128T::template op<ImmT>(a.v4[1]),           \
 136         };                                                  \
 137     }
 138
 139 #define SIMD_EMU_IWRAPPER_2(op)                                               \
 140     static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
 141     {                                                                         \
 142         return Integer{                                                       \
 143             SIMD128T::op(a.v4[0], b.v4[0]),                                   \
 144             SIMD128T::op(a.v4[1], b.v4[1]),                                   \
 145         };                                                                    \
 146     }
 147
 148 #define SIMD_EMU_IWRAPPER_2I(op)                                              \
 149     template <int ImmT>                                                       \
 150     static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
 151     {                                                                         \
 152         return Integer{                                                       \
 153             SIMD128T::template op<ImmT>(a.v4[0], b.v[0]),                     \
 154             SIMD128T::template op<ImmT>(a.v4[1], b.v[1]),                     \
 155         };                                                                    \
 156     }
 157
 158 //-----------------------------------------------------------------------
 159 // Single precision floating point arithmetic operations
 160 //-----------------------------------------------------------------------
 161 SIMD_WRAPPER_2(add_ps); // return a + b
 162 SIMD_WRAPPER_2(div_ps); // return a / b
 163
 164 static SIMDINLINE Float SIMDCALL fmadd_ps(Float const& a,
 165                                           Float const& b,
 166                                           Float const& c) // return (a * b) + c
 167 {
 168     return add_ps(mul_ps(a, b), c);
 169 }
 170
 171 static SIMDINLINE Float SIMDCALL fmsub_ps(Float const& a,
 172                                           Float const& b,
 173                                           Float const& c) // return (a * b) - c
 174 {
 175     return sub_ps(mul_ps(a, b), c);
 176 }
 177
 178 SIMD_WRAPPER_2(max_ps);   // return (a > b) ? a : b
 179 SIMD_WRAPPER_2(min_ps);   // return (a < b) ? a : b
 180 SIMD_WRAPPER_2(mul_ps);   // return a * b
 181 SIMD_WRAPPER_1(rcp_ps);   // return 1.0f / a
 182 SIMD_WRAPPER_1(rsqrt_ps); // return 1.0f / sqrt(a)
 183 SIMD_WRAPPER_2(sub_ps);   // return a - b
 184
 185 template <RoundMode RMT>
 186 static SIMDINLINE Float SIMDCALL round_ps(Float const& a)
 187 {
 188     return _mm256_round_ps(a, static_cast<int>(RMT));
 189 }
 190
 191 static SIMDINLINE Float SIMDCALL ceil_ps(Float const& a)
 192 {
 193     return round_ps<RoundMode::CEIL_NOEXC>(a);
 194 }
 195 static SIMDINLINE Float SIMDCALL floor_ps(Float const& a)
 196 {
 197     return round_ps<RoundMode::FLOOR_NOEXC>(a);
 198 }
 199
 200 //-----------------------------------------------------------------------
 201 // Integer (various width) arithmetic operations
 202 //-----------------------------------------------------------------------
 203 SIMD_EMU_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
 204 SIMD_EMU_IWRAPPER_2(add_epi32); // return a + b (int32)
 205 SIMD_EMU_IWRAPPER_2(add_epi8);  // return a + b (int8)
 206 SIMD_EMU_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
 207 SIMD_EMU_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
 208 SIMD_EMU_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
 209 SIMD_EMU_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
 210 SIMD_EMU_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
 211 SIMD_EMU_IWRAPPER_2(mul_epi32); // return a * b (int32)
 212
 213 // return (a * b) & 0xFFFFFFFF
 214 //
 215 // Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
 216 // and store the low 32 bits of the intermediate integers in dst.
 217 SIMD_EMU_IWRAPPER_2(mullo_epi32);
 218 SIMD_EMU_IWRAPPER_2(sub_epi32); // return a - b (int32)
 219 SIMD_EMU_IWRAPPER_2(sub_epi64); // return a - b (int64)
 220 SIMD_EMU_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
 221
 222 //-----------------------------------------------------------------------
 223 // Logical operations
 224 //-----------------------------------------------------------------------
 225 SIMD_WRAPPER_2(and_ps);         // return a & b       (float treated as int)
 226 SIMD_EMU_IWRAPPER_2(and_si);    // return a & b       (int)
 227 SIMD_WRAPPER_2(andnot_ps);      // return (~a) & b    (float treated as int)
 228 SIMD_EMU_IWRAPPER_2(andnot_si); // return (~a) & b    (int)
 229 SIMD_WRAPPER_2(or_ps);          // return a | b       (float treated as int)
 230 SIMD_EMU_IWRAPPER_2(or_si);     // return a | b       (int)
 231 SIMD_WRAPPER_2(xor_ps);         // return a ^ b       (float treated as int)
 232 SIMD_EMU_IWRAPPER_2(xor_si);    // return a ^ b       (int)
 233
 234 //-----------------------------------------------------------------------
 235 // Shift operations
 236 //-----------------------------------------------------------------------
 237 SIMD_EMU_IWRAPPER_1I(slli_epi32); // return a << ImmT
 238
 239 static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer const& vA,
 240                                               Integer const& vCount) // return a << b      (uint32)
 241 {
 242     int32_t aHi, aLow, countHi, countLow;
 243     __m128i vAHi      = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
 244     __m128i vALow     = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
 245     __m128i vCountHi  = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
 246     __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0));
 247
 248     aHi     = _mm_extract_epi32(vAHi, 0);
 249     countHi = _mm_extract_epi32(vCountHi, 0);
 250     aHi <<= countHi;
 251     vAHi = _mm_insert_epi32(vAHi, aHi, 0);
 252
 253     aLow     = _mm_extract_epi32(vALow, 0);
 254     countLow = _mm_extract_epi32(vCountLow, 0);
 255     aLow <<= countLow;
 256     vALow = _mm_insert_epi32(vALow, aLow, 0);
 257
 258     aHi     = _mm_extract_epi32(vAHi, 1);
 259     countHi = _mm_extract_epi32(vCountHi, 1);
 260     aHi <<= countHi;
 261     vAHi = _mm_insert_epi32(vAHi, aHi, 1);
 262
 263     aLow     = _mm_extract_epi32(vALow, 1);
 264     countLow = _mm_extract_epi32(vCountLow, 1);
 265     aLow <<= countLow;
 266     vALow = _mm_insert_epi32(vALow, aLow, 1);
 267
 268     aHi     = _mm_extract_epi32(vAHi, 2);
 269     countHi = _mm_extract_epi32(vCountHi, 2);
 270     aHi <<= countHi;
 271     vAHi = _mm_insert_epi32(vAHi, aHi, 2);
 272
 273     aLow     = _mm_extract_epi32(vALow, 2);
 274     countLow = _mm_extract_epi32(vCountLow, 2);
 275     aLow <<= countLow;
 276     vALow = _mm_insert_epi32(vALow, aLow, 2);
 277
 278     aHi     = _mm_extract_epi32(vAHi, 3);
 279     countHi = _mm_extract_epi32(vCountHi, 3);
 280     aHi <<= countHi;
 281     vAHi = _mm_insert_epi32(vAHi, aHi, 3);
 282
 283     aLow     = _mm_extract_epi32(vALow, 3);
 284     countLow = _mm_extract_epi32(vCountLow, 3);
 285     aLow <<= countLow;
 286     vALow = _mm_insert_epi32(vALow, aLow, 3);
 287
 288     __m256i ret = _mm256_set1_epi32(0);
 289     ret         = _mm256_insertf128_si256(ret, vAHi, 1);
 290     ret         = _mm256_insertf128_si256(ret, vALow, 0);
 291     return ret;
 292 }
 293
 294 SIMD_EMU_IWRAPPER_1I(srai_epi32); // return a >> ImmT   (int32)
 295 SIMD_EMU_IWRAPPER_1I(srli_epi32); // return a >> ImmT   (uint32)
 296 SIMD_EMU_IWRAPPER_1I(srli_si);    // return a >> (ImmT*8) (uint)
 297
 298 template <int ImmT> // same as srli_si, but with Float cast to int
 299 static SIMDINLINE Float SIMDCALL srlisi_ps(Float const& a)
 300 {
 301     return castsi_ps(srli_si<ImmT>(castps_si(a)));
 302 }
 303
 304 static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer const& vA,
 305                                               Integer const& vCount) // return a >> b      (uint32)
 306 {
 307     int32_t aHi, aLow, countHi, countLow;
 308     __m128i vAHi      = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
 309     __m128i vALow     = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
 310     __m128i vCountHi  = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
 311     __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0));
 312
 313     aHi     = _mm_extract_epi32(vAHi, 0);
 314     countHi = _mm_extract_epi32(vCountHi, 0);
 315     aHi >>= countHi;
 316     vAHi = _mm_insert_epi32(vAHi, aHi, 0);
 317
 318     aLow     = _mm_extract_epi32(vALow, 0);
 319     countLow = _mm_extract_epi32(vCountLow, 0);
 320     aLow >>= countLow;
 321     vALow = _mm_insert_epi32(vALow, aLow, 0);
 322
 323     aHi     = _mm_extract_epi32(vAHi, 1);
 324     countHi = _mm_extract_epi32(vCountHi, 1);
 325     aHi >>= countHi;
 326     vAHi = _mm_insert_epi32(vAHi, aHi, 1);
 327
 328     aLow     = _mm_extract_epi32(vALow, 1);
 329     countLow = _mm_extract_epi32(vCountLow, 1);
 330     aLow >>= countLow;
 331     vALow = _mm_insert_epi32(vALow, aLow, 1);
 332
 333     aHi     = _mm_extract_epi32(vAHi, 2);
 334     countHi = _mm_extract_epi32(vCountHi, 2);
 335     aHi >>= countHi;
 336     vAHi = _mm_insert_epi32(vAHi, aHi, 2);
 337
 338     aLow     = _mm_extract_epi32(vALow, 2);
 339     countLow = _mm_extract_epi32(vCountLow, 2);
 340     aLow >>= countLow;
 341     vALow = _mm_insert_epi32(vALow, aLow, 2);
 342
 343     aHi     = _mm_extract_epi32(vAHi, 3);
 344     countHi = _mm_extract_epi32(vCountHi, 3);
 345     aHi >>= countHi;
 346     vAHi = _mm_insert_epi32(vAHi, aHi, 3);
 347
 348     aLow     = _mm_extract_epi32(vALow, 3);
 349     countLow = _mm_extract_epi32(vCountLow, 3);
 350     aLow >>= countLow;
 351     vALow = _mm_insert_epi32(vALow, aLow, 3);
 352
 353     __m256i ret = _mm256_set1_epi32(0);
 354     ret         = _mm256_insertf128_si256(ret, vAHi, 1);
 355     ret         = _mm256_insertf128_si256(ret, vALow, 0);
 356     return ret;
 357 }
 358
 359 //-----------------------------------------------------------------------
 360 // Conversion operations
 361 //-----------------------------------------------------------------------
 362 static SIMDINLINE Float SIMDCALL castpd_ps(Double const& a) // return *(Float*)(&a)
 363 {
 364     return _mm256_castpd_ps(a);
 365 }
 366
 367 static SIMDINLINE Integer SIMDCALL castps_si(Float const& a) // return *(Integer*)(&a)
 368 {
 369     return _mm256_castps_si256(a);
 370 }
 371
 372 static SIMDINLINE Double SIMDCALL castsi_pd(Integer const& a) // return *(Double*)(&a)
 373 {
 374     return _mm256_castsi256_pd(a);
 375 }
 376
 377 static SIMDINLINE Double SIMDCALL castps_pd(Float const& a) // return *(Double*)(&a)
 378 {
 379     return _mm256_castps_pd(a);
 380 }
 381
 382 static SIMDINLINE Integer SIMDCALL castpd_si(Double const& a) // return *(Integer*)(&a)
 383 {
 384     return _mm256_castpd_si256(a);
 385 }
 386
 387 static SIMDINLINE Float SIMDCALL castsi_ps(Integer const& a) // return *(Float*)(&a)
 388 {
 389     return _mm256_castsi256_ps(a);
 390 }
 391
 392 static SIMDINLINE Float SIMDCALL
 393                         cvtepi32_ps(Integer const& a) // return (float)a    (int32 --> float)
 394 {
 395     return _mm256_cvtepi32_ps(a);
 396 }
 397
 398 SIMD_EMU_IWRAPPER_1L(cvtepu8_epi16, 8);  // return (int16)a    (uint8 --> int16)
 399 SIMD_EMU_IWRAPPER_1L(cvtepu8_epi32, 4);  // return (int32)a    (uint8 --> int32)
 400 SIMD_EMU_IWRAPPER_1L(cvtepu16_epi32, 8); // return (int32)a    (uint16 --> int32)
 401 SIMD_EMU_IWRAPPER_1L(cvtepu16_epi64, 4); // return (int64)a    (uint16 --> int64)
 402 SIMD_EMU_IWRAPPER_1L(cvtepu32_epi64, 8); // return (int64)a    (uint32 --> int64)
 403
 404 static SIMDINLINE Integer SIMDCALL
 405                           cvtps_epi32(Float const& a) // return (int32)a    (float --> int32)
 406 {
 407     return _mm256_cvtps_epi32(a);
 408 }
 409
 410 static SIMDINLINE Integer SIMDCALL
 411                           cvttps_epi32(Float const& a) // return (int32)a    (rnd_to_zero(float) --> int32)
 412 {
 413     return _mm256_cvttps_epi32(a);
 414 }
 415
 416 //-----------------------------------------------------------------------
 417 // Comparison operations
 418 //-----------------------------------------------------------------------
 419 template <CompareType CmpTypeT>
 420 static SIMDINLINE Float SIMDCALL cmp_ps(Float const& a, Float const& b) // return a (CmpTypeT) b
 421 {
 422     return _mm256_cmp_ps(a, b, static_cast<const int>(CmpTypeT));
 423 }
 424 static SIMDINLINE Float SIMDCALL cmplt_ps(Float const& a, Float const& b)
 425 {
 426     return cmp_ps<CompareType::LT_OQ>(a, b);
 427 }
 428 static SIMDINLINE Float SIMDCALL cmpgt_ps(Float const& a, Float const& b)
 429 {
 430     return cmp_ps<CompareType::GT_OQ>(a, b);
 431 }
 432 static SIMDINLINE Float SIMDCALL cmpneq_ps(Float const& a, Float const& b)
 433 {
 434     return cmp_ps<CompareType::NEQ_OQ>(a, b);
 435 }
 436 static SIMDINLINE Float SIMDCALL cmpeq_ps(Float const& a, Float const& b)
 437 {
 438     return cmp_ps<CompareType::EQ_OQ>(a, b);
 439 }
 440 static SIMDINLINE Float SIMDCALL cmpge_ps(Float const& a, Float const& b)
 441 {
 442     return cmp_ps<CompareType::GE_OQ>(a, b);
 443 }
 444 static SIMDINLINE Float SIMDCALL cmple_ps(Float const& a, Float const& b)
 445 {
 446     return cmp_ps<CompareType::LE_OQ>(a, b);
 447 }
 448
 449 SIMD_EMU_IWRAPPER_2(cmpeq_epi8);  // return a == b (int8)
 450 SIMD_EMU_IWRAPPER_2(cmpeq_epi16); // return a == b (int16)
 451 SIMD_EMU_IWRAPPER_2(cmpeq_epi32); // return a == b (int32)
 452 SIMD_EMU_IWRAPPER_2(cmpeq_epi64); // return a == b (int64)
 453 SIMD_EMU_IWRAPPER_2(cmpgt_epi8);  // return a > b (int8)
 454 SIMD_EMU_IWRAPPER_2(cmpgt_epi16); // return a > b (int16)
 455 SIMD_EMU_IWRAPPER_2(cmpgt_epi32); // return a > b (int32)
 456 SIMD_EMU_IWRAPPER_2(cmpgt_epi64); // return a > b (int64)
 457 SIMD_EMU_IWRAPPER_2(cmplt_epi32); // return a < b (int32)
 458
 459 static SIMDINLINE bool SIMDCALL
 460                        testz_ps(Float const& a, Float const& b) // return all_lanes_zero(a & b) ? 1 : 0 (float)
 461 {
 462     return 0 != _mm256_testz_ps(a, b);
 463 }
 464
 465 static SIMDINLINE bool SIMDCALL
 466                        testz_si(Integer const& a, Integer const& b) // return all_lanes_zero(a & b) ? 1 : 0 (int)
 467 {
 468     return 0 != _mm256_testz_si256(a, b);
 469 }
 470
 471 //-----------------------------------------------------------------------
 472 // Blend / shuffle / permute operations
 473 //-----------------------------------------------------------------------
 474 SIMD_WRAPPER_2I(blend_ps);                       // return ImmT ? b : a  (float)
 475 SIMD_IFWRAPPER_2I(blend_epi32, _mm256_blend_ps); // return ImmT ? b : a  (int32)
 476 SIMD_WRAPPER_3(blendv_ps);                       // return mask ? b : a  (float)
 477
 478 static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const& a,
 479                                                 Integer const& b,
 480                                                 Float const&   mask) // return mask ? b : a (int)
 481 {
 482     return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask));
 483 }
 484
 485 static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const& a,
 486                                                 Integer const& b,
 487                                                 Integer const& mask) // return mask ? b : a (int)
 488 {
 489     return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask)));
 490 }
 491
 492 static SIMDINLINE Float SIMDCALL
 493                         broadcast_ss(float const* p) // return *p (all elements in vector get same value)
 494 {
 495     return _mm256_broadcast_ss(p);
 496 }
 497
 498 SIMD_EMU_IWRAPPER_2(packs_epi16); // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
 499 SIMD_EMU_IWRAPPER_2(packs_epi32); // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
 500 SIMD_EMU_IWRAPPER_2(
 501     packus_epi16); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
 502 SIMD_EMU_IWRAPPER_2(
 503     packus_epi32); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
 504
 505 template <int ImmT>
 506 static SIMDINLINE Float SIMDCALL permute_ps(Float const& a)
 507 {
 508     return _mm256_permute_ps(a, ImmT);
 509 }
 510
 511 static SIMDINLINE Integer SIMDCALL permute_epi32(
 512     Integer const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (int32)
 513 {
 514     Integer result;
 515
 516     // Ugly slow implementation
 517     uint32_t const* pA      = reinterpret_cast<uint32_t const*>(&a);
 518     uint32_t const* pSwiz   = reinterpret_cast<uint32_t const*>(&swiz);
 519     uint32_t*       pResult = reinterpret_cast<uint32_t*>(&result);
 520
 521     for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
 522     {
 523         pResult[i] = pA[0xF & pSwiz[i]];
 524     }
 525
 526     return result;
 527 }
 528
 529 static SIMDINLINE Float SIMDCALL
 530                         permute_ps(Float const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (float)
 531 {
 532     Float result;
 533
 534     // Ugly slow implementation
 535     float const*    pA      = reinterpret_cast<float const*>(&a);
 536     uint32_t const* pSwiz   = reinterpret_cast<uint32_t const*>(&swiz);
 537     float*          pResult = reinterpret_cast<float*>(&result);
 538
 539     for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
 540     {
 541         pResult[i] = pA[0xF & pSwiz[i]];
 542     }
 543
 544     return result;
 545 }
 546
 547 SIMD_WRAPPER_2I(permute2f128_ps);
 548 SIMD_DWRAPPER_2I(permute2f128_pd);
 549 SIMD_IWRAPPER_2I_(permute2f128_si, permute2f128_si256);
 550
 551 SIMD_EMU_IWRAPPER_1I(shuffle_epi32);
 552
 553 template <int ImmT>
 554 static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer const& a, Integer const& b)
 555 {
 556     return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
 557 }
 558 SIMD_EMU_IWRAPPER_2(shuffle_epi8);
 559 SIMD_DWRAPPER_2I(shuffle_pd);
 560 SIMD_WRAPPER_2I(shuffle_ps);
 561 SIMD_EMU_IWRAPPER_2(unpackhi_epi16);
 562 SIMD_IFWRAPPER_2(unpackhi_epi32, _mm256_unpackhi_ps);
 563 SIMD_EMU_IWRAPPER_2(unpackhi_epi64);
 564 SIMD_EMU_IWRAPPER_2(unpackhi_epi8);
 565 SIMD_DWRAPPER_2(unpackhi_pd);
 566 SIMD_WRAPPER_2(unpackhi_ps);
 567 SIMD_EMU_IWRAPPER_2(unpacklo_epi16);
 568 SIMD_IFWRAPPER_2(unpacklo_epi32, _mm256_unpacklo_ps);
 569 SIMD_EMU_IWRAPPER_2(unpacklo_epi64);
 570 SIMD_EMU_IWRAPPER_2(unpacklo_epi8);
 571 SIMD_DWRAPPER_2(unpacklo_pd);
 572 SIMD_WRAPPER_2(unpacklo_ps);
 573
 574 //-----------------------------------------------------------------------
 575 // Load / store operations
 576 //-----------------------------------------------------------------------
 577 template <ScaleFactor ScaleT>
 578 static SIMDINLINE Float SIMDCALL
 579                         i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
 580 {
 581     uint32_t* pOffsets = (uint32_t*)&idx;
 582     Float     vResult;
 583     float*    pResult = (float*)&vResult;
 584     for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
 585     {
 586         uint32_t offset = pOffsets[i];
 587         offset          = offset * static_cast<uint32_t>(ScaleT);
 588         pResult[i]      = *(float const*)(((uint8_t const*)p + offset));
 589     }
 590
 591     return vResult;
 592 }
 593
 594 static SIMDINLINE Float SIMDCALL
 595                         load1_ps(float const* p) // return *p    (broadcast 1 value to all elements)
 596 {
 597     return broadcast_ss(p);
 598 }
 599
 600 static SIMDINLINE Float SIMDCALL
 601                         load_ps(float const* p) // return *p    (loads SIMD width elements from memory)
 602 {
 603     return _mm256_load_ps(p);
 604 }
 605
 606 static SIMDINLINE Integer SIMDCALL load_si(Integer const* p) // return *p
 607 {
 608     return _mm256_load_si256(&p->v);
 609 }
 610
 611 static SIMDINLINE Float SIMDCALL
 612                         loadu_ps(float const* p) // return *p    (same as load_ps but allows for unaligned mem)
 613 {
 614     return _mm256_loadu_ps(p);
 615 }
 616
 617 static SIMDINLINE Integer SIMDCALL
 618                           loadu_si(Integer const* p) // return *p    (same as load_si but allows for unaligned mem)
 619 {
 620     return _mm256_lddqu_si256(&p->v);
 621 }
 622
 623 // for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
 624 template <ScaleFactor ScaleT>
 625 static SIMDINLINE Float SIMDCALL
 626                         mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask)
 627 {
 628     uint32_t* pOffsets = (uint32_t*)&idx;
 629     Float     vResult  = old;
 630     float*    pResult  = (float*)&vResult;
 631     DWORD     index;
 632     uint32_t  umask = movemask_ps(mask);
 633     while (_BitScanForward(&index, umask))
 634     {
 635         umask &= ~(1 << index);
 636         uint32_t offset = pOffsets[index];
 637         offset          = offset * static_cast<uint32_t>(ScaleT);
 638         pResult[index]  = *(float const*)(((uint8_t const*)p + offset));
 639     }
 640
 641     return vResult;
 642 }
 643
 644 static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer const& mask, Float const& src)
 645 {
 646     _mm256_maskstore_ps(p, mask, src);
 647 }
 648
 649 static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer const& a)
 650 {
 651     return SIMD128T::movemask_epi8(a.v4[0]) | (SIMD128T::movemask_epi8(a.v4[1]) << 16);
 652 }
 653
 654 static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double const& a)
 655 {
 656     return static_cast<uint32_t>(_mm256_movemask_pd(a));
 657 }
 658 static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float const& a)
 659 {
 660     return static_cast<uint32_t>(_mm256_movemask_ps(a));
 661 }
 662
 663 static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
 664 {
 665     return _mm256_set1_epi32(i);
 666 }
 667
 668 static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
 669 {
 670     return _mm256_set1_epi8(i);
 671 }
 672
 673 static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value)
 674 {
 675     return _mm256_set1_ps(f);
 676 }
 677
 678 static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float)
 679 {
 680     return _mm256_setzero_ps();
 681 }
 682
 683 static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer)
 684 {
 685     return _mm256_setzero_si256();
 686 }
 687
 688 static SIMDINLINE void SIMDCALL
 689                        store_ps(float* p, Float const& a) // *p = a   (stores all elements contiguously in memory)
 690 {
 691     _mm256_store_ps(p, a);
 692 }
 693
 694 static SIMDINLINE void SIMDCALL store_si(Integer* p, Integer const& a) // *p = a
 695 {
 696     _mm256_store_si256(&p->v, a);
 697 }
 698
 699 static SIMDINLINE void SIMDCALL
 700                        stream_ps(float* p, Float const& a) // *p = a   (same as store_ps, but doesn't keep memory in cache)
 701 {
 702     _mm256_stream_ps(p, a);
 703 }
 704
 705 //=======================================================================
 706 // Legacy interface (available only in SIMD256 width)
 707 //=======================================================================
 708
 709 static SIMDINLINE Float SIMDCALL broadcast_ps(SIMD128Impl::Float const* p)
 710 {
 711     return _mm256_broadcast_ps(&p->v);
 712 }
 713
 714 template <int ImmT>
 715 static SIMDINLINE SIMD128Impl::Double SIMDCALL extractf128_pd(Double const& a)
 716 {
 717     return _mm256_extractf128_pd(a, ImmT);
 718 }
 719
 720 template <int ImmT>
 721 static SIMDINLINE SIMD128Impl::Float SIMDCALL extractf128_ps(Float const& a)
 722 {
 723     return _mm256_extractf128_ps(a, ImmT);
 724 }
 725
 726 template <int ImmT>
 727 static SIMDINLINE SIMD128Impl::Integer SIMDCALL extractf128_si(Integer const& a)
 728 {
 729     return _mm256_extractf128_si256(a, ImmT);
 730 }
 731
 732 template <int ImmT>
 733 static SIMDINLINE Double SIMDCALL insertf128_pd(Double const& a, SIMD128Impl::Double const& b)
 734 {
 735     return _mm256_insertf128_pd(a, b, ImmT);
 736 }
 737
 738 template <int ImmT>
 739 static SIMDINLINE Float SIMDCALL insertf128_ps(Float const& a, SIMD128Impl::Float const& b)
 740 {
 741     return _mm256_insertf128_ps(a, b, ImmT);
 742 }
 743
 744 template <int ImmT>
 745 static SIMDINLINE Integer SIMDCALL insertf128_si(Integer const& a, SIMD128Impl::Integer const& b)
 746 {
 747     return _mm256_insertf128_si256(a, b, ImmT);
 748 }
 749
 750 #ifndef _mm256_set_m128i
 751 #define _mm256_set_m128i(/* SIMD128Impl::Integer */ hi, /* SIMD128Impl::Integer */ lo) \
 752     _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
 753 #endif
 754
 755 #ifndef _mm256_loadu2_m128i
 756 #define _mm256_loadu2_m128i(/* SIMD128Impl::Integer const* */ hiaddr, \
 757                             /* SIMD128Impl::Integer const* */ loaddr) \
 758     _mm256_set_m128i(_mm_loadu_si128(hiaddr), _mm_loadu_si128(loaddr))
 759 #endif
 760
 761 static SIMDINLINE Integer SIMDCALL loadu2_si(SIMD128Impl::Integer const* phi,
 762                                              SIMD128Impl::Integer const* plo)
 763 {
 764     return _mm256_loadu2_m128i(&phi->v, &plo->v);
 765 }
 766
 767 static SIMDINLINE Integer SIMDCALL
 768                           set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
 769 {
 770     return _mm256_set_epi32(i7, i6, i5, i4, i3, i2, i1, i0);
 771 }
 772
 773 static SIMDINLINE Float SIMDCALL
 774                         set_ps(float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
 775 {
 776     return _mm256_set_ps(i7, i6, i5, i4, i3, i2, i1, i0);
 777 }
 778
 779 static SIMDINLINE void SIMDCALL storeu2_si(SIMD128Impl::Integer* phi,
 780                                            SIMD128Impl::Integer* plo,
 781                                            Integer const&        src)
 782 {
 783     _mm256_storeu2_m128i(&phi->v, &plo->v, src);
 784 }
 785
 786 static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
 787 {
 788     Integer       vec = set1_epi32(mask);
 789     const Integer bit = set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
 790     vec               = and_si(vec, bit);
 791     vec               = cmplt_epi32(setzero_si(), vec);
 792     return castsi_ps(vec);
 793 }
 794
 795 #undef SIMD_WRAPPER_1
 796 #undef SIMD_WRAPPER_2
 797 #undef SIMD_DWRAPPER_2
 798 #undef SIMD_DWRAPPER_2I
 799 #undef SIMD_WRAPPER_2I
 800 #undef SIMD_WRAPPER_3
 801 #undef SIMD_IWRAPPER_1
 802 #undef SIMD_IWRAPPER_2
 803 #undef SIMD_IFWRAPPER_2
 804 #undef SIMD_IFWRAPPER_2I
 805 #undef SIMD_IWRAPPER_2I
 806 #undef SIMD_IWRAPPER_2I_
 807 #undef SIMD_IWRAPPER_2_
 808 #undef SIMD_IWRAPPER_3
 809 #undef SIMD_EMU_IWRAPPER_1
 810 #undef SIMD_EMU_IWRAPPER_1I
 811 #undef SIMD_EMU_IWRAPPER_2
 812 #undef SIMD_EMU_IWRAPPER_2I