src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl

   1 /****************************************************************************
   2 * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 ****************************************************************************/
  23 #if !defined(__SIMD_LIB_AVX512_HPP__)
  24 #error Do not include this file directly, use "simdlib.hpp" instead.
  25 #endif
  26
  27 #if defined(__GNUC__) && !defined( __clang__) && !defined(__INTEL_COMPILER)
  28 // gcc as of 7.1 was missing these intrinsics
  29 #ifndef _mm512_cmpneq_ps_mask
  30 #define _mm512_cmpneq_ps_mask(a,b) _mm512_cmp_ps_mask((a),(b),_CMP_NEQ_UQ)
  31 #endif
  32
  33 #ifndef _mm512_cmplt_ps_mask
  34 #define _mm512_cmplt_ps_mask(a,b) _mm512_cmp_ps_mask((a),(b),_CMP_LT_OS)
  35 #endif
  36
  37 #ifndef _mm512_cmplt_pd_mask
  38 #define _mm512_cmplt_pd_mask(a,b) _mm512_cmp_pd_mask((a),(b),_CMP_LT_OS)
  39 #endif
  40
  41 #endif
  42
  43 //============================================================================
  44 // SIMD16 AVX512 (F) implementation (compatible with Knights and Core
  45 // processors)
  46 //
  47 //============================================================================
  48
  49 static const int TARGET_SIMD_WIDTH = 16;
  50 using SIMD256T = SIMD256Impl::AVX2Impl;
  51
  52 #define SIMD_WRAPPER_1_(op, intrin)  \
  53     static SIMDINLINE Float SIMDCALL op(Float a)   \
  54     {\
  55         return intrin(a);\
  56     }
  57
  58 #define SIMD_WRAPPER_1(op)  \
  59     SIMD_WRAPPER_1_(op, _mm512_##op)
  60
  61 #define SIMD_WRAPPER_2_(op, intrin)  \
  62     static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
  63     {\
  64         return _mm512_##intrin(a, b);\
  65     }
  66 #define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op)
  67
  68 #define SIMD_WRAPPERI_2_(op, intrin)  \
  69     static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
  70     {\
  71         return _mm512_castsi512_ps(_mm512_##intrin(\
  72             _mm512_castps_si512(a), _mm512_castps_si512(b)));\
  73     }
  74
  75 #define SIMD_DWRAPPER_2(op)  \
  76     static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
  77     {\
  78         return _mm512_##op(a, b);\
  79     }
  80
  81 #define SIMD_WRAPPER_2I_(op, intrin)  \
  82     template<int ImmT>\
  83     static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
  84     {\
  85         return _mm512_##intrin(a, b, ImmT);\
  86     }
  87 #define SIMD_WRAPPER_2I(op)  SIMD_WRAPPER_2I_(op, op)
  88
  89 #define SIMD_DWRAPPER_2I_(op, intrin)  \
  90     template<int ImmT>\
  91     static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
  92     {\
  93         return _mm512_##intrin(a, b, ImmT);\
  94     }
  95 #define SIMD_DWRAPPER_2I(op)  SIMD_DWRAPPER_2I_(op, op)
  96
  97 #define SIMD_WRAPPER_3(op)  \
  98     static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c)   \
  99     {\
 100         return _mm512_##op(a, b, c);\
 101     }
 102
 103 #define SIMD_IWRAPPER_1(op)  \
 104     static SIMDINLINE Integer SIMDCALL op(Integer a)   \
 105     {\
 106         return _mm512_##op(a);\
 107     }
 108 #define SIMD_IWRAPPER_1_8(op)  \
 109     static SIMDINLINE Integer SIMDCALL op(SIMD256Impl::Integer a)   \
 110     {\
 111         return _mm512_##op(a);\
 112     }
 113
 114 #define SIMD_IWRAPPER_1_4(op)  \
 115     static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer a)   \
 116     {\
 117         return _mm512_##op(a);\
 118     }
 119
 120 #define SIMD_IWRAPPER_1I_(op, intrin)  \
 121     template<int ImmT> \
 122     static SIMDINLINE Integer SIMDCALL op(Integer a)   \
 123     {\
 124         return intrin(a, ImmT);\
 125     }
 126 #define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm512_##op)
 127
 128 #define SIMD_IWRAPPER_2_(op, intrin)  \
 129     static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
 130     {\
 131         return _mm512_##intrin(a, b);\
 132     }
 133 #define SIMD_IWRAPPER_2(op)  SIMD_IWRAPPER_2_(op, op)
 134
 135 #define SIMD_IWRAPPER_2_CMP(op, cmp)  \
 136     static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
 137     {\
 138         return cmp(a, b);\
 139     }
 140
 141 #define SIMD_IFWRAPPER_2(op, intrin)  \
 142     static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
 143     {\
 144         return castps_si(_mm512_##intrin(castsi_ps(a), castsi_ps(b)) );\
 145     }
 146
 147 #define SIMD_IWRAPPER_2I_(op, intrin)  \
 148     template<int ImmT>\
 149     static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
 150     {\
 151         return _mm512_##intrin(a, b, ImmT);\
 152     }
 153 #define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op)
 154
 155 private:
 156     static SIMDINLINE Integer vmask(__mmask16 m)
 157     {
 158         return _mm512_maskz_set1_epi32(m, -1);
 159     }
 160
 161     static SIMDINLINE Integer vmask(__mmask8 m)
 162     {
 163         return _mm512_maskz_set1_epi64(m, -1LL);
 164     }
 165
 166 public:
 167 //-----------------------------------------------------------------------
 168 // Single precision floating point arithmetic operations
 169 //-----------------------------------------------------------------------
 170 SIMD_WRAPPER_2(add_ps);     // return a + b
 171 SIMD_WRAPPER_2(div_ps);     // return a / b
 172 SIMD_WRAPPER_3(fmadd_ps);   // return (a * b) + c
 173 SIMD_WRAPPER_3(fmsub_ps);   // return (a * b) - c
 174 SIMD_WRAPPER_2(max_ps);     // return (a > b) ? a : b
 175 SIMD_WRAPPER_2(min_ps);     // return (a < b) ? a : b
 176 SIMD_WRAPPER_2(mul_ps);     // return a * b
 177 SIMD_WRAPPER_1_(rcp_ps, _mm512_rcp14_ps);       // return 1.0f / a
 178 SIMD_WRAPPER_1_(rsqrt_ps, _mm512_rsqrt14_ps);   // return 1.0f / sqrt(a)
 179 SIMD_WRAPPER_2(sub_ps);     // return a - b
 180
 181 template <RoundMode RMT>
 182 static SIMDINLINE Float SIMDCALL round_ps(Float a)
 183 {
 184     return _mm512_roundscale_ps(a, static_cast<int>(RMT));
 185 }
 186
 187 static SIMDINLINE Float SIMDCALL ceil_ps(Float a) { return round_ps<RoundMode::CEIL_NOEXC>(a); }
 188 static SIMDINLINE Float SIMDCALL floor_ps(Float a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); }
 189
 190 //-----------------------------------------------------------------------
 191 // Integer (various width) arithmetic operations
 192 //-----------------------------------------------------------------------
 193 SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
 194 SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
 195 //SIMD_IWRAPPER_2(add_epi8);  // return a + b (int8)
 196 //SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
 197 SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
 198 SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
 199 SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
 200 SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
 201 SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
 202
 203                             // return (a * b) & 0xFFFFFFFF
 204                             //
 205                             // Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
 206                             // and store the low 32 bits of the intermediate integers in dst.
 207 SIMD_IWRAPPER_2(mullo_epi32);
 208 SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
 209 SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
 210 //SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
 211
 212 //-----------------------------------------------------------------------
 213 // Logical operations
 214 //-----------------------------------------------------------------------
 215 SIMD_IWRAPPER_2_(and_si, and_si512);        // return a & b       (int)
 216 SIMD_IWRAPPER_2_(andnot_si, andnot_si512);  // return (~a) & b    (int)
 217 SIMD_IWRAPPER_2_(or_si, or_si512);          // return a | b       (int)
 218 SIMD_IWRAPPER_2_(xor_si, xor_si512);        // return a ^ b       (int)
 219
 220 // SIMD_WRAPPER_2(and_ps);                     // return a & b       (float treated as int)
 221 // SIMD_WRAPPER_2(andnot_ps);                  // return (~a) & b    (float treated as int)
 222 // SIMD_WRAPPER_2(or_ps);                      // return a | b       (float treated as int)
 223 // SIMD_WRAPPER_2(xor_ps);                     // return a ^ b       (float treated as int)
 224
 225
 226 //-----------------------------------------------------------------------
 227 // Shift operations
 228 //-----------------------------------------------------------------------
 229 SIMD_IWRAPPER_1I(slli_epi32);               // return a << ImmT
 230 SIMD_IWRAPPER_2(sllv_epi32);
 231 SIMD_IWRAPPER_1I(srai_epi32);               // return a >> ImmT   (int32)
 232 SIMD_IWRAPPER_1I(srli_epi32);               // return a >> ImmT   (uint32)
 233
 234 #if 0
 235 SIMD_IWRAPPER_1I_(srli_si, srli_si512);     // return a >> (ImmT*8) (uint)
 236
 237 template<int ImmT>                              // same as srli_si, but with Float cast to int
 238 static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)
 239 {
 240     return castsi_ps(srli_si<ImmT>(castps_si(a)));
 241 }
 242 #endif
 243
 244 SIMD_IWRAPPER_2(srlv_epi32);
 245
 246 //-----------------------------------------------------------------------
 247 // Conversion operations
 248 //-----------------------------------------------------------------------
 249 static SIMDINLINE Float SIMDCALL castpd_ps(Double a)   // return *(Float*)(&a)
 250 {
 251     return _mm512_castpd_ps(a);
 252 }
 253
 254 static SIMDINLINE Integer SIMDCALL castps_si(Float a)   // return *(Integer*)(&a)
 255 {
 256     return _mm512_castps_si512(a);
 257 }
 258
 259 static SIMDINLINE Double SIMDCALL castsi_pd(Integer a)   // return *(Double*)(&a)
 260 {
 261     return _mm512_castsi512_pd(a);
 262 }
 263
 264 static SIMDINLINE Double SIMDCALL castps_pd(Float a)   // return *(Double*)(&a)
 265 {
 266     return _mm512_castps_pd(a);
 267 }
 268
 269 static SIMDINLINE Integer SIMDCALL castpd_si(Double a)   // return *(Integer*)(&a)
 270 {
 271     return _mm512_castpd_si512(a);
 272 }
 273
 274 static SIMDINLINE Float SIMDCALL castsi_ps(Integer a)   // return *(Float*)(&a)
 275 {
 276     return _mm512_castsi512_ps(a);
 277 }
 278
 279 static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a) // return (float)a    (int32 --> float)
 280 {
 281     return _mm512_cvtepi32_ps(a);
 282 }
 283
 284 //SIMD_IWRAPPER_1_8(cvtepu8_epi16);     // return (int16)a    (uint8 --> int16)
 285 SIMD_IWRAPPER_1_4(cvtepu8_epi32);     // return (int32)a    (uint8 --> int32)
 286 SIMD_IWRAPPER_1_8(cvtepu16_epi32);    // return (int32)a    (uint16 --> int32)
 287 SIMD_IWRAPPER_1_4(cvtepu16_epi64);    // return (int64)a    (uint16 --> int64)
 288 SIMD_IWRAPPER_1_8(cvtepu32_epi64);    // return (int64)a    (uint32 --> int64)
 289
 290 static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a)            // return (int32)a    (float --> int32)
 291 {
 292     return _mm512_cvtps_epi32(a);
 293 }
 294
 295 static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float a)           // return (int32)a    (rnd_to_zero(float) --> int32)
 296 {
 297     return _mm512_cvttps_epi32(a);
 298 }
 299
 300 //-----------------------------------------------------------------------
 301 // Comparison operations
 302 //-----------------------------------------------------------------------
 303 template<CompareType CmpTypeT>
 304 static SIMDINLINE Mask SIMDCALL cmp_ps_mask(Float a, Float b)
 305 {
 306     return _mm512_cmp_ps_mask(a, b, static_cast<const int>(CmpTypeT));
 307 }
 308
 309 template<CompareType CmpTypeT>
 310 static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b
 311 {
 312     // Legacy vector mask generator
 313     __mmask16 result = cmp_ps_mask<CmpTypeT>(a, b);
 314     return castsi_ps(vmask(result));
 315 }
 316
 317 static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b) { return cmp_ps<CompareType::LT_OQ>(a, b); }
 318 static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b) { return cmp_ps<CompareType::GT_OQ>(a, b); }
 319 static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); }
 320 static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b) { return cmp_ps<CompareType::EQ_OQ>(a, b); }
 321 static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b) { return cmp_ps<CompareType::GE_OQ>(a, b); }
 322 static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b) { return cmp_ps<CompareType::LE_OQ>(a, b); }
 323
 324 template<CompareTypeInt CmpTypeT>
 325 static SIMDINLINE Integer SIMDCALL cmp_epi32(Integer a, Integer b)
 326 {
 327     // Legacy vector mask generator
 328     __mmask16 result = _mm512_cmp_epi32_mask(a, b, static_cast<const int>(CmpTypeT));
 329     return vmask(result);
 330 }
 331 template<CompareTypeInt CmpTypeT>
 332 static SIMDINLINE Integer SIMDCALL cmp_epi64(Integer a, Integer b)
 333 {
 334     // Legacy vector mask generator
 335     __mmask8 result = _mm512_cmp_epi64_mask(a, b, static_cast<const int>(CmpTypeT));
 336     return vmask(result);
 337 }
 338
 339 //SIMD_IWRAPPER_2_CMP(cmpeq_epi8,  cmp_epi8<CompareTypeInt::EQ>);    // return a == b (int8)
 340 //SIMD_IWRAPPER_2_CMP(cmpeq_epi16, cmp_epi16<CompareTypeInt::EQ>);   // return a == b (int16)
 341 SIMD_IWRAPPER_2_CMP(cmpeq_epi32, cmp_epi32<CompareTypeInt::EQ>);   // return a == b (int32)
 342 SIMD_IWRAPPER_2_CMP(cmpeq_epi64, cmp_epi64<CompareTypeInt::EQ>);   // return a == b (int64)
 343 //SIMD_IWRAPPER_2_CMP(cmpgt_epi8,  cmp_epi8<CompareTypeInt::GT>);    // return a > b (int8)
 344 //SIMD_IWRAPPER_2_CMP(cmpgt_epi16, cmp_epi16<CompareTypeInt::GT>);   // return a > b (int16)
 345 SIMD_IWRAPPER_2_CMP(cmpgt_epi32, cmp_epi32<CompareTypeInt::GT>);   // return a > b (int32)
 346 SIMD_IWRAPPER_2_CMP(cmpgt_epi64, cmp_epi64<CompareTypeInt::GT>);   // return a > b (int64)
 347 SIMD_IWRAPPER_2_CMP(cmplt_epi32, cmp_epi32<CompareTypeInt::LT>);   // return a < b (int32)
 348
 349 static SIMDINLINE bool SIMDCALL testz_ps(Float a, Float b)  // return all_lanes_zero(a & b) ? 1 : 0 (float)
 350 {
 351     return (0 == static_cast<int>(_mm512_test_epi32_mask(castps_si(a), castps_si(b))));
 352 }
 353
 354 static SIMDINLINE bool SIMDCALL testz_si(Integer a, Integer b)  // return all_lanes_zero(a & b) ? 1 : 0 (int)
 355 {
 356     return (0 == static_cast<int>(_mm512_test_epi32_mask(a, b)));
 357 }
 358
 359 //-----------------------------------------------------------------------
 360 // Blend / shuffle / permute operations
 361 //-----------------------------------------------------------------------
 362 template <int ImmT>
 363 static SIMDINLINE Float blend_ps(Float a, Float b) // return ImmT ? b : a  (float)
 364 {
 365     return _mm512_mask_blend_ps(__mmask16(ImmT), a, b);
 366 }
 367
 368 template <int ImmT>
 369 static SIMDINLINE Float blend_epi32(Integer a, Integer b) // return ImmT ? b : a  (int32)
 370 {
 371     return _mm512_mask_blend_epi32(__mmask16(ImmT), a, b);
 372 }
 373
 374 static SIMDINLINE Float blendv_ps(Float a, Float b, Float mask) // return mask ? b : a  (float)
 375 {
 376     return _mm512_mask_blend_ps(__mmask16(movemask_ps(mask)), a, b);
 377 }
 378
 379
 380 static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Float mask) // return mask ? b : a (int)
 381 {
 382     return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask));
 383 }
 384
 385 static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Integer mask) // return mask ? b : a (int)
 386 {
 387     return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask)));
 388 }
 389
 390 static SIMDINLINE Float SIMDCALL broadcast_ss(float const *p)  // return *p (all elements in vector get same value)
 391 {
 392     return _mm512_set1_ps(*p);
 393 }
 394
 395 template<int imm>
 396 static SIMDINLINE SIMD256Impl::Float SIMDCALL extract_ps(Float a)
 397 {
 398     return _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(a), imm));
 399 }
 400
 401 template<int imm>
 402 static SIMDINLINE SIMD256Impl::Double SIMDCALL extract_pd(Double a)
 403 {
 404     return _mm512_extractf64x4_pd(a, imm);
 405 }
 406
 407 template<int imm>
 408 static SIMDINLINE SIMD256Impl::Integer SIMDCALL extract_si(Integer a)
 409 {
 410     return _mm512_extracti64x4_epi64(a, imm);
 411 }
 412
 413 template<int imm>
 414 static SIMDINLINE Float SIMDCALL insert_ps(Float a, SIMD256Impl::Float b)
 415 {
 416     return _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castps_pd(a), _mm256_castps_pd(b), imm));
 417 }
 418
 419 template<int imm>
 420 static SIMDINLINE Double SIMDCALL insert_pd(Double a, SIMD256Impl::Double b)
 421 {
 422     return _mm512_insertf64x4(a, b, imm);
 423 }
 424
 425 template<int imm>
 426 static SIMDINLINE Integer SIMDCALL insert_si(Integer a, SIMD256Impl::Integer b)
 427 {
 428     return _mm512_inserti64x4(a, b, imm);
 429 }
 430
 431 // SIMD_IWRAPPER_2(packs_epi16);   // See documentation for _mm512_packs_epi16 and _mm512_packs_epi16
 432 // SIMD_IWRAPPER_2(packs_epi32);   // See documentation for _mm512_packs_epi32 and _mm512_packs_epi32
 433 // SIMD_IWRAPPER_2(packus_epi16);  // See documentation for _mm512_packus_epi16 and _mm512_packus_epi16
 434 // SIMD_IWRAPPER_2(packus_epi32);  // See documentation for _mm512_packus_epi32 and _mm512_packus_epi32
 435
 436 static SIMDINLINE Integer SIMDCALL permute_epi32(Integer a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
 437 {
 438     return _mm512_permutexvar_epi32(swiz, a);
 439 }
 440
 441 static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
 442 {
 443     return _mm512_permutexvar_ps(swiz, a);
 444 }
 445
 446 SIMD_WRAPPER_2I_(permute2f128_ps, shuffle_f32x4);
 447 SIMD_DWRAPPER_2I_(permute2f128_pd, shuffle_f64x2);
 448 SIMD_IWRAPPER_2I_(permute2f128_si, shuffle_i32x4);
 449
 450 SIMD_IWRAPPER_1I(shuffle_epi32);
 451
 452 //SIMD_IWRAPPER_2(shuffle_epi8);
 453 SIMD_DWRAPPER_2I(shuffle_pd);
 454 SIMD_WRAPPER_2I(shuffle_ps);
 455
 456 template<int ImmT>
 457 static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
 458 {
 459     return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
 460 }
 461
 462 SIMD_IWRAPPER_2(unpackhi_epi16);
 463
 464 //SIMD_IFWRAPPER_2(unpackhi_epi32, _mm512_unpackhi_ps);
 465 static SIMDINLINE Integer SIMDCALL unpackhi_epi32(Integer a, Integer b)
 466 {
 467     return castps_si(_mm512_unpackhi_ps(castsi_ps(a), castsi_ps(b)));
 468 }
 469
 470 SIMD_IWRAPPER_2(unpackhi_epi64);
 471 //SIMD_IWRAPPER_2(unpackhi_epi8);
 472 SIMD_DWRAPPER_2(unpackhi_pd);
 473 SIMD_WRAPPER_2(unpackhi_ps);
 474 //SIMD_IWRAPPER_2(unpacklo_epi16);
 475 SIMD_IFWRAPPER_2(unpacklo_epi32, unpacklo_ps);
 476 SIMD_IWRAPPER_2(unpacklo_epi64);
 477 //SIMD_IWRAPPER_2(unpacklo_epi8);
 478 SIMD_DWRAPPER_2(unpacklo_pd);
 479 SIMD_WRAPPER_2(unpacklo_ps);
 480
 481 //-----------------------------------------------------------------------
 482 // Load / store operations
 483 //-----------------------------------------------------------------------
 484 template<ScaleFactor ScaleT>
 485 static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
 486 {
 487     return _mm512_i32gather_ps(idx, p, static_cast<int>(ScaleT));
 488 }
 489
 490 static SIMDINLINE Float SIMDCALL load1_ps(float const *p)  // return *p    (broadcast 1 value to all elements)
 491 {
 492     return broadcast_ss(p);
 493 }
 494
 495 static SIMDINLINE Float SIMDCALL load_ps(float const *p)   // return *p    (loads SIMD width elements from memory)
 496 {
 497     return _mm512_load_ps(p);
 498 }
 499
 500 static SIMDINLINE Integer SIMDCALL load_si(Integer const *p)  // return *p
 501 {
 502     return _mm512_load_si512(&p->v);
 503 }
 504
 505 static SIMDINLINE Float SIMDCALL loadu_ps(float const *p)  // return *p    (same as load_ps but allows for unaligned mem)
 506 {
 507     return _mm512_loadu_ps(p);
 508 }
 509
 510 static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p    (same as load_si but allows for unaligned mem)
 511 {
 512     return _mm512_loadu_si512(p);
 513 }
 514
 515 // for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
 516 template<ScaleFactor ScaleT>
 517 static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
 518 {
 519     __mmask16 k = _mm512_cmpneq_ps_mask(mask, setzero_ps());
 520
 521     return _mm512_mask_i32gather_ps(old, k, idx, p, static_cast<int>(ScaleT));
 522 }
 523
 524 static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
 525 {
 526     Mask m = _mm512_cmplt_epi32_mask(mask, setzero_si());
 527     _mm512_mask_store_ps(p, m, src);
 528 }
 529
 530 //static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer a)
 531 //{
 532 //    __mmask64 m = _mm512_cmplt_epi8_mask(a, setzero_si());
 533 //    return static_cast<uint64_t>(m);
 534 //}
 535
 536 static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a)
 537 {
 538     __mmask8 m = _mm512_test_epi64_mask(castpd_si(a), set1_epi64(0x8000000000000000LL));
 539     return static_cast<uint32_t>(m);
 540 }
 541 static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a)
 542 {
 543     __mmask16 m = _mm512_test_epi32_mask(castps_si(a), set1_epi32(0x80000000));
 544     return static_cast<uint32_t>(m);
 545 }
 546
 547 static SIMDINLINE Integer SIMDCALL set1_epi64(long long i) // return i (all elements are same value)
 548 {
 549     return _mm512_set1_epi64(i);
 550 }
 551
 552 static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
 553 {
 554     return _mm512_set1_epi32(i);
 555 }
 556
 557 static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
 558 {
 559     return _mm512_set1_epi8(i);
 560 }
 561
 562 static SIMDINLINE Float SIMDCALL set1_ps(float f)  // return f (all elements are same value)
 563 {
 564     return _mm512_set1_ps(f);
 565 }
 566
 567 static SIMDINLINE Double SIMDCALL setzero_pd()      // return 0 (double)
 568 {
 569     return _mm512_setzero_pd();
 570 }
 571
 572 static SIMDINLINE Float SIMDCALL setzero_ps()      // return 0 (float)
 573 {
 574     return _mm512_setzero_ps();
 575 }
 576
 577 static SIMDINLINE Integer SIMDCALL setzero_si()      // return 0 (integer)
 578 {
 579     return _mm512_setzero_si512();
 580 }
 581
 582 static SIMDINLINE void SIMDCALL store_ps(float *p, Float a)    // *p = a   (stores all elements contiguously in memory)
 583 {
 584     _mm512_store_ps(p, a);
 585 }
 586
 587 static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a)   // *p = a
 588 {
 589     _mm512_store_si512(&p->v, a);
 590 }
 591
 592 static SIMDINLINE void SIMDCALL storeu_si(Integer *p, Integer a) // *p = a    (same as store_si but allows for unaligned mem)
 593 {
 594     _mm512_storeu_si512(&p->v, a);
 595 }
 596
 597 static SIMDINLINE void SIMDCALL stream_ps(float *p, Float a)   // *p = a   (same as store_ps, but doesn't keep memory in cache)
 598 {
 599     _mm512_stream_ps(p, a);
 600 }
 601
 602 static SIMDINLINE Integer SIMDCALL set_epi32(
 603     int i15, int i14, int i13, int i12, int i11, int i10, int i9, int i8,
 604     int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
 605 {
 606     return _mm512_set_epi32(
 607         i15, i14, i13, i12, i11, i10, i9, i8,
 608         i7, i6, i5, i4, i3, i2, i1, i0);
 609 }
 610
 611 static SIMDINLINE Integer SIMDCALL set_epi32(
 612     int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
 613 {
 614     return set_epi32(
 615         0, 0, 0, 0, 0, 0, 0, 0,
 616         i7, i6, i5, i4, i3, i2, i1, i0);
 617 }
 618
 619 static SIMDINLINE Float SIMDCALL set_ps(
 620     float i15, float i14, float i13, float i12, float i11, float i10, float i9, float i8,
 621     float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
 622 {
 623     return _mm512_set_ps(
 624         i15, i14, i13, i12, i11, i10, i9, i8,
 625         i7, i6, i5, i4, i3, i2, i1, i0);
 626 }
 627
 628 static SIMDINLINE Float SIMDCALL set_ps(
 629     float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
 630 {
 631     return set_ps(
 632         0, 0, 0, 0, 0, 0, 0, 0,
 633         i7, i6, i5, i4, i3, i2, i1, i0);
 634 }
 635
 636 static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
 637 {
 638     return castsi_ps(_mm512_maskz_mov_epi32(__mmask16(mask), set1_epi32(-1)));
 639 }
 640
 641 #undef SIMD_WRAPPER_1_
 642 #undef SIMD_WRAPPER_1
 643 #undef SIMD_WRAPPER_2
 644 #undef SIMD_WRAPPER_2_
 645 #undef SIMD_WRAPPERI_2_
 646 #undef SIMD_DWRAPPER_2
 647 #undef SIMD_DWRAPPER_2I
 648 #undef SIMD_WRAPPER_2I_
 649 #undef SIMD_WRAPPER_3_
 650 #undef SIMD_WRAPPER_2I
 651 #undef SIMD_WRAPPER_3
 652 #undef SIMD_IWRAPPER_1
 653 #undef SIMD_IWRAPPER_2
 654 #undef SIMD_IFWRAPPER_2
 655 #undef SIMD_IWRAPPER_2I
 656 #undef SIMD_IWRAPPER_1
 657 #undef SIMD_IWRAPPER_1I
 658 #undef SIMD_IWRAPPER_1I_
 659 #undef SIMD_IWRAPPER_2
 660 #undef SIMD_IWRAPPER_2_
 661 #undef SIMD_IWRAPPER_2I
 662