src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl

   1 /****************************************************************************
   2  * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  ****************************************************************************/
  23 #if !defined(__SIMD_LIB_AVX_HPP__)
  24 #error Do not include this file directly, use "simdlib.hpp" instead.
  25 #endif
  26
  27 //============================================================================
  28 // SIMD16 AVX (1) implementation
  29 //============================================================================
  30
  31 static const int TARGET_SIMD_WIDTH = 8;
  32 using SIMD128T                     = SIMD128Impl::AVXImpl;
  33
  34 #define SIMD_WRAPPER_1(op)                              \
  35     static SIMDINLINE Float SIMDCALL op(Float const& a) \
  36     {                                                   \
  37         return Float{                                   \
  38             SIMD256T::op(a.v8[0]),                      \
  39             SIMD256T::op(a.v8[1]),                      \
  40         };                                              \
  41     }
  42
  43 #define SIMD_WRAPPER_2(op)                                              \
  44     static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \
  45     {                                                                   \
  46         return Float{                                                   \
  47             SIMD256T::op(a.v8[0], b.v8[0]),                             \
  48             SIMD256T::op(a.v8[1], b.v8[1]),                             \
  49         };                                                              \
  50     }
  51
  52 #define SIMD_WRAPPER_2I(op)                                                              \
  53     template <int ImmT>                                                                  \
  54     static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b)                  \
  55     {                                                                                    \
  56         return Float{                                                                    \
  57             SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]),                        \
  58             SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]), \
  59         };                                                                               \
  60     }
  61
  62 #define SIMD_WRAPPER_2I_1(op)                                           \
  63     template <int ImmT>                                                 \
  64     static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \
  65     {                                                                   \
  66         return Float{                                                   \
  67             SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]),              \
  68             SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]),              \
  69         };                                                              \
  70     }
  71
  72 #define SIMD_WRAPPER_3(op)                                                              \
  73     static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b, Float const& c) \
  74     {                                                                                   \
  75         return Float{                                                                   \
  76             SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]),                                    \
  77             SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]),                                    \
  78         };                                                                              \
  79     }
  80
  81 #define SIMD_IWRAPPER_1(op)                                 \
  82     static SIMDINLINE Integer SIMDCALL op(Integer const& a) \
  83     {                                                       \
  84         return Integer{                                     \
  85             SIMD256T::op(a.v8[0]),                          \
  86             SIMD256T::op(a.v8[1]),                          \
  87         };                                                  \
  88     }
  89
  90 #define SIMD_IWRAPPER_2(op)                                                   \
  91     static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
  92     {                                                                         \
  93         return Integer{                                                       \
  94             SIMD256T::op(a.v8[0], b.v8[0]),                                   \
  95             SIMD256T::op(a.v8[1], b.v8[1]),                                   \
  96         };                                                                    \
  97     }
  98
  99 #define SIMD_IWRAPPER_2I(op)                                                             \
 100     template <int ImmT>                                                                  \
 101     static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b)            \
 102     {                                                                                    \
 103         return Integer{                                                                  \
 104             SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]),                        \
 105             SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]), \
 106         };                                                                               \
 107     }
 108
 109 #define SIMD_IWRAPPER_2I_1(op)                                                \
 110     template <int ImmT>                                                       \
 111     static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
 112     {                                                                         \
 113         return Integer{                                                       \
 114             SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]),                    \
 115             SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]),                    \
 116         };                                                                    \
 117     }
 118
 119 #define SIMD_IWRAPPER_2I_2(op)                                                \
 120     template <int ImmT>                                                       \
 121     static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
 122     {                                                                         \
 123         return Integer{                                                       \
 124             SIMD256T::template op<0xF & ImmT>(a.v8[0], b.v8[0]),              \
 125             SIMD256T::template op<0xF & (ImmT >> 4)>(a.v8[1], b.v8[1]),       \
 126         };                                                                    \
 127     }
 128
 129 #define SIMD_IWRAPPER_3(op)                                                                     \
 130     static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b, Integer const& c) \
 131     {                                                                                           \
 132         return Integer{                                                                         \
 133             SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]),                                            \
 134             SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]),                                            \
 135         };                                                                                      \
 136     }
 137
 138 //-----------------------------------------------------------------------
 139 // Single precision floating point arithmetic operations
 140 //-----------------------------------------------------------------------
 141 SIMD_WRAPPER_2(add_ps);   // return a + b
 142 SIMD_WRAPPER_2(div_ps);   // return a / b
 143 SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c
 144 SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c
 145 SIMD_WRAPPER_2(max_ps);   // return (a > b) ? a : b
 146 SIMD_WRAPPER_2(min_ps);   // return (a < b) ? a : b
 147 SIMD_WRAPPER_2(mul_ps);   // return a * b
 148 SIMD_WRAPPER_1(rcp_ps);   // return 1.0f / a
 149 SIMD_WRAPPER_1(rsqrt_ps); // return 1.0f / sqrt(a)
 150 SIMD_WRAPPER_2(sub_ps);   // return a - b
 151
 152 template <RoundMode RMT>
 153 static SIMDINLINE Float SIMDCALL round_ps(Float const& a)
 154 {
 155     return Float{
 156         SIMD256T::template round_ps<RMT>(a.v8[0]),
 157         SIMD256T::template round_ps<RMT>(a.v8[1]),
 158     };
 159 }
 160
 161 static SIMDINLINE Float SIMDCALL ceil_ps(Float const& a)
 162 {
 163     return round_ps<RoundMode::CEIL_NOEXC>(a);
 164 }
 165 static SIMDINLINE Float SIMDCALL floor_ps(Float const& a)
 166 {
 167     return round_ps<RoundMode::FLOOR_NOEXC>(a);
 168 }
 169
 170 //-----------------------------------------------------------------------
 171 // Integer (various width) arithmetic operations
 172 //-----------------------------------------------------------------------
 173 SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
 174 SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
 175 SIMD_IWRAPPER_2(add_epi8);  // return a + b (int8)
 176 SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
 177 SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
 178 SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
 179 SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
 180 SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
 181 SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
 182
 183 // return (a * b) & 0xFFFFFFFF
 184 //
 185 // Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
 186 // and store the low 32 bits of the intermediate integers in dst.
 187 SIMD_IWRAPPER_2(mullo_epi32);
 188 SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
 189 SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
 190 SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
 191
 192 //-----------------------------------------------------------------------
 193 // Logical operations
 194 //-----------------------------------------------------------------------
 195 SIMD_WRAPPER_2(and_ps);     // return a & b       (float treated as int)
 196 SIMD_IWRAPPER_2(and_si);    // return a & b       (int)
 197 SIMD_WRAPPER_2(andnot_ps);  // return (~a) & b    (float treated as int)
 198 SIMD_IWRAPPER_2(andnot_si); // return (~a) & b    (int)
 199 SIMD_WRAPPER_2(or_ps);      // return a | b       (float treated as int)
 200 SIMD_IWRAPPER_2(or_si);     // return a | b       (int)
 201 SIMD_WRAPPER_2(xor_ps);     // return a ^ b       (float treated as int)
 202 SIMD_IWRAPPER_2(xor_si);    // return a ^ b       (int)
 203
 204 //-----------------------------------------------------------------------
 205 // Shift operations
 206 //-----------------------------------------------------------------------
 207 template <int ImmT>
 208 static SIMDINLINE Integer SIMDCALL slli_epi32(Integer const& a) // return a << ImmT
 209 {
 210     return Integer{
 211         SIMD256T::template slli_epi32<ImmT>(a.v8[0]),
 212         SIMD256T::template slli_epi32<ImmT>(a.v8[1]),
 213     };
 214 }
 215
 216 SIMD_IWRAPPER_2(sllv_epi32); // return a << b      (uint32)
 217
 218 template <int ImmT>
 219 static SIMDINLINE Integer SIMDCALL srai_epi32(Integer const& a) // return a >> ImmT   (int32)
 220 {
 221     return Integer{
 222         SIMD256T::template srai_epi32<ImmT>(a.v8[0]),
 223         SIMD256T::template srai_epi32<ImmT>(a.v8[1]),
 224     };
 225 }
 226
 227 template <int ImmT>
 228 static SIMDINLINE Integer SIMDCALL srli_epi32(Integer const& a) // return a >> ImmT   (uint32)
 229 {
 230     return Integer{
 231         SIMD256T::template srli_epi32<ImmT>(a.v8[0]),
 232         SIMD256T::template srli_epi32<ImmT>(a.v8[1]),
 233     };
 234 }
 235
 236 template <int ImmT>                                          // for each 128-bit lane:
 237 static SIMDINLINE Integer SIMDCALL srli_si(Integer const& a) //  return a >> (ImmT*8) (uint)
 238 {
 239     return Integer{
 240         SIMD256T::template srli_si<ImmT>(a.v8[0]),
 241         SIMD256T::template srli_si<ImmT>(a.v8[1]),
 242     };
 243 }
 244 template <int ImmT>
 245 static SIMDINLINE Float SIMDCALL
 246                         srlisi_ps(Float const& a) // same as srli_si, but with Float cast to int
 247 {
 248     return Float{
 249         SIMD256T::template srlisi_ps<ImmT>(a.v8[0]),
 250         SIMD256T::template srlisi_ps<ImmT>(a.v8[1]),
 251     };
 252 }
 253
 254 SIMD_IWRAPPER_2(srlv_epi32); // return a >> b      (uint32)
 255
 256 //-----------------------------------------------------------------------
 257 // Conversion operations
 258 //-----------------------------------------------------------------------
 259 static SIMDINLINE Float SIMDCALL castpd_ps(Double const& a) // return *(Float*)(&a)
 260 {
 261     return Float{
 262         SIMD256T::castpd_ps(a.v8[0]),
 263         SIMD256T::castpd_ps(a.v8[1]),
 264     };
 265 }
 266
 267 static SIMDINLINE Integer SIMDCALL castps_si(Float const& a) // return *(Integer*)(&a)
 268 {
 269     return Integer{
 270         SIMD256T::castps_si(a.v8[0]),
 271         SIMD256T::castps_si(a.v8[1]),
 272     };
 273 }
 274
 275 static SIMDINLINE Double SIMDCALL castsi_pd(Integer const& a) // return *(Double*)(&a)
 276 {
 277     return Double{
 278         SIMD256T::castsi_pd(a.v8[0]),
 279         SIMD256T::castsi_pd(a.v8[1]),
 280     };
 281 }
 282
 283 static SIMDINLINE Double SIMDCALL castps_pd(Float const& a) // return *(Double*)(&a)
 284 {
 285     return Double{
 286         SIMD256T::castps_pd(a.v8[0]),
 287         SIMD256T::castps_pd(a.v8[1]),
 288     };
 289 }
 290
 291 static SIMDINLINE Float SIMDCALL castsi_ps(Integer const& a) // return *(Float*)(&a)
 292 {
 293     return Float{
 294         SIMD256T::castsi_ps(a.v8[0]),
 295         SIMD256T::castsi_ps(a.v8[1]),
 296     };
 297 }
 298
 299 static SIMDINLINE Float SIMDCALL
 300                         cvtepi32_ps(Integer const& a) // return (float)a    (int32 --> float)
 301 {
 302     return Float{
 303         SIMD256T::cvtepi32_ps(a.v8[0]),
 304         SIMD256T::cvtepi32_ps(a.v8[1]),
 305     };
 306 }
 307
 308 static SIMDINLINE Integer SIMDCALL
 309                           cvtepu8_epi16(SIMD256Impl::Integer const& a) // return (int16)a    (uint8 --> int16)
 310 {
 311     return Integer{
 312         SIMD256T::cvtepu8_epi16(a.v4[0]),
 313         SIMD256T::cvtepu8_epi16(a.v4[1]),
 314     };
 315 }
 316
 317 static SIMDINLINE Integer SIMDCALL
 318                           cvtepu8_epi32(SIMD256Impl::Integer const& a) // return (int32)a    (uint8 --> int32)
 319 {
 320     return Integer{
 321         SIMD256T::cvtepu8_epi32(a.v4[0]),
 322         SIMD256T::cvtepu8_epi32(SIMD128T::template srli_si<8>(a.v4[0])),
 323     };
 324 }
 325
 326 static SIMDINLINE Integer SIMDCALL
 327                           cvtepu16_epi32(SIMD256Impl::Integer const& a) // return (int32)a    (uint16 --> int32)
 328 {
 329     return Integer{
 330         SIMD256T::cvtepu16_epi32(a.v4[0]),
 331         SIMD256T::cvtepu16_epi32(a.v4[1]),
 332     };
 333 }
 334
 335 static SIMDINLINE Integer SIMDCALL
 336                           cvtepu16_epi64(SIMD256Impl::Integer const& a) // return (int64)a    (uint16 --> int64)
 337 {
 338     return Integer{
 339         SIMD256T::cvtepu16_epi64(a.v4[0]),
 340         SIMD256T::cvtepu16_epi64(SIMD128T::template srli_si<8>(a.v4[0])),
 341     };
 342 }
 343
 344 static SIMDINLINE Integer SIMDCALL
 345                           cvtepu32_epi64(SIMD256Impl::Integer const& a) // return (int64)a    (uint32 --> int64)
 346 {
 347     return Integer{
 348         SIMD256T::cvtepu32_epi64(a.v4[0]),
 349         SIMD256T::cvtepu32_epi64(a.v4[1]),
 350     };
 351 }
 352
 353 static SIMDINLINE Integer SIMDCALL
 354                           cvtps_epi32(Float const& a) // return (int32)a    (float --> int32)
 355 {
 356     return Integer{
 357         SIMD256T::cvtps_epi32(a.v8[0]),
 358         SIMD256T::cvtps_epi32(a.v8[1]),
 359     };
 360 }
 361
 362 static SIMDINLINE Integer SIMDCALL
 363                           cvttps_epi32(Float const& a) // return (int32)a    (rnd_to_zero(float) --> int32)
 364 {
 365     return Integer{
 366         SIMD256T::cvtps_epi32(a.v8[0]),
 367         SIMD256T::cvtps_epi32(a.v8[1]),
 368     };
 369 }
 370
 371 //-----------------------------------------------------------------------
 372 // Comparison operations
 373 //-----------------------------------------------------------------------
 374 template <CompareType CmpTypeT>
 375 static SIMDINLINE Float SIMDCALL cmp_ps(Float const& a, Float const& b) // return a (CmpTypeT) b
 376 {
 377     return Float{
 378         SIMD256T::template cmp_ps<CmpTypeT>(a.v8[0], b.v8[0]),
 379         SIMD256T::template cmp_ps<CmpTypeT>(a.v8[1], b.v8[1]),
 380     };
 381 }
 382 static SIMDINLINE Float SIMDCALL cmplt_ps(Float const& a, Float const& b)
 383 {
 384     return cmp_ps<CompareType::LT_OQ>(a, b);
 385 }
 386 static SIMDINLINE Float SIMDCALL cmpgt_ps(Float const& a, Float const& b)
 387 {
 388     return cmp_ps<CompareType::GT_OQ>(a, b);
 389 }
 390 static SIMDINLINE Float SIMDCALL cmpneq_ps(Float const& a, Float const& b)
 391 {
 392     return cmp_ps<CompareType::NEQ_OQ>(a, b);
 393 }
 394 static SIMDINLINE Float SIMDCALL cmpeq_ps(Float const& a, Float const& b)
 395 {
 396     return cmp_ps<CompareType::EQ_OQ>(a, b);
 397 }
 398 static SIMDINLINE Float SIMDCALL cmpge_ps(Float const& a, Float const& b)
 399 {
 400     return cmp_ps<CompareType::GE_OQ>(a, b);
 401 }
 402 static SIMDINLINE Float SIMDCALL cmple_ps(Float const& a, Float const& b)
 403 {
 404     return cmp_ps<CompareType::LE_OQ>(a, b);
 405 }
 406
 407 template <CompareType CmpTypeT>
 408 static SIMDINLINE Mask SIMDCALL cmp_ps_mask(Float const& a, Float const& b)
 409 {
 410     return static_cast<Mask>(movemask_ps(cmp_ps<CmpTypeT>(a, b)));
 411 }
 412
 413 SIMD_IWRAPPER_2(cmpeq_epi8);  // return a == b (int8)
 414 SIMD_IWRAPPER_2(cmpeq_epi16); // return a == b (int16)
 415 SIMD_IWRAPPER_2(cmpeq_epi32); // return a == b (int32)
 416 SIMD_IWRAPPER_2(cmpeq_epi64); // return a == b (int64)
 417 SIMD_IWRAPPER_2(cmpgt_epi8);  // return a > b (int8)
 418 SIMD_IWRAPPER_2(cmpgt_epi16); // return a > b (int16)
 419 SIMD_IWRAPPER_2(cmpgt_epi32); // return a > b (int32)
 420 SIMD_IWRAPPER_2(cmpgt_epi64); // return a > b (int64)
 421 SIMD_IWRAPPER_2(cmplt_epi32); // return a < b (int32)
 422
 423 static SIMDINLINE bool SIMDCALL
 424                        testz_ps(Float const& a, Float const& b) // return all_lanes_zero(a & b) ? 1 : 0 (float)
 425 {
 426     return 0 != (SIMD256T::testz_ps(a.v8[0], b.v8[0]) & SIMD256T::testz_ps(a.v8[1], b.v8[1]));
 427 }
 428
 429 static SIMDINLINE bool SIMDCALL
 430                        testz_si(Integer const& a, Integer const& b) // return all_lanes_zero(a & b) ? 1 : 0 (int)
 431 {
 432     return 0 != (SIMD256T::testz_si(a.v8[0], b.v8[0]) & SIMD256T::testz_si(a.v8[1], b.v8[1]));
 433 }
 434
 435 //-----------------------------------------------------------------------
 436 // Blend / shuffle / permute operations
 437 //-----------------------------------------------------------------------
 438 SIMD_WRAPPER_2I(blend_ps);     // return ImmT ? b : a  (float)
 439 SIMD_IWRAPPER_2I(blend_epi32); // return ImmT ? b : a  (int32)
 440 SIMD_WRAPPER_3(blendv_ps);     // return mask ? b : a  (float)
 441 static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const& a,
 442                                                 Integer const& b,
 443                                                 Float const&   mask) // return mask ? b : a (int)
 444 {
 445     return Integer{
 446         SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]),
 447         SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]),
 448     };
 449 }
 450
 451 static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const& a,
 452                                                 Integer const& b,
 453                                                 Integer const& mask) // return mask ? b : a (int)
 454 {
 455     return Integer{
 456         SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]),
 457         SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]),
 458     };
 459 }
 460
 461 static SIMDINLINE Float SIMDCALL
 462                         broadcast_ss(float const* p) // return *p (all elements in vector get same value)
 463 {
 464     float f = *p;
 465     return Float{
 466         SIMD256T::set1_ps(f),
 467         SIMD256T::set1_ps(f),
 468     };
 469 }
 470
 471 template <int imm>
 472 static SIMDINLINE SIMD256Impl::Float SIMDCALL extract_ps(Float const& a)
 473 {
 474     SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
 475     return a.v8[imm];
 476 }
 477
 478 template <int imm>
 479 static SIMDINLINE SIMD256Impl::Double SIMDCALL extract_pd(Double const& a)
 480 {
 481     SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
 482     return a.v8[imm];
 483 }
 484
 485 template <int imm>
 486 static SIMDINLINE SIMD256Impl::Integer SIMDCALL extract_si(Integer const& a)
 487 {
 488     SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
 489     return a.v8[imm];
 490 }
 491
 492 template <int imm>
 493 static SIMDINLINE Float SIMDCALL insert_ps(Float const& a, SIMD256Impl::Float const& b)
 494 {
 495     SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
 496     Float r   = a;
 497     r.v8[imm] = b;
 498     return r;
 499 }
 500
 501 template <int imm>
 502 static SIMDINLINE Double SIMDCALL insert_pd(Double const& a, SIMD256Impl::Double const& b)
 503 {
 504     SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
 505     Double r  = a;
 506     r.v8[imm] = b;
 507     return r;
 508 }
 509
 510 template <int imm>
 511 static SIMDINLINE Integer SIMDCALL insert_si(Integer const& a, SIMD256Impl::Integer const& b)
 512 {
 513     SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
 514     Integer r = a;
 515     r.v8[imm] = b;
 516     return r;
 517 }
 518
 519 SIMD_IWRAPPER_2(packs_epi16);  // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
 520 SIMD_IWRAPPER_2(packs_epi32);  // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
 521 SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
 522 SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
 523
 524 template <int ImmT>
 525 static SIMDINLINE Float SIMDCALL permute_ps(Float const& a)
 526 {
 527     return Float{
 528         SIMD256T::template permute_ps<ImmT>(a.v8[0]),
 529         SIMD256T::template permute_ps<ImmT>(a.v8[1]),
 530     };
 531 }
 532
 533 static SIMDINLINE Integer SIMDCALL permute_epi32(
 534     Integer const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (int32)
 535 {
 536     return castps_si(permute_ps(castsi_ps(a), swiz));
 537 }
 538
 539 static SIMDINLINE Float SIMDCALL
 540                         permute_ps(Float const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (float)
 541 {
 542     const auto mask = SIMD256T::set1_epi32(7);
 543
 544     auto lolo = SIMD256T::permute_ps(a.v8[0], SIMD256T::and_si(swiz.v8[0], mask));
 545     auto lohi = SIMD256T::permute_ps(a.v8[1], SIMD256T::and_si(swiz.v8[0], mask));
 546
 547     auto hilo = SIMD256T::permute_ps(a.v8[0], SIMD256T::and_si(swiz.v8[1], mask));
 548     auto hihi = SIMD256T::permute_ps(a.v8[1], SIMD256T::and_si(swiz.v8[1], mask));
 549
 550     return Float{
 551         SIMD256T::blendv_ps(
 552             lolo, lohi, SIMD256T::castsi_ps(SIMD256T::cmpgt_epi32(swiz.v8[0], mask))),
 553         SIMD256T::blendv_ps(
 554             hilo, hihi, SIMD256T::castsi_ps(SIMD256T::cmpgt_epi32(swiz.v8[1], mask))),
 555     };
 556 }
 557
 558 // All of the 512-bit permute2f128_XX intrinsics do the following:
 559 //
 560 //      SELECT4(src, control) {
 561 //          CASE(control[1:0])
 562 //              0:      tmp[127:0] : = src[127:0]
 563 //              1 : tmp[127:0] : = src[255:128]
 564 //              2 : tmp[127:0] : = src[383:256]
 565 //              3 : tmp[127:0] : = src[511:384]
 566 //              ESAC
 567 //              RETURN tmp[127:0]
 568 //      }
 569 //
 570 //      dst[127:0]   : = SELECT4(a[511:0], imm8[1:0])
 571 //      dst[255:128] : = SELECT4(a[511:0], imm8[3:2])
 572 //      dst[383:256] : = SELECT4(b[511:0], imm8[5:4])
 573 //      dst[511:384] : = SELECT4(b[511:0], imm8[7:6])
 574 //      dst[MAX:512] : = 0
 575 //
 576 // Since the 256-bit AVX instructions use a 4-bit control field (instead
 577 // of 2-bit for AVX512), we need to expand the control bits sent to the
 578 // AVX instructions for emulation.
 579 //
 580 template <int shuf>
 581 static SIMDINLINE Float SIMDCALL permute2f128_ps(Float const& a, Float const& b)
 582 {
 583     return Float{
 584         SIMD256T::template permute2f128_ps<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0],
 585                                                                                         a.v8[1]),
 586         SIMD256T::template permute2f128_ps<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0],
 587                                                                                         b.v8[1]),
 588     };
 589 }
 590
 591 template <int shuf>
 592 static SIMDINLINE Double SIMDCALL permute2f128_pd(Double const& a, Double const& b)
 593 {
 594     return Double{
 595         SIMD256T::template permute2f128_pd<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0],
 596                                                                                         a.v8[1]),
 597         SIMD256T::template permute2f128_pd<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0],
 598                                                                                         b.v8[1]),
 599     };
 600 }
 601
 602 template <int shuf>
 603 static SIMDINLINE Integer SIMDCALL permute2f128_si(Integer const& a, Integer const& b)
 604 {
 605     return Integer{
 606         SIMD256T::template permute2f128_si<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0],
 607                                                                                         a.v8[1]),
 608         SIMD256T::template permute2f128_si<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0],
 609                                                                                         b.v8[1]),
 610     };
 611 }
 612
 613 SIMD_IWRAPPER_2I_1(shuffle_epi32);
 614 SIMD_IWRAPPER_2I_2(shuffle_epi64);
 615 SIMD_IWRAPPER_2(shuffle_epi8);
 616 SIMD_WRAPPER_2I_1(shuffle_pd);
 617 SIMD_WRAPPER_2I_1(shuffle_ps);
 618 SIMD_IWRAPPER_2(unpackhi_epi16);
 619 SIMD_IWRAPPER_2(unpackhi_epi32);
 620 SIMD_IWRAPPER_2(unpackhi_epi64);
 621 SIMD_IWRAPPER_2(unpackhi_epi8);
 622 SIMD_WRAPPER_2(unpackhi_pd);
 623 SIMD_WRAPPER_2(unpackhi_ps);
 624 SIMD_IWRAPPER_2(unpacklo_epi16);
 625 SIMD_IWRAPPER_2(unpacklo_epi32);
 626 SIMD_IWRAPPER_2(unpacklo_epi64);
 627 SIMD_IWRAPPER_2(unpacklo_epi8);
 628 SIMD_WRAPPER_2(unpacklo_pd);
 629 SIMD_WRAPPER_2(unpacklo_ps);
 630
 631 //-----------------------------------------------------------------------
 632 // Load / store operations
 633 //-----------------------------------------------------------------------
 634 template <ScaleFactor ScaleT = ScaleFactor::SF_1>
 635 static SIMDINLINE Float SIMDCALL
 636                         i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
 637 {
 638     return Float{
 639         SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[0]),
 640         SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[1]),
 641     };
 642 }
 643
 644 template <ScaleFactor ScaleT = ScaleFactor::SF_1>
 645 static SIMDINLINE Float SIMDCALL
 646                         sw_i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
 647 {
 648     return Float{
 649         SIMD256T::template sw_i32gather_ps<ScaleT>(p, idx.v8[0]),
 650         SIMD256T::template sw_i32gather_ps<ScaleT>(p, idx.v8[1]),
 651     };
 652 }
 653
 654 static SIMDINLINE Float SIMDCALL
 655                         load1_ps(float const* p) // return *p    (broadcast 1 value to all elements)
 656 {
 657     return broadcast_ss(p);
 658 }
 659
 660 static SIMDINLINE Float SIMDCALL
 661                         load_ps(float const* p) // return *p    (loads SIMD width elements from memory)
 662 {
 663     return Float{SIMD256T::load_ps(p), SIMD256T::load_ps(p + TARGET_SIMD_WIDTH)};
 664 }
 665
 666 static SIMDINLINE Integer SIMDCALL load_si(Integer const* p) // return *p
 667 {
 668     return Integer{
 669         SIMD256T::load_si(&p->v8[0]),
 670         SIMD256T::load_si(&p->v8[1]),
 671     };
 672 }
 673
 674 static SIMDINLINE Float SIMDCALL
 675                         loadu_ps(float const* p) // return *p    (same as load_ps but allows for unaligned mem)
 676 {
 677     return Float{SIMD256T::loadu_ps(p), SIMD256T::loadu_ps(p + TARGET_SIMD_WIDTH)};
 678 }
 679
 680 static SIMDINLINE Integer SIMDCALL
 681                           loadu_si(Integer const* p) // return *p    (same as load_si but allows for unaligned mem)
 682 {
 683     return Integer{
 684         SIMD256T::loadu_si(&p->v8[0]),
 685         SIMD256T::loadu_si(&p->v8[1]),
 686     };
 687 }
 688
 689 // for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
 690 template <ScaleFactor ScaleT = ScaleFactor::SF_1>
 691 static SIMDINLINE Float SIMDCALL
 692                         mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask)
 693 {
 694     return Float{
 695         SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[0], p, idx.v8[0], mask.v8[0]),
 696         SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[1], p, idx.v8[1], mask.v8[1]),
 697     };
 698 }
 699
 700 template <ScaleFactor ScaleT = ScaleFactor::SF_1>
 701 static SIMDINLINE Float SIMDCALL
 702                         sw_mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask)
 703 {
 704     return Float{
 705         SIMD256T::template sw_mask_i32gather_ps<ScaleT>(old.v8[0], p, idx.v8[0], mask.v8[0]),
 706         SIMD256T::template sw_mask_i32gather_ps<ScaleT>(old.v8[1], p, idx.v8[1], mask.v8[1]),
 707     };
 708 }
 709
 710 static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer const& mask, Float const& src)
 711 {
 712     SIMD256T::maskstore_ps(p, mask.v8[0], src.v8[0]);
 713     SIMD256T::maskstore_ps(p + TARGET_SIMD_WIDTH, mask.v8[1], src.v8[1]);
 714 }
 715
 716 static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer const& a)
 717 {
 718     uint64_t mask = static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[0]));
 719     mask |= static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[1])) << (TARGET_SIMD_WIDTH * 4);
 720
 721     return mask;
 722 }
 723
 724 static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double const& a)
 725 {
 726     uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[0]));
 727     mask |= static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[1])) << (TARGET_SIMD_WIDTH / 2);
 728
 729     return mask;
 730 }
 731 static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float const& a)
 732 {
 733     uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[0]));
 734     mask |= static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[1])) << TARGET_SIMD_WIDTH;
 735
 736     return mask;
 737 }
 738
 739 static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
 740 {
 741     return Integer{SIMD256T::set1_epi32(i), SIMD256T::set1_epi32(i)};
 742 }
 743
 744 static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
 745 {
 746     return Integer{SIMD256T::set1_epi8(i), SIMD256T::set1_epi8(i)};
 747 }
 748
 749 static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value)
 750 {
 751     return Float{SIMD256T::set1_ps(f), SIMD256T::set1_ps(f)};
 752 }
 753
 754 static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float)
 755 {
 756     return Float{SIMD256T::setzero_ps(), SIMD256T::setzero_ps()};
 757 }
 758
 759 static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer)
 760 {
 761     return Integer{SIMD256T::setzero_si(), SIMD256T::setzero_si()};
 762 }
 763
 764 static SIMDINLINE void SIMDCALL
 765                        store_ps(float* p, Float const& a) // *p = a   (stores all elements contiguously in memory)
 766 {
 767     SIMD256T::store_ps(p, a.v8[0]);
 768     SIMD256T::store_ps(p + TARGET_SIMD_WIDTH, a.v8[1]);
 769 }
 770
 771 static SIMDINLINE void SIMDCALL store_si(Integer* p, Integer const& a) // *p = a
 772 {
 773     SIMD256T::store_si(&p->v8[0], a.v8[0]);
 774     SIMD256T::store_si(&p->v8[1], a.v8[1]);
 775 }
 776
 777 static SIMDINLINE void SIMDCALL
 778                        stream_ps(float* p, Float const& a) // *p = a   (same as store_ps, but doesn't keep memory in cache)
 779 {
 780     SIMD256T::stream_ps(p, a.v8[0]);
 781     SIMD256T::stream_ps(p + TARGET_SIMD_WIDTH, a.v8[1]);
 782 }
 783
 784 static SIMDINLINE Integer SIMDCALL set_epi32(int i15,
 785                                              int i14,
 786                                              int i13,
 787                                              int i12,
 788                                              int i11,
 789                                              int i10,
 790                                              int i9,
 791                                              int i8,
 792                                              int i7,
 793                                              int i6,
 794                                              int i5,
 795                                              int i4,
 796                                              int i3,
 797                                              int i2,
 798                                              int i1,
 799                                              int i0)
 800 {
 801     return Integer{SIMD256T::set_epi32(i7, i6, i5, i4, i3, i2, i1, i0),
 802                    SIMD256T::set_epi32(i15, i14, i13, i12, i11, i10, i9, i8)};
 803 }
 804
 805 static SIMDINLINE Integer SIMDCALL
 806                           set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
 807 {
 808     return set_epi32(0, 0, 0, 0, 0, 0, 0, 0, i7, i6, i5, i4, i3, i2, i1, i0);
 809 }
 810
 811 static SIMDINLINE Float SIMDCALL set_ps(float i15,
 812                                         float i14,
 813                                         float i13,
 814                                         float i12,
 815                                         float i11,
 816                                         float i10,
 817                                         float i9,
 818                                         float i8,
 819                                         float i7,
 820                                         float i6,
 821                                         float i5,
 822                                         float i4,
 823                                         float i3,
 824                                         float i2,
 825                                         float i1,
 826                                         float i0)
 827 {
 828     return Float{SIMD256T::set_ps(i7, i6, i5, i4, i3, i2, i1, i0),
 829                  SIMD256T::set_ps(i15, i14, i13, i12, i11, i10, i9, i8)};
 830 }
 831
 832 static SIMDINLINE Float SIMDCALL
 833                         set_ps(float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
 834 {
 835     return set_ps(0, 0, 0, 0, 0, 0, 0, 0, i7, i6, i5, i4, i3, i2, i1, i0);
 836 }
 837
 838 static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
 839 {
 840     return Float{SIMD256T::vmask_ps(mask), SIMD256T::vmask_ps(mask >> TARGET_SIMD_WIDTH)};
 841 }
 842
 843 #undef SIMD_WRAPPER_1
 844 #undef SIMD_WRAPPER_2
 845 #undef SIMD_WRAPPER_2I
 846 #undef SIMD_WRAPPER_2I_1
 847 #undef SIMD_WRAPPER_3
 848 #undef SIMD_IWRAPPER_1
 849 #undef SIMD_IWRAPPER_2
 850 #undef SIMD_IWRAPPER_2I
 851 #undef SIMD_IWRAPPER_2I_1
 852 #undef SIMD_IWRAPPER_3