src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl

   1 /****************************************************************************
   2 * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 ****************************************************************************/
  23 #if !defined(__SIMD_LIB_AVX_HPP__)
  24 #error Do not include this file directly, use "simdlib.hpp" instead.
  25 #endif
  26
  27 //============================================================================
  28 // SIMD16 AVX (1) implementation
  29 //============================================================================
  30
  31 static const int TARGET_SIMD_WIDTH = 8;
  32 using SIMD128T = SIMD128Impl::AVXImpl;
  33
  34 #define SIMD_WRAPPER_1(op)  \
  35     static SIMDINLINE Float SIMDCALL op(Float const &a)   \
  36     {\
  37         return Float\
  38         {\
  39             SIMD256T::op(a.v8[0]),\
  40             SIMD256T::op(a.v8[1]),\
  41         };\
  42     }
  43
  44 #define SIMD_WRAPPER_2(op)  \
  45     static SIMDINLINE Float SIMDCALL op(Float const &a, Float const &b)    \
  46     {\
  47         return Float\
  48         {\
  49             SIMD256T::op(a.v8[0], b.v8[0]),\
  50             SIMD256T::op(a.v8[1], b.v8[1]),\
  51         };\
  52     }
  53
  54 #define SIMD_WRAPPER_2I(op)  \
  55     template<int ImmT>\
  56     static SIMDINLINE Float SIMDCALL op(Float const &a, Float const &b)   \
  57     {\
  58         return Float\
  59         {\
  60             SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]),\
  61             SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]),\
  62         };\
  63     }
  64
  65 #define SIMD_WRAPPER_2I_1(op)  \
  66     template<int ImmT>\
  67     static SIMDINLINE Float SIMDCALL op(Float const &a, Float const &b)   \
  68     {\
  69         return Float\
  70         {\
  71             SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]),\
  72             SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]),\
  73         };\
  74     }
  75
  76 #define SIMD_WRAPPER_3(op)  \
  77         static SIMDINLINE Float SIMDCALL op(Float const &a, Float const &b, Float const &c)   \
  78         {\
  79             return Float\
  80             {\
  81                 SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]),\
  82                 SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]),\
  83             };\
  84         }
  85
  86 #define SIMD_IWRAPPER_1(op)  \
  87     static SIMDINLINE Integer SIMDCALL op(Integer const &a)   \
  88     {\
  89         return Integer\
  90         {\
  91             SIMD256T::op(a.v8[0]),\
  92             SIMD256T::op(a.v8[1]),\
  93         };\
  94     }
  95
  96 #define SIMD_IWRAPPER_2(op)  \
  97     static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b)   \
  98     {\
  99         return Integer\
 100         {\
 101             SIMD256T::op(a.v8[0], b.v8[0]),\
 102             SIMD256T::op(a.v8[1], b.v8[1]),\
 103         };\
 104     }
 105
 106 #define SIMD_IWRAPPER_2I(op)  \
 107     template<int ImmT>\
 108     static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b)   \
 109     {\
 110         return Integer\
 111         {\
 112             SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]),\
 113             SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]),\
 114         };\
 115     }
 116
 117 #define SIMD_IWRAPPER_2I_1(op)  \
 118     template<int ImmT>\
 119     static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b)   \
 120     {\
 121         return Integer\
 122         {\
 123             SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]),\
 124             SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]),\
 125         };\
 126     }
 127
 128 #define SIMD_IWRAPPER_2I_2(op)  \
 129     template<int ImmT>\
 130     static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b)   \
 131     {\
 132         return Integer\
 133         {\
 134             SIMD256T::template op<0xF & ImmT>(a.v8[0], b.v8[0]),\
 135             SIMD256T::template op<0xF & (ImmT >> 4)>(a.v8[1], b.v8[1]),\
 136         };\
 137     }
 138
 139 #define SIMD_IWRAPPER_3(op)  \
 140     static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b, Integer const &c)   \
 141     {\
 142         return Integer\
 143         {\
 144             SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]),\
 145             SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]),\
 146         };\
 147     }
 148
 149 //-----------------------------------------------------------------------
 150 // Single precision floating point arithmetic operations
 151 //-----------------------------------------------------------------------
 152 SIMD_WRAPPER_2(add_ps);     // return a + b
 153 SIMD_WRAPPER_2(div_ps);     // return a / b
 154 SIMD_WRAPPER_3(fmadd_ps);   // return (a * b) + c
 155 SIMD_WRAPPER_3(fmsub_ps);   // return (a * b) - c
 156 SIMD_WRAPPER_2(max_ps);     // return (a > b) ? a : b
 157 SIMD_WRAPPER_2(min_ps);     // return (a < b) ? a : b
 158 SIMD_WRAPPER_2(mul_ps);     // return a * b
 159 SIMD_WRAPPER_1(rcp_ps);     // return 1.0f / a
 160 SIMD_WRAPPER_1(rsqrt_ps);   // return 1.0f / sqrt(a)
 161 SIMD_WRAPPER_2(sub_ps);     // return a - b
 162
 163 template <RoundMode RMT>
 164 static SIMDINLINE Float SIMDCALL round_ps(Float const &a)
 165 {
 166     return Float
 167     {
 168         SIMD256T::template round_ps<RMT>(a.v8[0]),
 169         SIMD256T::template round_ps<RMT>(a.v8[1]),
 170     };
 171 }
 172
 173 static SIMDINLINE Float SIMDCALL ceil_ps(Float const &a) { return round_ps<RoundMode::CEIL_NOEXC>(a); }
 174 static SIMDINLINE Float SIMDCALL floor_ps(Float const &a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); }
 175
 176 //-----------------------------------------------------------------------
 177 // Integer (various width) arithmetic operations
 178 //-----------------------------------------------------------------------
 179 SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
 180 SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
 181 SIMD_IWRAPPER_2(add_epi8);  // return a + b (int8)
 182 SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
 183 SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
 184 SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
 185 SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
 186 SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
 187 SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
 188
 189 // return (a * b) & 0xFFFFFFFF
 190 //
 191 // Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
 192 // and store the low 32 bits of the intermediate integers in dst.
 193 SIMD_IWRAPPER_2(mullo_epi32);
 194 SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
 195 SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
 196 SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
 197
 198 //-----------------------------------------------------------------------
 199 // Logical operations
 200 //-----------------------------------------------------------------------
 201 SIMD_WRAPPER_2(and_ps);     // return a & b       (float treated as int)
 202 SIMD_IWRAPPER_2(and_si);    // return a & b       (int)
 203 SIMD_WRAPPER_2(andnot_ps);  // return (~a) & b    (float treated as int)
 204 SIMD_IWRAPPER_2(andnot_si); // return (~a) & b    (int)
 205 SIMD_WRAPPER_2(or_ps);      // return a | b       (float treated as int)
 206 SIMD_IWRAPPER_2(or_si);     // return a | b       (int)
 207 SIMD_WRAPPER_2(xor_ps);     // return a ^ b       (float treated as int)
 208 SIMD_IWRAPPER_2(xor_si);    // return a ^ b       (int)
 209
 210
 211 //-----------------------------------------------------------------------
 212 // Shift operations
 213 //-----------------------------------------------------------------------
 214 template<int ImmT>
 215 static SIMDINLINE Integer SIMDCALL slli_epi32(Integer const &a)      // return a << ImmT
 216 {
 217     return Integer
 218     {
 219         SIMD256T::template slli_epi32<ImmT>(a.v8[0]),
 220         SIMD256T::template slli_epi32<ImmT>(a.v8[1]),
 221     };
 222 }
 223
 224 SIMD_IWRAPPER_2(sllv_epi32);                                // return a << b      (uint32)
 225
 226 template<int ImmT>
 227 static SIMDINLINE Integer SIMDCALL srai_epi32(Integer const &a)      // return a >> ImmT   (int32)
 228 {
 229     return Integer
 230     {
 231         SIMD256T::template srai_epi32<ImmT>(a.v8[0]),
 232         SIMD256T::template srai_epi32<ImmT>(a.v8[1]),
 233     };
 234 }
 235
 236 template<int ImmT>
 237 static SIMDINLINE Integer SIMDCALL srli_epi32(Integer const &a)      // return a >> ImmT   (uint32)
 238 {
 239     return Integer
 240     {
 241         SIMD256T::template srli_epi32<ImmT>(a.v8[0]),
 242         SIMD256T::template srli_epi32<ImmT>(a.v8[1]),
 243     };
 244 }
 245
 246 template<int ImmT>                                          // for each 128-bit lane:
 247 static SIMDINLINE Integer SIMDCALL srli_si(Integer const &a)         //  return a >> (ImmT*8) (uint)
 248 {
 249     return Integer
 250     {
 251         SIMD256T::template srli_si<ImmT>(a.v8[0]),
 252         SIMD256T::template srli_si<ImmT>(a.v8[1]),
 253     };
 254 }
 255 template<int ImmT>
 256 static SIMDINLINE Float SIMDCALL srlisi_ps(Float const &a)       // same as srli_si, but with Float cast to int
 257 {
 258     return Float
 259     {
 260         SIMD256T::template srlisi_ps<ImmT>(a.v8[0]),
 261         SIMD256T::template srlisi_ps<ImmT>(a.v8[1]),
 262     };
 263 }
 264
 265 SIMD_IWRAPPER_2(srlv_epi32);                                // return a >> b      (uint32)
 266
 267 //-----------------------------------------------------------------------
 268 // Conversion operations
 269 //-----------------------------------------------------------------------
 270 static SIMDINLINE Float SIMDCALL castpd_ps(Double const &a)              // return *(Float*)(&a)
 271 {
 272     return Float
 273     {
 274         SIMD256T::castpd_ps(a.v8[0]),
 275         SIMD256T::castpd_ps(a.v8[1]),
 276     };
 277 }
 278
 279 static SIMDINLINE Integer SIMDCALL castps_si(Float const &a)              // return *(Integer*)(&a)
 280 {
 281     return Integer
 282     {
 283         SIMD256T::castps_si(a.v8[0]),
 284         SIMD256T::castps_si(a.v8[1]),
 285     };
 286 }
 287
 288 static SIMDINLINE Double SIMDCALL castsi_pd(Integer const &a)              // return *(Double*)(&a)
 289 {
 290     return Double
 291     {
 292         SIMD256T::castsi_pd(a.v8[0]),
 293         SIMD256T::castsi_pd(a.v8[1]),
 294     };
 295 }
 296
 297 static SIMDINLINE Double SIMDCALL castps_pd(Float const &a)   // return *(Double*)(&a)
 298 {
 299     return Double
 300     {
 301         SIMD256T::castps_pd(a.v8[0]),
 302         SIMD256T::castps_pd(a.v8[1]),
 303     };
 304 }
 305
 306 static SIMDINLINE Float SIMDCALL castsi_ps(Integer const &a)              // return *(Float*)(&a)
 307 {
 308     return Float
 309     {
 310         SIMD256T::castsi_ps(a.v8[0]),
 311         SIMD256T::castsi_ps(a.v8[1]),
 312     };
 313 }
 314
 315 static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer const &a)            // return (float)a    (int32 --> float)
 316 {
 317     return Float
 318     {
 319         SIMD256T::cvtepi32_ps(a.v8[0]),
 320         SIMD256T::cvtepi32_ps(a.v8[1]),
 321     };
 322 }
 323
 324 static SIMDINLINE Integer SIMDCALL cvtepu8_epi16(SIMD256Impl::Integer const &a)          // return (int16)a    (uint8 --> int16)
 325 {
 326     return Integer
 327     {
 328         SIMD256T::cvtepu8_epi16(a.v4[0]),
 329         SIMD256T::cvtepu8_epi16(a.v4[1]),
 330     };
 331 }
 332
 333 static SIMDINLINE Integer SIMDCALL cvtepu8_epi32(SIMD256Impl::Integer const &a)          // return (int32)a    (uint8 --> int32)
 334 {
 335     return Integer
 336         {
 337         SIMD256T::cvtepu8_epi32(a.v4[0]),
 338         SIMD256T::cvtepu8_epi32(SIMD128T::template srli_si<8>(a.v4[0])),
 339         };
 340 }
 341
 342 static SIMDINLINE Integer SIMDCALL cvtepu16_epi32(SIMD256Impl::Integer const &a)         // return (int32)a    (uint16 --> int32)
 343 {
 344     return Integer
 345     {
 346         SIMD256T::cvtepu16_epi32(a.v4[0]),
 347         SIMD256T::cvtepu16_epi32(a.v4[1]),
 348     };
 349 }
 350
 351 static SIMDINLINE Integer SIMDCALL cvtepu16_epi64(SIMD256Impl::Integer const &a)         // return (int64)a    (uint16 --> int64)
 352 {
 353     return Integer
 354     {
 355         SIMD256T::cvtepu16_epi64(a.v4[0]),
 356         SIMD256T::cvtepu16_epi64(SIMD128T::template srli_si<8>(a.v4[0])),
 357     };
 358 }
 359
 360 static SIMDINLINE Integer SIMDCALL cvtepu32_epi64(SIMD256Impl::Integer const &a)         // return (int64)a    (uint32 --> int64)
 361 {
 362     return Integer
 363     {
 364         SIMD256T::cvtepu32_epi64(a.v4[0]),
 365         SIMD256T::cvtepu32_epi64(a.v4[1]),
 366     };
 367 }
 368
 369 static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float const &a)            // return (int32)a    (float --> int32)
 370 {
 371     return Integer
 372     {
 373         SIMD256T::cvtps_epi32(a.v8[0]),
 374         SIMD256T::cvtps_epi32(a.v8[1]),
 375     };
 376 }
 377
 378 static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float const &a)           // return (int32)a    (rnd_to_zero(float) --> int32)
 379 {
 380     return Integer
 381     {
 382         SIMD256T::cvtps_epi32(a.v8[0]),
 383         SIMD256T::cvtps_epi32(a.v8[1]),
 384     };
 385 }
 386
 387 //-----------------------------------------------------------------------
 388 // Comparison operations
 389 //-----------------------------------------------------------------------
 390 template<CompareType CmpTypeT>
 391 static SIMDINLINE Float SIMDCALL cmp_ps(Float const &a, Float const &b) // return a (CmpTypeT) b
 392 {
 393     return Float
 394     {
 395         SIMD256T::template cmp_ps<CmpTypeT>(a.v8[0], b.v8[0]),
 396         SIMD256T::template cmp_ps<CmpTypeT>(a.v8[1], b.v8[1]),
 397     };
 398 }
 399 static SIMDINLINE Float SIMDCALL cmplt_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::LT_OQ>(a, b); }
 400 static SIMDINLINE Float SIMDCALL cmpgt_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::GT_OQ>(a, b); }
 401 static SIMDINLINE Float SIMDCALL cmpneq_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); }
 402 static SIMDINLINE Float SIMDCALL cmpeq_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::EQ_OQ>(a, b); }
 403 static SIMDINLINE Float SIMDCALL cmpge_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::GE_OQ>(a, b); }
 404 static SIMDINLINE Float SIMDCALL cmple_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::LE_OQ>(a, b); }
 405
 406 template<CompareType CmpTypeT>
 407 static SIMDINLINE Mask SIMDCALL cmp_ps_mask(Float const &a, Float const &b)
 408 {
 409     return static_cast<Mask>(movemask_ps(cmp_ps<CmpTypeT>(a, b)));
 410 }
 411
 412
 413 SIMD_IWRAPPER_2(cmpeq_epi8);    // return a == b (int8)
 414 SIMD_IWRAPPER_2(cmpeq_epi16);   // return a == b (int16)
 415 SIMD_IWRAPPER_2(cmpeq_epi32);   // return a == b (int32)
 416 SIMD_IWRAPPER_2(cmpeq_epi64);   // return a == b (int64)
 417 SIMD_IWRAPPER_2(cmpgt_epi8);    // return a > b (int8)
 418 SIMD_IWRAPPER_2(cmpgt_epi16);   // return a > b (int16)
 419 SIMD_IWRAPPER_2(cmpgt_epi32);   // return a > b (int32)
 420 SIMD_IWRAPPER_2(cmpgt_epi64);   // return a > b (int64)
 421 SIMD_IWRAPPER_2(cmplt_epi32);   // return a < b (int32)
 422
 423 static SIMDINLINE bool SIMDCALL testz_ps(Float const &a, Float const &b)  // return all_lanes_zero(a & b) ? 1 : 0 (float)
 424 {
 425     return  0 != (SIMD256T::testz_ps(a.v8[0], b.v8[0]) &
 426                   SIMD256T::testz_ps(a.v8[1], b.v8[1]));
 427 }
 428
 429 static SIMDINLINE bool SIMDCALL testz_si(Integer const &a, Integer const &b)  // return all_lanes_zero(a & b) ? 1 : 0 (int)
 430 {
 431     return  0 != (SIMD256T::testz_si(a.v8[0], b.v8[0]) &
 432                   SIMD256T::testz_si(a.v8[1], b.v8[1]));
 433 }
 434
 435 //-----------------------------------------------------------------------
 436 // Blend / shuffle / permute operations
 437 //-----------------------------------------------------------------------
 438 SIMD_WRAPPER_2I(blend_ps);  // return ImmT ? b : a  (float)
 439 SIMD_IWRAPPER_2I(blend_epi32);  // return ImmT ? b : a  (int32)
 440 SIMD_WRAPPER_3(blendv_ps);  // return mask ? b : a  (float)
 441 static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const &a, Integer const &b, Float const &mask) // return mask ? b : a (int)
 442 {
 443     return Integer
 444     {
 445         SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]),
 446         SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]),
 447     };
 448 }
 449
 450 static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const &a, Integer const &b, Integer const &mask) // return mask ? b : a (int)
 451 {
 452     return Integer
 453     {
 454         SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]),
 455         SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]),
 456     };
 457 }
 458
 459 static SIMDINLINE Float SIMDCALL broadcast_ss(float const *p)         // return *p (all elements in vector get same value)
 460 {
 461     float f = *p;
 462     return Float
 463     {
 464         SIMD256T::set1_ps(f),
 465         SIMD256T::set1_ps(f),
 466     };
 467 }
 468
 469 template<int imm>
 470 static SIMDINLINE SIMD256Impl::Float SIMDCALL extract_ps(Float const &a)
 471 {
 472     SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
 473     return a.v8[imm];
 474 }
 475
 476 template<int imm>
 477 static SIMDINLINE SIMD256Impl::Double SIMDCALL extract_pd(Double const &a)
 478 {
 479     SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
 480     return a.v8[imm];
 481 }
 482
 483 template<int imm>
 484 static SIMDINLINE SIMD256Impl::Integer SIMDCALL extract_si(Integer const &a)
 485 {
 486     SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
 487     return a.v8[imm];
 488 }
 489
 490 template<int imm>
 491 static SIMDINLINE Float SIMDCALL insert_ps(Float const &a, SIMD256Impl::Float const &b)
 492 {
 493     SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
 494     Float r = a;
 495     r.v8[imm] = b;
 496     return r;
 497 }
 498
 499 template<int imm>
 500 static SIMDINLINE Double SIMDCALL insert_pd(Double const &a, SIMD256Impl::Double const &b)
 501 {
 502     SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
 503     Double r = a;
 504     r.v8[imm] = b;
 505     return r;
 506 }
 507
 508 template<int imm>
 509 static SIMDINLINE Integer SIMDCALL insert_si(Integer const &a, SIMD256Impl::Integer const &b)
 510 {
 511     SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
 512     Integer r = a;
 513     r.v8[imm] = b;
 514     return r;
 515 }
 516
 517 SIMD_IWRAPPER_2(packs_epi16);      // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
 518 SIMD_IWRAPPER_2(packs_epi32);      // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
 519 SIMD_IWRAPPER_2(packus_epi16);     // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
 520 SIMD_IWRAPPER_2(packus_epi32);     // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
 521
 522 template<int ImmT>
 523 static SIMDINLINE Float SIMDCALL permute_ps(Float const &a)
 524 {
 525     return Float
 526     {
 527         SIMD256T::template permute_ps<ImmT>(a.v8[0]),
 528         SIMD256T::template permute_ps<ImmT>(a.v8[1]),
 529     };
 530 }
 531
 532 static SIMDINLINE Integer SIMDCALL permute_epi32(Integer const &a, Integer const &swiz) // return a[swiz[i]] for each 32-bit lane i (int32)
 533 {
 534     return castps_si(permute_ps(castsi_ps(a), swiz));
 535 }
 536
 537 static SIMDINLINE Float SIMDCALL permute_ps(Float const &a, Integer const &swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
 538 {
 539     const auto mask = SIMD256T::set1_epi32(7);
 540
 541     auto lolo = SIMD256T::permute_ps(a.v8[0], SIMD256T::and_si(swiz.v8[0], mask));
 542     auto lohi = SIMD256T::permute_ps(a.v8[1], SIMD256T::and_si(swiz.v8[0], mask));
 543
 544     auto hilo = SIMD256T::permute_ps(a.v8[0], SIMD256T::and_si(swiz.v8[1], mask));
 545     auto hihi = SIMD256T::permute_ps(a.v8[1], SIMD256T::and_si(swiz.v8[1], mask));
 546
 547     return Float
 548     {
 549         SIMD256T::blendv_ps(lolo, lohi, SIMD256T::castsi_ps(SIMD256T::cmpgt_epi32(swiz.v8[0], mask))),
 550         SIMD256T::blendv_ps(hilo, hihi, SIMD256T::castsi_ps(SIMD256T::cmpgt_epi32(swiz.v8[1], mask))),
 551     };
 552 }
 553
 554 // All of the 512-bit permute2f128_XX intrinsics do the following:
 555 //
 556 //      SELECT4(src, control) {
 557 //          CASE(control[1:0])
 558 //              0:      tmp[127:0] : = src[127:0]
 559 //              1 : tmp[127:0] : = src[255:128]
 560 //              2 : tmp[127:0] : = src[383:256]
 561 //              3 : tmp[127:0] : = src[511:384]
 562 //              ESAC
 563 //              RETURN tmp[127:0]
 564 //      }
 565 //
 566 //      dst[127:0]   : = SELECT4(a[511:0], imm8[1:0])
 567 //      dst[255:128] : = SELECT4(a[511:0], imm8[3:2])
 568 //      dst[383:256] : = SELECT4(b[511:0], imm8[5:4])
 569 //      dst[511:384] : = SELECT4(b[511:0], imm8[7:6])
 570 //      dst[MAX:512] : = 0
 571 //
 572 // Since the 256-bit AVX instructions use a 4-bit control field (instead
 573 // of 2-bit for AVX512), we need to expand the control bits sent to the
 574 // AVX instructions for emulation.
 575 //
 576 template <int shuf>
 577 static SIMDINLINE Float SIMDCALL permute2f128_ps(Float const &a, Float const &b)
 578 {
 579     return Float
 580     {
 581         SIMD256T::template permute2f128_ps<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], a.v8[1]),
 582         SIMD256T::template permute2f128_ps<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], b.v8[1]),
 583     };
 584 }
 585
 586 template <int shuf>
 587 static SIMDINLINE Double SIMDCALL permute2f128_pd(Double const &a, Double const &b)
 588 {
 589     return Double
 590     {
 591         SIMD256T::template permute2f128_pd<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], a.v8[1]),
 592         SIMD256T::template permute2f128_pd<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], b.v8[1]),
 593     };
 594 }
 595
 596 template <int shuf>
 597 static SIMDINLINE Integer SIMDCALL permute2f128_si(Integer const &a, Integer const &b)
 598 {
 599     return Integer
 600     {
 601         SIMD256T::template permute2f128_si<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], a.v8[1]),
 602         SIMD256T::template permute2f128_si<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], b.v8[1]),
 603     };
 604 }
 605
 606 SIMD_IWRAPPER_2I_1(shuffle_epi32);
 607 SIMD_IWRAPPER_2I_2(shuffle_epi64);
 608 SIMD_IWRAPPER_2(shuffle_epi8);
 609 SIMD_WRAPPER_2I_1(shuffle_pd);
 610 SIMD_WRAPPER_2I_1(shuffle_ps);
 611 SIMD_IWRAPPER_2(unpackhi_epi16);
 612 SIMD_IWRAPPER_2(unpackhi_epi32);
 613 SIMD_IWRAPPER_2(unpackhi_epi64);
 614 SIMD_IWRAPPER_2(unpackhi_epi8);
 615 SIMD_WRAPPER_2(unpackhi_pd);
 616 SIMD_WRAPPER_2(unpackhi_ps);
 617 SIMD_IWRAPPER_2(unpacklo_epi16);
 618 SIMD_IWRAPPER_2(unpacklo_epi32);
 619 SIMD_IWRAPPER_2(unpacklo_epi64);
 620 SIMD_IWRAPPER_2(unpacklo_epi8);
 621 SIMD_WRAPPER_2(unpacklo_pd);
 622 SIMD_WRAPPER_2(unpacklo_ps);
 623
 624 //-----------------------------------------------------------------------
 625 // Load / store operations
 626 //-----------------------------------------------------------------------
 627 template<ScaleFactor ScaleT>
 628 static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer const &idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
 629 {
 630     return Float
 631     {
 632         SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[0]),
 633         SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[1]),
 634     };
 635 }
 636
 637 static SIMDINLINE Float SIMDCALL load1_ps(float const *p)  // return *p    (broadcast 1 value to all elements)
 638 {
 639     return broadcast_ss(p);
 640 }
 641
 642 static SIMDINLINE Float SIMDCALL load_ps(float const *p)   // return *p    (loads SIMD width elements from memory)
 643 {
 644     return Float
 645     {
 646         SIMD256T::load_ps(p),
 647         SIMD256T::load_ps(p + TARGET_SIMD_WIDTH)
 648     };
 649 }
 650
 651 static SIMDINLINE Integer SIMDCALL load_si(Integer const *p)  // return *p
 652 {
 653     return Integer
 654     {
 655         SIMD256T::load_si(&p->v8[0]),
 656         SIMD256T::load_si(&p->v8[1]),
 657     };
 658 }
 659
 660 static SIMDINLINE Float SIMDCALL loadu_ps(float const *p)  // return *p    (same as load_ps but allows for unaligned mem)
 661 {
 662     return Float
 663     {
 664         SIMD256T::loadu_ps(p),
 665         SIMD256T::loadu_ps(p + TARGET_SIMD_WIDTH)
 666     };
 667 }
 668
 669 static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p    (same as load_si but allows for unaligned mem)
 670 {
 671     return Integer
 672     {
 673         SIMD256T::loadu_si(&p->v8[0]),
 674         SIMD256T::loadu_si(&p->v8[1]),
 675     };
 676 }
 677
 678 // for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
 679 template<ScaleFactor ScaleT>
 680 static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float const &old, float const* p, Integer const &idx, Float const &mask)
 681 {
 682     return Float
 683     {
 684         SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[0], p, idx.v8[0], mask.v8[0]),
 685         SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[1], p, idx.v8[1], mask.v8[1]),
 686     };
 687 }
 688
 689 static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer const &mask, Float const &src)
 690 {
 691     SIMD256T::maskstore_ps(p, mask.v8[0], src.v8[0]);
 692     SIMD256T::maskstore_ps(p + TARGET_SIMD_WIDTH, mask.v8[1], src.v8[1]);
 693 }
 694
 695 static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer const &a)
 696 {
 697     uint64_t mask = static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[0]));
 698              mask |= static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[1])) << (TARGET_SIMD_WIDTH * 4);
 699
 700     return mask;
 701 }
 702
 703 static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double const &a)
 704 {
 705     uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[0]));
 706              mask |= static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[1])) << (TARGET_SIMD_WIDTH / 2);
 707
 708     return mask;
 709 }
 710 static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float const &a)
 711 {
 712     uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[0]));
 713              mask |= static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[1])) << TARGET_SIMD_WIDTH;
 714
 715     return mask;
 716 }
 717
 718 static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
 719 {
 720     return Integer
 721     {
 722         SIMD256T::set1_epi32(i),
 723         SIMD256T::set1_epi32(i)
 724     };
 725 }
 726
 727 static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
 728 {
 729     return Integer
 730     {
 731         SIMD256T::set1_epi8(i),
 732         SIMD256T::set1_epi8(i)
 733     };
 734 }
 735
 736 static SIMDINLINE Float SIMDCALL set1_ps(float f)  // return f (all elements are same value)
 737 {
 738     return Float
 739     {
 740         SIMD256T::set1_ps(f),
 741         SIMD256T::set1_ps(f)
 742     };
 743 }
 744
 745 static SIMDINLINE Float SIMDCALL setzero_ps()      // return 0 (float)
 746 {
 747     return Float
 748     {
 749         SIMD256T::setzero_ps(),
 750         SIMD256T::setzero_ps()
 751     };
 752 }
 753
 754 static SIMDINLINE Integer SIMDCALL setzero_si()      // return 0 (integer)
 755 {
 756     return Integer
 757     {
 758         SIMD256T::setzero_si(),
 759         SIMD256T::setzero_si()
 760     };
 761 }
 762
 763 static SIMDINLINE void SIMDCALL store_ps(float *p, Float const &a)    // *p = a   (stores all elements contiguously in memory)
 764 {
 765     SIMD256T::store_ps(p, a.v8[0]);
 766     SIMD256T::store_ps(p + TARGET_SIMD_WIDTH, a.v8[1]);
 767 }
 768
 769 static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer const &a)   // *p = a
 770 {
 771     SIMD256T::store_si(&p->v8[0], a.v8[0]);
 772     SIMD256T::store_si(&p->v8[1], a.v8[1]);
 773 }
 774
 775 static SIMDINLINE void SIMDCALL stream_ps(float *p, Float const &a)   // *p = a   (same as store_ps, but doesn't keep memory in cache)
 776 {
 777     SIMD256T::stream_ps(p, a.v8[0]);
 778     SIMD256T::stream_ps(p + TARGET_SIMD_WIDTH, a.v8[1]);
 779 }
 780
 781 static SIMDINLINE Integer SIMDCALL set_epi32(
 782     int i15, int i14, int i13, int i12, int i11, int i10, int i9, int i8,
 783     int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
 784 {
 785     return Integer
 786     {
 787         SIMD256T::set_epi32(
 788             i7, i6, i5, i4, i3, i2, i1, i0),
 789         SIMD256T::set_epi32(
 790             i15, i14, i13, i12, i11, i10, i9, i8)
 791     };
 792 }
 793
 794 static SIMDINLINE Integer SIMDCALL set_epi32(
 795     int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
 796 {
 797     return set_epi32(
 798         0, 0, 0, 0, 0, 0, 0, 0,
 799         i7, i6, i5, i4, i3, i2, i1, i0);
 800 }
 801
 802 static SIMDINLINE Float SIMDCALL set_ps(
 803     float i15, float i14, float i13, float i12, float i11, float i10, float i9, float i8,
 804     float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
 805 {
 806     return Float
 807     {
 808         SIMD256T::set_ps(
 809             i7, i6, i5, i4, i3, i2, i1, i0),
 810         SIMD256T::set_ps(
 811             i15, i14, i13, i12, i11, i10, i9, i8)
 812     };
 813 }
 814
 815 static SIMDINLINE Float SIMDCALL set_ps(
 816     float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
 817 {
 818     return set_ps(
 819         0, 0, 0, 0, 0, 0, 0, 0,
 820         i7, i6, i5, i4, i3, i2, i1, i0);
 821 }
 822
 823 static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
 824 {
 825     return Float
 826     {
 827         SIMD256T::vmask_ps(mask),
 828         SIMD256T::vmask_ps(mask >> TARGET_SIMD_WIDTH)
 829     };
 830 }
 831
 832 #undef SIMD_WRAPPER_1
 833 #undef SIMD_WRAPPER_2
 834 #undef SIMD_WRAPPER_2I
 835 #undef SIMD_WRAPPER_2I_1
 836 #undef SIMD_WRAPPER_3
 837 #undef SIMD_IWRAPPER_1
 838 #undef SIMD_IWRAPPER_2
 839 #undef SIMD_IWRAPPER_2I
 840 #undef SIMD_IWRAPPER_2I_1
 841 #undef SIMD_IWRAPPER_3