1 /****************************************************************************
2 * Copyright (C) 2017 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 ****************************************************************************/
23 #if !defined(__SIMD_LIB_AVX_HPP__)
24 #error Do not include this file directly, use "simdlib.hpp" instead.
27 //============================================================================
28 // SIMD16 AVX (1) implementation
29 //============================================================================
31 static const int TARGET_SIMD_WIDTH = 8;
32 using SIMD128T = SIMD128Impl::AVXImpl;
34 #define SIMD_WRAPPER_1(op) \
35 static SIMDINLINE Float SIMDCALL op(Float const &a) \
39 SIMD256T::op(a.v8[0]),\
40 SIMD256T::op(a.v8[1]),\
44 #define SIMD_WRAPPER_2(op) \
45 static SIMDINLINE Float SIMDCALL op(Float const &a, Float const &b) \
49 SIMD256T::op(a.v8[0], b.v8[0]),\
50 SIMD256T::op(a.v8[1], b.v8[1]),\
54 #define SIMD_WRAPPER_2I(op) \
56 static SIMDINLINE Float SIMDCALL op(Float const &a, Float const &b) \
60 SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]),\
61 SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]),\
65 #define SIMD_WRAPPER_2I_1(op) \
67 static SIMDINLINE Float SIMDCALL op(Float const &a, Float const &b) \
71 SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]),\
72 SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]),\
76 #define SIMD_WRAPPER_3(op) \
77 static SIMDINLINE Float SIMDCALL op(Float const &a, Float const &b, Float const &c) \
81 SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]),\
82 SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]),\
86 #define SIMD_IWRAPPER_1(op) \
87 static SIMDINLINE Integer SIMDCALL op(Integer const &a) \
91 SIMD256T::op(a.v8[0]),\
92 SIMD256T::op(a.v8[1]),\
96 #define SIMD_IWRAPPER_2(op) \
97 static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b) \
101 SIMD256T::op(a.v8[0], b.v8[0]),\
102 SIMD256T::op(a.v8[1], b.v8[1]),\
106 #define SIMD_IWRAPPER_2I(op) \
108 static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b) \
112 SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]),\
113 SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]),\
117 #define SIMD_IWRAPPER_2I_1(op) \
119 static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b) \
123 SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]),\
124 SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]),\
128 #define SIMD_IWRAPPER_2I_2(op) \
130 static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b) \
134 SIMD256T::template op<0xF & ImmT>(a.v8[0], b.v8[0]),\
135 SIMD256T::template op<0xF & (ImmT >> 4)>(a.v8[1], b.v8[1]),\
139 #define SIMD_IWRAPPER_3(op) \
140 static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b, Integer const &c) \
144 SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]),\
145 SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]),\
149 //-----------------------------------------------------------------------
150 // Single precision floating point arithmetic operations
151 //-----------------------------------------------------------------------
152 SIMD_WRAPPER_2(add_ps); // return a + b
153 SIMD_WRAPPER_2(div_ps); // return a / b
154 SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c
155 SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c
156 SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b
157 SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b
158 SIMD_WRAPPER_2(mul_ps); // return a * b
159 SIMD_WRAPPER_1(rcp_ps); // return 1.0f / a
160 SIMD_WRAPPER_1(rsqrt_ps); // return 1.0f / sqrt(a)
161 SIMD_WRAPPER_2(sub_ps); // return a - b
163 template <RoundMode RMT>
164 static SIMDINLINE Float SIMDCALL round_ps(Float const &a)
168 SIMD256T::template round_ps<RMT>(a.v8[0]),
169 SIMD256T::template round_ps<RMT>(a.v8[1]),
173 static SIMDINLINE Float SIMDCALL ceil_ps(Float const &a) { return round_ps<RoundMode::CEIL_NOEXC>(a); }
174 static SIMDINLINE Float SIMDCALL floor_ps(Float const &a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); }
176 //-----------------------------------------------------------------------
177 // Integer (various width) arithmetic operations
178 //-----------------------------------------------------------------------
179 SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
180 SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
181 SIMD_IWRAPPER_2(add_epi8); // return a + b (int8)
182 SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
183 SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
184 SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
185 SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
186 SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
187 SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
189 // return (a * b) & 0xFFFFFFFF
191 // Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
192 // and store the low 32 bits of the intermediate integers in dst.
193 SIMD_IWRAPPER_2(mullo_epi32);
194 SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
195 SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
196 SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
198 //-----------------------------------------------------------------------
199 // Logical operations
200 //-----------------------------------------------------------------------
201 SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int)
202 SIMD_IWRAPPER_2(and_si); // return a & b (int)
203 SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int)
204 SIMD_IWRAPPER_2(andnot_si); // return (~a) & b (int)
205 SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int)
206 SIMD_IWRAPPER_2(or_si); // return a | b (int)
207 SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int)
208 SIMD_IWRAPPER_2(xor_si); // return a ^ b (int)
211 //-----------------------------------------------------------------------
213 //-----------------------------------------------------------------------
215 static SIMDINLINE Integer SIMDCALL slli_epi32(Integer const &a) // return a << ImmT
219 SIMD256T::template slli_epi32<ImmT>(a.v8[0]),
220 SIMD256T::template slli_epi32<ImmT>(a.v8[1]),
224 SIMD_IWRAPPER_2(sllv_epi32); // return a << b (uint32)
227 static SIMDINLINE Integer SIMDCALL srai_epi32(Integer const &a) // return a >> ImmT (int32)
231 SIMD256T::template srai_epi32<ImmT>(a.v8[0]),
232 SIMD256T::template srai_epi32<ImmT>(a.v8[1]),
237 static SIMDINLINE Integer SIMDCALL srli_epi32(Integer const &a) // return a >> ImmT (uint32)
241 SIMD256T::template srli_epi32<ImmT>(a.v8[0]),
242 SIMD256T::template srli_epi32<ImmT>(a.v8[1]),
246 template<int ImmT> // for each 128-bit lane:
247 static SIMDINLINE Integer SIMDCALL srli_si(Integer const &a) // return a >> (ImmT*8) (uint)
251 SIMD256T::template srli_si<ImmT>(a.v8[0]),
252 SIMD256T::template srli_si<ImmT>(a.v8[1]),
256 static SIMDINLINE Float SIMDCALL srlisi_ps(Float const &a) // same as srli_si, but with Float cast to int
260 SIMD256T::template srlisi_ps<ImmT>(a.v8[0]),
261 SIMD256T::template srlisi_ps<ImmT>(a.v8[1]),
265 SIMD_IWRAPPER_2(srlv_epi32); // return a >> b (uint32)
267 //-----------------------------------------------------------------------
268 // Conversion operations
269 //-----------------------------------------------------------------------
270 static SIMDINLINE Float SIMDCALL castpd_ps(Double const &a) // return *(Float*)(&a)
274 SIMD256T::castpd_ps(a.v8[0]),
275 SIMD256T::castpd_ps(a.v8[1]),
279 static SIMDINLINE Integer SIMDCALL castps_si(Float const &a) // return *(Integer*)(&a)
283 SIMD256T::castps_si(a.v8[0]),
284 SIMD256T::castps_si(a.v8[1]),
288 static SIMDINLINE Double SIMDCALL castsi_pd(Integer const &a) // return *(Double*)(&a)
292 SIMD256T::castsi_pd(a.v8[0]),
293 SIMD256T::castsi_pd(a.v8[1]),
297 static SIMDINLINE Double SIMDCALL castps_pd(Float const &a) // return *(Double*)(&a)
301 SIMD256T::castps_pd(a.v8[0]),
302 SIMD256T::castps_pd(a.v8[1]),
306 static SIMDINLINE Float SIMDCALL castsi_ps(Integer const &a) // return *(Float*)(&a)
310 SIMD256T::castsi_ps(a.v8[0]),
311 SIMD256T::castsi_ps(a.v8[1]),
315 static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer const &a) // return (float)a (int32 --> float)
319 SIMD256T::cvtepi32_ps(a.v8[0]),
320 SIMD256T::cvtepi32_ps(a.v8[1]),
324 static SIMDINLINE Integer SIMDCALL cvtepu8_epi16(SIMD256Impl::Integer const &a) // return (int16)a (uint8 --> int16)
328 SIMD256T::cvtepu8_epi16(a.v4[0]),
329 SIMD256T::cvtepu8_epi16(a.v4[1]),
333 static SIMDINLINE Integer SIMDCALL cvtepu8_epi32(SIMD256Impl::Integer const &a) // return (int32)a (uint8 --> int32)
337 SIMD256T::cvtepu8_epi32(a.v4[0]),
338 SIMD256T::cvtepu8_epi32(SIMD128T::template srli_si<8>(a.v4[0])),
342 static SIMDINLINE Integer SIMDCALL cvtepu16_epi32(SIMD256Impl::Integer const &a) // return (int32)a (uint16 --> int32)
346 SIMD256T::cvtepu16_epi32(a.v4[0]),
347 SIMD256T::cvtepu16_epi32(a.v4[1]),
351 static SIMDINLINE Integer SIMDCALL cvtepu16_epi64(SIMD256Impl::Integer const &a) // return (int64)a (uint16 --> int64)
355 SIMD256T::cvtepu16_epi64(a.v4[0]),
356 SIMD256T::cvtepu16_epi64(SIMD128T::template srli_si<8>(a.v4[0])),
360 static SIMDINLINE Integer SIMDCALL cvtepu32_epi64(SIMD256Impl::Integer const &a) // return (int64)a (uint32 --> int64)
364 SIMD256T::cvtepu32_epi64(a.v4[0]),
365 SIMD256T::cvtepu32_epi64(a.v4[1]),
369 static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float const &a) // return (int32)a (float --> int32)
373 SIMD256T::cvtps_epi32(a.v8[0]),
374 SIMD256T::cvtps_epi32(a.v8[1]),
378 static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float const &a) // return (int32)a (rnd_to_zero(float) --> int32)
382 SIMD256T::cvtps_epi32(a.v8[0]),
383 SIMD256T::cvtps_epi32(a.v8[1]),
387 //-----------------------------------------------------------------------
388 // Comparison operations
389 //-----------------------------------------------------------------------
390 template<CompareType CmpTypeT>
391 static SIMDINLINE Float SIMDCALL cmp_ps(Float const &a, Float const &b) // return a (CmpTypeT) b
395 SIMD256T::template cmp_ps<CmpTypeT>(a.v8[0], b.v8[0]),
396 SIMD256T::template cmp_ps<CmpTypeT>(a.v8[1], b.v8[1]),
399 static SIMDINLINE Float SIMDCALL cmplt_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::LT_OQ>(a, b); }
400 static SIMDINLINE Float SIMDCALL cmpgt_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::GT_OQ>(a, b); }
401 static SIMDINLINE Float SIMDCALL cmpneq_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); }
402 static SIMDINLINE Float SIMDCALL cmpeq_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::EQ_OQ>(a, b); }
403 static SIMDINLINE Float SIMDCALL cmpge_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::GE_OQ>(a, b); }
404 static SIMDINLINE Float SIMDCALL cmple_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::LE_OQ>(a, b); }
406 template<CompareType CmpTypeT>
407 static SIMDINLINE Mask SIMDCALL cmp_ps_mask(Float const &a, Float const &b)
409 return static_cast<Mask>(movemask_ps(cmp_ps<CmpTypeT>(a, b)));
413 SIMD_IWRAPPER_2(cmpeq_epi8); // return a == b (int8)
414 SIMD_IWRAPPER_2(cmpeq_epi16); // return a == b (int16)
415 SIMD_IWRAPPER_2(cmpeq_epi32); // return a == b (int32)
416 SIMD_IWRAPPER_2(cmpeq_epi64); // return a == b (int64)
417 SIMD_IWRAPPER_2(cmpgt_epi8); // return a > b (int8)
418 SIMD_IWRAPPER_2(cmpgt_epi16); // return a > b (int16)
419 SIMD_IWRAPPER_2(cmpgt_epi32); // return a > b (int32)
420 SIMD_IWRAPPER_2(cmpgt_epi64); // return a > b (int64)
421 SIMD_IWRAPPER_2(cmplt_epi32); // return a < b (int32)
423 static SIMDINLINE bool SIMDCALL testz_ps(Float const &a, Float const &b) // return all_lanes_zero(a & b) ? 1 : 0 (float)
425 return 0 != (SIMD256T::testz_ps(a.v8[0], b.v8[0]) &
426 SIMD256T::testz_ps(a.v8[1], b.v8[1]));
429 static SIMDINLINE bool SIMDCALL testz_si(Integer const &a, Integer const &b) // return all_lanes_zero(a & b) ? 1 : 0 (int)
431 return 0 != (SIMD256T::testz_si(a.v8[0], b.v8[0]) &
432 SIMD256T::testz_si(a.v8[1], b.v8[1]));
435 //-----------------------------------------------------------------------
436 // Blend / shuffle / permute operations
437 //-----------------------------------------------------------------------
438 SIMD_WRAPPER_2I(blend_ps); // return ImmT ? b : a (float)
439 SIMD_IWRAPPER_2I(blend_epi32); // return ImmT ? b : a (int32)
440 SIMD_WRAPPER_3(blendv_ps); // return mask ? b : a (float)
441 static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const &a, Integer const &b, Float const &mask) // return mask ? b : a (int)
445 SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]),
446 SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]),
450 static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const &a, Integer const &b, Integer const &mask) // return mask ? b : a (int)
454 SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]),
455 SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]),
459 static SIMDINLINE Float SIMDCALL broadcast_ss(float const *p) // return *p (all elements in vector get same value)
464 SIMD256T::set1_ps(f),
465 SIMD256T::set1_ps(f),
470 static SIMDINLINE SIMD256Impl::Float SIMDCALL extract_ps(Float const &a)
472 SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
477 static SIMDINLINE SIMD256Impl::Double SIMDCALL extract_pd(Double const &a)
479 SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
484 static SIMDINLINE SIMD256Impl::Integer SIMDCALL extract_si(Integer const &a)
486 SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
491 static SIMDINLINE Float SIMDCALL insert_ps(Float const &a, SIMD256Impl::Float const &b)
493 SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
500 static SIMDINLINE Double SIMDCALL insert_pd(Double const &a, SIMD256Impl::Double const &b)
502 SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
509 static SIMDINLINE Integer SIMDCALL insert_si(Integer const &a, SIMD256Impl::Integer const &b)
511 SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
517 SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
518 SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
519 SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
520 SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
523 static SIMDINLINE Float SIMDCALL permute_ps(Float const &a)
527 SIMD256T::template permute_ps<ImmT>(a.v8[0]),
528 SIMD256T::template permute_ps<ImmT>(a.v8[1]),
532 static SIMDINLINE Integer SIMDCALL permute_epi32(Integer const &a, Integer const &swiz) // return a[swiz[i]] for each 32-bit lane i (int32)
534 return castps_si(permute_ps(castsi_ps(a), swiz));
537 static SIMDINLINE Float SIMDCALL permute_ps(Float const &a, Integer const &swiz) // return a[swiz[i]] for each 32-bit lane i (float)
539 const auto mask = SIMD256T::set1_epi32(7);
541 auto lolo = SIMD256T::permute_ps(a.v8[0], SIMD256T::and_si(swiz.v8[0], mask));
542 auto lohi = SIMD256T::permute_ps(a.v8[1], SIMD256T::and_si(swiz.v8[0], mask));
544 auto hilo = SIMD256T::permute_ps(a.v8[0], SIMD256T::and_si(swiz.v8[1], mask));
545 auto hihi = SIMD256T::permute_ps(a.v8[1], SIMD256T::and_si(swiz.v8[1], mask));
549 SIMD256T::blendv_ps(lolo, lohi, SIMD256T::castsi_ps(SIMD256T::cmpgt_epi32(swiz.v8[0], mask))),
550 SIMD256T::blendv_ps(hilo, hihi, SIMD256T::castsi_ps(SIMD256T::cmpgt_epi32(swiz.v8[1], mask))),
554 // All of the 512-bit permute2f128_XX intrinsics do the following:
556 // SELECT4(src, control) {
557 // CASE(control[1:0])
558 // 0: tmp[127:0] : = src[127:0]
559 // 1 : tmp[127:0] : = src[255:128]
560 // 2 : tmp[127:0] : = src[383:256]
561 // 3 : tmp[127:0] : = src[511:384]
566 // dst[127:0] : = SELECT4(a[511:0], imm8[1:0])
567 // dst[255:128] : = SELECT4(a[511:0], imm8[3:2])
568 // dst[383:256] : = SELECT4(b[511:0], imm8[5:4])
569 // dst[511:384] : = SELECT4(b[511:0], imm8[7:6])
570 // dst[MAX:512] : = 0
572 // Since the 256-bit AVX instructions use a 4-bit control field (instead
573 // of 2-bit for AVX512), we need to expand the control bits sent to the
574 // AVX instructions for emulation.
577 static SIMDINLINE Float SIMDCALL permute2f128_ps(Float const &a, Float const &b)
581 SIMD256T::template permute2f128_ps<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], a.v8[1]),
582 SIMD256T::template permute2f128_ps<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], b.v8[1]),
587 static SIMDINLINE Double SIMDCALL permute2f128_pd(Double const &a, Double const &b)
591 SIMD256T::template permute2f128_pd<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], a.v8[1]),
592 SIMD256T::template permute2f128_pd<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], b.v8[1]),
597 static SIMDINLINE Integer SIMDCALL permute2f128_si(Integer const &a, Integer const &b)
601 SIMD256T::template permute2f128_si<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], a.v8[1]),
602 SIMD256T::template permute2f128_si<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], b.v8[1]),
606 SIMD_IWRAPPER_2I_1(shuffle_epi32);
607 SIMD_IWRAPPER_2I_2(shuffle_epi64);
608 SIMD_IWRAPPER_2(shuffle_epi8);
609 SIMD_WRAPPER_2I_1(shuffle_pd);
610 SIMD_WRAPPER_2I_1(shuffle_ps);
611 SIMD_IWRAPPER_2(unpackhi_epi16);
612 SIMD_IWRAPPER_2(unpackhi_epi32);
613 SIMD_IWRAPPER_2(unpackhi_epi64);
614 SIMD_IWRAPPER_2(unpackhi_epi8);
615 SIMD_WRAPPER_2(unpackhi_pd);
616 SIMD_WRAPPER_2(unpackhi_ps);
617 SIMD_IWRAPPER_2(unpacklo_epi16);
618 SIMD_IWRAPPER_2(unpacklo_epi32);
619 SIMD_IWRAPPER_2(unpacklo_epi64);
620 SIMD_IWRAPPER_2(unpacklo_epi8);
621 SIMD_WRAPPER_2(unpacklo_pd);
622 SIMD_WRAPPER_2(unpacklo_ps);
624 //-----------------------------------------------------------------------
625 // Load / store operations
626 //-----------------------------------------------------------------------
627 template<ScaleFactor ScaleT>
628 static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer const &idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
632 SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[0]),
633 SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[1]),
637 static SIMDINLINE Float SIMDCALL load1_ps(float const *p) // return *p (broadcast 1 value to all elements)
639 return broadcast_ss(p);
642 static SIMDINLINE Float SIMDCALL load_ps(float const *p) // return *p (loads SIMD width elements from memory)
646 SIMD256T::load_ps(p),
647 SIMD256T::load_ps(p + TARGET_SIMD_WIDTH)
651 static SIMDINLINE Integer SIMDCALL load_si(Integer const *p) // return *p
655 SIMD256T::load_si(&p->v8[0]),
656 SIMD256T::load_si(&p->v8[1]),
660 static SIMDINLINE Float SIMDCALL loadu_ps(float const *p) // return *p (same as load_ps but allows for unaligned mem)
664 SIMD256T::loadu_ps(p),
665 SIMD256T::loadu_ps(p + TARGET_SIMD_WIDTH)
669 static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p (same as load_si but allows for unaligned mem)
673 SIMD256T::loadu_si(&p->v8[0]),
674 SIMD256T::loadu_si(&p->v8[1]),
678 // for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
679 template<ScaleFactor ScaleT>
680 static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float const &old, float const* p, Integer const &idx, Float const &mask)
684 SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[0], p, idx.v8[0], mask.v8[0]),
685 SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[1], p, idx.v8[1], mask.v8[1]),
689 static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer const &mask, Float const &src)
691 SIMD256T::maskstore_ps(p, mask.v8[0], src.v8[0]);
692 SIMD256T::maskstore_ps(p + TARGET_SIMD_WIDTH, mask.v8[1], src.v8[1]);
695 static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer const &a)
697 uint64_t mask = static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[0]));
698 mask |= static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[1])) << (TARGET_SIMD_WIDTH * 4);
703 static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double const &a)
705 uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[0]));
706 mask |= static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[1])) << (TARGET_SIMD_WIDTH / 2);
710 static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float const &a)
712 uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[0]));
713 mask |= static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[1])) << TARGET_SIMD_WIDTH;
718 static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
722 SIMD256T::set1_epi32(i),
723 SIMD256T::set1_epi32(i)
727 static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
731 SIMD256T::set1_epi8(i),
732 SIMD256T::set1_epi8(i)
736 static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value)
740 SIMD256T::set1_ps(f),
745 static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float)
749 SIMD256T::setzero_ps(),
750 SIMD256T::setzero_ps()
754 static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer)
758 SIMD256T::setzero_si(),
759 SIMD256T::setzero_si()
763 static SIMDINLINE void SIMDCALL store_ps(float *p, Float const &a) // *p = a (stores all elements contiguously in memory)
765 SIMD256T::store_ps(p, a.v8[0]);
766 SIMD256T::store_ps(p + TARGET_SIMD_WIDTH, a.v8[1]);
769 static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer const &a) // *p = a
771 SIMD256T::store_si(&p->v8[0], a.v8[0]);
772 SIMD256T::store_si(&p->v8[1], a.v8[1]);
775 static SIMDINLINE void SIMDCALL stream_ps(float *p, Float const &a) // *p = a (same as store_ps, but doesn't keep memory in cache)
777 SIMD256T::stream_ps(p, a.v8[0]);
778 SIMD256T::stream_ps(p + TARGET_SIMD_WIDTH, a.v8[1]);
781 static SIMDINLINE Integer SIMDCALL set_epi32(
782 int i15, int i14, int i13, int i12, int i11, int i10, int i9, int i8,
783 int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
788 i7, i6, i5, i4, i3, i2, i1, i0),
790 i15, i14, i13, i12, i11, i10, i9, i8)
794 static SIMDINLINE Integer SIMDCALL set_epi32(
795 int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
798 0, 0, 0, 0, 0, 0, 0, 0,
799 i7, i6, i5, i4, i3, i2, i1, i0);
802 static SIMDINLINE Float SIMDCALL set_ps(
803 float i15, float i14, float i13, float i12, float i11, float i10, float i9, float i8,
804 float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
809 i7, i6, i5, i4, i3, i2, i1, i0),
811 i15, i14, i13, i12, i11, i10, i9, i8)
815 static SIMDINLINE Float SIMDCALL set_ps(
816 float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
819 0, 0, 0, 0, 0, 0, 0, 0,
820 i7, i6, i5, i4, i3, i2, i1, i0);
823 static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
827 SIMD256T::vmask_ps(mask),
828 SIMD256T::vmask_ps(mask >> TARGET_SIMD_WIDTH)
832 #undef SIMD_WRAPPER_1
833 #undef SIMD_WRAPPER_2
834 #undef SIMD_WRAPPER_2I
835 #undef SIMD_WRAPPER_2I_1
836 #undef SIMD_WRAPPER_3
837 #undef SIMD_IWRAPPER_1
838 #undef SIMD_IWRAPPER_2
839 #undef SIMD_IWRAPPER_2I
840 #undef SIMD_IWRAPPER_2I_1
841 #undef SIMD_IWRAPPER_3