1 /****************************************************************************
2 * Copyright (C) 2017 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 ****************************************************************************/
23 #if !defined(__SIMD_LIB_AVX_HPP__)
24 #error Do not include this file directly, use "simdlib.hpp" instead.
27 //============================================================================
28 // SIMD16 AVX (1) implementation
29 //============================================================================
31 static const int TARGET_SIMD_WIDTH = 8;
32 using SIMD128T = SIMD128Impl::AVXImpl;
34 #define SIMD_WRAPPER_1(op) \
35 static SIMDINLINE Float SIMDCALL op(Float const& a) \
38 SIMD256T::op(a.v8[0]), \
39 SIMD256T::op(a.v8[1]), \
43 #define SIMD_WRAPPER_2(op) \
44 static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \
47 SIMD256T::op(a.v8[0], b.v8[0]), \
48 SIMD256T::op(a.v8[1], b.v8[1]), \
52 #define SIMD_WRAPPER_2I(op) \
54 static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \
57 SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]), \
58 SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]), \
62 #define SIMD_WRAPPER_2I_1(op) \
64 static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \
67 SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]), \
68 SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]), \
72 #define SIMD_WRAPPER_3(op) \
73 static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b, Float const& c) \
76 SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]), \
77 SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]), \
81 #define SIMD_IWRAPPER_1(op) \
82 static SIMDINLINE Integer SIMDCALL op(Integer const& a) \
85 SIMD256T::op(a.v8[0]), \
86 SIMD256T::op(a.v8[1]), \
90 #define SIMD_IWRAPPER_2(op) \
91 static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
94 SIMD256T::op(a.v8[0], b.v8[0]), \
95 SIMD256T::op(a.v8[1], b.v8[1]), \
99 #define SIMD_IWRAPPER_2I(op) \
100 template <int ImmT> \
101 static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
104 SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]), \
105 SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]), \
109 #define SIMD_IWRAPPER_2I_1(op) \
110 template <int ImmT> \
111 static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
114 SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]), \
115 SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]), \
119 #define SIMD_IWRAPPER_2I_2(op) \
120 template <int ImmT> \
121 static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
124 SIMD256T::template op<0xF & ImmT>(a.v8[0], b.v8[0]), \
125 SIMD256T::template op<0xF & (ImmT >> 4)>(a.v8[1], b.v8[1]), \
129 #define SIMD_IWRAPPER_3(op) \
130 static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b, Integer const& c) \
133 SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]), \
134 SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]), \
138 //-----------------------------------------------------------------------
139 // Single precision floating point arithmetic operations
140 //-----------------------------------------------------------------------
141 SIMD_WRAPPER_2(add_ps); // return a + b
142 SIMD_WRAPPER_2(div_ps); // return a / b
143 SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c
144 SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c
145 SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b
146 SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b
147 SIMD_WRAPPER_2(mul_ps); // return a * b
148 SIMD_WRAPPER_1(rcp_ps); // return 1.0f / a
149 SIMD_WRAPPER_1(rsqrt_ps); // return 1.0f / sqrt(a)
150 SIMD_WRAPPER_2(sub_ps); // return a - b
152 template <RoundMode RMT>
153 static SIMDINLINE Float SIMDCALL round_ps(Float const& a)
156 SIMD256T::template round_ps<RMT>(a.v8[0]),
157 SIMD256T::template round_ps<RMT>(a.v8[1]),
161 static SIMDINLINE Float SIMDCALL ceil_ps(Float const& a)
163 return round_ps<RoundMode::CEIL_NOEXC>(a);
165 static SIMDINLINE Float SIMDCALL floor_ps(Float const& a)
167 return round_ps<RoundMode::FLOOR_NOEXC>(a);
170 //-----------------------------------------------------------------------
171 // Integer (various width) arithmetic operations
172 //-----------------------------------------------------------------------
173 SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
174 SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
175 SIMD_IWRAPPER_2(add_epi8); // return a + b (int8)
176 SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
177 SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
178 SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
179 SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
180 SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
181 SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
183 // return (a * b) & 0xFFFFFFFF
185 // Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
186 // and store the low 32 bits of the intermediate integers in dst.
187 SIMD_IWRAPPER_2(mullo_epi32);
188 SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
189 SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
190 SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
192 //-----------------------------------------------------------------------
193 // Logical operations
194 //-----------------------------------------------------------------------
195 SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int)
196 SIMD_IWRAPPER_2(and_si); // return a & b (int)
197 SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int)
198 SIMD_IWRAPPER_2(andnot_si); // return (~a) & b (int)
199 SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int)
200 SIMD_IWRAPPER_2(or_si); // return a | b (int)
201 SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int)
202 SIMD_IWRAPPER_2(xor_si); // return a ^ b (int)
204 //-----------------------------------------------------------------------
206 //-----------------------------------------------------------------------
208 static SIMDINLINE Integer SIMDCALL slli_epi32(Integer const& a) // return a << ImmT
211 SIMD256T::template slli_epi32<ImmT>(a.v8[0]),
212 SIMD256T::template slli_epi32<ImmT>(a.v8[1]),
216 SIMD_IWRAPPER_2(sllv_epi32); // return a << b (uint32)
219 static SIMDINLINE Integer SIMDCALL srai_epi32(Integer const& a) // return a >> ImmT (int32)
222 SIMD256T::template srai_epi32<ImmT>(a.v8[0]),
223 SIMD256T::template srai_epi32<ImmT>(a.v8[1]),
228 static SIMDINLINE Integer SIMDCALL srli_epi32(Integer const& a) // return a >> ImmT (uint32)
231 SIMD256T::template srli_epi32<ImmT>(a.v8[0]),
232 SIMD256T::template srli_epi32<ImmT>(a.v8[1]),
236 template <int ImmT> // for each 128-bit lane:
237 static SIMDINLINE Integer SIMDCALL srli_si(Integer const& a) // return a >> (ImmT*8) (uint)
240 SIMD256T::template srli_si<ImmT>(a.v8[0]),
241 SIMD256T::template srli_si<ImmT>(a.v8[1]),
245 static SIMDINLINE Float SIMDCALL
246 srlisi_ps(Float const& a) // same as srli_si, but with Float cast to int
249 SIMD256T::template srlisi_ps<ImmT>(a.v8[0]),
250 SIMD256T::template srlisi_ps<ImmT>(a.v8[1]),
254 SIMD_IWRAPPER_2(srlv_epi32); // return a >> b (uint32)
256 //-----------------------------------------------------------------------
257 // Conversion operations
258 //-----------------------------------------------------------------------
259 static SIMDINLINE Float SIMDCALL castpd_ps(Double const& a) // return *(Float*)(&a)
262 SIMD256T::castpd_ps(a.v8[0]),
263 SIMD256T::castpd_ps(a.v8[1]),
267 static SIMDINLINE Integer SIMDCALL castps_si(Float const& a) // return *(Integer*)(&a)
270 SIMD256T::castps_si(a.v8[0]),
271 SIMD256T::castps_si(a.v8[1]),
275 static SIMDINLINE Double SIMDCALL castsi_pd(Integer const& a) // return *(Double*)(&a)
278 SIMD256T::castsi_pd(a.v8[0]),
279 SIMD256T::castsi_pd(a.v8[1]),
283 static SIMDINLINE Double SIMDCALL castps_pd(Float const& a) // return *(Double*)(&a)
286 SIMD256T::castps_pd(a.v8[0]),
287 SIMD256T::castps_pd(a.v8[1]),
291 static SIMDINLINE Float SIMDCALL castsi_ps(Integer const& a) // return *(Float*)(&a)
294 SIMD256T::castsi_ps(a.v8[0]),
295 SIMD256T::castsi_ps(a.v8[1]),
299 static SIMDINLINE Float SIMDCALL
300 cvtepi32_ps(Integer const& a) // return (float)a (int32 --> float)
303 SIMD256T::cvtepi32_ps(a.v8[0]),
304 SIMD256T::cvtepi32_ps(a.v8[1]),
308 static SIMDINLINE Integer SIMDCALL
309 cvtepu8_epi16(SIMD256Impl::Integer const& a) // return (int16)a (uint8 --> int16)
312 SIMD256T::cvtepu8_epi16(a.v4[0]),
313 SIMD256T::cvtepu8_epi16(a.v4[1]),
317 static SIMDINLINE Integer SIMDCALL
318 cvtepu8_epi32(SIMD256Impl::Integer const& a) // return (int32)a (uint8 --> int32)
321 SIMD256T::cvtepu8_epi32(a.v4[0]),
322 SIMD256T::cvtepu8_epi32(SIMD128T::template srli_si<8>(a.v4[0])),
326 static SIMDINLINE Integer SIMDCALL
327 cvtepu16_epi32(SIMD256Impl::Integer const& a) // return (int32)a (uint16 --> int32)
330 SIMD256T::cvtepu16_epi32(a.v4[0]),
331 SIMD256T::cvtepu16_epi32(a.v4[1]),
335 static SIMDINLINE Integer SIMDCALL
336 cvtepu16_epi64(SIMD256Impl::Integer const& a) // return (int64)a (uint16 --> int64)
339 SIMD256T::cvtepu16_epi64(a.v4[0]),
340 SIMD256T::cvtepu16_epi64(SIMD128T::template srli_si<8>(a.v4[0])),
344 static SIMDINLINE Integer SIMDCALL
345 cvtepu32_epi64(SIMD256Impl::Integer const& a) // return (int64)a (uint32 --> int64)
348 SIMD256T::cvtepu32_epi64(a.v4[0]),
349 SIMD256T::cvtepu32_epi64(a.v4[1]),
353 static SIMDINLINE Integer SIMDCALL
354 cvtps_epi32(Float const& a) // return (int32)a (float --> int32)
357 SIMD256T::cvtps_epi32(a.v8[0]),
358 SIMD256T::cvtps_epi32(a.v8[1]),
362 static SIMDINLINE Integer SIMDCALL
363 cvttps_epi32(Float const& a) // return (int32)a (rnd_to_zero(float) --> int32)
366 SIMD256T::cvtps_epi32(a.v8[0]),
367 SIMD256T::cvtps_epi32(a.v8[1]),
371 //-----------------------------------------------------------------------
372 // Comparison operations
373 //-----------------------------------------------------------------------
374 template <CompareType CmpTypeT>
375 static SIMDINLINE Float SIMDCALL cmp_ps(Float const& a, Float const& b) // return a (CmpTypeT) b
378 SIMD256T::template cmp_ps<CmpTypeT>(a.v8[0], b.v8[0]),
379 SIMD256T::template cmp_ps<CmpTypeT>(a.v8[1], b.v8[1]),
382 static SIMDINLINE Float SIMDCALL cmplt_ps(Float const& a, Float const& b)
384 return cmp_ps<CompareType::LT_OQ>(a, b);
386 static SIMDINLINE Float SIMDCALL cmpgt_ps(Float const& a, Float const& b)
388 return cmp_ps<CompareType::GT_OQ>(a, b);
390 static SIMDINLINE Float SIMDCALL cmpneq_ps(Float const& a, Float const& b)
392 return cmp_ps<CompareType::NEQ_OQ>(a, b);
394 static SIMDINLINE Float SIMDCALL cmpeq_ps(Float const& a, Float const& b)
396 return cmp_ps<CompareType::EQ_OQ>(a, b);
398 static SIMDINLINE Float SIMDCALL cmpge_ps(Float const& a, Float const& b)
400 return cmp_ps<CompareType::GE_OQ>(a, b);
402 static SIMDINLINE Float SIMDCALL cmple_ps(Float const& a, Float const& b)
404 return cmp_ps<CompareType::LE_OQ>(a, b);
407 template <CompareType CmpTypeT>
408 static SIMDINLINE Mask SIMDCALL cmp_ps_mask(Float const& a, Float const& b)
410 return static_cast<Mask>(movemask_ps(cmp_ps<CmpTypeT>(a, b)));
413 SIMD_IWRAPPER_2(cmpeq_epi8); // return a == b (int8)
414 SIMD_IWRAPPER_2(cmpeq_epi16); // return a == b (int16)
415 SIMD_IWRAPPER_2(cmpeq_epi32); // return a == b (int32)
416 SIMD_IWRAPPER_2(cmpeq_epi64); // return a == b (int64)
417 SIMD_IWRAPPER_2(cmpgt_epi8); // return a > b (int8)
418 SIMD_IWRAPPER_2(cmpgt_epi16); // return a > b (int16)
419 SIMD_IWRAPPER_2(cmpgt_epi32); // return a > b (int32)
420 SIMD_IWRAPPER_2(cmpgt_epi64); // return a > b (int64)
421 SIMD_IWRAPPER_2(cmplt_epi32); // return a < b (int32)
423 static SIMDINLINE bool SIMDCALL
424 testz_ps(Float const& a, Float const& b) // return all_lanes_zero(a & b) ? 1 : 0 (float)
426 return 0 != (SIMD256T::testz_ps(a.v8[0], b.v8[0]) & SIMD256T::testz_ps(a.v8[1], b.v8[1]));
429 static SIMDINLINE bool SIMDCALL
430 testz_si(Integer const& a, Integer const& b) // return all_lanes_zero(a & b) ? 1 : 0 (int)
432 return 0 != (SIMD256T::testz_si(a.v8[0], b.v8[0]) & SIMD256T::testz_si(a.v8[1], b.v8[1]));
435 //-----------------------------------------------------------------------
436 // Blend / shuffle / permute operations
437 //-----------------------------------------------------------------------
438 SIMD_WRAPPER_2I(blend_ps); // return ImmT ? b : a (float)
439 SIMD_IWRAPPER_2I(blend_epi32); // return ImmT ? b : a (int32)
440 SIMD_WRAPPER_3(blendv_ps); // return mask ? b : a (float)
441 static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const& a,
443 Float const& mask) // return mask ? b : a (int)
446 SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]),
447 SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]),
451 static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const& a,
453 Integer const& mask) // return mask ? b : a (int)
456 SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]),
457 SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]),
461 static SIMDINLINE Float SIMDCALL
462 broadcast_ss(float const* p) // return *p (all elements in vector get same value)
466 SIMD256T::set1_ps(f),
467 SIMD256T::set1_ps(f),
472 static SIMDINLINE SIMD256Impl::Float SIMDCALL extract_ps(Float const& a)
474 SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
479 static SIMDINLINE SIMD256Impl::Double SIMDCALL extract_pd(Double const& a)
481 SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
486 static SIMDINLINE SIMD256Impl::Integer SIMDCALL extract_si(Integer const& a)
488 SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
493 static SIMDINLINE Float SIMDCALL insert_ps(Float const& a, SIMD256Impl::Float const& b)
495 SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
502 static SIMDINLINE Double SIMDCALL insert_pd(Double const& a, SIMD256Impl::Double const& b)
504 SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
511 static SIMDINLINE Integer SIMDCALL insert_si(Integer const& a, SIMD256Impl::Integer const& b)
513 SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
519 SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
520 SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
521 SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
522 SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
525 static SIMDINLINE Float SIMDCALL permute_ps(Float const& a)
528 SIMD256T::template permute_ps<ImmT>(a.v8[0]),
529 SIMD256T::template permute_ps<ImmT>(a.v8[1]),
533 static SIMDINLINE Integer SIMDCALL permute_epi32(
534 Integer const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (int32)
536 return castps_si(permute_ps(castsi_ps(a), swiz));
539 static SIMDINLINE Float SIMDCALL
540 permute_ps(Float const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (float)
542 const auto mask = SIMD256T::set1_epi32(7);
544 auto lolo = SIMD256T::permute_ps(a.v8[0], SIMD256T::and_si(swiz.v8[0], mask));
545 auto lohi = SIMD256T::permute_ps(a.v8[1], SIMD256T::and_si(swiz.v8[0], mask));
547 auto hilo = SIMD256T::permute_ps(a.v8[0], SIMD256T::and_si(swiz.v8[1], mask));
548 auto hihi = SIMD256T::permute_ps(a.v8[1], SIMD256T::and_si(swiz.v8[1], mask));
552 lolo, lohi, SIMD256T::castsi_ps(SIMD256T::cmpgt_epi32(swiz.v8[0], mask))),
554 hilo, hihi, SIMD256T::castsi_ps(SIMD256T::cmpgt_epi32(swiz.v8[1], mask))),
558 // All of the 512-bit permute2f128_XX intrinsics do the following:
560 // SELECT4(src, control) {
561 // CASE(control[1:0])
562 // 0 : tmp[127:0] : = src[127:0]
563 // 1 : tmp[127:0] : = src[255:128]
564 // 2 : tmp[127:0] : = src[383:256]
565 // 3 : tmp[127:0] : = src[511:384]
570 // dst[127:0] : = SELECT4(a[511:0], imm8[1:0])
571 // dst[255:128] : = SELECT4(a[511:0], imm8[3:2])
572 // dst[383:256] : = SELECT4(b[511:0], imm8[5:4])
573 // dst[511:384] : = SELECT4(b[511:0], imm8[7:6])
574 // dst[MAX:512] : = 0
576 // Since the 256-bit AVX instructions use a 4-bit control field (instead
577 // of 2-bit for AVX512), we need to expand the control bits sent to the
578 // AVX instructions for emulation.
581 static SIMDINLINE Float SIMDCALL permute2f128_ps(Float const& a, Float const& b)
584 SIMD256T::template permute2f128_ps<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0],
586 SIMD256T::template permute2f128_ps<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0],
592 static SIMDINLINE Double SIMDCALL permute2f128_pd(Double const& a, Double const& b)
595 SIMD256T::template permute2f128_pd<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0],
597 SIMD256T::template permute2f128_pd<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0],
603 static SIMDINLINE Integer SIMDCALL permute2f128_si(Integer const& a, Integer const& b)
606 SIMD256T::template permute2f128_si<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0],
608 SIMD256T::template permute2f128_si<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0],
613 SIMD_IWRAPPER_2I_1(shuffle_epi32);
614 SIMD_IWRAPPER_2I_2(shuffle_epi64);
615 SIMD_IWRAPPER_2(shuffle_epi8);
616 SIMD_WRAPPER_2I_1(shuffle_pd);
617 SIMD_WRAPPER_2I_1(shuffle_ps);
618 SIMD_IWRAPPER_2(unpackhi_epi16);
619 SIMD_IWRAPPER_2(unpackhi_epi32);
620 SIMD_IWRAPPER_2(unpackhi_epi64);
621 SIMD_IWRAPPER_2(unpackhi_epi8);
622 SIMD_WRAPPER_2(unpackhi_pd);
623 SIMD_WRAPPER_2(unpackhi_ps);
624 SIMD_IWRAPPER_2(unpacklo_epi16);
625 SIMD_IWRAPPER_2(unpacklo_epi32);
626 SIMD_IWRAPPER_2(unpacklo_epi64);
627 SIMD_IWRAPPER_2(unpacklo_epi8);
628 SIMD_WRAPPER_2(unpacklo_pd);
629 SIMD_WRAPPER_2(unpacklo_ps);
631 //-----------------------------------------------------------------------
632 // Load / store operations
633 //-----------------------------------------------------------------------
634 template <ScaleFactor ScaleT = ScaleFactor::SF_1>
635 static SIMDINLINE Float SIMDCALL
636 i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
639 SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[0]),
640 SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[1]),
644 template <ScaleFactor ScaleT = ScaleFactor::SF_1>
645 static SIMDINLINE Float SIMDCALL
646 sw_i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
649 SIMD256T::template sw_i32gather_ps<ScaleT>(p, idx.v8[0]),
650 SIMD256T::template sw_i32gather_ps<ScaleT>(p, idx.v8[1]),
654 static SIMDINLINE Float SIMDCALL
655 load1_ps(float const* p) // return *p (broadcast 1 value to all elements)
657 return broadcast_ss(p);
660 static SIMDINLINE Float SIMDCALL
661 load_ps(float const* p) // return *p (loads SIMD width elements from memory)
663 return Float{SIMD256T::load_ps(p), SIMD256T::load_ps(p + TARGET_SIMD_WIDTH)};
666 static SIMDINLINE Integer SIMDCALL load_si(Integer const* p) // return *p
669 SIMD256T::load_si(&p->v8[0]),
670 SIMD256T::load_si(&p->v8[1]),
674 static SIMDINLINE Float SIMDCALL
675 loadu_ps(float const* p) // return *p (same as load_ps but allows for unaligned mem)
677 return Float{SIMD256T::loadu_ps(p), SIMD256T::loadu_ps(p + TARGET_SIMD_WIDTH)};
680 static SIMDINLINE Integer SIMDCALL
681 loadu_si(Integer const* p) // return *p (same as load_si but allows for unaligned mem)
684 SIMD256T::loadu_si(&p->v8[0]),
685 SIMD256T::loadu_si(&p->v8[1]),
689 // for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
690 template <ScaleFactor ScaleT = ScaleFactor::SF_1>
691 static SIMDINLINE Float SIMDCALL
692 mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask)
695 SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[0], p, idx.v8[0], mask.v8[0]),
696 SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[1], p, idx.v8[1], mask.v8[1]),
700 template <ScaleFactor ScaleT = ScaleFactor::SF_1>
701 static SIMDINLINE Float SIMDCALL
702 sw_mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask)
705 SIMD256T::template sw_mask_i32gather_ps<ScaleT>(old.v8[0], p, idx.v8[0], mask.v8[0]),
706 SIMD256T::template sw_mask_i32gather_ps<ScaleT>(old.v8[1], p, idx.v8[1], mask.v8[1]),
710 static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer const& mask, Float const& src)
712 SIMD256T::maskstore_ps(p, mask.v8[0], src.v8[0]);
713 SIMD256T::maskstore_ps(p + TARGET_SIMD_WIDTH, mask.v8[1], src.v8[1]);
716 static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer const& a)
718 uint64_t mask = static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[0]));
719 mask |= static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[1])) << (TARGET_SIMD_WIDTH * 4);
724 static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double const& a)
726 uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[0]));
727 mask |= static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[1])) << (TARGET_SIMD_WIDTH / 2);
731 static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float const& a)
733 uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[0]));
734 mask |= static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[1])) << TARGET_SIMD_WIDTH;
739 static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
741 return Integer{SIMD256T::set1_epi32(i), SIMD256T::set1_epi32(i)};
744 static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
746 return Integer{SIMD256T::set1_epi8(i), SIMD256T::set1_epi8(i)};
749 static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value)
751 return Float{SIMD256T::set1_ps(f), SIMD256T::set1_ps(f)};
754 static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float)
756 return Float{SIMD256T::setzero_ps(), SIMD256T::setzero_ps()};
759 static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer)
761 return Integer{SIMD256T::setzero_si(), SIMD256T::setzero_si()};
764 static SIMDINLINE void SIMDCALL
765 store_ps(float* p, Float const& a) // *p = a (stores all elements contiguously in memory)
767 SIMD256T::store_ps(p, a.v8[0]);
768 SIMD256T::store_ps(p + TARGET_SIMD_WIDTH, a.v8[1]);
771 static SIMDINLINE void SIMDCALL store_si(Integer* p, Integer const& a) // *p = a
773 SIMD256T::store_si(&p->v8[0], a.v8[0]);
774 SIMD256T::store_si(&p->v8[1], a.v8[1]);
777 static SIMDINLINE void SIMDCALL
778 stream_ps(float* p, Float const& a) // *p = a (same as store_ps, but doesn't keep memory in cache)
780 SIMD256T::stream_ps(p, a.v8[0]);
781 SIMD256T::stream_ps(p + TARGET_SIMD_WIDTH, a.v8[1]);
784 static SIMDINLINE Integer SIMDCALL set_epi32(int i15,
801 return Integer{SIMD256T::set_epi32(i7, i6, i5, i4, i3, i2, i1, i0),
802 SIMD256T::set_epi32(i15, i14, i13, i12, i11, i10, i9, i8)};
805 static SIMDINLINE Integer SIMDCALL
806 set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
808 return set_epi32(0, 0, 0, 0, 0, 0, 0, 0, i7, i6, i5, i4, i3, i2, i1, i0);
811 static SIMDINLINE Float SIMDCALL set_ps(float i15,
828 return Float{SIMD256T::set_ps(i7, i6, i5, i4, i3, i2, i1, i0),
829 SIMD256T::set_ps(i15, i14, i13, i12, i11, i10, i9, i8)};
832 static SIMDINLINE Float SIMDCALL
833 set_ps(float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
835 return set_ps(0, 0, 0, 0, 0, 0, 0, 0, i7, i6, i5, i4, i3, i2, i1, i0);
838 static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
840 return Float{SIMD256T::vmask_ps(mask), SIMD256T::vmask_ps(mask >> TARGET_SIMD_WIDTH)};
843 #undef SIMD_WRAPPER_1
844 #undef SIMD_WRAPPER_2
845 #undef SIMD_WRAPPER_2I
846 #undef SIMD_WRAPPER_2I_1
847 #undef SIMD_WRAPPER_3
848 #undef SIMD_IWRAPPER_1
849 #undef SIMD_IWRAPPER_2
850 #undef SIMD_IWRAPPER_2I
851 #undef SIMD_IWRAPPER_2I_1
852 #undef SIMD_IWRAPPER_3