8de62f2a7e451c0a1266de270290966705c0d78d
[mesa.git] / src / gallium / drivers / swr / rasterizer / common / simdlib_512_avx512.inl
1 /****************************************************************************
2 * Copyright (C) 2017 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 ****************************************************************************/
23 #if !defined(__SIMD_LIB_AVX512_HPP__)
24 #error Do not include this file directly, use "simdlib.hpp" instead.
25 #endif
26
27 #if defined(__GNUC__) && !defined( __clang__) && !defined(__INTEL_COMPILER)
28 // gcc as of 7.1 was missing these intrinsics
29 #ifndef _mm512_cmpneq_ps_mask
30 #define _mm512_cmpneq_ps_mask(a,b) _mm512_cmp_ps_mask((a),(b),_CMP_NEQ_UQ)
31 #endif
32
33 #ifndef _mm512_cmplt_ps_mask
34 #define _mm512_cmplt_ps_mask(a,b) _mm512_cmp_ps_mask((a),(b),_CMP_LT_OS)
35 #endif
36
37 #ifndef _mm512_cmplt_pd_mask
38 #define _mm512_cmplt_pd_mask(a,b) _mm512_cmp_pd_mask((a),(b),_CMP_LT_OS)
39 #endif
40
41 #endif
42
43 //============================================================================
44 // SIMD16 AVX512 (F) implementation (compatible with Knights and Core
45 // processors)
46 //
47 //============================================================================
48
49 static const int TARGET_SIMD_WIDTH = 16;
50 using SIMD256T = SIMD256Impl::AVX2Impl;
51
52 #define SIMD_WRAPPER_1_(op, intrin) \
53 static SIMDINLINE Float SIMDCALL op(Float a) \
54 {\
55 return intrin(a);\
56 }
57
58 #define SIMD_WRAPPER_1(op) \
59 SIMD_WRAPPER_1_(op, _mm512_##op)
60
61 #define SIMD_WRAPPER_2_(op, intrin) \
62 static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
63 {\
64 return _mm512_##intrin(a, b);\
65 }
66 #define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op)
67
68 #define SIMD_WRAPPERI_2_(op, intrin) \
69 static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
70 {\
71 return _mm512_castsi512_ps(_mm512_##intrin(\
72 _mm512_castps_si512(a), _mm512_castps_si512(b)));\
73 }
74
75 #define SIMD_DWRAPPER_2(op) \
76 static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
77 {\
78 return _mm512_##op(a, b);\
79 }
80
81 #define SIMD_WRAPPER_2I_(op, intrin) \
82 template<int ImmT>\
83 static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
84 {\
85 return _mm512_##intrin(a, b, ImmT);\
86 }
87 #define SIMD_WRAPPER_2I(op) SIMD_WRAPPER_2I_(op, op)
88
89 #define SIMD_DWRAPPER_2I_(op, intrin) \
90 template<int ImmT>\
91 static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
92 {\
93 return _mm512_##intrin(a, b, ImmT);\
94 }
95 #define SIMD_DWRAPPER_2I(op) SIMD_DWRAPPER_2I_(op, op)
96
97 #define SIMD_WRAPPER_3(op) \
98 static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \
99 {\
100 return _mm512_##op(a, b, c);\
101 }
102
103 #define SIMD_IWRAPPER_1(op) \
104 static SIMDINLINE Integer SIMDCALL op(Integer a) \
105 {\
106 return _mm512_##op(a);\
107 }
108 #define SIMD_IWRAPPER_1_8(op) \
109 static SIMDINLINE Integer SIMDCALL op(SIMD256Impl::Integer a) \
110 {\
111 return _mm512_##op(a);\
112 }
113
114 #define SIMD_IWRAPPER_1_4(op) \
115 static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer a) \
116 {\
117 return _mm512_##op(a);\
118 }
119
120 #define SIMD_IWRAPPER_1I_(op, intrin) \
121 template<int ImmT> \
122 static SIMDINLINE Integer SIMDCALL op(Integer a) \
123 {\
124 return intrin(a, ImmT);\
125 }
126 #define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm512_##op)
127
128 #define SIMD_IWRAPPER_2_(op, intrin) \
129 static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
130 {\
131 return _mm512_##intrin(a, b);\
132 }
133 #define SIMD_IWRAPPER_2(op) SIMD_IWRAPPER_2_(op, op)
134
135 #define SIMD_IWRAPPER_2_CMP(op, cmp) \
136 static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
137 {\
138 return cmp(a, b);\
139 }
140
141 #define SIMD_IFWRAPPER_2(op, intrin) \
142 static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
143 {\
144 return castps_si(_mm512_##intrin(castsi_ps(a), castsi_ps(b)) );\
145 }
146
147 #define SIMD_IWRAPPER_2I_(op, intrin) \
148 template<int ImmT>\
149 static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
150 {\
151 return _mm512_##intrin(a, b, ImmT);\
152 }
153 #define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op)
154
155 private:
156 static SIMDINLINE Integer vmask(__mmask16 m)
157 {
158 return _mm512_maskz_set1_epi32(m, -1);
159 }
160
161 static SIMDINLINE Integer vmask(__mmask8 m)
162 {
163 return _mm512_maskz_set1_epi64(m, -1LL);
164 }
165
166 public:
167 //-----------------------------------------------------------------------
168 // Single precision floating point arithmetic operations
169 //-----------------------------------------------------------------------
170 SIMD_WRAPPER_2(add_ps); // return a + b
171 SIMD_WRAPPER_2(div_ps); // return a / b
172 SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c
173 SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c
174 SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b
175 SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b
176 SIMD_WRAPPER_2(mul_ps); // return a * b
177 SIMD_WRAPPER_1_(rcp_ps, _mm512_rcp14_ps); // return 1.0f / a
178 SIMD_WRAPPER_1_(rsqrt_ps, _mm512_rsqrt14_ps); // return 1.0f / sqrt(a)
179 SIMD_WRAPPER_2(sub_ps); // return a - b
180
181 template <RoundMode RMT>
182 static SIMDINLINE Float SIMDCALL round_ps(Float a)
183 {
184 return _mm512_roundscale_ps(a, static_cast<int>(RMT));
185 }
186
187 static SIMDINLINE Float SIMDCALL ceil_ps(Float a) { return round_ps<RoundMode::CEIL_NOEXC>(a); }
188 static SIMDINLINE Float SIMDCALL floor_ps(Float a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); }
189
190 //-----------------------------------------------------------------------
191 // Integer (various width) arithmetic operations
192 //-----------------------------------------------------------------------
193 SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
194 SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
195 //SIMD_IWRAPPER_2(add_epi8); // return a + b (int8)
196 //SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
197 SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
198 SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
199 SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
200 SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
201 SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
202
203 // return (a * b) & 0xFFFFFFFF
204 //
205 // Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
206 // and store the low 32 bits of the intermediate integers in dst.
207 SIMD_IWRAPPER_2(mullo_epi32);
208 SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
209 SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
210 //SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
211
212 //-----------------------------------------------------------------------
213 // Logical operations
214 //-----------------------------------------------------------------------
215 SIMD_IWRAPPER_2_(and_si, and_si512); // return a & b (int)
216 SIMD_IWRAPPER_2_(andnot_si, andnot_si512); // return (~a) & b (int)
217 SIMD_IWRAPPER_2_(or_si, or_si512); // return a | b (int)
218 SIMD_IWRAPPER_2_(xor_si, xor_si512); // return a ^ b (int)
219
220 // SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int)
221 // SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int)
222 // SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int)
223 // SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int)
224
225
226 //-----------------------------------------------------------------------
227 // Shift operations
228 //-----------------------------------------------------------------------
229 SIMD_IWRAPPER_1I(slli_epi32); // return a << ImmT
230 SIMD_IWRAPPER_2(sllv_epi32);
231 SIMD_IWRAPPER_1I(srai_epi32); // return a >> ImmT (int32)
232 SIMD_IWRAPPER_1I(srli_epi32); // return a >> ImmT (uint32)
233
234 #if 0
235 SIMD_IWRAPPER_1I_(srli_si, srli_si512); // return a >> (ImmT*8) (uint)
236
237 template<int ImmT> // same as srli_si, but with Float cast to int
238 static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)
239 {
240 return castsi_ps(srli_si<ImmT>(castps_si(a)));
241 }
242 #endif
243
244 SIMD_IWRAPPER_2(srlv_epi32);
245
246 //-----------------------------------------------------------------------
247 // Conversion operations
248 //-----------------------------------------------------------------------
249 static SIMDINLINE Float SIMDCALL castpd_ps(Double a) // return *(Float*)(&a)
250 {
251 return _mm512_castpd_ps(a);
252 }
253
254 static SIMDINLINE Integer SIMDCALL castps_si(Float a) // return *(Integer*)(&a)
255 {
256 return _mm512_castps_si512(a);
257 }
258
259 static SIMDINLINE Double SIMDCALL castsi_pd(Integer a) // return *(Double*)(&a)
260 {
261 return _mm512_castsi512_pd(a);
262 }
263
264 static SIMDINLINE Double SIMDCALL castps_pd(Float a) // return *(Double*)(&a)
265 {
266 return _mm512_castps_pd(a);
267 }
268
269 static SIMDINLINE Integer SIMDCALL castpd_si(Double a) // return *(Integer*)(&a)
270 {
271 return _mm512_castpd_si512(a);
272 }
273
274 static SIMDINLINE Float SIMDCALL castsi_ps(Integer a) // return *(Float*)(&a)
275 {
276 return _mm512_castsi512_ps(a);
277 }
278
279 static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a) // return (float)a (int32 --> float)
280 {
281 return _mm512_cvtepi32_ps(a);
282 }
283
284 //SIMD_IWRAPPER_1_8(cvtepu8_epi16); // return (int16)a (uint8 --> int16)
285 SIMD_IWRAPPER_1_4(cvtepu8_epi32); // return (int32)a (uint8 --> int32)
286 SIMD_IWRAPPER_1_8(cvtepu16_epi32); // return (int32)a (uint16 --> int32)
287 SIMD_IWRAPPER_1_4(cvtepu16_epi64); // return (int64)a (uint16 --> int64)
288 SIMD_IWRAPPER_1_8(cvtepu32_epi64); // return (int64)a (uint32 --> int64)
289
290 static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a) // return (int32)a (float --> int32)
291 {
292 return _mm512_cvtps_epi32(a);
293 }
294
295 static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float a) // return (int32)a (rnd_to_zero(float) --> int32)
296 {
297 return _mm512_cvttps_epi32(a);
298 }
299
300 //-----------------------------------------------------------------------
301 // Comparison operations
302 //-----------------------------------------------------------------------
303 template<CompareType CmpTypeT>
304 static SIMDINLINE Mask SIMDCALL cmp_ps_mask(Float a, Float b)
305 {
306 return _mm512_cmp_ps_mask(a, b, static_cast<const int>(CmpTypeT));
307 }
308
309 template<CompareType CmpTypeT>
310 static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b
311 {
312 // Legacy vector mask generator
313 __mmask16 result = cmp_ps_mask<CmpTypeT>(a, b);
314 return castsi_ps(vmask(result));
315 }
316
317 static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b) { return cmp_ps<CompareType::LT_OQ>(a, b); }
318 static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b) { return cmp_ps<CompareType::GT_OQ>(a, b); }
319 static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); }
320 static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b) { return cmp_ps<CompareType::EQ_OQ>(a, b); }
321 static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b) { return cmp_ps<CompareType::GE_OQ>(a, b); }
322 static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b) { return cmp_ps<CompareType::LE_OQ>(a, b); }
323
324 template<CompareTypeInt CmpTypeT>
325 static SIMDINLINE Integer SIMDCALL cmp_epi32(Integer a, Integer b)
326 {
327 // Legacy vector mask generator
328 __mmask16 result = _mm512_cmp_epi32_mask(a, b, static_cast<const int>(CmpTypeT));
329 return vmask(result);
330 }
331 template<CompareTypeInt CmpTypeT>
332 static SIMDINLINE Integer SIMDCALL cmp_epi64(Integer a, Integer b)
333 {
334 // Legacy vector mask generator
335 __mmask8 result = _mm512_cmp_epi64_mask(a, b, static_cast<const int>(CmpTypeT));
336 return vmask(result);
337 }
338
339 //SIMD_IWRAPPER_2_CMP(cmpeq_epi8, cmp_epi8<CompareTypeInt::EQ>); // return a == b (int8)
340 //SIMD_IWRAPPER_2_CMP(cmpeq_epi16, cmp_epi16<CompareTypeInt::EQ>); // return a == b (int16)
341 SIMD_IWRAPPER_2_CMP(cmpeq_epi32, cmp_epi32<CompareTypeInt::EQ>); // return a == b (int32)
342 SIMD_IWRAPPER_2_CMP(cmpeq_epi64, cmp_epi64<CompareTypeInt::EQ>); // return a == b (int64)
343 //SIMD_IWRAPPER_2_CMP(cmpgt_epi8, cmp_epi8<CompareTypeInt::GT>); // return a > b (int8)
344 //SIMD_IWRAPPER_2_CMP(cmpgt_epi16, cmp_epi16<CompareTypeInt::GT>); // return a > b (int16)
345 SIMD_IWRAPPER_2_CMP(cmpgt_epi32, cmp_epi32<CompareTypeInt::GT>); // return a > b (int32)
346 SIMD_IWRAPPER_2_CMP(cmpgt_epi64, cmp_epi64<CompareTypeInt::GT>); // return a > b (int64)
347 SIMD_IWRAPPER_2_CMP(cmplt_epi32, cmp_epi32<CompareTypeInt::LT>); // return a < b (int32)
348
349 static SIMDINLINE bool SIMDCALL testz_ps(Float a, Float b) // return all_lanes_zero(a & b) ? 1 : 0 (float)
350 {
351 return (0 == static_cast<int>(_mm512_test_epi32_mask(castps_si(a), castps_si(b))));
352 }
353
354 static SIMDINLINE bool SIMDCALL testz_si(Integer a, Integer b) // return all_lanes_zero(a & b) ? 1 : 0 (int)
355 {
356 return (0 == static_cast<int>(_mm512_test_epi32_mask(a, b)));
357 }
358
359 //-----------------------------------------------------------------------
360 // Blend / shuffle / permute operations
361 //-----------------------------------------------------------------------
362 template <int ImmT>
363 static SIMDINLINE Float blend_ps(Float a, Float b) // return ImmT ? b : a (float)
364 {
365 return _mm512_mask_blend_ps(__mmask16(ImmT), a, b);
366 }
367
368 template <int ImmT>
369 static SIMDINLINE Float blend_epi32(Integer a, Integer b) // return ImmT ? b : a (int32)
370 {
371 return _mm512_mask_blend_epi32(__mmask16(ImmT), a, b);
372 }
373
374 static SIMDINLINE Float blendv_ps(Float a, Float b, Float mask) // return mask ? b : a (float)
375 {
376 return _mm512_mask_blend_ps(__mmask16(movemask_ps(mask)), a, b);
377 }
378
379
380 static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Float mask) // return mask ? b : a (int)
381 {
382 return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask));
383 }
384
385 static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Integer mask) // return mask ? b : a (int)
386 {
387 return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask)));
388 }
389
390 static SIMDINLINE Float SIMDCALL broadcast_ss(float const *p) // return *p (all elements in vector get same value)
391 {
392 return _mm512_set1_ps(*p);
393 }
394
395 template<int imm>
396 static SIMDINLINE SIMD256Impl::Float SIMDCALL extract_ps(Float a)
397 {
398 return _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(a), imm));
399 }
400
401 template<int imm>
402 static SIMDINLINE SIMD256Impl::Double SIMDCALL extract_pd(Double a)
403 {
404 return _mm512_extractf64x4_pd(a, imm);
405 }
406
407 template<int imm>
408 static SIMDINLINE SIMD256Impl::Integer SIMDCALL extract_si(Integer a)
409 {
410 return _mm512_extracti64x4_epi64(a, imm);
411 }
412
413 template<int imm>
414 static SIMDINLINE Float SIMDCALL insert_ps(Float a, SIMD256Impl::Float b)
415 {
416 return _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castps_pd(a), _mm256_castps_pd(b), imm));
417 }
418
419 template<int imm>
420 static SIMDINLINE Double SIMDCALL insert_pd(Double a, SIMD256Impl::Double b)
421 {
422 return _mm512_insertf64x4(a, b, imm);
423 }
424
425 template<int imm>
426 static SIMDINLINE Integer SIMDCALL insert_si(Integer a, SIMD256Impl::Integer b)
427 {
428 return _mm512_inserti64x4(a, b, imm);
429 }
430
431 // SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm512_packs_epi16 and _mm512_packs_epi16
432 // SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm512_packs_epi32 and _mm512_packs_epi32
433 // SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm512_packus_epi16 and _mm512_packus_epi16
434 // SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm512_packus_epi32 and _mm512_packus_epi32
435
436 static SIMDINLINE Integer SIMDCALL permute_epi32(Integer a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
437 {
438 return _mm512_permutexvar_epi32(swiz, a);
439 }
440
441 static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
442 {
443 return _mm512_permutexvar_ps(swiz, a);
444 }
445
446 SIMD_WRAPPER_2I_(permute2f128_ps, shuffle_f32x4);
447 SIMD_DWRAPPER_2I_(permute2f128_pd, shuffle_f64x2);
448 SIMD_IWRAPPER_2I_(permute2f128_si, shuffle_i32x4);
449
450 SIMD_IWRAPPER_1I(shuffle_epi32);
451
452 //SIMD_IWRAPPER_2(shuffle_epi8);
453 SIMD_DWRAPPER_2I(shuffle_pd);
454 SIMD_WRAPPER_2I(shuffle_ps);
455
456 template<int ImmT>
457 static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
458 {
459 return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
460 }
461
462 SIMD_IWRAPPER_2(unpackhi_epi16);
463
464 //SIMD_IFWRAPPER_2(unpackhi_epi32, _mm512_unpackhi_ps);
465 static SIMDINLINE Integer SIMDCALL unpackhi_epi32(Integer a, Integer b)
466 {
467 return castps_si(_mm512_unpackhi_ps(castsi_ps(a), castsi_ps(b)));
468 }
469
470 SIMD_IWRAPPER_2(unpackhi_epi64);
471 //SIMD_IWRAPPER_2(unpackhi_epi8);
472 SIMD_DWRAPPER_2(unpackhi_pd);
473 SIMD_WRAPPER_2(unpackhi_ps);
474 //SIMD_IWRAPPER_2(unpacklo_epi16);
475 SIMD_IFWRAPPER_2(unpacklo_epi32, unpacklo_ps);
476 SIMD_IWRAPPER_2(unpacklo_epi64);
477 //SIMD_IWRAPPER_2(unpacklo_epi8);
478 SIMD_DWRAPPER_2(unpacklo_pd);
479 SIMD_WRAPPER_2(unpacklo_ps);
480
481 //-----------------------------------------------------------------------
482 // Load / store operations
483 //-----------------------------------------------------------------------
484 template<ScaleFactor ScaleT>
485 static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
486 {
487 return _mm512_i32gather_ps(idx, p, static_cast<int>(ScaleT));
488 }
489
490 static SIMDINLINE Float SIMDCALL load1_ps(float const *p) // return *p (broadcast 1 value to all elements)
491 {
492 return broadcast_ss(p);
493 }
494
495 static SIMDINLINE Float SIMDCALL load_ps(float const *p) // return *p (loads SIMD width elements from memory)
496 {
497 return _mm512_load_ps(p);
498 }
499
500 static SIMDINLINE Integer SIMDCALL load_si(Integer const *p) // return *p
501 {
502 return _mm512_load_si512(&p->v);
503 }
504
505 static SIMDINLINE Float SIMDCALL loadu_ps(float const *p) // return *p (same as load_ps but allows for unaligned mem)
506 {
507 return _mm512_loadu_ps(p);
508 }
509
510 static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p (same as load_si but allows for unaligned mem)
511 {
512 return _mm512_loadu_si512(p);
513 }
514
515 // for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
516 template<ScaleFactor ScaleT>
517 static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
518 {
519 __mmask16 k = _mm512_cmpneq_ps_mask(mask, setzero_ps());
520
521 return _mm512_mask_i32gather_ps(old, k, idx, p, static_cast<int>(ScaleT));
522 }
523
524 static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
525 {
526 Mask m = _mm512_cmplt_epi32_mask(mask, setzero_si());
527 _mm512_mask_store_ps(p, m, src);
528 }
529
530 //static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer a)
531 //{
532 // __mmask64 m = _mm512_cmplt_epi8_mask(a, setzero_si());
533 // return static_cast<uint64_t>(m);
534 //}
535
536 static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a)
537 {
538 __mmask8 m = _mm512_test_epi64_mask(castpd_si(a), set1_epi64(0x8000000000000000LL));
539 return static_cast<uint32_t>(m);
540 }
541 static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a)
542 {
543 __mmask16 m = _mm512_test_epi32_mask(castps_si(a), set1_epi32(0x80000000));
544 return static_cast<uint32_t>(m);
545 }
546
547 static SIMDINLINE Integer SIMDCALL set1_epi64(long long i) // return i (all elements are same value)
548 {
549 return _mm512_set1_epi64(i);
550 }
551
552 static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
553 {
554 return _mm512_set1_epi32(i);
555 }
556
557 static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
558 {
559 return _mm512_set1_epi8(i);
560 }
561
562 static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value)
563 {
564 return _mm512_set1_ps(f);
565 }
566
567 static SIMDINLINE Double SIMDCALL setzero_pd() // return 0 (double)
568 {
569 return _mm512_setzero_pd();
570 }
571
572 static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float)
573 {
574 return _mm512_setzero_ps();
575 }
576
577 static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer)
578 {
579 return _mm512_setzero_si512();
580 }
581
582 static SIMDINLINE void SIMDCALL store_ps(float *p, Float a) // *p = a (stores all elements contiguously in memory)
583 {
584 _mm512_store_ps(p, a);
585 }
586
587 static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a) // *p = a
588 {
589 _mm512_store_si512(&p->v, a);
590 }
591
592 static SIMDINLINE void SIMDCALL storeu_si(Integer *p, Integer a) // *p = a (same as store_si but allows for unaligned mem)
593 {
594 _mm512_storeu_si512(&p->v, a);
595 }
596
597 static SIMDINLINE void SIMDCALL stream_ps(float *p, Float a) // *p = a (same as store_ps, but doesn't keep memory in cache)
598 {
599 _mm512_stream_ps(p, a);
600 }
601
602 static SIMDINLINE Integer SIMDCALL set_epi32(
603 int i15, int i14, int i13, int i12, int i11, int i10, int i9, int i8,
604 int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
605 {
606 return _mm512_set_epi32(
607 i15, i14, i13, i12, i11, i10, i9, i8,
608 i7, i6, i5, i4, i3, i2, i1, i0);
609 }
610
611 static SIMDINLINE Integer SIMDCALL set_epi32(
612 int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
613 {
614 return set_epi32(
615 0, 0, 0, 0, 0, 0, 0, 0,
616 i7, i6, i5, i4, i3, i2, i1, i0);
617 }
618
619 static SIMDINLINE Float SIMDCALL set_ps(
620 float i15, float i14, float i13, float i12, float i11, float i10, float i9, float i8,
621 float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
622 {
623 return _mm512_set_ps(
624 i15, i14, i13, i12, i11, i10, i9, i8,
625 i7, i6, i5, i4, i3, i2, i1, i0);
626 }
627
628 static SIMDINLINE Float SIMDCALL set_ps(
629 float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
630 {
631 return set_ps(
632 0, 0, 0, 0, 0, 0, 0, 0,
633 i7, i6, i5, i4, i3, i2, i1, i0);
634 }
635
636 static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
637 {
638 return castsi_ps(_mm512_maskz_mov_epi32(__mmask16(mask), set1_epi32(-1)));
639 }
640
641 #undef SIMD_WRAPPER_1_
642 #undef SIMD_WRAPPER_1
643 #undef SIMD_WRAPPER_2
644 #undef SIMD_WRAPPER_2_
645 #undef SIMD_WRAPPERI_2_
646 #undef SIMD_DWRAPPER_2
647 #undef SIMD_DWRAPPER_2I
648 #undef SIMD_WRAPPER_2I_
649 #undef SIMD_WRAPPER_3_
650 #undef SIMD_WRAPPER_2I
651 #undef SIMD_WRAPPER_3
652 #undef SIMD_IWRAPPER_1
653 #undef SIMD_IWRAPPER_2
654 #undef SIMD_IFWRAPPER_2
655 #undef SIMD_IWRAPPER_2I
656 #undef SIMD_IWRAPPER_1
657 #undef SIMD_IWRAPPER_1I
658 #undef SIMD_IWRAPPER_1I_
659 #undef SIMD_IWRAPPER_2
660 #undef SIMD_IWRAPPER_2_
661 #undef SIMD_IWRAPPER_2I
662