3b43d510e68fa89e7b2625686d12bcf9b4e53b81
[mesa.git] / src / gallium / drivers / swr / rasterizer / common / simd16intrin.h
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 ****************************************************************************/
23
24 #ifndef __SWR_SIMD16INTRIN_H__
25 #define __SWR_SIMD16INTRIN_H__
26
27 #if ENABLE_AVX512_SIMD16
28
29 #if KNOB_SIMD16_WIDTH == 16
30
31 #if ENABLE_AVX512_EMULATION
32 struct simd16scalar
33 {
34 __m256 lo;
35 __m256 hi;
36 };
37 struct simd16scalard
38 {
39 __m256d lo;
40 __m256d hi;
41 };
42 struct simd16scalari
43 {
44 __m256i lo;
45 __m256i hi;
46 };
47 typedef uint16_t simd16mask;
48
49 #define _simd16_masklo(mask) ((mask) & 0xFF)
50 #define _simd16_maskhi(mask) (((mask) >> 8))
51 #define _simd16_setmask(hi, lo) (((hi) << 8) | (lo))
52
53 #else
54 typedef __m512 simd16scalar;
55 typedef __m512d simd16scalard;
56 typedef __m512i simd16scalari;
57 typedef __mmask16 simd16mask;
58 #endif//ENABLE_AVX512_EMULATION
59 #else
60 #error Unsupported vector width
61 #endif//KNOB_SIMD16_WIDTH == 16
62
63 OSALIGN(union, KNOB_SIMD16_BYTES) simd16vector
64 {
65 simd16scalar v[4];
66 struct
67 {
68 simd16scalar x, y, z, w;
69 };
70
71 simd16scalar& operator[] (const int i) { return v[i]; }
72 const simd16scalar& operator[] (const int i) const { return v[i]; }
73 };
74
75 #if ENABLE_AVX512_EMULATION
76
77 #define SIMD16_EMU_AVX512_0(type, func, intrin) \
78 INLINE type func()\
79 {\
80 type result;\
81 \
82 result.lo = intrin();\
83 result.hi = intrin();\
84 \
85 return result;\
86 }
87
88 #define SIMD16_EMU_AVX512_1(type, func, intrin) \
89 INLINE type func(type a)\
90 {\
91 type result;\
92 \
93 result.lo = intrin(a.lo);\
94 result.hi = intrin(a.hi);\
95 \
96 return result;\
97 }
98
99 #define SIMD16_EMU_AVX512_2(type, func, intrin) \
100 INLINE type func(type a, type b)\
101 {\
102 type result;\
103 \
104 result.lo = intrin(a.lo, b.lo);\
105 result.hi = intrin(a.hi, b.hi);\
106 \
107 return result;\
108 }
109
110 #define SIMD16_EMU_AVX512_3(type, func, intrin) \
111 INLINE type func(type a, type b, type c)\
112 {\
113 type result;\
114 \
115 result.lo = intrin(a.lo, b.lo, c.lo);\
116 result.hi = intrin(a.hi, b.hi, c.hi);\
117 \
118 return result;\
119 }
120
121 SIMD16_EMU_AVX512_0(simd16scalar, _simd16_setzero_ps, _mm256_setzero_ps)
122 SIMD16_EMU_AVX512_0(simd16scalari, _simd16_setzero_si, _mm256_setzero_si256)
123
124 INLINE simd16scalar _simd16_set1_ps(float a)
125 {
126 simd16scalar result;
127
128 result.lo = _mm256_set1_ps(a);
129 result.hi = _mm256_set1_ps(a);
130
131 return result;
132 }
133
134 INLINE simd16scalari _simd16_set1_epi8(char a)
135 {
136 simd16scalari result;
137
138 result.lo = _mm256_set1_epi8(a);
139 result.hi = _mm256_set1_epi8(a);
140
141 return result;
142 }
143
144 INLINE simd16scalari _simd16_set1_epi32(int a)
145 {
146 simd16scalari result;
147
148 result.lo = _mm256_set1_epi32(a);
149 result.hi = _mm256_set1_epi32(a);
150
151 return result;
152 }
153
154 INLINE simd16scalar _simd16_set_ps(float e15, float e14, float e13, float e12, float e11, float e10, float e9, float e8, float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
155 {
156 simd16scalar result;
157
158 result.lo = _mm256_set_ps(e7, e6, e5, e4, e3, e2, e1, e0);
159 result.hi = _mm256_set_ps(e15, e14, e13, e12, e11, e10, e9, e8);
160
161 return result;
162 }
163
164 INLINE simd16scalari _simd16_set_epi32(int e15, int e14, int e13, int e12, int e11, int e10, int e9, int e8, int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
165 {
166 simd16scalari result;
167
168 result.lo = _mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0);
169 result.hi = _mm256_set_epi32(e15, e14, e13, e12, e11, e10, e9, e8);
170
171 return result;
172 }
173
174 INLINE simd16scalar _simd16_set_ps(float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
175 {
176 simd16scalar result;
177
178 result.lo = _mm256_set_ps(e7, e6, e5, e4, e3, e2, e1, e0);
179 result.hi = _mm256_set_ps(e7, e6, e5, e4, e3, e2, e1, e0);
180
181 return result;
182 }
183
184 INLINE simd16scalari _simd16_set_epi32(int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
185 {
186 simd16scalari result;
187
188 result.lo = _mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0);
189 result.hi = _mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0);
190
191 return result;
192 }
193
194 INLINE simd16scalar _simd16_load_ps(float const *m)
195 {
196 simd16scalar result;
197
198 float const *n = reinterpret_cast<float const *>(reinterpret_cast<uint8_t const *>(m) + sizeof(result.lo));
199
200 result.lo = _mm256_load_ps(m);
201 result.hi = _mm256_load_ps(n);
202
203 return result;
204 }
205
206 INLINE simd16scalar _simd16_loadu_ps(float const *m)
207 {
208 simd16scalar result;
209
210 float const *n = reinterpret_cast<float const *>(reinterpret_cast<uint8_t const *>(m) + sizeof(result.lo));
211
212 result.lo = _mm256_loadu_ps(m);
213 result.hi = _mm256_loadu_ps(n);
214
215 return result;
216 }
217
218 INLINE simd16scalar _simd16_load1_ps(float const *m)
219 {
220 simd16scalar result;
221
222 result.lo = _mm256_broadcast_ss(m);
223 result.hi = _mm256_broadcast_ss(m);
224
225 return result;
226 }
227
228 INLINE simd16scalari _simd16_load_si(simd16scalari const *m)
229 {
230 simd16scalari result;
231
232 result.lo = _mm256_load_si256(&m[0].lo);
233 result.hi = _mm256_load_si256(&m[0].hi);
234
235 return result;
236 }
237
238 INLINE simd16scalari _simd16_loadu_si(simd16scalari const *m)
239 {
240 simd16scalari result;
241
242 result.lo = _mm256_loadu_si256(&m[0].lo);
243 result.hi = _mm256_loadu_si256(&m[0].hi);
244
245 return result;
246 }
247
248 INLINE simd16scalar _simd16_broadcast_ss(float const *m)
249 {
250 simd16scalar result;
251
252 result.lo = _mm256_broadcast_ss(m);
253 result.hi = _mm256_broadcast_ss(m);
254
255 return result;
256 }
257
258 INLINE simd16scalar _simd16_broadcast_ps(__m128 const *m)
259 {
260 simd16scalar result;
261
262 result.lo = _mm256_broadcast_ps(m);
263 result.hi = _mm256_broadcast_ps(m);
264
265 return result;
266 }
267
268 INLINE void _simd16_store_ps(float *m, simd16scalar a)
269 {
270 float *n = reinterpret_cast<float *>(reinterpret_cast<uint8_t *>(m) + sizeof(a.lo));
271
272 _mm256_store_ps(m, a.lo);
273 _mm256_store_ps(n, a.hi);
274 }
275
276 INLINE void _simd16_maskstore_ps(float *m, simd16scalari mask, simd16scalar a)
277 {
278 float *n = reinterpret_cast<float *>(reinterpret_cast<uint8_t *>(m) + sizeof(a.lo));
279
280 _mm256_maskstore_ps(m, mask.lo, a.lo);
281 _mm256_maskstore_ps(n, mask.hi, a.hi);
282 }
283
284 INLINE void _simd16_store_si(simd16scalari *m, simd16scalari a)
285 {
286 _mm256_store_si256(&m[0].lo, a.lo);
287 _mm256_store_si256(&m[0].hi, a.hi);
288 }
289
290 INLINE simdscalar _simd16_extract_ps(simd16scalar a, int imm8)
291 {
292 switch (imm8)
293 {
294 case 0:
295 return a.lo;
296 case 1:
297 return a.hi;
298 }
299 return _simd_set1_ps(0.0f);
300 }
301
302 INLINE simdscalari _simd16_extract_si(simd16scalari a, int imm8)
303 {
304 switch (imm8)
305 {
306 case 0:
307 return a.lo;
308 case 1:
309 return a.hi;
310 }
311 return _simd_set1_epi32(0);
312 }
313
314 INLINE simd16scalar _simd16_insert_ps(simd16scalar a, simdscalar b, int imm8)
315 {
316 switch (imm8)
317 {
318 case 0:
319 a.lo = b;
320 break;
321 case 1:
322 a.hi = b;
323 break;
324 }
325 return a;
326 }
327
328 INLINE simd16scalari _simd16_insert_si(simd16scalari a, simdscalari b, int imm8)
329 {
330 switch (imm8)
331 {
332 case 0:
333 a.lo = b;
334 break;
335 case 1:
336 a.hi = b;
337 break;
338 }
339 return a;
340 }
341
342 template <simd16mask mask>
343 INLINE simd16scalar _simd16_blend_ps_temp(simd16scalar a, simd16scalar b)
344 {
345 simd16scalar result;
346
347 result.lo = _mm256_blend_ps(a.lo, b.lo, _simd16_masklo(mask));
348 result.hi = _mm256_blend_ps(a.hi, b.hi, _simd16_maskhi(mask));
349
350 return result;
351 }
352
353 #define _simd16_blend_ps(a, b, mask) _simd16_blend_ps_temp<mask>(a, b)
354
355 SIMD16_EMU_AVX512_3(simd16scalar, _simd16_blendv_ps, _mm256_blendv_ps)
356
357 INLINE simd16scalari _simd16_blendv_epi32(simd16scalari a, simd16scalari b, const simd16scalar mask)
358 {
359 simd16scalari result;
360
361 result.lo = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.lo), _mm256_castsi256_ps(b.lo), mask.lo));
362 result.hi = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.hi), _mm256_castsi256_ps(b.hi), mask.hi));
363
364 return result;
365 }
366
367 INLINE simd16scalari _simd16_blendv_epi32(simd16scalari a, simd16scalari b, const simd16scalari mask)
368 {
369 simd16scalari result;
370
371 result.lo = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.lo), _mm256_castsi256_ps(b.lo), _mm256_castsi256_ps(mask.lo)));
372 result.hi = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.hi), _mm256_castsi256_ps(b.hi), _mm256_castsi256_ps(mask.hi)));
373
374 return result;
375 }
376
377 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_mul_ps, _mm256_mul_ps)
378 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_add_ps, _mm256_add_ps)
379 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_sub_ps, _mm256_sub_ps)
380 SIMD16_EMU_AVX512_1(simd16scalar, _simd16_rsqrt_ps, _mm256_rsqrt_ps)
381 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_min_ps, _mm256_min_ps)
382 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_max_ps, _mm256_max_ps)
383
384 INLINE simd16mask _simd16_movemask_ps(simd16scalar a)
385 {
386 simd16mask mask;
387
388 reinterpret_cast<uint8_t *>(&mask)[0] = _mm256_movemask_ps(a.lo);
389 reinterpret_cast<uint8_t *>(&mask)[1] = _mm256_movemask_ps(a.hi);
390
391 return mask;
392 }
393
394 INLINE simd16mask _simd16_movemask_pd(simd16scalard a)
395 {
396 simd16mask mask;
397
398 reinterpret_cast<uint8_t *>(&mask)[0] = _mm256_movemask_pd(a.lo);
399 reinterpret_cast<uint8_t *>(&mask)[1] = _mm256_movemask_pd(a.hi);
400
401 return mask;
402 }
403
404 INLINE simd16mask _simd16_movemask_epi8(simd16scalari a)
405 {
406 simd16mask mask;
407
408 reinterpret_cast<uint8_t *>(&mask)[0] = _mm256_movemask_epi8(a.lo);
409 reinterpret_cast<uint8_t *>(&mask)[1] = _mm256_movemask_epi8(a.hi);
410
411 return mask;
412 }
413
414 INLINE simd16scalari _simd16_cvtps_epi32(simd16scalar a)
415 {
416 simd16scalari result;
417
418 result.lo = _mm256_cvtps_epi32(a.lo);
419 result.hi = _mm256_cvtps_epi32(a.hi);
420
421 return result;
422 }
423
424 INLINE simd16scalari _simd16_cvttps_epi32(simd16scalar a)
425 {
426 simd16scalari result;
427
428 result.lo = _mm256_cvttps_epi32(a.lo);
429 result.hi = _mm256_cvttps_epi32(a.hi);
430
431 return result;
432 }
433
434 INLINE simd16scalar _simd16_cvtepi32_ps(simd16scalari a)
435 {
436 simd16scalar result;
437
438 result.lo = _mm256_cvtepi32_ps(a.lo);
439 result.hi = _mm256_cvtepi32_ps(a.hi);
440
441 return result;
442 }
443
444 template <int comp>
445 INLINE simd16scalar _simd16_cmp_ps(simd16scalar a, simd16scalar b)
446 {
447 simd16scalar result;
448
449 result.lo = _mm256_cmp_ps(a.lo, b.lo, comp);
450 result.hi = _mm256_cmp_ps(a.hi, b.hi, comp);
451
452 return result;
453 }
454
455 #define _simd16_cmplt_ps(a, b) _simd16_cmp_ps<_CMP_LT_OQ>(a, b)
456 #define _simd16_cmpgt_ps(a, b) _simd16_cmp_ps<_CMP_GT_OQ>(a, b)
457 #define _simd16_cmpneq_ps(a, b) _simd16_cmp_ps<_CMP_NEQ_OQ>(a, b)
458 #define _simd16_cmpeq_ps(a, b) _simd16_cmp_ps<_CMP_EQ_OQ>(a, b)
459 #define _simd16_cmpge_ps(a, b) _simd16_cmp_ps<_CMP_GE_OQ>(a, b)
460 #define _simd16_cmple_ps(a, b) _simd16_cmp_ps<_CMP_LE_OQ>(a, b)
461
462 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_and_ps, _simd_and_ps)
463 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_andnot_ps, _simd_andnot_ps)
464 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_or_ps, _simd_or_ps)
465 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_xor_ps, _simd_xor_ps)
466
467 SIMD16_EMU_AVX512_1(simd16scalar, _simd16_rcp_ps, _simd_rcp_ps)
468 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_div_ps, _simd_div_ps)
469
470 INLINE simd16scalar _simd16_castsi_ps(simd16scalari a)
471 {
472 return *reinterpret_cast<simd16scalar *>(&a);
473 }
474
475 INLINE simd16scalari _simd16_castps_si(simd16scalar a)
476 {
477 return *reinterpret_cast<simd16scalari *>(&a);
478 }
479
480 INLINE simd16scalard _simd16_castsi_pd(simd16scalari a)
481 {
482 return *reinterpret_cast<simd16scalard *>(&a);
483 }
484
485 INLINE simd16scalari _simd16_castpd_si(simd16scalard a)
486 {
487 return *reinterpret_cast<simd16scalari *>(&a);
488 }
489
490 INLINE simd16scalar _simd16_castpd_ps(simd16scalard a)
491 {
492 return *reinterpret_cast<simd16scalar *>(&a);
493 }
494
495 INLINE simd16scalard _simd16_castps_pd(simd16scalar a)
496 {
497 return *reinterpret_cast<simd16scalard *>(&a);
498 }
499
500 template <int mode>
501 INLINE simd16scalar _simd16_round_ps_temp(simd16scalar a)
502 {
503 simd16scalar result;
504
505 result.lo = _mm256_round_ps(a.lo, mode);
506 result.hi = _mm256_round_ps(a.hi, mode);
507
508 return result;
509 }
510
511 #define _simd16_round_ps(a, mode) _simd16_round_ps_temp<mode>(a)
512
513 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_mul_epi32, _simd_mul_epi32)
514 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_mullo_epi32, _simd_mullo_epi32)
515 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sub_epi32, _simd_sub_epi32)
516 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sub_epi64, _simd_sub_epi64)
517 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_min_epi32, _simd_min_epi32)
518 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_max_epi32, _simd_max_epi32)
519 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_min_epu32, _simd_min_epu32)
520 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_max_epu32, _simd_max_epu32)
521 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_add_epi32, _simd_add_epi32)
522
523 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_and_si, _simd_and_si)
524 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_andnot_si, _simd_andnot_si)
525 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_or_si, _simd_or_si)
526 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_xor_si, _simd_xor_si)
527
528 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi32, _simd_cmpeq_epi32)
529 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi32, _simd_cmpgt_epi32)
530 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmplt_epi32, _simd_cmplt_epi32)
531
532 INLINE int _simd16_testz_ps(simd16scalar a, simd16scalar b)
533 {
534 int lo = _mm256_testz_ps(a.lo, b.lo);
535 int hi = _mm256_testz_ps(a.hi, b.hi);
536
537 return lo & hi;
538 }
539
540 #define _simd16_cmplt_epi32(a, b) _simd16_cmpgt_epi32(b, a)
541
542 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_unpacklo_ps, _simd_unpacklo_ps)
543 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_unpackhi_ps, _simd_unpackhi_ps)
544 SIMD16_EMU_AVX512_2(simd16scalard, _simd16_unpacklo_pd, _simd_unpacklo_pd)
545 SIMD16_EMU_AVX512_2(simd16scalard, _simd16_unpackhi_pd, _simd_unpackhi_pd)
546
547 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpacklo_epi8, _simd_unpacklo_epi8)
548 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpackhi_epi8, _simd_unpackhi_epi8)
549 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpacklo_epi16, _simd_unpacklo_epi16)
550 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpackhi_epi16, _simd_unpackhi_epi16)
551 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpacklo_epi32, _simd_unpacklo_epi32)
552 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpackhi_epi32, _simd_unpackhi_epi32)
553 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpacklo_epi64, _simd_unpacklo_epi64)
554 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpackhi_epi64, _simd_unpackhi_epi64)
555
556 template <int imm8>
557 INLINE simd16scalari _simd16_slli_epi32_temp(simd16scalari a)
558 {
559 simd16scalari result;
560
561 result.lo = _simd_slli_epi32(a.lo, imm8);
562 result.hi = _simd_slli_epi32(a.hi, imm8);
563
564 return result;
565 }
566
567 #define _simd16_slli_epi32(a, imm8) _simd16_slli_epi32_temp<imm8>(a)
568
569 template <int imm8>
570 INLINE simd16scalari _simd16_srai_epi32_temp(simd16scalari a)
571 {
572 simd16scalari result;
573
574 result.lo = _simd_srai_epi32(a.lo, imm8);
575 result.hi = _simd_srai_epi32(a.hi, imm8);
576
577 return result;
578 }
579
580 #define _simd16_srai_epi32(a, imm8) _simd16_srai_epi32_temp<imm8>(a)
581
582 template <int imm8>
583 INLINE simd16scalari _simd16_srli_epi32_temp(simd16scalari a)
584 {
585 simd16scalari result;
586
587 result.lo = _simd_srli_epi32(a.lo, imm8);
588 result.hi = _simd_srli_epi32(a.hi, imm8);
589
590 return result;
591 }
592
593 #define _simd16_srli_epi32(a, imm8) _simd16_srli_epi32_temp<imm8>(a)
594
595 SIMD16_EMU_AVX512_3(simd16scalar, _simd16_fmadd_ps, _simd_fmadd_ps)
596 SIMD16_EMU_AVX512_3(simd16scalar, _simd16_fmsub_ps, _simd_fmsub_ps)
597
598 template <int scale>
599 INLINE simd16scalar _simd16_i32gather_ps_temp(const float *m, simd16scalari index)
600 {
601 simd16scalar result;
602
603 result.lo = _simd_i32gather_ps(m, index.lo, scale);
604 result.hi = _simd_i32gather_ps(m, index.hi, scale);
605
606 return result;
607 }
608
609 #define _simd16_i32gather_ps(m, index, scale) _simd16_i32gather_ps_temp<scale>(m, index)
610
611 template <int scale>
612 INLINE simd16scalar _simd16_mask_i32gather_ps_temp(simd16scalar a, const float *m, simd16scalari index, simd16scalari mask)
613 {
614 simd16scalar result;
615
616 result.lo = _simd_mask_i32gather_ps(a.lo, m, index.lo, _simd_castsi_ps(mask.lo), scale);
617 result.hi = _simd_mask_i32gather_ps(a.hi, m, index.hi, _simd_castsi_ps(mask.hi), scale);
618
619 return result;
620 }
621
622 #define _simd16_mask_i32gather_ps(a, m, index, mask, scale) _simd16_mask_i32gather_ps_temp<scale>(a, m, index, mask)
623
624 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_shuffle_epi8, _simd_shuffle_epi8)
625 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_adds_epu8, _simd_adds_epu8)
626 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_subs_epu8, _simd_subs_epu8)
627 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_add_epi8, _simd_add_epi8)
628 SIMD16_EMU_AVX512_1(simd16scalari, _simd16_abs_epi32, _simd_abs_epi32)
629 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi64, _simd_cmpeq_epi64)
630 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi64, _simd_cmpgt_epi64)
631 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi16, _simd_cmpeq_epi16)
632 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi16, _simd_cmpgt_epi16)
633 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi8, _simd_cmpeq_epi8)
634 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi8, _simd_cmpgt_epi8)
635
636 INLINE simd16scalar _simd16_permute_ps(simd16scalar a, simd16scalari i)
637 {
638 simd16scalar result;
639
640 const simdscalari mask = _simd_set1_epi32(7);
641
642 simdscalar lolo = _simd_permute_ps(a.lo, _simd_and_si(i.lo, mask));
643 simdscalar lohi = _simd_permute_ps(a.hi, _simd_and_si(i.lo, mask));
644
645 simdscalar hilo = _simd_permute_ps(a.lo, _simd_and_si(i.hi, mask));
646 simdscalar hihi = _simd_permute_ps(a.hi, _simd_and_si(i.hi, mask));
647
648 result.lo = _simd_blendv_ps(lolo, lohi, _simd_castsi_ps(_simd_cmpgt_epi32(i.lo, mask)));
649 result.hi = _simd_blendv_ps(hilo, hihi, _simd_castsi_ps(_simd_cmpgt_epi32(i.hi, mask)));
650
651 return result;
652 }
653
654 INLINE simd16scalari _simd16_permute_epi32(simd16scalari a, simd16scalari i)
655 {
656 return _simd16_castps_si(_simd16_permute_ps(_simd16_castsi_ps(a), i));
657 }
658
659 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_srlv_epi32, _simd_srlv_epi32)
660 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sllv_epi32, _simd_sllv_epi32)
661
662 template <int imm8>
663 INLINE simd16scalar _simd16_permute2f128_ps_temp(simd16scalar a, simd16scalar b)
664 {
665 simd16scalar result;
666
667 result.lo = _simd_permute2f128_ps(a.lo, a.hi, ((imm8 & 0x03) << 0) | ((imm8 & 0x0C) << 2));
668 result.hi = _simd_permute2f128_ps(b.lo, b.hi, ((imm8 & 0x30) >> 4) | ((imm8 & 0xC0) >> 2));
669
670 return result;
671 }
672
673 #define _simd16_permute2f128_ps(a, b, imm8) _simd16_permute2f128_ps_temp<imm8>(a, b)
674
675 template <int imm8>
676 INLINE simd16scalard _simd16_permute2f128_pd_temp(simd16scalard a, simd16scalard b)
677 {
678 simd16scalard result;
679
680 result.lo = _simd_permute2f128_pd(a.lo, a.hi, ((imm8 & 0x03) << 0) | ((imm8 & 0x0C) << 2));
681 result.hi = _simd_permute2f128_pd(b.lo, b.hi, ((imm8 & 0x30) >> 4) | ((imm8 & 0xC0) >> 2));
682
683 return result;
684 }
685
686 #define _simd16_permute2f128_pd(a, b, imm8) _simd16_permute2f128_pd_temp<imm8>(a, b)
687
688 template <int imm8>
689 INLINE simd16scalari _simd16_permute2f128_si_temp(simd16scalari a, simd16scalari b)
690 {
691 simd16scalari result;
692
693 result.lo = _simd_permute2f128_si(a.lo, a.hi, ((imm8 & 0x03) << 0) | ((imm8 & 0x0C) << 2));
694 result.hi = _simd_permute2f128_si(b.lo, b.hi, ((imm8 & 0x30) >> 4) | ((imm8 & 0xC0) >> 2));
695
696 return result;
697 }
698
699 #define _simd16_permute2f128_si(a, b, imm8) _simd16_permute2f128_si_temp<imm8>(a, b)
700
701 template <int imm8>
702 INLINE simd16scalar _simd16_shuffle_ps_temp(simd16scalar a, simd16scalar b)
703 {
704 simd16scalar result;
705
706 result.lo = _simd_shuffle_ps(a.lo, b.lo, imm8);
707 result.hi = _simd_shuffle_ps(a.hi, b.hi, imm8);
708
709 return result;
710 }
711
712 #define _simd16_shuffle_ps(a, b, imm8) _simd16_shuffle_ps_temp<imm8>(a, b)
713
714 template <int imm8>
715 INLINE simd16scalard _simd16_shuffle_pd_temp(simd16scalard a, simd16scalard b)
716 {
717 simd16scalard result;
718
719 result.lo = _simd_shuffle_pd(a.lo, b.lo, (imm8 & 15));
720 result.hi = _simd_shuffle_pd(a.hi, b.hi, (imm8 >> 4));
721
722 return result;
723 }
724
725 #define _simd16_shuffle_pd(a, b, imm8) _simd16_shuffle_pd_temp<imm8>(a, b)
726
727 template <int imm8>
728 INLINE simd16scalari _simd16_shuffle_epi32_temp(simd16scalari a, simd16scalari b)
729 {
730 return _simd16_castps_si(_simd16_shuffle_ps(_simd16_castsi_ps(a), _simd16_castsi_ps(b), imm8));
731 }
732
733 #define _simd16_shuffle_epi32(a, b, imm8) _simd16_shuffle_epi32_temp<imm8>(a, b)
734
735 template <int imm8>
736 INLINE simd16scalari _simd16_shuffle_epi64_temp(simd16scalari a, simd16scalari b)
737 {
738 return _simd16_castpd_si(_simd16_shuffle_pd(_simd16_castsi_pd(a), _simd16_castsi_pd(b), imm8));
739 }
740
741 #define _simd16_shuffle_epi64(a, b, imm8) _simd16_shuffle_epi64_temp<imm8>(a, b)
742
743 INLINE simd16scalari _simd16_cvtepu8_epi16(simdscalari a)
744 {
745 simd16scalari result;
746
747 result.lo = _simd_cvtepu8_epi16(_mm256_extractf128_si256(a, 0));
748 result.hi = _simd_cvtepu8_epi16(_mm256_extractf128_si256(a, 1));
749
750 return result;
751 }
752
753 INLINE simd16scalari _simd16_cvtepu8_epi32(__m128i a)
754 {
755 simd16scalari result;
756
757 result.lo = _simd_cvtepu8_epi32(a);
758 result.hi = _simd_cvtepu8_epi32(_mm_srli_si128(a, 8));
759
760 return result;
761 }
762
763 INLINE simd16scalari _simd16_cvtepu16_epi32(simdscalari a)
764 {
765 simd16scalari result;
766
767 result.lo = _simd_cvtepu16_epi32(_mm256_extractf128_si256(a, 0));
768 result.hi = _simd_cvtepu16_epi32(_mm256_extractf128_si256(a, 1));
769
770 return result;
771 }
772
773 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_packus_epi16, _simd_packus_epi16)
774 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_packs_epi16, _simd_packs_epi16)
775 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_packus_epi32, _simd_packus_epi32)
776 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_packs_epi32, _simd_packs_epi32)
777
778 INLINE simd16mask _simd16_int2mask(int mask)
779 {
780 return mask;
781 }
782
783 INLINE int _simd16_mask2int(simd16mask mask)
784 {
785 return mask;
786 }
787
788 INLINE simd16mask _simd16_cmplt_ps_mask(simd16scalar a, simd16scalar b)
789 {
790 return _simd16_movemask_ps(_simd16_cmplt_ps(a, b));
791 }
792
793 // convert bitmask to vector mask
794 INLINE simd16scalar vMask16(int32_t mask)
795 {
796 simd16scalari temp = _simd16_set1_epi32(mask);
797
798 simd16scalari bits = _simd16_set_epi32(0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100, 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001);
799
800 simd16scalari result = _simd16_cmplt_epi32(_simd16_setzero_si(), _simd16_and_si(temp, bits));
801
802 return _simd16_castsi_ps(result);
803 }
804
805 #else
806
807 INLINE simd16mask _simd16_scalari2mask(simd16scalari mask)
808 {
809 return _mm512_cmpneq_epu32_mask(mask, _mm512_setzero_epi32());
810 }
811
812 #if 0
813 INLINE simd16mask _simd16_scalard2mask(simd16scalard mask)
814 {
815 return _mm512_cmpneq_epu64_mask(mask, _mm512_setzero_epi64());
816 }
817 #endif
818
819 #define _simd16_setzero_ps _mm512_setzero_ps
820 #define _simd16_setzero_si _mm512_setzero_si512
821 #define _simd16_set1_ps _mm512_set1_ps
822 #define _simd16_set1_epi8 _mm512_set1_epi8
823 #define _simd16_set1_epi32 _mm512_set1_epi32
824
825 INLINE simd16scalar _simd16_set_ps(float e15, float e14, float e13, float e12, float e11, float e10, float e9, float e8, float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
826 {
827 return _mm512_set_ps(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0);
828 }
829
830 INLINE simd16scalari _simd16_set_epi32(int e15, int e14, int e13, int e12, int e11, int e10, int e9, int e8, int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
831 {
832 return _mm512_set_epi32(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0);
833 }
834
835 INLINE simd16scalar _simd16_set_ps(float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
836 {
837 return _mm512_set_ps(e7, e6, e5, e4, e3, e2, e1, e0, e7, e6, e5, e4, e3, e2, e1, e0);
838 }
839
840 INLINE simd16scalari _simd16_set_epi32(int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
841 {
842 return _mm512_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0, e7, e6, e5, e4, e3, e2, e1, e0);
843 }
844
845 #define _simd16_load_ps _mm512_load_ps
846 #define _simd16_loadu_ps _mm512_loadu_ps
847 #if 1
848 #define _simd16_load1_ps _simd16_broadcast_ss
849 #endif
850 #define _simd16_load_si _mm512_load_si512
851 #define _simd16_loadu_si _mm512_loadu_si512
852 #define _simd16_broadcast_ss(m) _mm512_extload_ps(m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, 0)
853 #define _simd16_broadcast_ps(m) _mm512_extload_ps(m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_4X16, 0)
854 #define _simd16_store_ps _mm512_store_ps
855 #define _simd16_store_si _mm512_store_si512
856 #define _simd16_extract_ps(a, imm8) _mm256_castsi256_ps(_mm512_extracti64x4_epi64(_mm512_castps_si512(a), imm8))
857 #define _simd16_extract_si _mm512_extracti64x4_epi64
858 #define _simd16_insert_ps(a, b, imm8) _mm512_castsi512_ps(_mm512_inserti64x4(_mm512_castps_si512(a), _mm256_castps_si256(b), imm8))
859 #define _simd16_insert_si _mm512_inserti64x4
860
861 INLINE void _simd16_maskstore_ps(float *m, simd16scalari mask, simd16scalar a)
862 {
863 simd16mask k = _simd16_scalari2mask(mask);
864
865 _mm512_mask_store_ps(m, k, a);
866 }
867
868 #define _simd16_blend_ps(a, b, mask) _mm512_mask_blend_ps(mask, a, b)
869
870 INLINE simd16scalar _simd16_blendv_ps(simd16scalar a, simd16scalar b, const simd16scalar mask)
871 {
872 simd16mask k = _simd16_scalari2mask(_mm512_castps_si512(mask));
873
874 return _mm512_mask_blend_ps(k, a, b);
875 }
876
877 INLINE simd16scalari _simd16_blendv_epi32(simd16scalari a, simd16scalari b, const simd16scalar mask)
878 {
879 simd16mask k = _simd16_scalari2mask(_mm512_castps_si512(mask));
880
881 return _mm512_mask_blend_epi32(k, a, b);
882 }
883
884 INLINE simd16scalari _simd16_blendv_epi32(simd16scalari a, simd16scalari b, const simd16scalari mask)
885 {
886 simd16mask k = _simd16_scalari2mask(mask);
887
888 return _mm512_mask_blend_epi32(k, a, b);
889 }
890
891 #define _simd16_mul_ps _mm512_mul_ps
892 #define _simd16_add_ps _mm512_add_ps
893 #define _simd16_sub_ps _mm512_sub_ps
894 #define _simd16_rsqrt_ps _mm512_rsqrt14_ps
895 #define _simd16_min_ps _mm512_min_ps
896 #define _simd16_max_ps _mm512_max_ps
897
898 INLINE simd16mask _simd16_movemask_ps(simd16scalar a)
899 {
900 return _simd16_scalari2mask(_mm512_castps_si512(a));
901 }
902
903 #if 0
904 INLINE simd16mask _simd16_movemask_pd(simd16scalard a)
905 {
906 return _simd16_scalard2mask(_mm512i_castpd_si512(a));
907 }
908 #endif
909
910 #if 0
911 INLINE int _simd16_movemask_epi8(simd16scalari a)
912 {
913 return _simd16_scalar2mask(a);
914 }
915 #endif
916
917 #define _simd16_cvtps_epi32 _mm512_cvtps_epi32
918 #define _simd16_cvttps_epi32 _mm512_cvttps_epi32
919 #define _simd16_cvtepi32_ps _mm512_cvtepi32_ps
920
921 template <int comp>
922 INLINE simd16scalar _simd16_cmp_ps_temp(simd16scalar a, simd16scalar b)
923 {
924 simd16mask k = _mm512_cmpeq_ps_mask(a, b);
925
926 return _mm512_castsi512_ps(_mm512_mask_blend_epi32(k, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF)));
927 }
928
929 #define _simd16_cmp_ps(a, b, comp) _simd16_cmp_ps_temp<comp>(a, b)
930
931 #define _simd16_cmplt_ps(a, b) _simd16_cmp_ps<_CMP_LT_OQ>(a, b)
932 #define _simd16_cmpgt_ps(a, b) _simd16_cmp_ps<_CMP_GT_OQ>(a, b)
933 #define _simd16_cmpneq_ps(a, b) _simd16_cmp_ps<_CMP_NEQ_OQ>(a, b)
934 #define _simd16_cmpeq_ps(a, b) _simd16_cmp_ps<_CMP_EQ_OQ>(a, b)
935 #define _simd16_cmpge_ps(a, b) _simd16_cmp_ps<_CMP_GE_OQ>(a, b)
936 #define _simd16_cmple_ps(a, b) _simd16_cmp_ps<_CMP_LE_OQ>(a, b)
937
938 #define _simd16_castsi_ps _mm512_castsi512_ps
939 #define _simd16_castps_si _mm512_castps_si512
940 #define _simd16_castsi_pd _mm512_castsi512_pd
941 #define _simd16_castpd_si _mm512_castpd_si512
942 #define _simd16_castpd_ps _mm512_castpd_ps
943 #define _simd16_castps_pd _mm512_castps_pd
944
945 #define _simd16_and_ps _mm512_and_ps
946 #define _simd16_andnot_ps _mm512_andnot_ps
947 #define _simd16_or_ps _mm512_or_ps
948 #define _simd16_xor_ps _mm512_xor_ps
949
950 template <int mode>
951 INLINE simd16scalar _simd16_round_ps_temp(simd16scalar a)
952 {
953 return _mm512_roundscale_ps(a, mode);
954 }
955
956 #define _simd16_round_ps(a, mode) _simd16_round_ps_temp<mode>(a)
957
958 #define _simd16_mul_epi32 _mm512_mul_epi32
959 #define _simd16_mullo_epi32 _mm512_mullo_epi32
960 #define _simd16_sub_epi32 _mm512_sub_epi32
961 #define _simd16_sub_epi64 _mm512_sub_epi64
962 #define _simd16_min_epi32 _mm512_min_epi32
963 #define _simd16_max_epi32 _mm512_max_epi32
964 #define _simd16_min_epu32 _mm512_min_epu32
965 #define _simd16_max_epu32 _mm512_max_epu32
966 #define _simd16_add_epi32 _mm512_add_epi32
967
968 #define _simd16_and_si _mm512_and_si512
969 #define _simd16_andnot_si _mm512_andnot_si512
970 #define _simd16_or_si _mm512_or_si512
971 #define _simd16_xor_si _mm512_xor_si512
972
973 INLINE simd16scalari _simd16_cmpeq_epi32(simd16scalari a, simd16scalari b)
974 {
975 simd16mask k = _mm512_cmpeq_epi32_mask(a, b);
976
977 return _mm512_mask_blend_epi32(k, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF));
978 }
979
980 INLINE simd16scalari _simd16_cmpgt_epi32(simd16scalari a, simd16scalari b)
981 {
982 simd16mask k = _mm512_cmpgt_epi32_mask(a, b);
983
984 return _mm512_mask_blend_epi32(k, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF));
985 }
986
987 INLINE simd16scalari _simd16_cmplt_epi32(simd16scalari a, simd16scalari b)
988 {
989 simd16mask k = _mm512_cmplt_epi32_mask(a, b);
990
991 return _mm512_mask_blend_epi32(k, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF));
992 }
993
994 #if 0
995 INLINE int _simd16_testz_ps(simd16scalar a, simd16scalar b)
996 {
997 int lo = _mm256_testz_ps(a.lo, b.lo);
998 int hi = _mm256_testz_ps(a.hi, b.hi);
999
1000 return lo & hi;
1001 }
1002
1003 #endif
1004
1005 #define _simd16_unpacklo_ps _mm512_unpacklo_ps
1006 #define _simd16_unpackhi_ps _mm512_unpackhi_ps
1007 #define _simd16_unpacklo_pd _mm512_unpacklo_pd
1008 #define _simd16_unpackhi_pd _mm512_unpackhi_pd
1009 #define _simd16_unpacklo_epi8 _mm512_unpacklo_epi8
1010 #define _simd16_unpackhi_epi8 _mm512_unpackhi_epi8
1011 #define _simd16_unpacklo_epi16 _mm512_unpacklo_epi16
1012 #define _simd16_unpackhi_epi16 _mm512_unpackhi_epi16
1013 #define _simd16_unpacklo_epi32 _mm512_unpacklo_epi32
1014 #define _simd16_unpackhi_epi32 _mm512_unpackhi_epi32
1015 #define _simd16_unpacklo_epi64 _mm512_unpacklo_epi64
1016 #define _simd16_unpackhi_epi64 _mm512_unpackhi_epi64
1017 #define _simd16_slli_epi32 _mm512_slli_epi32
1018 #define _simd16_srli_epi32 _mm512_srli_epi32
1019 #define _simd16_srai_epi32 _mm512_srai_epi32
1020 #define _simd16_fmadd_ps _mm512_fmadd_ps
1021 #define _simd16_fmsub_ps _mm512_fmsub_ps
1022 #define _simd16_adds_epu8 _mm512_adds_epu8
1023 #define _simd16_subs_epu8 _mm512_subs_epu8
1024 #define _simd16_add_epi8 _mm512_add_epi8
1025 #define _simd16_shuffle_epi8 _mm512_shuffle_epi8
1026
1027 #define _simd16_fmadd_ps _mm512_fmadd_ps
1028 #define _simd16_fmsub_ps _mm512_fmsub_ps
1029
1030 #define _simd16_i32gather_ps(m, index, scale) _mm512_i32gather_ps(index, m, scale)
1031
1032 template <int scale>
1033 INLINE simd16scalar _simd16_mask_i32gather_ps_temp(simd16scalar a, const float *m, simd16scalari index, simd16scalari mask)
1034 {
1035 __mmask16 k = _mm512_cmpneq_epi32_mask(mask, _mm512_setzero_si512());
1036
1037 return _mm512_mask_i32gather_ps(a, k, index, m, scale);
1038 }
1039
1040 #define _simd16_mask_i32gather_ps(a, m, index, mask, scale) _simd16_mask_i32gather_ps_temp<scale>(a, m, index, mask)
1041
1042 #define _simd16_abs_epi32 _mm512_abs_epi32
1043 #define _simd16_cmpeq_epi64 _mm512_abs_epi32
1044
1045 INLINE simd16scalari _simd16_cmpeq_epi64(simd16scalari a, simd16scalari b)
1046 {
1047 __mmask8 k = _mm512_cmpeq_epi64_mask(a, b);
1048
1049 return _mm512_mask_blend_epi64(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
1050 }
1051
1052 INLINE simd16scalari _simd16_cmpgt_epi64(simd16scalari a, simd16scalari b)
1053 {
1054 __mmask8 k = _mm512_cmpgt_epi64_mask(a, b);
1055
1056 return _mm512_mask_blend_epi64(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
1057 }
1058
1059 INLINE simd16scalari _simd16_cmpeq_epi16(simd16scalari a, simd16scalari b)
1060 {
1061 __mmask32 k = _mm512_cmpeq_epi16_mask(a, b);
1062
1063 return _mm512_mask_blend_epi16(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
1064 }
1065
1066 INLINE simd16scalari _simd16_cmpgt_epi16(simd16scalari a, simd16scalari b)
1067 {
1068 __mmask32 k = _mm512_cmpgt_epi16_mask(a, b);
1069
1070 return _mm512_mask_blend_epi16(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
1071 }
1072
1073 INLINE simd16scalari _simd16_cmpeq_epi8(simd16scalari a, simd16scalari b)
1074 {
1075 __mmask64 k = _mm512_cmpeq_epi8_mask(a, b);
1076
1077 return _mm512_mask_blend_epi8(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
1078 }
1079
1080 INLINE simd16scalari _simd16_cmpgt_epi8(simd16scalari a, simd16scalari b)
1081 {
1082 __mmask64 k = _mm512_cmpgt_epi8_mask(a, b);
1083
1084 return _mm512_mask_blend_epi8(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
1085 }
1086
1087 #define _simd16_permute_ps(a, i) _mm512_permutexvar_ps(i, a)
1088 #define _simd16_permute_epi32(a, i) _mm512_permutexvar_epi32(i, a)
1089 #define _simd16_sllv_epi32 _mm512_srlv_epi32
1090 #define _simd16_srlv_epi32 _mm512_sllv_epi32
1091 #define _simd16_permute2f128_ps _mm512_shuffle_f32x4
1092 #define _simd16_permute2f128_pd _mm512_shuffle_f64x2
1093 #define _simd16_permute2f128_si _mm512_shuffle_i32x4
1094 #define _simd16_shuffle_ps _mm512_shuffle_ps
1095 #define _simd16_shuffle_pd _mm512_shuffle_pd
1096 #define _simd16_cvtepu8_epi16 _mm512_cvtepu8_epi16
1097 #define _simd16_cvtepu8_epi32 _mm512_cvtepu8_epi32
1098 #define _simd16_cvtepu16_epi32 _mm512_cvtepu16_epi32
1099 #define _simd16_packus_epi16 _mm512_packus_epi16
1100 #define _simd16_packs_epi16 _mm512_packs_epi16
1101 #define _simd16_packus_epi32 _mm512_packus_epi32
1102 #define _simd16_packs_epi32 _mm512_packs_epi32
1103
1104 template <int imm8>
1105 INLINE simd16scalari _simd16_shuffle_epi32_temp(simd16scalari a, simd16scalari b)
1106 {
1107 return _simd16_castps_si(_simd16_shuffle_ps(_simd16_castsi_ps(a), _simd16_castsi_ps(b), imm8));
1108 }
1109
1110 #define _simd16_shuffle_epi32(a, b, imm8) _simd16_shuffle_epi32_temp<imm8>(a, b)
1111
1112 template <int imm8>
1113 INLINE simd16scalari _simd16_shuffle_epi64_temp(simd16scalari a, simd16scalari b)
1114 {
1115 return _simd16_castpd_si(_simd16_shuffle_pd(_simd16_castsi_pd(a), _simd16_castsi_pd(b), imm8));
1116 }
1117
1118 #define _simd16_shuffle_epi64(a, b, imm8) _simd16_shuffle_epi64_temp<imm8>(a, b)
1119
1120 INLINE simd16mask _simd16_int2mask(int mask)
1121 {
1122 return _mm512_int2mask(mask);
1123 }
1124
1125 INLINE int _simd16_mask2int(simd16mask mask)
1126 {
1127 return _mm512_mask2int(mask);
1128 }
1129
1130 INLINE simd16mask _simd16_cmplt_ps_mask(simd16scalar a, simd16scalar b)
1131 {
1132 return _mm512_cmplt_ps_mask(a, b);
1133 }
1134
1135 // convert bitmask to vector mask
1136 INLINE simd16scalar vMask16(int32_t mask)
1137 {
1138 simd16scalari temp = _simd16_set1_epi32(mask);
1139
1140 simd16scalari bits = _simd16_set_epi32(0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100, 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001);
1141
1142 simd16scalari result = _simd16_cmplt_epi32(_simd16_setzero_si(), _simd16_and_si(temp, bits));
1143
1144 return _simd16_castsi_ps(result);
1145 }
1146
1147 #endif//ENABLE_AVX512_EMULATION
1148
1149 #endif//ENABLE_AVX512_SIMD16
1150
1151 #endif//__SWR_SIMD16INTRIN_H_