etnaviv: increment the resource seqno in resource_changed
[mesa.git] / src / gallium / drivers / swr / rasterizer / common / simd16intrin.h
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 ****************************************************************************/
23
24 #ifndef __SWR_SIMD16INTRIN_H__
25 #define __SWR_SIMD16INTRIN_H__
26
27 #if ENABLE_AVX512_SIMD16
28
29 #if KNOB_SIMD16_WIDTH == 16
30
31 #if ENABLE_AVX512_EMULATION
32 struct simd16scalar
33 {
34 __m256 lo;
35 __m256 hi;
36 };
37 struct simd16scalard
38 {
39 __m256d lo;
40 __m256d hi;
41 };
42 struct simd16scalari
43 {
44 __m256i lo;
45 __m256i hi;
46 };
47 typedef uint16_t simd16mask;
48
49 #else
50 typedef __m512 simd16scalar;
51 typedef __m512d simd16scalard;
52 typedef __m512i simd16scalari;
53 typedef __mmask16 simd16mask;
54 #endif//ENABLE_AVX512_EMULATION
55 #else
56 #error Unsupported vector width
57 #endif//KNOB_SIMD16_WIDTH == 16
58
59 #define _simd16_masklo(mask) ((mask) & 0xFF)
60 #define _simd16_maskhi(mask) (((mask) >> 8) & 0xFF)
61 #define _simd16_setmask(hi, lo) (((hi) << 8) | (lo))
62
63 #if defined(_WIN32)
64 #define SIMDAPI __vectorcall
65 #else
66 #define SIMDAPI
67 #endif
68
69 OSALIGN(union, KNOB_SIMD16_BYTES) simd16vector
70 {
71 simd16scalar v[4];
72 struct
73 {
74 simd16scalar x, y, z, w;
75 };
76
77 simd16scalar& operator[] (const int i) { return v[i]; }
78 const simd16scalar& operator[] (const int i) const { return v[i]; }
79 };
80
81 #if ENABLE_AVX512_EMULATION
82
83 #define SIMD16_EMU_AVX512_0(type, func, intrin) \
84 INLINE type SIMDAPI func()\
85 {\
86 type result;\
87 \
88 result.lo = intrin();\
89 result.hi = intrin();\
90 \
91 return result;\
92 }
93
94 #define SIMD16_EMU_AVX512_1(type, func, intrin) \
95 INLINE type SIMDAPI func(type a)\
96 {\
97 type result;\
98 \
99 result.lo = intrin(a.lo);\
100 result.hi = intrin(a.hi);\
101 \
102 return result;\
103 }
104
105 #define SIMD16_EMU_AVX512_2(type, func, intrin) \
106 INLINE type SIMDAPI func(type a, type b)\
107 {\
108 type result;\
109 \
110 result.lo = intrin(a.lo, b.lo);\
111 result.hi = intrin(a.hi, b.hi);\
112 \
113 return result;\
114 }
115
116 #define SIMD16_EMU_AVX512_3(type, func, intrin) \
117 INLINE type SIMDAPI func(type a, type b, type c)\
118 {\
119 type result;\
120 \
121 result.lo = intrin(a.lo, b.lo, c.lo);\
122 result.hi = intrin(a.hi, b.hi, c.hi);\
123 \
124 return result;\
125 }
126
127 SIMD16_EMU_AVX512_0(simd16scalar, _simd16_setzero_ps, _mm256_setzero_ps)
128 SIMD16_EMU_AVX512_0(simd16scalari, _simd16_setzero_si, _mm256_setzero_si256)
129
130 INLINE simd16scalar SIMDAPI _simd16_set1_ps(float a)
131 {
132 simd16scalar result;
133
134 result.lo = _mm256_set1_ps(a);
135 result.hi = _mm256_set1_ps(a);
136
137 return result;
138 }
139
140 INLINE simd16scalari SIMDAPI _simd16_set1_epi8(char a)
141 {
142 simd16scalari result;
143
144 result.lo = _mm256_set1_epi8(a);
145 result.hi = _mm256_set1_epi8(a);
146
147 return result;
148 }
149
150 INLINE simd16scalari SIMDAPI _simd16_set1_epi32(int a)
151 {
152 simd16scalari result;
153
154 result.lo = _mm256_set1_epi32(a);
155 result.hi = _mm256_set1_epi32(a);
156
157 return result;
158 }
159
160 INLINE simd16scalar SIMDAPI _simd16_set_ps(float e15, float e14, float e13, float e12, float e11, float e10, float e9, float e8, float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
161 {
162 simd16scalar result;
163
164 result.lo = _mm256_set_ps(e7, e6, e5, e4, e3, e2, e1, e0);
165 result.hi = _mm256_set_ps(e15, e14, e13, e12, e11, e10, e9, e8);
166
167 return result;
168 }
169
170 INLINE simd16scalari SIMDAPI _simd16_set_epi32(int e15, int e14, int e13, int e12, int e11, int e10, int e9, int e8, int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
171 {
172 simd16scalari result;
173
174 result.lo = _mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0);
175 result.hi = _mm256_set_epi32(e15, e14, e13, e12, e11, e10, e9, e8);
176
177 return result;
178 }
179
180 INLINE simd16scalar SIMDAPI _simd16_set_ps(float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
181 {
182 simd16scalar result;
183
184 result.lo = _mm256_set_ps(e7, e6, e5, e4, e3, e2, e1, e0);
185 result.hi = _mm256_set_ps(e7, e6, e5, e4, e3, e2, e1, e0);
186
187 return result;
188 }
189
190 INLINE simd16scalari SIMDAPI _simd16_set_epi32(int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
191 {
192 simd16scalari result;
193
194 result.lo = _mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0);
195 result.hi = _mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0);
196
197 return result;
198 }
199
200 INLINE simd16scalar SIMDAPI _simd16_load_ps(float const *m)
201 {
202 simd16scalar result;
203
204 float const *n = reinterpret_cast<float const *>(reinterpret_cast<uint8_t const *>(m) + sizeof(result.lo));
205
206 result.lo = _mm256_load_ps(m);
207 result.hi = _mm256_load_ps(n);
208
209 return result;
210 }
211
212 INLINE simd16scalar SIMDAPI _simd16_loadu_ps(float const *m)
213 {
214 simd16scalar result;
215
216 float const *n = reinterpret_cast<float const *>(reinterpret_cast<uint8_t const *>(m) + sizeof(result.lo));
217
218 result.lo = _mm256_loadu_ps(m);
219 result.hi = _mm256_loadu_ps(n);
220
221 return result;
222 }
223
224 INLINE simd16scalar SIMDAPI _simd16_load1_ps(float const *m)
225 {
226 simd16scalar result;
227
228 result.lo = _mm256_broadcast_ss(m);
229 result.hi = _mm256_broadcast_ss(m);
230
231 return result;
232 }
233
234 INLINE simd16scalari SIMDAPI _simd16_load_si(simd16scalari const *m)
235 {
236 simd16scalari result;
237
238 result.lo = _mm256_load_si256(&m[0].lo);
239 result.hi = _mm256_load_si256(&m[0].hi);
240
241 return result;
242 }
243
244 INLINE simd16scalari SIMDAPI _simd16_loadu_si(simd16scalari const *m)
245 {
246 simd16scalari result;
247
248 result.lo = _mm256_loadu_si256(&m[0].lo);
249 result.hi = _mm256_loadu_si256(&m[0].hi);
250
251 return result;
252 }
253
254 INLINE simd16scalar SIMDAPI _simd16_broadcast_ss(float const *m)
255 {
256 simd16scalar result;
257
258 result.lo = _mm256_broadcast_ss(m);
259 result.hi = _mm256_broadcast_ss(m);
260
261 return result;
262 }
263
264 INLINE simd16scalar SIMDAPI _simd16_broadcast_ps(__m128 const *m)
265 {
266 simd16scalar result;
267
268 result.lo = _mm256_broadcast_ps(m);
269 result.hi = _mm256_broadcast_ps(m);
270
271 return result;
272 }
273
274 INLINE void SIMDAPI _simd16_store_ps(float *m, simd16scalar a)
275 {
276 float *n = reinterpret_cast<float *>(reinterpret_cast<uint8_t *>(m) + sizeof(a.lo));
277
278 _mm256_store_ps(m, a.lo);
279 _mm256_store_ps(n, a.hi);
280 }
281
282 INLINE void SIMDAPI _simd16_maskstore_ps(float *m, simd16scalari mask, simd16scalar a)
283 {
284 float *n = reinterpret_cast<float *>(reinterpret_cast<uint8_t *>(m) + sizeof(a.lo));
285
286 _mm256_maskstore_ps(m, mask.lo, a.lo);
287 _mm256_maskstore_ps(n, mask.hi, a.hi);
288 }
289
290 INLINE void SIMDAPI _simd16_store_si(simd16scalari *m, simd16scalari a)
291 {
292 _mm256_store_si256(&m[0].lo, a.lo);
293 _mm256_store_si256(&m[0].hi, a.hi);
294 }
295
296 INLINE simdscalar SIMDAPI _simd16_extract_ps(simd16scalar a, int imm8)
297 {
298 switch (imm8)
299 {
300 case 0:
301 return a.lo;
302 case 1:
303 return a.hi;
304 }
305 return _simd_set1_ps(0.0f);
306 }
307
308 INLINE simdscalari SIMDAPI _simd16_extract_si(simd16scalari a, int imm8)
309 {
310 switch (imm8)
311 {
312 case 0:
313 return a.lo;
314 case 1:
315 return a.hi;
316 }
317 return _simd_set1_epi32(0);
318 }
319
320 INLINE simd16scalar SIMDAPI _simd16_insert_ps(simd16scalar a, simdscalar b, int imm8)
321 {
322 switch (imm8)
323 {
324 case 0:
325 a.lo = b;
326 break;
327 case 1:
328 a.hi = b;
329 break;
330 }
331 return a;
332 }
333
334 INLINE simd16scalari SIMDAPI _simd16_insert_si(simd16scalari a, simdscalari b, int imm8)
335 {
336 switch (imm8)
337 {
338 case 0:
339 a.lo = b;
340 break;
341 case 1:
342 a.hi = b;
343 break;
344 }
345 return a;
346 }
347
348 template <simd16mask mask>
349 INLINE simd16scalar SIMDAPI _simd16_blend_ps_temp(simd16scalar a, simd16scalar b)
350 {
351 simd16scalar result;
352
353 result.lo = _mm256_blend_ps(a.lo, b.lo, _simd16_masklo(mask));
354 result.hi = _mm256_blend_ps(a.hi, b.hi, _simd16_maskhi(mask));
355
356 return result;
357 }
358
359 #define _simd16_blend_ps(a, b, mask) _simd16_blend_ps_temp<mask>(a, b)
360
361 SIMD16_EMU_AVX512_3(simd16scalar, _simd16_blendv_ps, _mm256_blendv_ps)
362
363 INLINE simd16scalari SIMDAPI _simd16_blendv_epi32(simd16scalari a, simd16scalari b, const simd16scalar mask)
364 {
365 simd16scalari result;
366
367 result.lo = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.lo), _mm256_castsi256_ps(b.lo), mask.lo));
368 result.hi = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.hi), _mm256_castsi256_ps(b.hi), mask.hi));
369
370 return result;
371 }
372
373 INLINE simd16scalari SIMDAPI _simd16_blendv_epi32(simd16scalari a, simd16scalari b, const simd16scalari mask)
374 {
375 simd16scalari result;
376
377 result.lo = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.lo), _mm256_castsi256_ps(b.lo), _mm256_castsi256_ps(mask.lo)));
378 result.hi = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.hi), _mm256_castsi256_ps(b.hi), _mm256_castsi256_ps(mask.hi)));
379
380 return result;
381 }
382
383 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_mul_ps, _mm256_mul_ps)
384 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_add_ps, _mm256_add_ps)
385 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_sub_ps, _mm256_sub_ps)
386 SIMD16_EMU_AVX512_1(simd16scalar, _simd16_rsqrt_ps, _mm256_rsqrt_ps)
387 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_min_ps, _mm256_min_ps)
388 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_max_ps, _mm256_max_ps)
389
390 INLINE simd16mask SIMDAPI _simd16_movemask_ps(simd16scalar a)
391 {
392 simdmask mask_lo = _mm256_movemask_ps(a.lo);
393 simdmask mask_hi = _mm256_movemask_ps(a.hi);
394
395 return static_cast<simd16mask>(mask_lo) | (static_cast<simd16mask>(mask_hi) << 8);
396 }
397
398 INLINE simd16mask SIMDAPI _simd16_movemask_pd(simd16scalard a)
399 {
400 simdmask mask_lo = _mm256_movemask_pd(a.lo);
401 simdmask mask_hi = _mm256_movemask_pd(a.hi);
402
403 return static_cast<simd16mask>(mask_lo) | (static_cast<simd16mask>(mask_hi) << 4);
404 }
405
406 INLINE uint64_t SIMDAPI _simd16_movemask_epi8(simd16scalari a)
407 {
408 uint32_t mask_lo = _mm256_movemask_epi8(a.lo);
409 uint32_t mask_hi = _mm256_movemask_epi8(a.hi);
410
411 return static_cast<uint64_t>(mask_lo) | (static_cast<uint64_t>(mask_hi) << 32);
412 }
413
414 INLINE simd16scalari SIMDAPI _simd16_cvtps_epi32(simd16scalar a)
415 {
416 simd16scalari result;
417
418 result.lo = _mm256_cvtps_epi32(a.lo);
419 result.hi = _mm256_cvtps_epi32(a.hi);
420
421 return result;
422 }
423
424 INLINE simd16scalari SIMDAPI _simd16_cvttps_epi32(simd16scalar a)
425 {
426 simd16scalari result;
427
428 result.lo = _mm256_cvttps_epi32(a.lo);
429 result.hi = _mm256_cvttps_epi32(a.hi);
430
431 return result;
432 }
433
434 INLINE simd16scalar SIMDAPI _simd16_cvtepi32_ps(simd16scalari a)
435 {
436 simd16scalar result;
437
438 result.lo = _mm256_cvtepi32_ps(a.lo);
439 result.hi = _mm256_cvtepi32_ps(a.hi);
440
441 return result;
442 }
443
444 template <int comp>
445 INLINE simd16scalar SIMDAPI _simd16_cmp_ps_temp(simd16scalar a, simd16scalar b)
446 {
447 simd16scalar result;
448
449 result.lo = _mm256_cmp_ps(a.lo, b.lo, comp);
450 result.hi = _mm256_cmp_ps(a.hi, b.hi, comp);
451
452 return result;
453 }
454
455 #define _simd16_cmp_ps(a, b, comp) _simd16_cmp_ps_temp<comp>(a, b)
456
457 #define _simd16_cmplt_ps(a, b) _simd16_cmp_ps(a, b, _CMP_LT_OQ)
458 #define _simd16_cmpgt_ps(a, b) _simd16_cmp_ps(a, b, _CMP_GT_OQ)
459 #define _simd16_cmpneq_ps(a, b) _simd16_cmp_ps(a, b, _CMP_NEQ_OQ)
460 #define _simd16_cmpeq_ps(a, b) _simd16_cmp_ps(a, b, _CMP_EQ_OQ)
461 #define _simd16_cmpge_ps(a, b) _simd16_cmp_ps(a, b, _CMP_GE_OQ)
462 #define _simd16_cmple_ps(a, b) _simd16_cmp_ps(a, b, _CMP_LE_OQ)
463
464 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_and_ps, _simd_and_ps)
465 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_andnot_ps, _simd_andnot_ps)
466 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_or_ps, _simd_or_ps)
467 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_xor_ps, _simd_xor_ps)
468
469 SIMD16_EMU_AVX512_1(simd16scalar, _simd16_rcp_ps, _simd_rcp_ps)
470 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_div_ps, _simd_div_ps)
471
472 INLINE simd16scalar SIMDAPI _simd16_castsi_ps(simd16scalari a)
473 {
474 return *reinterpret_cast<simd16scalar *>(&a);
475 }
476
477 INLINE simd16scalari SIMDAPI _simd16_castps_si(simd16scalar a)
478 {
479 return *reinterpret_cast<simd16scalari *>(&a);
480 }
481
482 INLINE simd16scalard SIMDAPI _simd16_castsi_pd(simd16scalari a)
483 {
484 return *reinterpret_cast<simd16scalard *>(&a);
485 }
486
487 INLINE simd16scalari SIMDAPI _simd16_castpd_si(simd16scalard a)
488 {
489 return *reinterpret_cast<simd16scalari *>(&a);
490 }
491
492 INLINE simd16scalar SIMDAPI _simd16_castpd_ps(simd16scalard a)
493 {
494 return *reinterpret_cast<simd16scalar *>(&a);
495 }
496
497 INLINE simd16scalard SIMDAPI _simd16_castps_pd(simd16scalar a)
498 {
499 return *reinterpret_cast<simd16scalard *>(&a);
500 }
501
502 template <int mode>
503 INLINE simd16scalar SIMDAPI _simd16_round_ps_temp(simd16scalar a)
504 {
505 simd16scalar result;
506
507 result.lo = _mm256_round_ps(a.lo, mode);
508 result.hi = _mm256_round_ps(a.hi, mode);
509
510 return result;
511 }
512
513 #define _simd16_round_ps(a, mode) _simd16_round_ps_temp<mode>(a)
514
515 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_mul_epi32, _simd_mul_epi32)
516 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_mullo_epi32, _simd_mullo_epi32)
517 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sub_epi32, _simd_sub_epi32)
518 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sub_epi64, _simd_sub_epi64)
519 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_min_epi32, _simd_min_epi32)
520 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_max_epi32, _simd_max_epi32)
521 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_min_epu32, _simd_min_epu32)
522 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_max_epu32, _simd_max_epu32)
523 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_add_epi32, _simd_add_epi32)
524
525 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_and_si, _simd_and_si)
526 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_andnot_si, _simd_andnot_si)
527 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_or_si, _simd_or_si)
528 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_xor_si, _simd_xor_si)
529
530 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi32, _simd_cmpeq_epi32)
531 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi32, _simd_cmpgt_epi32)
532 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmplt_epi32, _simd_cmplt_epi32)
533
534 INLINE int SIMDAPI _simd16_testz_ps(simd16scalar a, simd16scalar b)
535 {
536 int lo = _simd_testz_ps(a.lo, b.lo);
537 int hi = _simd_testz_ps(a.hi, b.hi);
538
539 return lo & hi;
540 }
541
542 #define _simd16_cmplt_epi32(a, b) _simd16_cmpgt_epi32(b, a)
543
544 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_unpacklo_ps, _simd_unpacklo_ps)
545 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_unpackhi_ps, _simd_unpackhi_ps)
546 SIMD16_EMU_AVX512_2(simd16scalard, _simd16_unpacklo_pd, _simd_unpacklo_pd)
547 SIMD16_EMU_AVX512_2(simd16scalard, _simd16_unpackhi_pd, _simd_unpackhi_pd)
548
549 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpacklo_epi8, _simd_unpacklo_epi8)
550 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpackhi_epi8, _simd_unpackhi_epi8)
551 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpacklo_epi16, _simd_unpacklo_epi16)
552 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpackhi_epi16, _simd_unpackhi_epi16)
553 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpacklo_epi32, _simd_unpacklo_epi32)
554 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpackhi_epi32, _simd_unpackhi_epi32)
555 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpacklo_epi64, _simd_unpacklo_epi64)
556 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpackhi_epi64, _simd_unpackhi_epi64)
557
558 template <int imm8>
559 INLINE simd16scalari SIMDAPI _simd16_slli_epi32_temp(simd16scalari a)
560 {
561 simd16scalari result;
562
563 result.lo = _simd_slli_epi32(a.lo, imm8);
564 result.hi = _simd_slli_epi32(a.hi, imm8);
565
566 return result;
567 }
568
569 #define _simd16_slli_epi32(a, imm8) _simd16_slli_epi32_temp<imm8>(a)
570
571 template <int imm8>
572 INLINE simd16scalari SIMDAPI _simd16_srai_epi32_temp(simd16scalari a)
573 {
574 simd16scalari result;
575
576 result.lo = _simd_srai_epi32(a.lo, imm8);
577 result.hi = _simd_srai_epi32(a.hi, imm8);
578
579 return result;
580 }
581
582 #define _simd16_srai_epi32(a, imm8) _simd16_srai_epi32_temp<imm8>(a)
583
584 template <int imm8>
585 INLINE simd16scalari SIMDAPI _simd16_srli_epi32_temp(simd16scalari a)
586 {
587 simd16scalari result;
588
589 result.lo = _simd_srli_epi32(a.lo, imm8);
590 result.hi = _simd_srli_epi32(a.hi, imm8);
591
592 return result;
593 }
594
595 #define _simd16_srli_epi32(a, imm8) _simd16_srli_epi32_temp<imm8>(a)
596
597 SIMD16_EMU_AVX512_3(simd16scalar, _simd16_fmadd_ps, _simd_fmadd_ps)
598 SIMD16_EMU_AVX512_3(simd16scalar, _simd16_fmsub_ps, _simd_fmsub_ps)
599
600 template <int scale>
601 INLINE simd16scalar SIMDAPI _simd16_i32gather_ps_temp(const float *m, simd16scalari index)
602 {
603 simd16scalar result;
604
605 result.lo = _simd_i32gather_ps(m, index.lo, scale);
606 result.hi = _simd_i32gather_ps(m, index.hi, scale);
607
608 return result;
609 }
610
611 #define _simd16_i32gather_ps(m, index, scale) _simd16_i32gather_ps_temp<scale>(m, index)
612
613 template <int scale>
614 INLINE simd16scalar SIMDAPI _simd16_mask_i32gather_ps_temp(simd16scalar a, const float *m, simd16scalari index, simd16scalari mask)
615 {
616 simd16scalar result;
617
618 result.lo = _simd_mask_i32gather_ps(a.lo, m, index.lo, _simd_castsi_ps(mask.lo), scale);
619 result.hi = _simd_mask_i32gather_ps(a.hi, m, index.hi, _simd_castsi_ps(mask.hi), scale);
620
621 return result;
622 }
623
624 #define _simd16_mask_i32gather_ps(a, m, index, mask, scale) _simd16_mask_i32gather_ps_temp<scale>(a, m, index, mask)
625
626 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_shuffle_epi8, _simd_shuffle_epi8)
627 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_adds_epu8, _simd_adds_epu8)
628 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_subs_epu8, _simd_subs_epu8)
629 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_add_epi8, _simd_add_epi8)
630 SIMD16_EMU_AVX512_1(simd16scalari, _simd16_abs_epi32, _simd_abs_epi32)
631 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi64, _simd_cmpeq_epi64)
632 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi64, _simd_cmpgt_epi64)
633 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi16, _simd_cmpeq_epi16)
634 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi16, _simd_cmpgt_epi16)
635 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi8, _simd_cmpeq_epi8)
636 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi8, _simd_cmpgt_epi8)
637
638 INLINE simd16scalar SIMDAPI _simd16_permute_ps(simd16scalar a, simd16scalari i)
639 {
640 simd16scalar result;
641
642 const simdscalari mask = _simd_set1_epi32(7);
643
644 simdscalar lolo = _simd_permute_ps(a.lo, _simd_and_si(i.lo, mask));
645 simdscalar lohi = _simd_permute_ps(a.hi, _simd_and_si(i.lo, mask));
646
647 simdscalar hilo = _simd_permute_ps(a.lo, _simd_and_si(i.hi, mask));
648 simdscalar hihi = _simd_permute_ps(a.hi, _simd_and_si(i.hi, mask));
649
650 result.lo = _simd_blendv_ps(lolo, lohi, _simd_castsi_ps(_simd_cmpgt_epi32(i.lo, mask)));
651 result.hi = _simd_blendv_ps(hilo, hihi, _simd_castsi_ps(_simd_cmpgt_epi32(i.hi, mask)));
652
653 return result;
654 }
655
656 INLINE simd16scalari SIMDAPI _simd16_permute_epi32(simd16scalari a, simd16scalari i)
657 {
658 return _simd16_castps_si(_simd16_permute_ps(_simd16_castsi_ps(a), i));
659 }
660
661 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_srlv_epi32, _simd_srlv_epi32)
662 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sllv_epi32, _simd_sllv_epi32)
663
664 template <int imm8>
665 INLINE simd16scalar SIMDAPI _simd16_permute2f128_ps_temp(simd16scalar a, simd16scalar b)
666 {
667 simd16scalar result;
668
669 result.lo = _simd_permute2f128_ps(a.lo, a.hi, ((imm8 & 0x03) << 0) | ((imm8 & 0x0C) << 2));
670 result.hi = _simd_permute2f128_ps(b.lo, b.hi, ((imm8 & 0x30) >> 4) | ((imm8 & 0xC0) >> 2));
671
672 return result;
673 }
674
675 #define _simd16_permute2f128_ps(a, b, imm8) _simd16_permute2f128_ps_temp<imm8>(a, b)
676
677 template <int imm8>
678 INLINE simd16scalard SIMDAPI _simd16_permute2f128_pd_temp(simd16scalard a, simd16scalard b)
679 {
680 simd16scalard result;
681
682 result.lo = _simd_permute2f128_pd(a.lo, a.hi, ((imm8 & 0x03) << 0) | ((imm8 & 0x0C) << 2));
683 result.hi = _simd_permute2f128_pd(b.lo, b.hi, ((imm8 & 0x30) >> 4) | ((imm8 & 0xC0) >> 2));
684
685 return result;
686 }
687
688 #define _simd16_permute2f128_pd(a, b, imm8) _simd16_permute2f128_pd_temp<imm8>(a, b)
689
690 template <int imm8>
691 INLINE simd16scalari SIMDAPI _simd16_permute2f128_si_temp(simd16scalari a, simd16scalari b)
692 {
693 simd16scalari result;
694
695 result.lo = _simd_permute2f128_si(a.lo, a.hi, ((imm8 & 0x03) << 0) | ((imm8 & 0x0C) << 2));
696 result.hi = _simd_permute2f128_si(b.lo, b.hi, ((imm8 & 0x30) >> 4) | ((imm8 & 0xC0) >> 2));
697
698 return result;
699 }
700
701 #define _simd16_permute2f128_si(a, b, imm8) _simd16_permute2f128_si_temp<imm8>(a, b)
702
703 template <int imm8>
704 INLINE simd16scalar SIMDAPI _simd16_shuffle_ps_temp(simd16scalar a, simd16scalar b)
705 {
706 simd16scalar result;
707
708 result.lo = _simd_shuffle_ps(a.lo, b.lo, imm8);
709 result.hi = _simd_shuffle_ps(a.hi, b.hi, imm8);
710
711 return result;
712 }
713
714 #define _simd16_shuffle_ps(a, b, imm8) _simd16_shuffle_ps_temp<imm8>(a, b)
715
716 template <int imm8>
717 INLINE simd16scalard SIMDAPI _simd16_shuffle_pd_temp(simd16scalard a, simd16scalard b)
718 {
719 simd16scalard result;
720
721 result.lo = _simd_shuffle_pd(a.lo, b.lo, (imm8 & 15));
722 result.hi = _simd_shuffle_pd(a.hi, b.hi, (imm8 >> 4));
723
724 return result;
725 }
726
727 #define _simd16_shuffle_pd(a, b, imm8) _simd16_shuffle_pd_temp<imm8>(a, b)
728
729 template <int imm8>
730 INLINE simd16scalari SIMDAPI _simd16_shuffle_epi32_temp(simd16scalari a, simd16scalari b)
731 {
732 return _simd16_castps_si(_simd16_shuffle_ps(_simd16_castsi_ps(a), _simd16_castsi_ps(b), imm8));
733 }
734
735 #define _simd16_shuffle_epi32(a, b, imm8) _simd16_shuffle_epi32_temp<imm8>(a, b)
736
737 template <int imm8>
738 INLINE simd16scalari SIMDAPI _simd16_shuffle_epi64_temp(simd16scalari a, simd16scalari b)
739 {
740 return _simd16_castpd_si(_simd16_shuffle_pd(_simd16_castsi_pd(a), _simd16_castsi_pd(b), imm8));
741 }
742
743 #define _simd16_shuffle_epi64(a, b, imm8) _simd16_shuffle_epi64_temp<imm8>(a, b)
744
745 INLINE simd16scalari SIMDAPI _simd16_cvtepu8_epi16(simdscalari a)
746 {
747 simd16scalari result;
748
749 result.lo = _simd_cvtepu8_epi16(_mm256_extractf128_si256(a, 0));
750 result.hi = _simd_cvtepu8_epi16(_mm256_extractf128_si256(a, 1));
751
752 return result;
753 }
754
755 INLINE simd16scalari SIMDAPI _simd16_cvtepu8_epi32(__m128i a)
756 {
757 simd16scalari result;
758
759 result.lo = _simd_cvtepu8_epi32(a);
760 result.hi = _simd_cvtepu8_epi32(_mm_srli_si128(a, 8));
761
762 return result;
763 }
764
765 INLINE simd16scalari SIMDAPI _simd16_cvtepu16_epi32(simdscalari a)
766 {
767 simd16scalari result;
768
769 result.lo = _simd_cvtepu16_epi32(_mm256_extractf128_si256(a, 0));
770 result.hi = _simd_cvtepu16_epi32(_mm256_extractf128_si256(a, 1));
771
772 return result;
773 }
774
775 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_packus_epi16, _simd_packus_epi16)
776 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_packs_epi16, _simd_packs_epi16)
777 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_packus_epi32, _simd_packus_epi32)
778 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_packs_epi32, _simd_packs_epi32)
779
780 INLINE simd16mask SIMDAPI _simd16_int2mask(int mask)
781 {
782 return mask;
783 }
784
785 INLINE int SIMDAPI SIMDAPI _simd16_mask2int(simd16mask mask)
786 {
787 return mask;
788 }
789
790 INLINE simd16mask SIMDAPI _simd16_cmplt_ps_mask(simd16scalar a, simd16scalar b)
791 {
792 return _simd16_movemask_ps(_simd16_cmplt_ps(a, b));
793 }
794
795 // convert bitmask to vector mask
796 INLINE simd16scalar SIMDAPI vMask16(int32_t mask)
797 {
798 simd16scalari temp = _simd16_set1_epi32(mask);
799
800 simd16scalari bits = _simd16_set_epi32(0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100, 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001);
801
802 simd16scalari result = _simd16_cmplt_epi32(_simd16_setzero_si(), _simd16_and_si(temp, bits));
803
804 return _simd16_castsi_ps(result);
805 }
806
807 #else
808
809 INLINE simd16mask SIMDAPI _simd16_scalari2mask(simd16scalari mask)
810 {
811 return _mm512_cmpneq_epu32_mask(mask, _mm512_setzero_epi32());
812 }
813
814 INLINE simd16mask SIMDAPI _simd16_scalard2mask(simd16scalard mask)
815 {
816 return _mm512_cmpneq_epu64_mask(_mm512_castpd_si512(mask), _mm512_setzero_si512());
817 }
818
819 #define _simd16_setzero_ps _mm512_setzero_ps
820 #define _simd16_setzero_si _mm512_setzero_si512
821 #define _simd16_set1_ps _mm512_set1_ps
822 #define _simd16_set1_epi8 _mm512_set1_epi8
823 #define _simd16_set1_epi32 _mm512_set1_epi32
824
825 INLINE simd16scalar SIMDAPI _simd16_set_ps(float e15, float e14, float e13, float e12, float e11, float e10, float e9, float e8, float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
826 {
827 return _mm512_set_ps(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0);
828 }
829
830 INLINE simd16scalari SIMDAPI _simd16_set_epi32(int e15, int e14, int e13, int e12, int e11, int e10, int e9, int e8, int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
831 {
832 return _mm512_set_epi32(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0);
833 }
834
835 INLINE simd16scalar SIMDAPI _simd16_set_ps(float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
836 {
837 return _mm512_set_ps(e7, e6, e5, e4, e3, e2, e1, e0, e7, e6, e5, e4, e3, e2, e1, e0);
838 }
839
840 INLINE simd16scalari SIMDAPI _simd16_set_epi32(int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
841 {
842 return _mm512_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0, e7, e6, e5, e4, e3, e2, e1, e0);
843 }
844
845 #define _simd16_load_ps _mm512_load_ps
846 #define _simd16_loadu_ps _mm512_loadu_ps
847 #if 1
848 #define _simd16_load1_ps _simd16_broadcast_ss
849 #endif
850 #define _simd16_load_si _mm512_load_si512
851 #define _simd16_loadu_si _mm512_loadu_si512
852 #define _simd16_broadcast_ss(m) _mm512_extload_ps(m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, 0)
853 #define _simd16_broadcast_ps(m) _mm512_extload_ps(m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_4X16, 0)
854 #define _simd16_store_ps _mm512_store_ps
855 #define _simd16_store_si _mm512_store_si512
856 #define _simd16_extract_ps(a, imm8) _mm256_castsi256_ps(_mm512_extracti64x4_epi64(_mm512_castps_si512(a), imm8))
857 #define _simd16_extract_si _mm512_extracti64x4_epi64
858 #define _simd16_insert_ps(a, b, imm8) _mm512_castsi512_ps(_mm512_inserti64x4(_mm512_castps_si512(a), _mm256_castps_si256(b), imm8))
859 #define _simd16_insert_si _mm512_inserti64x4
860
861 INLINE void SIMDAPI _simd16_maskstore_ps(float *m, simd16scalari mask, simd16scalar a)
862 {
863 simd16mask k = _simd16_scalari2mask(mask);
864
865 _mm512_mask_store_ps(m, k, a);
866 }
867
868 #define _simd16_blend_ps(a, b, mask) _mm512_mask_blend_ps(mask, a, b)
869
870 INLINE simd16scalar SIMDAPI _simd16_blendv_ps(simd16scalar a, simd16scalar b, const simd16scalar mask)
871 {
872 simd16mask k = _simd16_scalari2mask(_mm512_castps_si512(mask));
873
874 return _mm512_mask_blend_ps(k, a, b);
875 }
876
877 INLINE simd16scalari SIMDAPI _simd16_blendv_epi32(simd16scalari a, simd16scalari b, const simd16scalar mask)
878 {
879 simd16mask k = _simd16_scalari2mask(_mm512_castps_si512(mask));
880
881 return _mm512_mask_blend_epi32(k, a, b);
882 }
883
884 INLINE simd16scalari SIMDAPI _simd16_blendv_epi32(simd16scalari a, simd16scalari b, const simd16scalari mask)
885 {
886 simd16mask k = _simd16_scalari2mask(mask);
887
888 return _mm512_mask_blend_epi32(k, a, b);
889 }
890
891 #define _simd16_mul_ps _mm512_mul_ps
892 #define _simd16_div_ps _mm512_div_ps
893 #define _simd16_add_ps _mm512_add_ps
894 #define _simd16_sub_ps _mm512_sub_ps
895 #define _simd16_rsqrt_ps _mm512_rsqrt14_ps
896 #define _simd16_min_ps _mm512_min_ps
897 #define _simd16_max_ps _mm512_max_ps
898
899 INLINE simd16mask SIMDAPI _simd16_movemask_ps(simd16scalar a)
900 {
901 return _simd16_scalari2mask(_mm512_castps_si512(a));
902 }
903
904 INLINE simd16mask SIMDAPI _simd16_movemask_pd(simd16scalard a)
905 {
906 return _simd16_scalard2mask(a);
907 }
908
909 #if 0
910 INLINE int SIMDAPI _simd16_movemask_epi8(simd16scalari a)
911 {
912 return _simd16_scalar2mask(a);
913 }
914 #endif
915
916 #define _simd16_cvtps_epi32 _mm512_cvtps_epi32
917 #define _simd16_cvttps_epi32 _mm512_cvttps_epi32
918 #define _simd16_cvtepi32_ps _mm512_cvtepi32_ps
919
920 template <int comp>
921 INLINE simd16scalar SIMDAPI _simd16_cmp_ps_temp(simd16scalar a, simd16scalar b)
922 {
923 simd16mask k = _mm512_cmp_ps_mask(a, b, comp);
924
925 return _mm512_castsi512_ps(_mm512_mask_blend_epi32(k, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF)));
926 }
927
928 #define _simd16_cmp_ps(a, b, comp) _simd16_cmp_ps_temp<comp>(a, b)
929
930 #define _simd16_cmplt_ps(a, b) _simd16_cmp_ps(a, b, _CMP_LT_OQ)
931 #define _simd16_cmpgt_ps(a, b) _simd16_cmp_ps(a, b, _CMP_GT_OQ)
932 #define _simd16_cmpneq_ps(a, b) _simd16_cmp_ps(a, b, _CMP_NEQ_OQ)
933 #define _simd16_cmpeq_ps(a, b) _simd16_cmp_ps(a, b, _CMP_EQ_OQ)
934 #define _simd16_cmpge_ps(a, b) _simd16_cmp_ps(a, b, _CMP_GE_OQ)
935 #define _simd16_cmple_ps(a, b) _simd16_cmp_ps(a, b, _CMP_LE_OQ)
936
937 #define _simd16_castsi_ps _mm512_castsi512_ps
938 #define _simd16_castps_si _mm512_castps_si512
939 #define _simd16_castsi_pd _mm512_castsi512_pd
940 #define _simd16_castpd_si _mm512_castpd_si512
941 #define _simd16_castpd_ps _mm512_castpd_ps
942 #define _simd16_castps_pd _mm512_castps_pd
943
944 #define _simd16_and_ps _mm512_and_ps
945 #define _simd16_andnot_ps _mm512_andnot_ps
946 #define _simd16_or_ps _mm512_or_ps
947 #define _simd16_xor_ps _mm512_xor_ps
948
949 template <int mode>
950 INLINE simd16scalar SIMDAPI _simd16_round_ps_temp(simd16scalar a)
951 {
952 return _mm512_roundscale_ps(a, mode);
953 }
954
955 #define _simd16_round_ps(a, mode) _simd16_round_ps_temp<mode>(a)
956
957 #define _simd16_mul_epi32 _mm512_mul_epi32
958 #define _simd16_mullo_epi32 _mm512_mullo_epi32
959 #define _simd16_sub_epi32 _mm512_sub_epi32
960 #define _simd16_sub_epi64 _mm512_sub_epi64
961 #define _simd16_min_epi32 _mm512_min_epi32
962 #define _simd16_max_epi32 _mm512_max_epi32
963 #define _simd16_min_epu32 _mm512_min_epu32
964 #define _simd16_max_epu32 _mm512_max_epu32
965 #define _simd16_add_epi32 _mm512_add_epi32
966
967 #define _simd16_and_si _mm512_and_si512
968 #define _simd16_andnot_si _mm512_andnot_si512
969 #define _simd16_or_si _mm512_or_si512
970 #define _simd16_xor_si _mm512_xor_si512
971
972 INLINE simd16scalari SIMDAPI _simd16_cmpeq_epi32(simd16scalari a, simd16scalari b)
973 {
974 simd16mask k = _mm512_cmpeq_epi32_mask(a, b);
975
976 return _mm512_mask_blend_epi32(k, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF));
977 }
978
979 INLINE simd16scalari SIMDAPI _simd16_cmpgt_epi32(simd16scalari a, simd16scalari b)
980 {
981 simd16mask k = _mm512_cmpgt_epi32_mask(a, b);
982
983 return _mm512_mask_blend_epi32(k, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF));
984 }
985
986 INLINE simd16scalari SIMDAPI _simd16_cmplt_epi32(simd16scalari a, simd16scalari b)
987 {
988 simd16mask k = _mm512_cmplt_epi32_mask(a, b);
989
990 return _mm512_mask_blend_epi32(k, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF));
991 }
992
993 INLINE int SIMDAPI _simd16_testz_ps(simd16scalar a, simd16scalar b)
994 {
995 int lo = _simd_testz_ps(_simd16_extract_ps(a, 0), _simd16_extract_ps(b, 0));
996 int hi = _simd_testz_ps(_simd16_extract_ps(a, 1), _simd16_extract_ps(b, 1));
997
998 return lo & hi;
999 }
1000
1001 #define _simd16_unpacklo_ps _mm512_unpacklo_ps
1002 #define _simd16_unpackhi_ps _mm512_unpackhi_ps
1003 #define _simd16_unpacklo_pd _mm512_unpacklo_pd
1004 #define _simd16_unpackhi_pd _mm512_unpackhi_pd
1005 #define _simd16_unpacklo_epi8 _mm512_unpacklo_epi8
1006 #define _simd16_unpackhi_epi8 _mm512_unpackhi_epi8
1007 #define _simd16_unpacklo_epi16 _mm512_unpacklo_epi16
1008 #define _simd16_unpackhi_epi16 _mm512_unpackhi_epi16
1009 #define _simd16_unpacklo_epi32 _mm512_unpacklo_epi32
1010 #define _simd16_unpackhi_epi32 _mm512_unpackhi_epi32
1011 #define _simd16_unpacklo_epi64 _mm512_unpacklo_epi64
1012 #define _simd16_unpackhi_epi64 _mm512_unpackhi_epi64
1013 #define _simd16_slli_epi32 _mm512_slli_epi32
1014 #define _simd16_srli_epi32 _mm512_srli_epi32
1015 #define _simd16_srai_epi32 _mm512_srai_epi32
1016 #define _simd16_fmadd_ps _mm512_fmadd_ps
1017 #define _simd16_fmsub_ps _mm512_fmsub_ps
1018 #define _simd16_adds_epu8 _mm512_adds_epu8
1019 #define _simd16_subs_epu8 _mm512_subs_epu8
1020 #define _simd16_add_epi8 _mm512_add_epi8
1021 #define _simd16_shuffle_epi8 _mm512_shuffle_epi8
1022
1023 #define _simd16_fmadd_ps _mm512_fmadd_ps
1024 #define _simd16_fmsub_ps _mm512_fmsub_ps
1025
1026 #define _simd16_i32gather_ps(m, index, scale) _mm512_i32gather_ps(index, m, scale)
1027
1028 template <int scale>
1029 INLINE simd16scalar SIMDAPI _simd16_mask_i32gather_ps_temp(simd16scalar a, const float *m, simd16scalari index, simd16scalari mask)
1030 {
1031 __mmask16 k = _mm512_cmpneq_epi32_mask(mask, _mm512_setzero_si512());
1032
1033 return _mm512_mask_i32gather_ps(a, k, index, m, scale);
1034 }
1035
1036 #define _simd16_mask_i32gather_ps(a, m, index, mask, scale) _simd16_mask_i32gather_ps_temp<scale>(a, m, index, mask)
1037
1038 #define _simd16_abs_epi32 _mm512_abs_epi32
1039
1040 INLINE simd16scalari SIMDAPI _simd16_cmpeq_epi64(simd16scalari a, simd16scalari b)
1041 {
1042 __mmask8 k = _mm512_cmpeq_epi64_mask(a, b);
1043
1044 return _mm512_mask_blend_epi64(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
1045 }
1046
1047 INLINE simd16scalari SIMDAPI _simd16_cmpgt_epi64(simd16scalari a, simd16scalari b)
1048 {
1049 __mmask8 k = _mm512_cmpgt_epi64_mask(a, b);
1050
1051 return _mm512_mask_blend_epi64(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
1052 }
1053
1054 INLINE simd16scalari SIMDAPI _simd16_cmpeq_epi16(simd16scalari a, simd16scalari b)
1055 {
1056 __mmask32 k = _mm512_cmpeq_epi16_mask(a, b);
1057
1058 return _mm512_mask_blend_epi16(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
1059 }
1060
1061 INLINE simd16scalari SIMDAPI _simd16_cmpgt_epi16(simd16scalari a, simd16scalari b)
1062 {
1063 __mmask32 k = _mm512_cmpgt_epi16_mask(a, b);
1064
1065 return _mm512_mask_blend_epi16(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
1066 }
1067
1068 INLINE simd16scalari SIMDAPI _simd16_cmpeq_epi8(simd16scalari a, simd16scalari b)
1069 {
1070 __mmask64 k = _mm512_cmpeq_epi8_mask(a, b);
1071
1072 return _mm512_mask_blend_epi8(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
1073 }
1074
1075 INLINE simd16scalari SIMDAPI _simd16_cmpgt_epi8(simd16scalari a, simd16scalari b)
1076 {
1077 __mmask64 k = _mm512_cmpgt_epi8_mask(a, b);
1078
1079 return _mm512_mask_blend_epi8(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
1080 }
1081
1082 #define _simd16_permute_ps(a, i) _mm512_permutexvar_ps(i, a)
1083 #define _simd16_permute_epi32(a, i) _mm512_permutexvar_epi32(i, a)
1084 #define _simd16_sllv_epi32 _mm512_srlv_epi32
1085 #define _simd16_srlv_epi32 _mm512_sllv_epi32
1086 #define _simd16_permute2f128_ps _mm512_shuffle_f32x4
1087 #define _simd16_permute2f128_pd _mm512_shuffle_f64x2
1088 #define _simd16_permute2f128_si _mm512_shuffle_i32x4
1089 #define _simd16_shuffle_ps _mm512_shuffle_ps
1090 #define _simd16_shuffle_pd _mm512_shuffle_pd
1091 #define _simd16_cvtepu8_epi16 _mm512_cvtepu8_epi16
1092 #define _simd16_cvtepu8_epi32 _mm512_cvtepu8_epi32
1093 #define _simd16_cvtepu16_epi32 _mm512_cvtepu16_epi32
1094 #define _simd16_packus_epi16 _mm512_packus_epi16
1095 #define _simd16_packs_epi16 _mm512_packs_epi16
1096 #define _simd16_packus_epi32 _mm512_packus_epi32
1097 #define _simd16_packs_epi32 _mm512_packs_epi32
1098
1099 template <int imm8>
1100 INLINE simd16scalari SIMDAPI _simd16_shuffle_epi32_temp(simd16scalari a, simd16scalari b)
1101 {
1102 return _simd16_castps_si(_simd16_shuffle_ps(_simd16_castsi_ps(a), _simd16_castsi_ps(b), imm8));
1103 }
1104
1105 #define _simd16_shuffle_epi32(a, b, imm8) _simd16_shuffle_epi32_temp<imm8>(a, b)
1106
1107 template <int imm8>
1108 INLINE simd16scalari SIMDAPI _simd16_shuffle_epi64_temp(simd16scalari a, simd16scalari b)
1109 {
1110 return _simd16_castpd_si(_simd16_shuffle_pd(_simd16_castsi_pd(a), _simd16_castsi_pd(b), imm8));
1111 }
1112
1113 #define _simd16_shuffle_epi64(a, b, imm8) _simd16_shuffle_epi64_temp<imm8>(a, b)
1114
1115 INLINE simd16mask SIMDAPI _simd16_int2mask(int mask)
1116 {
1117 return _mm512_int2mask(mask);
1118 }
1119
1120 INLINE int SIMDAPI _simd16_mask2int(simd16mask mask)
1121 {
1122 return _mm512_mask2int(mask);
1123 }
1124
1125 INLINE simd16mask SIMDAPI _simd16_cmplt_ps_mask(simd16scalar a, simd16scalar b)
1126 {
1127 return _mm512_cmplt_ps_mask(a, b);
1128 }
1129
1130 // convert bitmask to vector mask
1131 INLINE simd16scalar SIMDAPI vMask16(int32_t mask)
1132 {
1133 simd16scalari temp = _simd16_set1_epi32(mask);
1134
1135 simd16scalari bits = _simd16_set_epi32(0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100, 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001);
1136
1137 simd16scalari result = _simd16_cmplt_epi32(_simd16_setzero_si(), _simd16_and_si(temp, bits));
1138
1139 return _simd16_castsi_ps(result);
1140 }
1141
1142 #endif//ENABLE_AVX512_EMULATION
1143
1144 #endif//ENABLE_AVX512_SIMD16
1145
1146 #endif//__SWR_SIMD16INTRIN_H_