swr: [rasterizer core] Frontend dependency work
[mesa.git] / src / gallium / drivers / swr / rasterizer / common / simd16intrin.h
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 ****************************************************************************/
23
24 #ifndef __SWR_SIMD16INTRIN_H__
25 #define __SWR_SIMD16INTRIN_H__
26
27 #if ENABLE_AVX512_SIMD16
28
29 #if KNOB_SIMD16_WIDTH == 16
30
31 #if ENABLE_AVX512_EMULATION
32 struct simd16scalar
33 {
34 __m256 lo;
35 __m256 hi;
36 };
37 struct simd16scalard
38 {
39 __m256d lo;
40 __m256d hi;
41 };
42 struct simd16scalari
43 {
44 __m256i lo;
45 __m256i hi;
46 };
47 typedef uint16_t simd16mask;
48
49 #define _simd16_masklo(mask) ((mask) & 0xFF)
50 #define _simd16_maskhi(mask) (((mask) >> 8))
51 #define _simd16_setmask(hi, lo) (((hi) << 8) | (lo))
52
53 #else
54 typedef __m512 simd16scalar;
55 typedef __m512d simd16scalard;
56 typedef __m512i simd16scalari;
57 typedef __mmask16 simd16mask;
58 #endif//ENABLE_AVX512_EMULATION
59 #else
60 #error Unsupported vector width
61 #endif//KNOB_SIMD16_WIDTH == 16
62
63 OSALIGN(union, KNOB_SIMD16_BYTES) simd16vector
64 {
65 simd16scalar v[4];
66 struct
67 {
68 simd16scalar x, y, z, w;
69 };
70
71 simd16scalar& operator[] (const int i) { return v[i]; }
72 const simd16scalar& operator[] (const int i) const { return v[i]; }
73 };
74
75 #if ENABLE_AVX512_EMULATION
76
77 #define SIMD16_EMU_AVX512_0(type, func, intrin) \
78 INLINE type func()\
79 {\
80 type result;\
81 \
82 result.lo = intrin();\
83 result.hi = intrin();\
84 \
85 return result;\
86 }
87
88 #define SIMD16_EMU_AVX512_1(type, func, intrin) \
89 INLINE type func(type a)\
90 {\
91 type result;\
92 \
93 result.lo = intrin(a.lo);\
94 result.hi = intrin(a.hi);\
95 \
96 return result;\
97 }
98
99 #define SIMD16_EMU_AVX512_2(type, func, intrin) \
100 INLINE type func(type a, type b)\
101 {\
102 type result;\
103 \
104 result.lo = intrin(a.lo, b.lo);\
105 result.hi = intrin(a.hi, b.hi);\
106 \
107 return result;\
108 }
109
110 #define SIMD16_EMU_AVX512_3(type, func, intrin) \
111 INLINE type func(type a, type b, type c)\
112 {\
113 type result;\
114 \
115 result.lo = intrin(a.lo, b.lo, c.lo);\
116 result.hi = intrin(a.hi, b.hi, c.hi);\
117 \
118 return result;\
119 }
120
121 SIMD16_EMU_AVX512_0(simd16scalar, _simd16_setzero_ps, _mm256_setzero_ps)
122 SIMD16_EMU_AVX512_0(simd16scalari, _simd16_setzero_si, _mm256_setzero_si256)
123
124 INLINE simd16scalar _simd16_set1_ps(float a)
125 {
126 simd16scalar result;
127
128 result.lo = _mm256_set1_ps(a);
129 result.hi = _mm256_set1_ps(a);
130
131 return result;
132 }
133
134 INLINE simd16scalari _simd16_set1_epi8(char a)
135 {
136 simd16scalari result;
137
138 result.lo = _mm256_set1_epi8(a);
139 result.hi = _mm256_set1_epi8(a);
140
141 return result;
142 }
143
144 INLINE simd16scalari _simd16_set1_epi32(int a)
145 {
146 simd16scalari result;
147
148 result.lo = _mm256_set1_epi32(a);
149 result.hi = _mm256_set1_epi32(a);
150
151 return result;
152 }
153
154 INLINE simd16scalar _simd16_set_ps(float e15, float e14, float e13, float e12, float e11, float e10, float e9, float e8, float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
155 {
156 simd16scalar result;
157
158 result.lo = _mm256_set_ps(e7, e6, e5, e4, e3, e2, e1, e0);
159 result.hi = _mm256_set_ps(e15, e14, e13, e12, e11, e10, e9, e8);
160
161 return result;
162 }
163
164 INLINE simd16scalari _simd16_set_epi32(int e15, int e14, int e13, int e12, int e11, int e10, int e9, int e8, int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
165 {
166 simd16scalari result;
167
168 result.lo = _mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0);
169 result.hi = _mm256_set_epi32(e15, e14, e13, e12, e11, e10, e9, e8);
170
171 return result;
172 }
173
174 INLINE simd16scalar _simd16_set_ps(float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
175 {
176 simd16scalar result;
177
178 result.lo = _mm256_set_ps(e7, e6, e5, e4, e3, e2, e1, e0);
179 result.hi = _mm256_set_ps(e7, e6, e5, e4, e3, e2, e1, e0);
180
181 return result;
182 }
183
184 INLINE simd16scalari _simd16_set_epi32(int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
185 {
186 simd16scalari result;
187
188 result.lo = _mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0);
189 result.hi = _mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0);
190
191 return result;
192 }
193
194 INLINE simd16scalar _simd16_load_ps(float const *m)
195 {
196 simd16scalar result;
197
198 float const *n = reinterpret_cast<float const *>(reinterpret_cast<uint8_t const *>(m) + sizeof(result.lo));
199
200 result.lo = _mm256_load_ps(m);
201 result.hi = _mm256_load_ps(n);
202
203 return result;
204 }
205
206 INLINE simd16scalar _simd16_loadu_ps(float const *m)
207 {
208 simd16scalar result;
209
210 float const *n = reinterpret_cast<float const *>(reinterpret_cast<uint8_t const *>(m) + sizeof(result.lo));
211
212 result.lo = _mm256_loadu_ps(m);
213 result.hi = _mm256_loadu_ps(n);
214
215 return result;
216 }
217
218 INLINE simd16scalar _simd16_load1_ps(float const *m)
219 {
220 simd16scalar result;
221
222 result.lo = _mm256_broadcast_ss(m);
223 result.hi = _mm256_broadcast_ss(m);
224
225 return result;
226 }
227
228 INLINE simd16scalari _simd16_load_si(simd16scalari const *m)
229 {
230 simd16scalari result;
231
232 result.lo = _mm256_load_si256(&m[0].lo);
233 result.hi = _mm256_load_si256(&m[0].hi);
234
235 return result;
236 }
237
238 INLINE simd16scalari _simd16_loadu_si(simd16scalari const *m)
239 {
240 simd16scalari result;
241
242 result.lo = _mm256_loadu_si256(&m[0].lo);
243 result.hi = _mm256_loadu_si256(&m[0].hi);
244
245 return result;
246 }
247
248 INLINE simd16scalar _simd16_broadcast_ss(float const *m)
249 {
250 simd16scalar result;
251
252 result.lo = _mm256_broadcast_ss(m);
253 result.hi = _mm256_broadcast_ss(m);
254
255 return result;
256 }
257
258 INLINE simd16scalar _simd16_broadcast_ps(__m128 const *m)
259 {
260 simd16scalar result;
261
262 result.lo = _mm256_broadcast_ps(m);
263 result.hi = _mm256_broadcast_ps(m);
264
265 return result;
266 }
267
268 INLINE void _simd16_store_ps(float *m, simd16scalar a)
269 {
270 float *n = reinterpret_cast<float *>(reinterpret_cast<uint8_t *>(m) + sizeof(a.lo));
271
272 _mm256_store_ps(m, a.lo);
273 _mm256_store_ps(n, a.hi);
274 }
275
276 INLINE void _simd16_maskstore_ps(float *m, simd16scalari mask, simd16scalar a)
277 {
278 float *n = reinterpret_cast<float *>(reinterpret_cast<uint8_t *>(m) + sizeof(a.lo));
279
280 _mm256_maskstore_ps(m, mask.lo, a.lo);
281 _mm256_maskstore_ps(n, mask.hi, a.hi);
282 }
283
284 INLINE void _simd16_store_si(simd16scalari *m, simd16scalari a)
285 {
286 _mm256_store_si256(&m[0].lo, a.lo);
287 _mm256_store_si256(&m[0].hi, a.hi);
288 }
289
290 INLINE simdscalar _simd16_extract_ps(simd16scalar a, int imm8)
291 {
292 switch (imm8)
293 {
294 case 0:
295 return a.lo;
296 case 1:
297 return a.hi;
298 }
299 return _simd_set1_ps(0.0f);
300 }
301
302 INLINE simdscalari _simd16_extract_si(simd16scalari a, int imm8)
303 {
304 switch (imm8)
305 {
306 case 0:
307 return a.lo;
308 case 1:
309 return a.hi;
310 }
311 return _simd_set1_epi32(0);
312 }
313
314 INLINE simd16scalar _simd16_insert_ps(simd16scalar a, simdscalar b, int imm8)
315 {
316 switch (imm8)
317 {
318 case 0:
319 a.lo = b;
320 break;
321 case 1:
322 a.hi = b;
323 break;
324 }
325 return a;
326 }
327
328 INLINE simd16scalari _simd16_insert_si(simd16scalari a, simdscalari b, int imm8)
329 {
330 switch (imm8)
331 {
332 case 0:
333 a.lo = b;
334 break;
335 case 1:
336 a.hi = b;
337 break;
338 }
339 return a;
340 }
341
342 template <simd16mask mask>
343 INLINE simd16scalar _simd16_blend_ps_temp(simd16scalar a, simd16scalar b)
344 {
345 simd16scalar result;
346
347 result.lo = _mm256_blend_ps(a.lo, b.lo, _simd16_masklo(mask));
348 result.hi = _mm256_blend_ps(a.hi, b.hi, _simd16_maskhi(mask));
349
350 return result;
351 }
352
353 #define _simd16_blend_ps(a, b, mask) _simd16_blend_ps_temp<mask>(a, b)
354
355 SIMD16_EMU_AVX512_3(simd16scalar, _simd16_blendv_ps, _mm256_blendv_ps)
356
357 INLINE simd16scalari _simd16_blendv_epi32(simd16scalari a, simd16scalari b, const simd16scalar mask)
358 {
359 simd16scalari result;
360
361 result.lo = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.lo), _mm256_castsi256_ps(b.lo), mask.lo));
362 result.hi = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.hi), _mm256_castsi256_ps(b.hi), mask.hi));
363
364 return result;
365 }
366
367 INLINE simd16scalari _simd16_blendv_epi32(simd16scalari a, simd16scalari b, const simd16scalari mask)
368 {
369 simd16scalari result;
370
371 result.lo = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.lo), _mm256_castsi256_ps(b.lo), _mm256_castsi256_ps(mask.lo)));
372 result.hi = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.hi), _mm256_castsi256_ps(b.hi), _mm256_castsi256_ps(mask.hi)));
373
374 return result;
375 }
376
377 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_mul_ps, _mm256_mul_ps)
378 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_add_ps, _mm256_add_ps)
379 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_sub_ps, _mm256_sub_ps)
380 SIMD16_EMU_AVX512_1(simd16scalar, _simd16_rsqrt_ps, _mm256_rsqrt_ps)
381 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_min_ps, _mm256_min_ps)
382 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_max_ps, _mm256_max_ps)
383
384 INLINE simd16mask _simd16_movemask_ps(simd16scalar a)
385 {
386 simd16mask mask;
387
388 reinterpret_cast<uint8_t *>(&mask)[0] = _mm256_movemask_ps(a.lo);
389 reinterpret_cast<uint8_t *>(&mask)[1] = _mm256_movemask_ps(a.hi);
390
391 return mask;
392 }
393
394 INLINE simd16mask _simd16_movemask_pd(simd16scalard a)
395 {
396 simd16mask mask;
397
398 reinterpret_cast<uint8_t *>(&mask)[0] = _mm256_movemask_pd(a.lo);
399 reinterpret_cast<uint8_t *>(&mask)[1] = _mm256_movemask_pd(a.hi);
400
401 return mask;
402 }
403
404 INLINE simd16mask _simd16_movemask_epi8(simd16scalari a)
405 {
406 simd16mask mask;
407
408 reinterpret_cast<uint8_t *>(&mask)[0] = _mm256_movemask_epi8(a.lo);
409 reinterpret_cast<uint8_t *>(&mask)[1] = _mm256_movemask_epi8(a.hi);
410
411 return mask;
412 }
413
414 INLINE simd16scalari _simd16_cvtps_epi32(simd16scalar a)
415 {
416 simd16scalari result;
417
418 result.lo = _mm256_cvtps_epi32(a.lo);
419 result.hi = _mm256_cvtps_epi32(a.hi);
420
421 return result;
422 }
423
424 INLINE simd16scalari _simd16_cvttps_epi32(simd16scalar a)
425 {
426 simd16scalari result;
427
428 result.lo = _mm256_cvttps_epi32(a.lo);
429 result.hi = _mm256_cvttps_epi32(a.hi);
430
431 return result;
432 }
433
434 INLINE simd16scalar _simd16_cvtepi32_ps(simd16scalari a)
435 {
436 simd16scalar result;
437
438 result.lo = _mm256_cvtepi32_ps(a.lo);
439 result.hi = _mm256_cvtepi32_ps(a.hi);
440
441 return result;
442 }
443
444 template <int comp>
445 INLINE simd16scalar _simd16_cmp_ps(simd16scalar a, simd16scalar b)
446 {
447 simd16scalar result;
448
449 result.lo = _mm256_cmp_ps(a.lo, b.lo, comp);
450 result.hi = _mm256_cmp_ps(a.hi, b.hi, comp);
451
452 return result;
453 }
454
455 #define _simd16_cmplt_ps(a, b) _simd16_cmp_ps<_CMP_LT_OQ>(a, b)
456 #define _simd16_cmpgt_ps(a, b) _simd16_cmp_ps<_CMP_GT_OQ>(a, b)
457 #define _simd16_cmpneq_ps(a, b) _simd16_cmp_ps<_CMP_NEQ_OQ>(a, b)
458 #define _simd16_cmpeq_ps(a, b) _simd16_cmp_ps<_CMP_EQ_OQ>(a, b)
459 #define _simd16_cmpge_ps(a, b) _simd16_cmp_ps<_CMP_GE_OQ>(a, b)
460 #define _simd16_cmple_ps(a, b) _simd16_cmp_ps<_CMP_LE_OQ>(a, b)
461
462 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_and_ps, _mm256_and_ps)
463 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_or_ps, _mm256_or_ps)
464 SIMD16_EMU_AVX512_1(simd16scalar, _simd16_rcp_ps, _mm256_rcp_ps)
465 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_div_ps, _mm256_div_ps)
466
467 INLINE simd16scalar _simd16_castsi_ps(simd16scalari a)
468 {
469 return *reinterpret_cast<simd16scalar *>(&a);
470 }
471
472 INLINE simd16scalari _simd16_castps_si(simd16scalar a)
473 {
474 return *reinterpret_cast<simd16scalari *>(&a);
475 }
476
477 INLINE simd16scalard _simd16_castsi_pd(simd16scalari a)
478 {
479 return *reinterpret_cast<simd16scalard *>(&a);
480 }
481
482 INLINE simd16scalari _simd16_castpd_si(simd16scalard a)
483 {
484 return *reinterpret_cast<simd16scalari *>(&a);
485 }
486
487 INLINE simd16scalar _simd16_castpd_ps(simd16scalard a)
488 {
489 return *reinterpret_cast<simd16scalar *>(&a);
490 }
491
492 INLINE simd16scalard _simd16_castps_pd(simd16scalar a)
493 {
494 return *reinterpret_cast<simd16scalard *>(&a);
495 }
496
497 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_andnot_ps, _mm256_andnot_ps)
498
499 template <int mode>
500 INLINE simd16scalar _simd16_round_ps_temp(simd16scalar a)
501 {
502 simd16scalar result;
503
504 result.lo = _mm256_round_ps(a.lo, mode);
505 result.hi = _mm256_round_ps(a.hi, mode);
506
507 return result;
508 }
509
510 #define _simd16_round_ps(a, mode) _simd16_round_ps_temp<mode>(a)
511
512 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_mul_epi32, _mm256_mul_epi32)
513 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_mullo_epi32, _mm256_mullo_epi32)
514 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sub_epi32, _mm256_sub_epi32)
515 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sub_epi64, _mm256_sub_epi64)
516 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_min_epi32, _mm256_min_epi32)
517 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_max_epi32, _mm256_max_epi32)
518 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_min_epu32, _mm256_min_epu32)
519 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_max_epu32, _mm256_max_epu32)
520 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_add_epi32, _mm256_add_epi32)
521 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_and_si, _simd_and_si)
522 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_andnot_si, _simd_andnot_si)
523 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_or_si, _simd_or_si)
524 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_xor_si, _simd_xor_si)
525 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi32, _mm256_cmpeq_epi32)
526 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi32, _mm256_cmpgt_epi32)
527
528 INLINE int _simd16_testz_ps(simd16scalar a, simd16scalar b)
529 {
530 int lo = _mm256_testz_ps(a.lo, b.lo);
531 int hi = _mm256_testz_ps(a.hi, b.hi);
532
533 return lo & hi;
534 }
535
536 #define _simd16_cmplt_epi32(a, b) _simd16_cmpgt_epi32(b, a)
537
538 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpacklo_epi32, _simd_unpacklo_epi32)
539 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpackhi_epi32, _simd_unpackhi_epi32)
540 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpacklo_epi64, _simd_unpacklo_epi64)
541 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpackhi_epi64, _simd_unpackhi_epi64)
542
543 template <int imm8>
544 INLINE simd16scalari _simd16_slli_epi32_temp(simd16scalari a)
545 {
546 simd16scalari result;
547
548 result.lo = _simd_slli_epi32(a.lo, imm8);
549 result.hi = _simd_slli_epi32(a.hi, imm8);
550
551 return result;
552 }
553
554 #define _simd16_slli_epi32(a, imm8) _simd16_slli_epi32_temp<imm8>(a)
555
556 template <int imm8>
557 INLINE simd16scalari _simd16_srai_epi32_temp(simd16scalari a)
558 {
559 simd16scalari result;
560
561 result.lo = _simd_srai_epi32(a.lo, imm8);
562 result.hi = _simd_srai_epi32(a.hi, imm8);
563
564 return result;
565 }
566
567 #define _simd16_srai_epi32(a, imm8) _simd16_srai_epi32_temp<imm8>(a)
568
569 template <int imm8>
570 INLINE simd16scalari _simd16_srli_epi32_temp(simd16scalari a)
571 {
572 simd16scalari result;
573
574 result.lo = _simd_srli_epi32(a.lo, imm8);
575 result.hi = _simd_srli_epi32(a.hi, imm8);
576
577 return result;
578 }
579
580 #define _simd16_srli_epi32(a, imm8) _simd16_srli_epi32_temp<imm8>(a)
581
582 SIMD16_EMU_AVX512_3(simd16scalar, _simd16_fmadd_ps, _mm256_fmadd_ps)
583 SIMD16_EMU_AVX512_3(simd16scalar, _simd16_fmsub_ps, _mm256_fmsub_ps)
584
585 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_shuffle_epi8, _mm256_shuffle_epi8)
586 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_adds_epu8, _mm256_adds_epu8)
587 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_subs_epu8, _mm256_subs_epu8)
588 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_add_epi8, _mm256_add_epi8)
589
590 template <int imm8>
591 INLINE simd16scalar _simd16_i32gather_ps_temp(float const *m, simd16scalari a)
592 {
593 simd16scalar result;
594
595 result.lo = _mm256_i32gather_ps(m, a.lo, imm8);
596 result.hi = _mm256_i32gather_ps(m, a.hi, imm8);
597
598 return result;
599 }
600
601 #define _simd16_i32gather_ps(m, a, imm8) _simd16_i32gather_ps_temp<imm8>(m, a)
602
603 SIMD16_EMU_AVX512_1(simd16scalari, _simd16_abs_epi32, _mm256_abs_epi32)
604 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi64, _mm256_cmpeq_epi64)
605 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi64, _mm256_cmpgt_epi64)
606 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi16, _mm256_cmpeq_epi16)
607 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi16, _mm256_cmpgt_epi16)
608 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi8, _mm256_cmpeq_epi8)
609 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi8, _mm256_cmpgt_epi8)
610
611 INLINE simd16scalar _simd16_permute_ps(simd16scalar a, simd16scalari i)
612 {
613 simd16scalar result;
614
615 const simdscalari mask = _simd_set1_epi32(7);
616
617 simdscalar lolo = _simd_permute_ps(a.lo, _simd_and_si(i.lo, mask));
618 simdscalar lohi = _simd_permute_ps(a.hi, _simd_and_si(i.lo, mask));
619
620 simdscalar hilo = _simd_permute_ps(a.lo, _simd_and_si(i.hi, mask));
621 simdscalar hihi = _simd_permute_ps(a.hi, _simd_and_si(i.hi, mask));
622
623 result.lo = _simd_blendv_ps(lolo, lohi, _simd_castsi_ps(_simd_cmpgt_epi32(i.lo, mask)));
624 result.hi = _simd_blendv_ps(hilo, hihi, _simd_castsi_ps(_simd_cmpgt_epi32(i.hi, mask)));
625
626 return result;
627 }
628
629 INLINE simd16scalari _simd16_permute_epi32(simd16scalari a, simd16scalari i)
630 {
631 return _simd16_castps_si(_simd16_permute_ps(_simd16_castsi_ps(a), i));
632 }
633
634 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_srlv_epi32, _mm256_srlv_epi32)
635 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sllv_epi32, _mm256_sllv_epi32)
636
637 template <int imm8>
638 INLINE simd16scalar _simd16_permute2f128_ps_temp(simd16scalar a, simd16scalar b)
639 {
640 simd16scalar result;
641
642 result.lo = _simd_permute2f128_ps(a.lo, a.hi, ((imm8 & 0x03) << 0) | ((imm8 & 0x0C) << 2));
643 result.hi = _simd_permute2f128_ps(b.lo, b.hi, ((imm8 & 0x30) >> 4) | ((imm8 & 0xC0) >> 2));
644
645 return result;
646 }
647
648 #define _simd16_permute2f128_ps(a, b, imm8) _simd16_permute2f128_ps_temp<imm8>(a, b)
649
650 template <int imm8>
651 INLINE simd16scalard _simd16_permute2f128_pd_temp(simd16scalard a, simd16scalard b)
652 {
653 simd16scalard result;
654
655 result.lo = _simd_permute2f128_pd(a.lo, a.hi, ((imm8 & 0x03) << 0) | ((imm8 & 0x0C) << 2));
656 result.hi = _simd_permute2f128_pd(b.lo, b.hi, ((imm8 & 0x30) >> 4) | ((imm8 & 0xC0) >> 2));
657
658 return result;
659 }
660
661 #define _simd16_permute2f128_pd(a, b, imm8) _simd16_permute2f128_pd_temp<imm8>(a, b)
662
663 template <int imm8>
664 INLINE simd16scalari _simd16_permute2f128_si_temp(simd16scalari a, simd16scalari b)
665 {
666 simd16scalari result;
667
668 result.lo = _simd_permute2f128_si(a.lo, a.hi, ((imm8 & 0x03) << 0) | ((imm8 & 0x0C) << 2));
669 result.hi = _simd_permute2f128_si(b.lo, b.hi, ((imm8 & 0x30) >> 4) | ((imm8 & 0xC0) >> 2));
670
671 return result;
672 }
673
674 #define _simd16_permute2f128_si(a, b, imm8) _simd16_permute2f128_si_temp<imm8>(a, b)
675
676 template <int imm8>
677 INLINE simd16scalar _simd16_shuffle_ps_temp(simd16scalar a, simd16scalar b)
678 {
679 simd16scalar result;
680
681 result.lo = _simd_shuffle_ps(a.lo, b.lo, imm8);
682 result.hi = _simd_shuffle_ps(a.hi, b.hi, imm8);
683
684 return result;
685 }
686
687 #define _simd16_shuffle_ps(a, b, imm8) _simd16_shuffle_ps_temp<imm8>(a, b)
688
689 template <int imm8>
690 INLINE simd16scalard _simd16_shuffle_pd_temp(simd16scalard a, simd16scalard b)
691 {
692 simd16scalard result;
693
694 result.lo = _simd_shuffle_pd(a.lo, b.lo, (imm8 & 15));
695 result.hi = _simd_shuffle_pd(a.hi, b.hi, (imm8 >> 4));
696
697 return result;
698 }
699
700 #define _simd16_shuffle_pd(a, b, imm8) _simd16_shuffle_pd_temp<imm8>(a, b)
701
702 template <int imm8>
703 INLINE simd16scalari _simd16_shuffle_epi32_temp(simd16scalari a, simd16scalari b)
704 {
705 return _simd16_castps_si(_simd16_shuffle_ps(_simd16_castsi_ps(a), _simd16_castsi_ps(b), imm8));
706 }
707
708 #define _simd16_shuffle_epi32(a, b, imm8) _simd16_shuffle_epi32_temp<imm8>(a, b)
709
710 template <int imm8>
711 INLINE simd16scalari _simd16_shuffle_epi64_temp(simd16scalari a, simd16scalari b)
712 {
713 return _simd16_castpd_si(_simd16_shuffle_pd(_simd16_castsi_pd(a), _simd16_castsi_pd(b), imm8));
714 }
715
716 #define _simd16_shuffle_epi64(a, b, imm8) _simd16_shuffle_epi64_temp<imm8>(a, b)
717
718 INLINE simd16mask _simd16_int2mask(int mask)
719 {
720 return mask;
721 }
722
723 INLINE int _simd16_mask2int(simd16mask mask)
724 {
725 return mask;
726 }
727
728 INLINE simd16mask _simd16_cmplt_ps_mask(simd16scalar a, simd16scalar b)
729 {
730 return _simd16_movemask_ps(_simd16_cmplt_ps(a, b));
731 }
732
733 // convert bitmask to vector mask
734 INLINE simd16scalar vMask16(int32_t mask)
735 {
736 simd16scalari temp = _simd16_set1_epi32(mask);
737
738 simd16scalari bits = _simd16_set_epi32(0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100, 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001);
739
740 simd16scalari result = _simd16_cmplt_epi32(_simd16_setzero_si(), _simd16_and_si(temp, bits));
741
742 return _simd16_castsi_ps(result);
743 }
744
745 #else
746
747 INLINE simd16mask _simd16_scalari2mask(simd16scalari mask)
748 {
749 return _mm512_cmpneq_epu32_mask(mask, _mm512_setzero_epi32());
750 }
751
752 #if 0
753 INLINE simd16mask _simd16_scalard2mask(simd16scalard mask)
754 {
755 return _mm512_cmpneq_epu64_mask(mask, _mm512_setzero_epi64());
756 }
757 #endif
758
759 #define _simd16_setzero_ps _mm512_setzero_ps
760 #define _simd16_setzero_si _mm512_setzero_si512
761 #define _simd16_set1_ps _mm512_set1_ps
762 #define _simd16_set1_epi8 _mm512_set1_epi8
763 #define _simd16_set1_epi32 _mm512_set1_epi32
764
765 INLINE simd16scalar _simd16_set_ps(float e15, float e14, float e13, float e12, float e11, float e10, float e9, float e8, float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
766 {
767 return _mm512_set_ps(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0);
768 }
769
770 INLINE simd16scalari _simd16_set_epi32(int e15, int e14, int e13, int e12, int e11, int e10, int e9, int e8, int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
771 {
772 return _mm512_set_epi32(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0);
773 }
774
775 INLINE simd16scalar _simd16_set_ps(float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
776 {
777 return _mm512_set_ps(e7, e6, e5, e4, e3, e2, e1, e0, e7, e6, e5, e4, e3, e2, e1, e0);
778 }
779
780 INLINE simd16scalari _simd16_set_epi32(int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
781 {
782 return _mm512_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0, e7, e6, e5, e4, e3, e2, e1, e0);
783 }
784
785 #define _simd16_load_ps _mm512_load_ps
786 #define _simd16_loadu_ps _mm512_loadu_ps
787 #if 1
788 #define _simd16_load1_ps _simd16_broadcast_ss
789 #endif
790 #define _simd16_load_si _mm512_load_si512
791 #define _simd16_loadu_si _mm512_loadu_si512
792 #define _simd16_broadcast_ss(m) _mm512_extload_ps(m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, 0)
793 #define _simd16_broadcast_ps(m) _mm512_extload_ps(m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_4X16, 0)
794 #define _simd16_store_ps _mm512_store_ps
795 #define _simd16_store_si _mm512_store_si512
796 #define _simd16_extract_ps _mm512_extractf32x8_ps
797 #define _simd16_extract_si _mm512_extracti32x8_epi32
798 #define _simd16_insert_ps _mm512_insertf32x8
799 #define _simd16_insert_si _mm512_inserti32x8
800
801 INLINE void _simd16_maskstore_ps(float *m, simd16scalari mask, simd16scalar a)
802 {
803 simd16mask k = _simd16_scalari2mask(mask);
804
805 _mm512_mask_store_ps(m, k, a);
806 }
807
808 #define _simd16_blend_ps(a, b, mask) _mm512_mask_blend_ps(mask, a, b)
809
810 INLINE simd16scalar _simd16_blendv_ps(simd16scalar a, simd16scalar b, const simd16scalar mask)
811 {
812 simd16mask k = _simd16_scalari2mask(_mm512_castps_si512(mask));
813
814 _mm512_mask_blend_ps(k, a, b);
815 }
816
817 INLINE simd16scalari _simd16_blendv_epi32(simd16scalari a, simd16scalari b, const simd16scalar mask)
818 {
819 simd16mask k = _simd16_scalari2mask(_mm512_castps_si512(mask));
820
821 _mm512_mask_blend_epi32(k, a, b);
822 }
823
824 INLINE simd16scalari _simd16_blendv_epi32(simd16scalari a, simd16scalari b, const simd16scalari mask)
825 {
826 simd16mask k = _simd16_scalari2mask(mask);
827
828 _mm512_mask_blend_epi32(k, a, b);
829 }
830
831 #define _simd16_mul_ps _mm512_mul_ps
832 #define _simd16_add_ps _mm512_add_ps
833 #define _simd16_sub_ps _mm512_sub_ps
834 #define _simd16_rsqrt_ps _mm512_rsqrt14_ps
835 #define _simd16_min_ps _mm512_min_ps
836 #define _simd16_max_ps _mm512_max_ps
837
838 INLINE simd16mask _simd16_movemask_ps(simd16scalar a)
839 {
840 return _simd16_scalari2mask(_mm512_castps_si512(a));
841 }
842
843 #if 0
844 INLINE simd16mask _simd16_movemask_pd(simd16scalard a)
845 {
846 return _simd16_scalard2mask(_mm512i_castpd_si512(a));
847 }
848 #endif
849
850 #if 0
851 INLINE int _simd16_movemask_epi8(simd16scalari a)
852 {
853 return _simd16_scalar2mask(a);
854 }
855 #endif
856
857 #define _simd16_cvtps_epi32 _mm512_cvtps_epi32
858 #define _simd16_cvttps_epi32 _mm512_cvttps_epi32
859 #define _simd16_cvtepi32_ps _mm512_cvtepi32_ps
860
861 template <int comp>
862 INLINE simd16scalar _simd16_cmp_ps_temp(simd16scalar a, simd16scalar b)
863 {
864 simd16mask k = _mm512_cmpeq_ps_mask(a, b);
865
866 return _mm512_castsi512_ps(_mm512_mask_blend_epi32(k, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF)));
867 }
868
869 #define _simd16_cmp_ps(a, b, comp) _simd16_cmp_ps_temp<comp>(a, b)
870
871 #define _simd16_cmplt_ps(a, b) _simd16_cmp_ps<_CMP_LT_OQ>(a, b)
872 #define _simd16_cmpgt_ps(a, b) _simd16_cmp_ps<_CMP_GT_OQ>(a, b)
873 #define _simd16_cmpneq_ps(a, b) _simd16_cmp_ps<_CMP_NEQ_OQ>(a, b)
874 #define _simd16_cmpeq_ps(a, b) _simd16_cmp_ps<_CMP_EQ_OQ>(a, b)
875 #define _simd16_cmpge_ps(a, b) _simd16_cmp_ps<_CMP_GE_OQ>(a, b)
876 #define _simd16_cmple_ps(a, b) _simd16_cmp_ps<_CMP_LE_OQ>(a, b)
877
878 #define _simd16_castsi_ps _mm512_castsi512_ps
879 #define _simd16_castps_si _mm512_castps_si512
880 #define _simd16_castsi_pd _mm512_castsi512_pd
881 #define _simd16_castpd_si _mm512_castpd_si512
882 #define _simd16_castpd_ps _mm512_castpd_ps
883 #define _simd16_castps_pd _mm512_castps_pd
884
885 #define _simd16_andnot_ps _mm512_andnot_ps
886
887 template <int mode>
888 INLINE simd16scalar _simd16_round_ps_temp(simd16scalar a)
889 {
890 return _mm512_roundscale_ps(a, mode);
891 }
892
893 #define _simd16_round_ps(a, mode) _simd16_round_ps_temp<mode>(a)
894
895 #define _simd16_mul_epi32 _mm512_mul_epi32
896 #define _simd16_mullo_epi32 _mm512_mullo_epi32
897 #define _simd16_sub_epi32 _mm512_sub_epi32
898 #define _simd16_sub_epi64 _mm512_sub_epi64
899 #define _simd16_min_epi32 _mm512_min_epi32
900 #define _simd16_max_epi32 _mm512_max_epi32
901 #define _simd16_min_epu32 _mm512_min_epu32
902 #define _simd16_max_epu32 _mm512_max_epu32
903 #define _simd16_add_epi32 _mm512_add_epi32
904 #define _simd16_and_si _mm512_and_si512
905 #define _simd16_andnot_si _mm512_andnot_si512
906 #define _simd16_or_si _mm512_or_si512
907 #define _simd16_xor_si _mm512_xor_si512
908
909 INLINE simd16scalari _simd16_cmpeq_epi32(simd16scalari a, simd16scalari b)
910 {
911 simd16mask k = _mm512_cmpeq_epi32_mask(a, b);
912
913 return _mm512_mask_blend_epi32(k, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF));
914 }
915
916 INLINE simd16scalari _simd16_cmpgt_epi32(simd16scalari a, simd16scalari b)
917 {
918 simd16mask k = _mm512_cmpgt_epi32_mask(a, b);
919
920 return _mm512_mask_blend_epi32(k, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF));
921 }
922
923 INLINE simd16scalari _simd16_cmplt_epi32(simd16scalari a, simd16scalari b)
924 {
925 simd16mask k = _mm512_cmplt_epi32_mask(a, b);
926
927 return _mm512_mask_blend_epi32(k, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF));
928 }
929
930 #if 0
931 INLINE int _simd16_testz_ps(simd16scalar a, simd16scalar b)
932 {
933 int lo = _mm256_testz_ps(a.lo, b.lo);
934 int hi = _mm256_testz_ps(a.hi, b.hi);
935
936 return lo & hi;
937 }
938
939 #endif
940
941 #define _simd16_unpacklo_epi32 _mm512_unpacklo_epi32
942 #define _simd16_unpackhi_epi32 _mm512_unpackhi_epi32
943 #define _simd16_unpacklo_epi64 _mm512_unpacklo_epi64
944 #define _simd16_unpackhi_epi64 _mm512_unpackhi_epi64
945 #define _simd16_slli_epi32 _mm512_slli_epi32
946 #define _simd16_srli_epi32 _mm512_srli_epi32
947 #define _simd16_srai_epi32 _mm512_srai_epi32
948 #define _simd16_fmadd_ps _mm512_fmadd_ps
949 #define _simd16_fmsub_ps _mm512_fmsub_ps
950 #define _simd16_adds_epu8 _mm512_adds_epu8
951 #define _simd16_subs_epu8 _mm512_subs_epu8
952 #define _simd16_add_epi8 _mm512_add_epi8
953 #define _simd16_shuffle_epi8 _mm512_shuffle_epi8
954
955 #define _simd16_i32gather_ps(m, index, scale) _mm512_i32gather_ps(index, m, scale)
956
957 #define _simd16_abs_epi32 _mm512_abs_epi32
958 #define _simd16_cmpeq_epi64 _mm512_abs_epi32
959
960 INLINE simd16scalari _simd16_cmpeq_epi64(simd16scalari a, simd16scalari b)
961 {
962 __mmask8 k = _mm512_cmpeq_epi64_mask(a, b);
963
964 return _mm512_mask_blend_epi64(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
965 }
966
967 INLINE simd16scalari _simd16_cmpgt_epi64(simd16scalari a, simd16scalari b)
968 {
969 __mmask8 k = _mm512_cmpgt_epi64_mask(a, b);
970
971 return _mm512_mask_blend_epi64(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
972 }
973
974 INLINE simd16scalari _simd16_cmpeq_epi16(simd16scalari a, simd16scalari b)
975 {
976 __mmask32 k = _mm512_cmpeq_epi16_mask(a, b);
977
978 return _mm512_mask_blend_epi16(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
979 }
980
981 INLINE simd16scalari _simd16_cmpgt_epi16(simd16scalari a, simd16scalari b)
982 {
983 __mmask32 k = _mm512_cmpgt_epi16_mask(a, b);
984
985 return _mm512_mask_blend_epi16(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
986 }
987
988 INLINE simd16scalari _simd16_cmpeq_epi8(simd16scalari a, simd16scalari b)
989 {
990 __mmask64 k = _mm512_cmpeq_epi8_mask(a, b);
991
992 return _mm512_mask_blend_epi8(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
993 }
994
995 INLINE simd16scalari _simd16_cmpgt_epi8(simd16scalari a, simd16scalari b)
996 {
997 __mmask64 k = _mm512_cmpgt_epi8_mask(a, b);
998
999 return _mm512_mask_blend_epi8(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
1000 }
1001
1002 #define _simd16_permute_ps(a, i) _mm512_permutexvar_ps(i, a)
1003 #define _simd16_permute_epi32(a, i) _mm512_permutexvar_epi32(i, a)
1004 #define _simd16_sllv_epi32 _mm512_srlv_epi32
1005 #define _simd16_srlv_epi32 _mm512_sllv_epi32
1006 #define _simd16_permute2f128_ps _mm512_shuffle_f32x4
1007 #define _simd16_permute2f128_pd _mm512_shuffle_f64x2
1008 #define _simd16_permute2f128_si _mm512_shuffle_i32x4
1009 #define _simd16_shuffle_ps _mm512_shuffle_ps
1010 #define _simd16_shuffle_pd _mm512_shuffle_pd
1011
1012 template <int imm8>
1013 INLINE simd16scalari _simd16_shuffle_epi32_temp(simd16scalari a, simd16scalari b)
1014 {
1015 return _simd16_castps_si(_simd16_shuffle_ps(_simd16_castsi_ps(a), _simd16_castsi_ps(b), imm8));
1016 }
1017
1018 #define _simd16_shuffle_epi32(a, b, imm8) _simd16_shuffle_epi32_temp<imm8>(a, b)
1019
1020 template <int imm8>
1021 INLINE simd16scalari _simd16_shuffle_epi64_temp(simd16scalari a, simd16scalari b)
1022 {
1023 return _simd16_castpd_si(_simd16_shuffle_pd(_simd16_castsi_pd(a), _simd16_castsi_pd(b), imm8));
1024 }
1025
1026 #define _simd16_shuffle_epi64(a, b, imm8) _simd16_shuffle_epi64_temp<imm8>(a, b)
1027
1028 INLINE simd16mask _simd16_int2mask(int mask)
1029 {
1030 return _mm512_int2mask(mask);
1031 }
1032
1033 INLINE int _simd16_mask2int(simd16mask mask)
1034 {
1035 return _mm512_mask2int(mask);
1036 }
1037
1038 INLINE simd16mask _simd16_cmplt_ps_mask(simd16scalar a, simd16scalar b)
1039 {
1040 return _mm512_cmplt_ps_mask(a, b);
1041 }
1042
1043 // convert bitmask to vector mask
1044 INLINE simd16scalar vMask16(int32_t mask)
1045 {
1046 simd16scalari temp = _simd16_set1_epi32(mask);
1047
1048 simd16scalari bits = _simd16_set_epi32(0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100, 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001);
1049
1050 simd16scalari result = _simd16_cmplt_epi32(_simd16_setzero_si(), _simd16_and_si(temp, bits));
1051
1052 return _simd16_castsi_ps(result);
1053 }
1054
1055 #endif//ENABLE_AVX512_EMULATION
1056
1057 #endif//ENABLE_AVX512_SIMD16
1058
1059 #endif//__SWR_SIMD16INTRIN_H_