swr/rast: Switch intrinsic usage to SIMDLib
[mesa.git] / src / gallium / drivers / swr / rasterizer / common / simdlib_512_emu.inl
1 /****************************************************************************
2 * Copyright (C) 2017 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 ****************************************************************************/
23 #if !defined(__SIMD_LIB_AVX_HPP__)
24 #error Do not include this file directly, use "simdlib.hpp" instead.
25 #endif
26
27 //============================================================================
28 // SIMD16 AVX (1) implementation
29 //============================================================================
30
31 static const int TARGET_SIMD_WIDTH = 8;
32 using SIMD128T = SIMD128Impl::AVXImpl;
33
34 #define SIMD_WRAPPER_1(op) \
35 static SIMDINLINE Float SIMDCALL op(Float a) \
36 {\
37 return Float\
38 {\
39 SIMD256T::op(a.v8[0]),\
40 SIMD256T::op(a.v8[1]),\
41 };\
42 }
43
44 #define SIMD_WRAPPER_2(op) \
45 static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
46 {\
47 return Float\
48 {\
49 SIMD256T::op(a.v8[0], b.v8[0]),\
50 SIMD256T::op(a.v8[1], b.v8[1]),\
51 };\
52 }
53
54 #define SIMD_WRAPPER_2I(op) \
55 template<int ImmT>\
56 static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
57 {\
58 return Float\
59 {\
60 SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]),\
61 SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]),\
62 };\
63 }
64
65 #define SIMD_WRAPPER_2I_1(op) \
66 template<int ImmT>\
67 static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
68 {\
69 return Float\
70 {\
71 SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]),\
72 SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]),\
73 };\
74 }
75
76 #define SIMD_WRAPPER_3(op) \
77 static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \
78 {\
79 return Float\
80 {\
81 SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]),\
82 SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]),\
83 };\
84 }
85
86 #define SIMD_IWRAPPER_1(op) \
87 static SIMDINLINE Integer SIMDCALL op(Integer a) \
88 {\
89 return Integer\
90 {\
91 SIMD256T::op(a.v8[0]),\
92 SIMD256T::op(a.v8[1]),\
93 };\
94 }
95
96 #define SIMD_IWRAPPER_2(op) \
97 static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
98 {\
99 return Integer\
100 {\
101 SIMD256T::op(a.v8[0], b.v8[0]),\
102 SIMD256T::op(a.v8[1], b.v8[1]),\
103 };\
104 }
105
106 #define SIMD_IWRAPPER_2I(op) \
107 template<int ImmT>\
108 static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
109 {\
110 return Integer\
111 {\
112 SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]),\
113 SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]),\
114 };\
115 }
116
117 #define SIMD_IWRAPPER_2I_1(op) \
118 template<int ImmT>\
119 static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
120 {\
121 return Integer\
122 {\
123 SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]),\
124 SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]),\
125 };\
126 }
127
128 #define SIMD_IWRAPPER_2I_2(op) \
129 template<int ImmT>\
130 static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
131 {\
132 return Integer\
133 {\
134 SIMD256T::template op<0xF & ImmT>(a.v8[0], b.v8[0]),\
135 SIMD256T::template op<0xF & (ImmT >> 4)>(a.v8[1], b.v8[1]),\
136 };\
137 }
138
139 #define SIMD_IWRAPPER_3(op) \
140 static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b, Integer c) \
141 {\
142 return Integer\
143 {\
144 SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]),\
145 SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]),\
146 };\
147 }
148
149 //-----------------------------------------------------------------------
150 // Single precision floating point arithmetic operations
151 //-----------------------------------------------------------------------
152 SIMD_WRAPPER_2(add_ps); // return a + b
153 SIMD_WRAPPER_2(div_ps); // return a / b
154 SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c
155 SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c
156 SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b
157 SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b
158 SIMD_WRAPPER_2(mul_ps); // return a * b
159 SIMD_WRAPPER_1(rcp_ps); // return 1.0f / a
160 SIMD_WRAPPER_1(rsqrt_ps); // return 1.0f / sqrt(a)
161 SIMD_WRAPPER_2(sub_ps); // return a - b
162
163 template <RoundMode RMT>
164 static SIMDINLINE Float SIMDCALL round_ps(Float a)
165 {
166 return Float
167 {
168 SIMD256T::template round_ps<RMT>(a.v8[0]),
169 SIMD256T::template round_ps<RMT>(a.v8[1]),
170 };
171 }
172
173 static SIMDINLINE Float SIMDCALL ceil_ps(Float a) { return round_ps<RoundMode::CEIL_NOEXC>(a); }
174 static SIMDINLINE Float SIMDCALL floor_ps(Float a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); }
175
176 //-----------------------------------------------------------------------
177 // Integer (various width) arithmetic operations
178 //-----------------------------------------------------------------------
179 SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
180 SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
181 SIMD_IWRAPPER_2(add_epi8); // return a + b (int8)
182 SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
183 SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
184 SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
185 SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
186 SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
187 SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
188
189 // return (a * b) & 0xFFFFFFFF
190 //
191 // Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
192 // and store the low 32 bits of the intermediate integers in dst.
193 SIMD_IWRAPPER_2(mullo_epi32);
194 SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
195 SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
196 SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
197
198 //-----------------------------------------------------------------------
199 // Logical operations
200 //-----------------------------------------------------------------------
201 SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int)
202 SIMD_IWRAPPER_2(and_si); // return a & b (int)
203 SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int)
204 SIMD_IWRAPPER_2(andnot_si); // return (~a) & b (int)
205 SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int)
206 SIMD_IWRAPPER_2(or_si); // return a | b (int)
207 SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int)
208 SIMD_IWRAPPER_2(xor_si); // return a ^ b (int)
209
210
211 //-----------------------------------------------------------------------
212 // Shift operations
213 //-----------------------------------------------------------------------
214 template<int ImmT>
215 static SIMDINLINE Integer SIMDCALL slli_epi32(Integer a) // return a << ImmT
216 {
217 return Integer
218 {
219 SIMD256T::template slli_epi32<ImmT>(a.v8[0]),
220 SIMD256T::template slli_epi32<ImmT>(a.v8[1]),
221 };
222 }
223
224 SIMD_IWRAPPER_2(sllv_epi32); // return a << b (uint32)
225
226 template<int ImmT>
227 static SIMDINLINE Integer SIMDCALL srai_epi32(Integer a) // return a >> ImmT (int32)
228 {
229 return Integer
230 {
231 SIMD256T::template srai_epi32<ImmT>(a.v8[0]),
232 SIMD256T::template srai_epi32<ImmT>(a.v8[1]),
233 };
234 }
235
236 template<int ImmT>
237 static SIMDINLINE Integer SIMDCALL srli_epi32(Integer a) // return a >> ImmT (uint32)
238 {
239 return Integer
240 {
241 SIMD256T::template srli_epi32<ImmT>(a.v8[0]),
242 SIMD256T::template srli_epi32<ImmT>(a.v8[1]),
243 };
244 }
245
246 template<int ImmT> // for each 128-bit lane:
247 static SIMDINLINE Integer SIMDCALL srli_si(Integer a) // return a >> (ImmT*8) (uint)
248 {
249 return Integer
250 {
251 SIMD256T::template srli_si<ImmT>(a.v8[0]),
252 SIMD256T::template srli_si<ImmT>(a.v8[1]),
253 };
254 }
255 template<int ImmT>
256 static SIMDINLINE Float SIMDCALL srlisi_ps(Float a) // same as srli_si, but with Float cast to int
257 {
258 return Float
259 {
260 SIMD256T::template srlisi_ps<ImmT>(a.v8[0]),
261 SIMD256T::template srlisi_ps<ImmT>(a.v8[1]),
262 };
263 }
264
265 SIMD_IWRAPPER_2(srlv_epi32); // return a >> b (uint32)
266
267 //-----------------------------------------------------------------------
268 // Conversion operations
269 //-----------------------------------------------------------------------
270 static SIMDINLINE Float SIMDCALL castpd_ps(Double a) // return *(Float*)(&a)
271 {
272 return Float
273 {
274 SIMD256T::castpd_ps(a.v8[0]),
275 SIMD256T::castpd_ps(a.v8[1]),
276 };
277 }
278
279 static SIMDINLINE Integer SIMDCALL castps_si(Float a) // return *(Integer*)(&a)
280 {
281 return Integer
282 {
283 SIMD256T::castps_si(a.v8[0]),
284 SIMD256T::castps_si(a.v8[1]),
285 };
286 }
287
288 static SIMDINLINE Double SIMDCALL castsi_pd(Integer a) // return *(Double*)(&a)
289 {
290 return Double
291 {
292 SIMD256T::castsi_pd(a.v8[0]),
293 SIMD256T::castsi_pd(a.v8[1]),
294 };
295 }
296
297 static SIMDINLINE Double SIMDCALL castps_pd(Float a) // return *(Double*)(&a)
298 {
299 return Double
300 {
301 SIMD256T::castps_pd(a.v8[0]),
302 SIMD256T::castps_pd(a.v8[1]),
303 };
304 }
305
306 static SIMDINLINE Float SIMDCALL castsi_ps(Integer a) // return *(Float*)(&a)
307 {
308 return Float
309 {
310 SIMD256T::castsi_ps(a.v8[0]),
311 SIMD256T::castsi_ps(a.v8[1]),
312 };
313 }
314
315 static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a) // return (float)a (int32 --> float)
316 {
317 return Float
318 {
319 SIMD256T::cvtepi32_ps(a.v8[0]),
320 SIMD256T::cvtepi32_ps(a.v8[1]),
321 };
322 }
323
324 static SIMDINLINE Integer SIMDCALL cvtepu8_epi16(SIMD256Impl::Integer a) // return (int16)a (uint8 --> int16)
325 {
326 return Integer
327 {
328 SIMD256T::cvtepu8_epi16(a.v4[0]),
329 SIMD256T::cvtepu8_epi16(a.v4[1]),
330 };
331 }
332
333 static SIMDINLINE Integer SIMDCALL cvtepu8_epi32(SIMD256Impl::Integer a) // return (int32)a (uint8 --> int32)
334 {
335 return Integer
336 {
337 SIMD256T::cvtepu8_epi32(a.v4[0]),
338 SIMD256T::cvtepu8_epi32(SIMD128T::template srli_si<8>(a.v4[0])),
339 };
340 }
341
342 static SIMDINLINE Integer SIMDCALL cvtepu16_epi32(SIMD256Impl::Integer a) // return (int32)a (uint16 --> int32)
343 {
344 return Integer
345 {
346 SIMD256T::cvtepu16_epi32(a.v4[0]),
347 SIMD256T::cvtepu16_epi32(a.v4[1]),
348 };
349 }
350
351 static SIMDINLINE Integer SIMDCALL cvtepu16_epi64(SIMD256Impl::Integer a) // return (int64)a (uint16 --> int64)
352 {
353 return Integer
354 {
355 SIMD256T::cvtepu16_epi64(a.v4[0]),
356 SIMD256T::cvtepu16_epi64(SIMD128T::template srli_si<8>(a.v4[0])),
357 };
358 }
359
360 static SIMDINLINE Integer SIMDCALL cvtepu32_epi64(SIMD256Impl::Integer a) // return (int64)a (uint32 --> int64)
361 {
362 return Integer
363 {
364 SIMD256T::cvtepu32_epi64(a.v4[0]),
365 SIMD256T::cvtepu32_epi64(a.v4[1]),
366 };
367 }
368
369 static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a) // return (int32)a (float --> int32)
370 {
371 return Integer
372 {
373 SIMD256T::cvtps_epi32(a.v8[0]),
374 SIMD256T::cvtps_epi32(a.v8[1]),
375 };
376 }
377
378 static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float a) // return (int32)a (rnd_to_zero(float) --> int32)
379 {
380 return Integer
381 {
382 SIMD256T::cvtps_epi32(a.v8[0]),
383 SIMD256T::cvtps_epi32(a.v8[1]),
384 };
385 }
386
387 //-----------------------------------------------------------------------
388 // Comparison operations
389 //-----------------------------------------------------------------------
390 template<CompareType CmpTypeT>
391 static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b
392 {
393 return Float
394 {
395 SIMD256T::template cmp_ps<CmpTypeT>(a.v8[0], b.v8[0]),
396 SIMD256T::template cmp_ps<CmpTypeT>(a.v8[1], b.v8[1]),
397 };
398 }
399 static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b) { return cmp_ps<CompareType::LT_OQ>(a, b); }
400 static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b) { return cmp_ps<CompareType::GT_OQ>(a, b); }
401 static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); }
402 static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b) { return cmp_ps<CompareType::EQ_OQ>(a, b); }
403 static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b) { return cmp_ps<CompareType::GE_OQ>(a, b); }
404 static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b) { return cmp_ps<CompareType::LE_OQ>(a, b); }
405
406 template<CompareType CmpTypeT>
407 static SIMDINLINE Mask SIMDCALL cmp_ps_mask(Float a, Float b)
408 {
409 return static_cast<Mask>(movemask_ps(cmp_ps<CmpTypeT>(a, b)));
410 }
411
412
413 SIMD_IWRAPPER_2(cmpeq_epi8); // return a == b (int8)
414 SIMD_IWRAPPER_2(cmpeq_epi16); // return a == b (int16)
415 SIMD_IWRAPPER_2(cmpeq_epi32); // return a == b (int32)
416 SIMD_IWRAPPER_2(cmpeq_epi64); // return a == b (int64)
417 SIMD_IWRAPPER_2(cmpgt_epi8); // return a > b (int8)
418 SIMD_IWRAPPER_2(cmpgt_epi16); // return a > b (int16)
419 SIMD_IWRAPPER_2(cmpgt_epi32); // return a > b (int32)
420 SIMD_IWRAPPER_2(cmpgt_epi64); // return a > b (int64)
421 SIMD_IWRAPPER_2(cmplt_epi32); // return a < b (int32)
422
423 static SIMDINLINE bool SIMDCALL testz_ps(Float a, Float b) // return all_lanes_zero(a & b) ? 1 : 0 (float)
424 {
425 return 0 != (SIMD256T::testz_ps(a.v8[0], b.v8[0]) &
426 SIMD256T::testz_ps(a.v8[1], b.v8[1]));
427 }
428
429 static SIMDINLINE int SIMDCALL testz_si(Integer a, Integer b) // return all_lanes_zero(a & b) ? 1 : 0 (int)
430 {
431 return 0 != (SIMD256T::testz_si(a.v8[0], b.v8[0]) &
432 SIMD256T::testz_si(a.v8[1], b.v8[1]));
433 }
434
435 //-----------------------------------------------------------------------
436 // Blend / shuffle / permute operations
437 //-----------------------------------------------------------------------
438 SIMD_WRAPPER_2I(blend_ps); // return ImmT ? b : a (float)
439 SIMD_IWRAPPER_2I(blend_epi32); // return ImmT ? b : a (int32)
440 SIMD_WRAPPER_3(blendv_ps); // return mask ? b : a (float)
441 static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Float mask) // return mask ? b : a (int)
442 {
443 return Integer
444 {
445 SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]),
446 SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]),
447 };
448 }
449
450 static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Integer mask) // return mask ? b : a (int)
451 {
452 return Integer
453 {
454 SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]),
455 SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]),
456 };
457 }
458
459 static SIMDINLINE Float SIMDCALL broadcast_ss(float const *p) // return *p (all elements in vector get same value)
460 {
461 float f = *p;
462 return Float
463 {
464 SIMD256T::set1_ps(f),
465 SIMD256T::set1_ps(f),
466 };
467 }
468
469 template<int imm>
470 static SIMDINLINE SIMD256Impl::Float SIMDCALL extract_ps(Float a)
471 {
472 SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
473 return a.v8[imm];
474 }
475
476 template<int imm>
477 static SIMDINLINE SIMD256Impl::Double SIMDCALL extract_pd(Double a)
478 {
479 SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
480 return a.v8[imm];
481 }
482
483 template<int imm>
484 static SIMDINLINE SIMD256Impl::Integer SIMDCALL extract_si(Integer a)
485 {
486 SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
487 return a.v8[imm];
488 }
489
490 template<int imm>
491 static SIMDINLINE Float SIMDCALL insert_ps(Float a, SIMD256Impl::Float b)
492 {
493 SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
494 a.v8[imm] = b;
495 return a;
496 }
497
498 template<int imm>
499 static SIMDINLINE Double SIMDCALL insert_pd(Double a, SIMD256Impl::Double b)
500 {
501 SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
502 a.v8[imm] = b;
503 return a;
504 }
505
506 template<int imm>
507 static SIMDINLINE Integer SIMDCALL insert_si(Integer a, SIMD256Impl::Integer b)
508 {
509 SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
510 a.v8[imm] = b;
511 return a;
512 }
513
514 SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
515 SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
516 SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
517 SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
518
519 static SIMDINLINE Integer SIMDCALL permute_epi32(Integer a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (int32)
520 {
521 Integer result;
522
523 // Ugly slow implementation
524 uint32_t const *pA = reinterpret_cast<uint32_t const*>(&a);
525 uint32_t const *pSwiz = reinterpret_cast<uint32_t const*>(&swiz);
526 uint32_t *pResult = reinterpret_cast<uint32_t *>(&result);
527
528 for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
529 {
530 pResult[i] = pA[0xF & pSwiz[i]];
531 }
532
533 return result;
534 }
535
536 static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
537 {
538 Float result;
539
540 // Ugly slow implementation
541 float const *pA = reinterpret_cast<float const*>(&a);
542 uint32_t const *pSwiz = reinterpret_cast<uint32_t const*>(&swiz);
543 float *pResult = reinterpret_cast<float *>(&result);
544
545 for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
546 {
547 pResult[i] = pA[0xF & pSwiz[i]];
548 }
549
550 return result;
551 }
552
553 // All of the 512-bit permute2f128_XX intrinsics do the following:
554 //
555 // SELECT4(src, control) {
556 // CASE(control[1:0])
557 // 0: tmp[127:0] : = src[127:0]
558 // 1 : tmp[127:0] : = src[255:128]
559 // 2 : tmp[127:0] : = src[383:256]
560 // 3 : tmp[127:0] : = src[511:384]
561 // ESAC
562 // RETURN tmp[127:0]
563 // }
564 //
565 // dst[127:0] : = SELECT4(a[511:0], imm8[1:0])
566 // dst[255:128] : = SELECT4(a[511:0], imm8[3:2])
567 // dst[383:256] : = SELECT4(b[511:0], imm8[5:4])
568 // dst[511:384] : = SELECT4(b[511:0], imm8[7:6])
569 // dst[MAX:512] : = 0
570 //
571 // Since the 256-bit AVX instructions use a 4-bit control field (instead
572 // of 2-bit for AVX512), we need to expand the control bits sent to the
573 // AVX instructions for emulation.
574 //
575 template <int shuf>
576 static SIMDINLINE Float SIMDCALL permute2f128_ps(Float a, Float b)
577 {
578 return Float
579 {
580 SIMD256T::template permute2f128_ps<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], a.v8[1]),
581 SIMD256T::template permute2f128_ps<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], b.v8[1]),
582 };
583 }
584
585 template <int shuf>
586 static SIMDINLINE Double SIMDCALL permute2f128_pd(Double a, Double b)
587 {
588 return Double
589 {
590 SIMD256T::template permute2f128_pd<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], a.v8[1]),
591 SIMD256T::template permute2f128_pd<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], b.v8[1]),
592 };
593 }
594
595 template <int shuf>
596 static SIMDINLINE Integer SIMDCALL permute2f128_si(Integer a, Integer b)
597 {
598 return Integer
599 {
600 SIMD256T::template permute2f128_si<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], a.v8[1]),
601 SIMD256T::template permute2f128_si<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], b.v8[1]),
602 };
603 }
604
605 SIMD_IWRAPPER_2I_1(shuffle_epi32);
606 SIMD_IWRAPPER_2I_2(shuffle_epi64);
607 SIMD_IWRAPPER_2(shuffle_epi8);
608 SIMD_WRAPPER_2I_1(shuffle_pd);
609 SIMD_WRAPPER_2I_1(shuffle_ps);
610 SIMD_IWRAPPER_2(unpackhi_epi16);
611 SIMD_IWRAPPER_2(unpackhi_epi32);
612 SIMD_IWRAPPER_2(unpackhi_epi64);
613 SIMD_IWRAPPER_2(unpackhi_epi8);
614 SIMD_WRAPPER_2(unpackhi_pd);
615 SIMD_WRAPPER_2(unpackhi_ps);
616 SIMD_IWRAPPER_2(unpacklo_epi16);
617 SIMD_IWRAPPER_2(unpacklo_epi32);
618 SIMD_IWRAPPER_2(unpacklo_epi64);
619 SIMD_IWRAPPER_2(unpacklo_epi8);
620 SIMD_WRAPPER_2(unpacklo_pd);
621 SIMD_WRAPPER_2(unpacklo_ps);
622
623 //-----------------------------------------------------------------------
624 // Load / store operations
625 //-----------------------------------------------------------------------
626 template<ScaleFactor ScaleT>
627 static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
628 {
629 return Float
630 {
631 SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[0]),
632 SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[1]),
633 };
634 }
635
636 static SIMDINLINE Float SIMDCALL load1_ps(float const *p) // return *p (broadcast 1 value to all elements)
637 {
638 return broadcast_ss(p);
639 }
640
641 static SIMDINLINE Float SIMDCALL load_ps(float const *p) // return *p (loads SIMD width elements from memory)
642 {
643 return Float
644 {
645 SIMD256T::load_ps(p),
646 SIMD256T::load_ps(p + TARGET_SIMD_WIDTH)
647 };
648 }
649
650 static SIMDINLINE Integer SIMDCALL load_si(Integer const *p) // return *p
651 {
652 return Integer
653 {
654 SIMD256T::load_si(&p->v8[0]),
655 SIMD256T::load_si(&p->v8[1]),
656 };
657 }
658
659 static SIMDINLINE Float SIMDCALL loadu_ps(float const *p) // return *p (same as load_ps but allows for unaligned mem)
660 {
661 return Float
662 {
663 SIMD256T::loadu_ps(p),
664 SIMD256T::loadu_ps(p + TARGET_SIMD_WIDTH)
665 };
666 }
667
668 static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p (same as load_si but allows for unaligned mem)
669 {
670 return Integer
671 {
672 SIMD256T::loadu_si(&p->v8[0]),
673 SIMD256T::loadu_si(&p->v8[1]),
674 };
675 }
676
677 // for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
678 template<ScaleFactor ScaleT>
679 static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
680 {
681 return Float
682 {
683 SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[0], p, idx.v8[0], mask.v8[0]),
684 SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[1], p, idx.v8[1], mask.v8[1]),
685 };
686 }
687
688 static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
689 {
690 SIMD256T::maskstore_ps(p, mask.v8[0], src.v8[0]);
691 SIMD256T::maskstore_ps(p + TARGET_SIMD_WIDTH, mask.v8[1], src.v8[1]);
692 }
693
694 static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer a)
695 {
696 uint64_t mask = static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[0]));
697 mask |= static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[1])) << (TARGET_SIMD_WIDTH * 4);
698
699 return mask;
700 }
701
702 static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a)
703 {
704 uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[0]));
705 mask |= static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[1])) << (TARGET_SIMD_WIDTH / 2);
706
707 return mask;
708 }
709 static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a)
710 {
711 uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[0]));
712 mask |= static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[1])) << TARGET_SIMD_WIDTH;
713
714 return mask;
715 }
716
717 static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
718 {
719 return Integer
720 {
721 SIMD256T::set1_epi32(i),
722 SIMD256T::set1_epi32(i)
723 };
724 }
725
726 static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
727 {
728 return Integer
729 {
730 SIMD256T::set1_epi8(i),
731 SIMD256T::set1_epi8(i)
732 };
733 }
734
735 static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value)
736 {
737 return Float
738 {
739 SIMD256T::set1_ps(f),
740 SIMD256T::set1_ps(f)
741 };
742 }
743
744 static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float)
745 {
746 return Float
747 {
748 SIMD256T::setzero_ps(),
749 SIMD256T::setzero_ps()
750 };
751 }
752
753 static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer)
754 {
755 return Integer
756 {
757 SIMD256T::setzero_si(),
758 SIMD256T::setzero_si()
759 };
760 }
761
762 static SIMDINLINE void SIMDCALL store_ps(float *p, Float a) // *p = a (stores all elements contiguously in memory)
763 {
764 SIMD256T::store_ps(p, a.v8[0]);
765 SIMD256T::store_ps(p + TARGET_SIMD_WIDTH, a.v8[1]);
766 }
767
768 static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a) // *p = a
769 {
770 SIMD256T::store_si(&p->v8[0], a.v8[0]);
771 SIMD256T::store_si(&p->v8[1], a.v8[1]);
772 }
773
774 static SIMDINLINE void SIMDCALL stream_ps(float *p, Float a) // *p = a (same as store_ps, but doesn't keep memory in cache)
775 {
776 SIMD256T::stream_ps(p, a.v8[0]);
777 SIMD256T::stream_ps(p + TARGET_SIMD_WIDTH, a.v8[1]);
778 }
779
780 static SIMDINLINE Integer SIMDCALL set_epi32(
781 int i15, int i14, int i13, int i12, int i11, int i10, int i9, int i8,
782 int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
783 {
784 return Integer
785 {
786 SIMD256T::set_epi32(
787 i7, i6, i5, i4, i3, i2, i1, i0),
788 SIMD256T::set_epi32(
789 i15, i14, i13, i12, i11, i10, i9, i8)
790 };
791 }
792
793 static SIMDINLINE Integer SIMDCALL set_epi32(
794 int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
795 {
796 return set_epi32(
797 0, 0, 0, 0, 0, 0, 0, 0,
798 i7, i6, i5, i4, i3, i2, i1, i0);
799 }
800
801 static SIMDINLINE Float SIMDCALL set_ps(
802 float i15, float i14, float i13, float i12, float i11, float i10, float i9, float i8,
803 float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
804 {
805 return Float
806 {
807 SIMD256T::set_ps(
808 i7, i6, i5, i4, i3, i2, i1, i0),
809 SIMD256T::set_ps(
810 i15, i14, i13, i12, i11, i10, i9, i8)
811 };
812 }
813
814 static SIMDINLINE Float SIMDCALL set_ps(
815 float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
816 {
817 return set_ps(
818 0, 0, 0, 0, 0, 0, 0, 0,
819 i7, i6, i5, i4, i3, i2, i1, i0);
820 }
821
822 static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
823 {
824 Integer vec = set1_epi32(mask);
825 const Integer bit = set_epi32(
826 0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100,
827 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
828 vec = and_si(vec, bit);
829 vec = cmplt_epi32(setzero_si(), vec);
830 return castsi_ps(vec);
831 }
832
833 #undef SIMD_WRAPPER_1
834 #undef SIMD_WRAPPER_2
835 #undef SIMD_WRAPPER_2I
836 #undef SIMD_WRAPPER_2I_1
837 #undef SIMD_WRAPPER_3
838 #undef SIMD_IWRAPPER_1
839 #undef SIMD_IWRAPPER_2
840 #undef SIMD_IWRAPPER_2I
841 #undef SIMD_IWRAPPER_2I_1
842 #undef SIMD_IWRAPPER_3