Revert "swr/rast: Archrast codegen updates"
[mesa.git] / src / gallium / drivers / swr / rasterizer / common / simdlib_512_emu.inl
1 /****************************************************************************
2 * Copyright (C) 2017 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 ****************************************************************************/
23 #if !defined(__SIMD_LIB_AVX_HPP__)
24 #error Do not include this file directly, use "simdlib.hpp" instead.
25 #endif
26
27 //============================================================================
28 // SIMD16 AVX (1) implementation
29 //============================================================================
30
31 static const int TARGET_SIMD_WIDTH = 8;
32 using SIMD128T = SIMD128Impl::AVXImpl;
33
34 #define SIMD_WRAPPER_1(op) \
35 static SIMDINLINE Float SIMDCALL op(Float const& a) \
36 { \
37 return Float{ \
38 SIMD256T::op(a.v8[0]), \
39 SIMD256T::op(a.v8[1]), \
40 }; \
41 }
42
43 #define SIMD_WRAPPER_2(op) \
44 static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \
45 { \
46 return Float{ \
47 SIMD256T::op(a.v8[0], b.v8[0]), \
48 SIMD256T::op(a.v8[1], b.v8[1]), \
49 }; \
50 }
51
52 #define SIMD_WRAPPER_2I(op) \
53 template <int ImmT> \
54 static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \
55 { \
56 return Float{ \
57 SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]), \
58 SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]), \
59 }; \
60 }
61
62 #define SIMD_WRAPPER_2I_1(op) \
63 template <int ImmT> \
64 static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \
65 { \
66 return Float{ \
67 SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]), \
68 SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]), \
69 }; \
70 }
71
72 #define SIMD_WRAPPER_3(op) \
73 static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b, Float const& c) \
74 { \
75 return Float{ \
76 SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]), \
77 SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]), \
78 }; \
79 }
80
81 #define SIMD_IWRAPPER_1(op) \
82 static SIMDINLINE Integer SIMDCALL op(Integer const& a) \
83 { \
84 return Integer{ \
85 SIMD256T::op(a.v8[0]), \
86 SIMD256T::op(a.v8[1]), \
87 }; \
88 }
89
90 #define SIMD_IWRAPPER_2(op) \
91 static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
92 { \
93 return Integer{ \
94 SIMD256T::op(a.v8[0], b.v8[0]), \
95 SIMD256T::op(a.v8[1], b.v8[1]), \
96 }; \
97 }
98
99 #define SIMD_IWRAPPER_2I(op) \
100 template <int ImmT> \
101 static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
102 { \
103 return Integer{ \
104 SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]), \
105 SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]), \
106 }; \
107 }
108
109 #define SIMD_IWRAPPER_2I_1(op) \
110 template <int ImmT> \
111 static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
112 { \
113 return Integer{ \
114 SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]), \
115 SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]), \
116 }; \
117 }
118
119 #define SIMD_IWRAPPER_2I_2(op) \
120 template <int ImmT> \
121 static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
122 { \
123 return Integer{ \
124 SIMD256T::template op<0xF & ImmT>(a.v8[0], b.v8[0]), \
125 SIMD256T::template op<0xF & (ImmT >> 4)>(a.v8[1], b.v8[1]), \
126 }; \
127 }
128
129 #define SIMD_IWRAPPER_3(op) \
130 static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b, Integer const& c) \
131 { \
132 return Integer{ \
133 SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]), \
134 SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]), \
135 }; \
136 }
137
138 //-----------------------------------------------------------------------
139 // Single precision floating point arithmetic operations
140 //-----------------------------------------------------------------------
141 SIMD_WRAPPER_2(add_ps); // return a + b
142 SIMD_WRAPPER_2(div_ps); // return a / b
143 SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c
144 SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c
145 SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b
146 SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b
147 SIMD_WRAPPER_2(mul_ps); // return a * b
148 SIMD_WRAPPER_1(rcp_ps); // return 1.0f / a
149 SIMD_WRAPPER_1(rsqrt_ps); // return 1.0f / sqrt(a)
150 SIMD_WRAPPER_2(sub_ps); // return a - b
151
152 template <RoundMode RMT>
153 static SIMDINLINE Float SIMDCALL round_ps(Float const& a)
154 {
155 return Float{
156 SIMD256T::template round_ps<RMT>(a.v8[0]),
157 SIMD256T::template round_ps<RMT>(a.v8[1]),
158 };
159 }
160
161 static SIMDINLINE Float SIMDCALL ceil_ps(Float const& a)
162 {
163 return round_ps<RoundMode::CEIL_NOEXC>(a);
164 }
165 static SIMDINLINE Float SIMDCALL floor_ps(Float const& a)
166 {
167 return round_ps<RoundMode::FLOOR_NOEXC>(a);
168 }
169
170 //-----------------------------------------------------------------------
171 // Integer (various width) arithmetic operations
172 //-----------------------------------------------------------------------
173 SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
174 SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
175 SIMD_IWRAPPER_2(add_epi8); // return a + b (int8)
176 SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
177 SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
178 SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
179 SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
180 SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
181 SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
182
183 // return (a * b) & 0xFFFFFFFF
184 //
185 // Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
186 // and store the low 32 bits of the intermediate integers in dst.
187 SIMD_IWRAPPER_2(mullo_epi32);
188 SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
189 SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
190 SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
191
192 //-----------------------------------------------------------------------
193 // Logical operations
194 //-----------------------------------------------------------------------
195 SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int)
196 SIMD_IWRAPPER_2(and_si); // return a & b (int)
197 SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int)
198 SIMD_IWRAPPER_2(andnot_si); // return (~a) & b (int)
199 SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int)
200 SIMD_IWRAPPER_2(or_si); // return a | b (int)
201 SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int)
202 SIMD_IWRAPPER_2(xor_si); // return a ^ b (int)
203
204 //-----------------------------------------------------------------------
205 // Shift operations
206 //-----------------------------------------------------------------------
207 template <int ImmT>
208 static SIMDINLINE Integer SIMDCALL slli_epi32(Integer const& a) // return a << ImmT
209 {
210 return Integer{
211 SIMD256T::template slli_epi32<ImmT>(a.v8[0]),
212 SIMD256T::template slli_epi32<ImmT>(a.v8[1]),
213 };
214 }
215
216 SIMD_IWRAPPER_2(sllv_epi32); // return a << b (uint32)
217
218 template <int ImmT>
219 static SIMDINLINE Integer SIMDCALL srai_epi32(Integer const& a) // return a >> ImmT (int32)
220 {
221 return Integer{
222 SIMD256T::template srai_epi32<ImmT>(a.v8[0]),
223 SIMD256T::template srai_epi32<ImmT>(a.v8[1]),
224 };
225 }
226
227 template <int ImmT>
228 static SIMDINLINE Integer SIMDCALL srli_epi32(Integer const& a) // return a >> ImmT (uint32)
229 {
230 return Integer{
231 SIMD256T::template srli_epi32<ImmT>(a.v8[0]),
232 SIMD256T::template srli_epi32<ImmT>(a.v8[1]),
233 };
234 }
235
236 template <int ImmT> // for each 128-bit lane:
237 static SIMDINLINE Integer SIMDCALL srli_si(Integer const& a) // return a >> (ImmT*8) (uint)
238 {
239 return Integer{
240 SIMD256T::template srli_si<ImmT>(a.v8[0]),
241 SIMD256T::template srli_si<ImmT>(a.v8[1]),
242 };
243 }
244 template <int ImmT>
245 static SIMDINLINE Float SIMDCALL
246 srlisi_ps(Float const& a) // same as srli_si, but with Float cast to int
247 {
248 return Float{
249 SIMD256T::template srlisi_ps<ImmT>(a.v8[0]),
250 SIMD256T::template srlisi_ps<ImmT>(a.v8[1]),
251 };
252 }
253
254 SIMD_IWRAPPER_2(srlv_epi32); // return a >> b (uint32)
255
256 //-----------------------------------------------------------------------
257 // Conversion operations
258 //-----------------------------------------------------------------------
259 static SIMDINLINE Float SIMDCALL castpd_ps(Double const& a) // return *(Float*)(&a)
260 {
261 return Float{
262 SIMD256T::castpd_ps(a.v8[0]),
263 SIMD256T::castpd_ps(a.v8[1]),
264 };
265 }
266
267 static SIMDINLINE Integer SIMDCALL castps_si(Float const& a) // return *(Integer*)(&a)
268 {
269 return Integer{
270 SIMD256T::castps_si(a.v8[0]),
271 SIMD256T::castps_si(a.v8[1]),
272 };
273 }
274
275 static SIMDINLINE Double SIMDCALL castsi_pd(Integer const& a) // return *(Double*)(&a)
276 {
277 return Double{
278 SIMD256T::castsi_pd(a.v8[0]),
279 SIMD256T::castsi_pd(a.v8[1]),
280 };
281 }
282
283 static SIMDINLINE Double SIMDCALL castps_pd(Float const& a) // return *(Double*)(&a)
284 {
285 return Double{
286 SIMD256T::castps_pd(a.v8[0]),
287 SIMD256T::castps_pd(a.v8[1]),
288 };
289 }
290
291 static SIMDINLINE Float SIMDCALL castsi_ps(Integer const& a) // return *(Float*)(&a)
292 {
293 return Float{
294 SIMD256T::castsi_ps(a.v8[0]),
295 SIMD256T::castsi_ps(a.v8[1]),
296 };
297 }
298
299 static SIMDINLINE Float SIMDCALL
300 cvtepi32_ps(Integer const& a) // return (float)a (int32 --> float)
301 {
302 return Float{
303 SIMD256T::cvtepi32_ps(a.v8[0]),
304 SIMD256T::cvtepi32_ps(a.v8[1]),
305 };
306 }
307
308 static SIMDINLINE Integer SIMDCALL
309 cvtepu8_epi16(SIMD256Impl::Integer const& a) // return (int16)a (uint8 --> int16)
310 {
311 return Integer{
312 SIMD256T::cvtepu8_epi16(a.v4[0]),
313 SIMD256T::cvtepu8_epi16(a.v4[1]),
314 };
315 }
316
317 static SIMDINLINE Integer SIMDCALL
318 cvtepu8_epi32(SIMD256Impl::Integer const& a) // return (int32)a (uint8 --> int32)
319 {
320 return Integer{
321 SIMD256T::cvtepu8_epi32(a.v4[0]),
322 SIMD256T::cvtepu8_epi32(SIMD128T::template srli_si<8>(a.v4[0])),
323 };
324 }
325
326 static SIMDINLINE Integer SIMDCALL
327 cvtepu16_epi32(SIMD256Impl::Integer const& a) // return (int32)a (uint16 --> int32)
328 {
329 return Integer{
330 SIMD256T::cvtepu16_epi32(a.v4[0]),
331 SIMD256T::cvtepu16_epi32(a.v4[1]),
332 };
333 }
334
335 static SIMDINLINE Integer SIMDCALL
336 cvtepu16_epi64(SIMD256Impl::Integer const& a) // return (int64)a (uint16 --> int64)
337 {
338 return Integer{
339 SIMD256T::cvtepu16_epi64(a.v4[0]),
340 SIMD256T::cvtepu16_epi64(SIMD128T::template srli_si<8>(a.v4[0])),
341 };
342 }
343
344 static SIMDINLINE Integer SIMDCALL
345 cvtepu32_epi64(SIMD256Impl::Integer const& a) // return (int64)a (uint32 --> int64)
346 {
347 return Integer{
348 SIMD256T::cvtepu32_epi64(a.v4[0]),
349 SIMD256T::cvtepu32_epi64(a.v4[1]),
350 };
351 }
352
353 static SIMDINLINE Integer SIMDCALL
354 cvtps_epi32(Float const& a) // return (int32)a (float --> int32)
355 {
356 return Integer{
357 SIMD256T::cvtps_epi32(a.v8[0]),
358 SIMD256T::cvtps_epi32(a.v8[1]),
359 };
360 }
361
362 static SIMDINLINE Integer SIMDCALL
363 cvttps_epi32(Float const& a) // return (int32)a (rnd_to_zero(float) --> int32)
364 {
365 return Integer{
366 SIMD256T::cvtps_epi32(a.v8[0]),
367 SIMD256T::cvtps_epi32(a.v8[1]),
368 };
369 }
370
371 //-----------------------------------------------------------------------
372 // Comparison operations
373 //-----------------------------------------------------------------------
374 template <CompareType CmpTypeT>
375 static SIMDINLINE Float SIMDCALL cmp_ps(Float const& a, Float const& b) // return a (CmpTypeT) b
376 {
377 return Float{
378 SIMD256T::template cmp_ps<CmpTypeT>(a.v8[0], b.v8[0]),
379 SIMD256T::template cmp_ps<CmpTypeT>(a.v8[1], b.v8[1]),
380 };
381 }
382 static SIMDINLINE Float SIMDCALL cmplt_ps(Float const& a, Float const& b)
383 {
384 return cmp_ps<CompareType::LT_OQ>(a, b);
385 }
386 static SIMDINLINE Float SIMDCALL cmpgt_ps(Float const& a, Float const& b)
387 {
388 return cmp_ps<CompareType::GT_OQ>(a, b);
389 }
390 static SIMDINLINE Float SIMDCALL cmpneq_ps(Float const& a, Float const& b)
391 {
392 return cmp_ps<CompareType::NEQ_OQ>(a, b);
393 }
394 static SIMDINLINE Float SIMDCALL cmpeq_ps(Float const& a, Float const& b)
395 {
396 return cmp_ps<CompareType::EQ_OQ>(a, b);
397 }
398 static SIMDINLINE Float SIMDCALL cmpge_ps(Float const& a, Float const& b)
399 {
400 return cmp_ps<CompareType::GE_OQ>(a, b);
401 }
402 static SIMDINLINE Float SIMDCALL cmple_ps(Float const& a, Float const& b)
403 {
404 return cmp_ps<CompareType::LE_OQ>(a, b);
405 }
406
407 template <CompareType CmpTypeT>
408 static SIMDINLINE Mask SIMDCALL cmp_ps_mask(Float const& a, Float const& b)
409 {
410 return static_cast<Mask>(movemask_ps(cmp_ps<CmpTypeT>(a, b)));
411 }
412
413 SIMD_IWRAPPER_2(cmpeq_epi8); // return a == b (int8)
414 SIMD_IWRAPPER_2(cmpeq_epi16); // return a == b (int16)
415 SIMD_IWRAPPER_2(cmpeq_epi32); // return a == b (int32)
416 SIMD_IWRAPPER_2(cmpeq_epi64); // return a == b (int64)
417 SIMD_IWRAPPER_2(cmpgt_epi8); // return a > b (int8)
418 SIMD_IWRAPPER_2(cmpgt_epi16); // return a > b (int16)
419 SIMD_IWRAPPER_2(cmpgt_epi32); // return a > b (int32)
420 SIMD_IWRAPPER_2(cmpgt_epi64); // return a > b (int64)
421 SIMD_IWRAPPER_2(cmplt_epi32); // return a < b (int32)
422
423 static SIMDINLINE bool SIMDCALL
424 testz_ps(Float const& a, Float const& b) // return all_lanes_zero(a & b) ? 1 : 0 (float)
425 {
426 return 0 != (SIMD256T::testz_ps(a.v8[0], b.v8[0]) & SIMD256T::testz_ps(a.v8[1], b.v8[1]));
427 }
428
429 static SIMDINLINE bool SIMDCALL
430 testz_si(Integer const& a, Integer const& b) // return all_lanes_zero(a & b) ? 1 : 0 (int)
431 {
432 return 0 != (SIMD256T::testz_si(a.v8[0], b.v8[0]) & SIMD256T::testz_si(a.v8[1], b.v8[1]));
433 }
434
435 //-----------------------------------------------------------------------
436 // Blend / shuffle / permute operations
437 //-----------------------------------------------------------------------
438 SIMD_WRAPPER_2I(blend_ps); // return ImmT ? b : a (float)
439 SIMD_IWRAPPER_2I(blend_epi32); // return ImmT ? b : a (int32)
440 SIMD_WRAPPER_3(blendv_ps); // return mask ? b : a (float)
441 static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const& a,
442 Integer const& b,
443 Float const& mask) // return mask ? b : a (int)
444 {
445 return Integer{
446 SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]),
447 SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]),
448 };
449 }
450
451 static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const& a,
452 Integer const& b,
453 Integer const& mask) // return mask ? b : a (int)
454 {
455 return Integer{
456 SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]),
457 SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]),
458 };
459 }
460
461 static SIMDINLINE Float SIMDCALL
462 broadcast_ss(float const* p) // return *p (all elements in vector get same value)
463 {
464 float f = *p;
465 return Float{
466 SIMD256T::set1_ps(f),
467 SIMD256T::set1_ps(f),
468 };
469 }
470
471 template <int imm>
472 static SIMDINLINE SIMD256Impl::Float SIMDCALL extract_ps(Float const& a)
473 {
474 SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
475 return a.v8[imm];
476 }
477
478 template <int imm>
479 static SIMDINLINE SIMD256Impl::Double SIMDCALL extract_pd(Double const& a)
480 {
481 SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
482 return a.v8[imm];
483 }
484
485 template <int imm>
486 static SIMDINLINE SIMD256Impl::Integer SIMDCALL extract_si(Integer const& a)
487 {
488 SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
489 return a.v8[imm];
490 }
491
492 template <int imm>
493 static SIMDINLINE Float SIMDCALL insert_ps(Float const& a, SIMD256Impl::Float const& b)
494 {
495 SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
496 Float r = a;
497 r.v8[imm] = b;
498 return r;
499 }
500
501 template <int imm>
502 static SIMDINLINE Double SIMDCALL insert_pd(Double const& a, SIMD256Impl::Double const& b)
503 {
504 SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
505 Double r = a;
506 r.v8[imm] = b;
507 return r;
508 }
509
510 template <int imm>
511 static SIMDINLINE Integer SIMDCALL insert_si(Integer const& a, SIMD256Impl::Integer const& b)
512 {
513 SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
514 Integer r = a;
515 r.v8[imm] = b;
516 return r;
517 }
518
519 SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
520 SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
521 SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
522 SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
523
524 template <int ImmT>
525 static SIMDINLINE Float SIMDCALL permute_ps(Float const& a)
526 {
527 return Float{
528 SIMD256T::template permute_ps<ImmT>(a.v8[0]),
529 SIMD256T::template permute_ps<ImmT>(a.v8[1]),
530 };
531 }
532
533 static SIMDINLINE Integer SIMDCALL permute_epi32(
534 Integer const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (int32)
535 {
536 return castps_si(permute_ps(castsi_ps(a), swiz));
537 }
538
539 static SIMDINLINE Float SIMDCALL
540 permute_ps(Float const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (float)
541 {
542 const auto mask = SIMD256T::set1_epi32(7);
543
544 auto lolo = SIMD256T::permute_ps(a.v8[0], SIMD256T::and_si(swiz.v8[0], mask));
545 auto lohi = SIMD256T::permute_ps(a.v8[1], SIMD256T::and_si(swiz.v8[0], mask));
546
547 auto hilo = SIMD256T::permute_ps(a.v8[0], SIMD256T::and_si(swiz.v8[1], mask));
548 auto hihi = SIMD256T::permute_ps(a.v8[1], SIMD256T::and_si(swiz.v8[1], mask));
549
550 return Float{
551 SIMD256T::blendv_ps(
552 lolo, lohi, SIMD256T::castsi_ps(SIMD256T::cmpgt_epi32(swiz.v8[0], mask))),
553 SIMD256T::blendv_ps(
554 hilo, hihi, SIMD256T::castsi_ps(SIMD256T::cmpgt_epi32(swiz.v8[1], mask))),
555 };
556 }
557
558 // All of the 512-bit permute2f128_XX intrinsics do the following:
559 //
560 // SELECT4(src, control) {
561 // CASE(control[1:0])
562 // 0: tmp[127:0] : = src[127:0]
563 // 1 : tmp[127:0] : = src[255:128]
564 // 2 : tmp[127:0] : = src[383:256]
565 // 3 : tmp[127:0] : = src[511:384]
566 // ESAC
567 // RETURN tmp[127:0]
568 // }
569 //
570 // dst[127:0] : = SELECT4(a[511:0], imm8[1:0])
571 // dst[255:128] : = SELECT4(a[511:0], imm8[3:2])
572 // dst[383:256] : = SELECT4(b[511:0], imm8[5:4])
573 // dst[511:384] : = SELECT4(b[511:0], imm8[7:6])
574 // dst[MAX:512] : = 0
575 //
576 // Since the 256-bit AVX instructions use a 4-bit control field (instead
577 // of 2-bit for AVX512), we need to expand the control bits sent to the
578 // AVX instructions for emulation.
579 //
580 template <int shuf>
581 static SIMDINLINE Float SIMDCALL permute2f128_ps(Float const& a, Float const& b)
582 {
583 return Float{
584 SIMD256T::template permute2f128_ps<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0],
585 a.v8[1]),
586 SIMD256T::template permute2f128_ps<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0],
587 b.v8[1]),
588 };
589 }
590
591 template <int shuf>
592 static SIMDINLINE Double SIMDCALL permute2f128_pd(Double const& a, Double const& b)
593 {
594 return Double{
595 SIMD256T::template permute2f128_pd<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0],
596 a.v8[1]),
597 SIMD256T::template permute2f128_pd<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0],
598 b.v8[1]),
599 };
600 }
601
602 template <int shuf>
603 static SIMDINLINE Integer SIMDCALL permute2f128_si(Integer const& a, Integer const& b)
604 {
605 return Integer{
606 SIMD256T::template permute2f128_si<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0],
607 a.v8[1]),
608 SIMD256T::template permute2f128_si<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0],
609 b.v8[1]),
610 };
611 }
612
613 SIMD_IWRAPPER_2I_1(shuffle_epi32);
614 SIMD_IWRAPPER_2I_2(shuffle_epi64);
615 SIMD_IWRAPPER_2(shuffle_epi8);
616 SIMD_WRAPPER_2I_1(shuffle_pd);
617 SIMD_WRAPPER_2I_1(shuffle_ps);
618 SIMD_IWRAPPER_2(unpackhi_epi16);
619 SIMD_IWRAPPER_2(unpackhi_epi32);
620 SIMD_IWRAPPER_2(unpackhi_epi64);
621 SIMD_IWRAPPER_2(unpackhi_epi8);
622 SIMD_WRAPPER_2(unpackhi_pd);
623 SIMD_WRAPPER_2(unpackhi_ps);
624 SIMD_IWRAPPER_2(unpacklo_epi16);
625 SIMD_IWRAPPER_2(unpacklo_epi32);
626 SIMD_IWRAPPER_2(unpacklo_epi64);
627 SIMD_IWRAPPER_2(unpacklo_epi8);
628 SIMD_WRAPPER_2(unpacklo_pd);
629 SIMD_WRAPPER_2(unpacklo_ps);
630
631 //-----------------------------------------------------------------------
632 // Load / store operations
633 //-----------------------------------------------------------------------
634 template <ScaleFactor ScaleT = ScaleFactor::SF_1>
635 static SIMDINLINE Float SIMDCALL
636 i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
637 {
638 return Float{
639 SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[0]),
640 SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[1]),
641 };
642 }
643
644 template <ScaleFactor ScaleT = ScaleFactor::SF_1>
645 static SIMDINLINE Float SIMDCALL
646 sw_i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
647 {
648 return Float{
649 SIMD256T::template sw_i32gather_ps<ScaleT>(p, idx.v8[0]),
650 SIMD256T::template sw_i32gather_ps<ScaleT>(p, idx.v8[1]),
651 };
652 }
653
654 static SIMDINLINE Float SIMDCALL
655 load1_ps(float const* p) // return *p (broadcast 1 value to all elements)
656 {
657 return broadcast_ss(p);
658 }
659
660 static SIMDINLINE Float SIMDCALL
661 load_ps(float const* p) // return *p (loads SIMD width elements from memory)
662 {
663 return Float{SIMD256T::load_ps(p), SIMD256T::load_ps(p + TARGET_SIMD_WIDTH)};
664 }
665
666 static SIMDINLINE Integer SIMDCALL load_si(Integer const* p) // return *p
667 {
668 return Integer{
669 SIMD256T::load_si(&p->v8[0]),
670 SIMD256T::load_si(&p->v8[1]),
671 };
672 }
673
674 static SIMDINLINE Float SIMDCALL
675 loadu_ps(float const* p) // return *p (same as load_ps but allows for unaligned mem)
676 {
677 return Float{SIMD256T::loadu_ps(p), SIMD256T::loadu_ps(p + TARGET_SIMD_WIDTH)};
678 }
679
680 static SIMDINLINE Integer SIMDCALL
681 loadu_si(Integer const* p) // return *p (same as load_si but allows for unaligned mem)
682 {
683 return Integer{
684 SIMD256T::loadu_si(&p->v8[0]),
685 SIMD256T::loadu_si(&p->v8[1]),
686 };
687 }
688
689 // for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
690 template <ScaleFactor ScaleT = ScaleFactor::SF_1>
691 static SIMDINLINE Float SIMDCALL
692 mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask)
693 {
694 return Float{
695 SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[0], p, idx.v8[0], mask.v8[0]),
696 SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[1], p, idx.v8[1], mask.v8[1]),
697 };
698 }
699
700 template <ScaleFactor ScaleT = ScaleFactor::SF_1>
701 static SIMDINLINE Float SIMDCALL
702 sw_mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask)
703 {
704 return Float{
705 SIMD256T::template sw_mask_i32gather_ps<ScaleT>(old.v8[0], p, idx.v8[0], mask.v8[0]),
706 SIMD256T::template sw_mask_i32gather_ps<ScaleT>(old.v8[1], p, idx.v8[1], mask.v8[1]),
707 };
708 }
709
710 static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer const& mask, Float const& src)
711 {
712 SIMD256T::maskstore_ps(p, mask.v8[0], src.v8[0]);
713 SIMD256T::maskstore_ps(p + TARGET_SIMD_WIDTH, mask.v8[1], src.v8[1]);
714 }
715
716 static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer const& a)
717 {
718 uint64_t mask = static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[0]));
719 mask |= static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[1])) << (TARGET_SIMD_WIDTH * 4);
720
721 return mask;
722 }
723
724 static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double const& a)
725 {
726 uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[0]));
727 mask |= static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[1])) << (TARGET_SIMD_WIDTH / 2);
728
729 return mask;
730 }
731 static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float const& a)
732 {
733 uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[0]));
734 mask |= static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[1])) << TARGET_SIMD_WIDTH;
735
736 return mask;
737 }
738
739 static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
740 {
741 return Integer{SIMD256T::set1_epi32(i), SIMD256T::set1_epi32(i)};
742 }
743
744 static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
745 {
746 return Integer{SIMD256T::set1_epi8(i), SIMD256T::set1_epi8(i)};
747 }
748
749 static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value)
750 {
751 return Float{SIMD256T::set1_ps(f), SIMD256T::set1_ps(f)};
752 }
753
754 static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float)
755 {
756 return Float{SIMD256T::setzero_ps(), SIMD256T::setzero_ps()};
757 }
758
759 static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer)
760 {
761 return Integer{SIMD256T::setzero_si(), SIMD256T::setzero_si()};
762 }
763
764 static SIMDINLINE void SIMDCALL
765 store_ps(float* p, Float const& a) // *p = a (stores all elements contiguously in memory)
766 {
767 SIMD256T::store_ps(p, a.v8[0]);
768 SIMD256T::store_ps(p + TARGET_SIMD_WIDTH, a.v8[1]);
769 }
770
771 static SIMDINLINE void SIMDCALL store_si(Integer* p, Integer const& a) // *p = a
772 {
773 SIMD256T::store_si(&p->v8[0], a.v8[0]);
774 SIMD256T::store_si(&p->v8[1], a.v8[1]);
775 }
776
777 static SIMDINLINE void SIMDCALL
778 stream_ps(float* p, Float const& a) // *p = a (same as store_ps, but doesn't keep memory in cache)
779 {
780 SIMD256T::stream_ps(p, a.v8[0]);
781 SIMD256T::stream_ps(p + TARGET_SIMD_WIDTH, a.v8[1]);
782 }
783
784 static SIMDINLINE Integer SIMDCALL set_epi32(int i15,
785 int i14,
786 int i13,
787 int i12,
788 int i11,
789 int i10,
790 int i9,
791 int i8,
792 int i7,
793 int i6,
794 int i5,
795 int i4,
796 int i3,
797 int i2,
798 int i1,
799 int i0)
800 {
801 return Integer{SIMD256T::set_epi32(i7, i6, i5, i4, i3, i2, i1, i0),
802 SIMD256T::set_epi32(i15, i14, i13, i12, i11, i10, i9, i8)};
803 }
804
805 static SIMDINLINE Integer SIMDCALL
806 set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
807 {
808 return set_epi32(0, 0, 0, 0, 0, 0, 0, 0, i7, i6, i5, i4, i3, i2, i1, i0);
809 }
810
811 static SIMDINLINE Float SIMDCALL set_ps(float i15,
812 float i14,
813 float i13,
814 float i12,
815 float i11,
816 float i10,
817 float i9,
818 float i8,
819 float i7,
820 float i6,
821 float i5,
822 float i4,
823 float i3,
824 float i2,
825 float i1,
826 float i0)
827 {
828 return Float{SIMD256T::set_ps(i7, i6, i5, i4, i3, i2, i1, i0),
829 SIMD256T::set_ps(i15, i14, i13, i12, i11, i10, i9, i8)};
830 }
831
832 static SIMDINLINE Float SIMDCALL
833 set_ps(float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
834 {
835 return set_ps(0, 0, 0, 0, 0, 0, 0, 0, i7, i6, i5, i4, i3, i2, i1, i0);
836 }
837
838 static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
839 {
840 return Float{SIMD256T::vmask_ps(mask), SIMD256T::vmask_ps(mask >> TARGET_SIMD_WIDTH)};
841 }
842
843 #undef SIMD_WRAPPER_1
844 #undef SIMD_WRAPPER_2
845 #undef SIMD_WRAPPER_2I
846 #undef SIMD_WRAPPER_2I_1
847 #undef SIMD_WRAPPER_3
848 #undef SIMD_IWRAPPER_1
849 #undef SIMD_IWRAPPER_2
850 #undef SIMD_IWRAPPER_2I
851 #undef SIMD_IWRAPPER_2I_1
852 #undef SIMD_IWRAPPER_3