1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 ****************************************************************************/
24 #ifndef __SWR_SIMDINTRIN_H__
25 #define __SWR_SIMDINTRIN_H__
31 #include <emmintrin.h>
32 #include <immintrin.h>
33 #include <xmmintrin.h>
35 #if KNOB_SIMD_WIDTH == 8
36 typedef __m256 simdscalar
;
37 typedef __m256i simdscalari
;
38 typedef uint8_t simdmask
;
40 #error Unsupported vector width
44 OSALIGNSIMD(union) simdvector
49 simdscalar x
, y
, z
, w
;
52 simdscalar
& operator[] (const int i
) { return v
[i
]; }
53 const simdscalar
& operator[] (const int i
) const { return v
[i
]; }
56 #if KNOB_SIMD_WIDTH == 8
57 #define _simd128_maskstore_ps _mm_maskstore_ps
58 #define _simd_load_ps _mm256_load_ps
59 #define _simd_load1_ps _mm256_broadcast_ss
60 #define _simd_loadu_ps _mm256_loadu_ps
61 #define _simd_setzero_ps _mm256_setzero_ps
62 #define _simd_set1_ps _mm256_set1_ps
63 #define _simd_blend_ps _mm256_blend_ps
64 #define _simd_blendv_ps _mm256_blendv_ps
65 #define _simd_store_ps _mm256_store_ps
66 #define _simd_mul_ps _mm256_mul_ps
67 #define _simd_add_ps _mm256_add_ps
68 #define _simd_sub_ps _mm256_sub_ps
69 #define _simd_rsqrt_ps _mm256_rsqrt_ps
70 #define _simd_min_ps _mm256_min_ps
71 #define _simd_max_ps _mm256_max_ps
72 #define _simd_movemask_ps _mm256_movemask_ps
73 #define _simd_cvtps_epi32 _mm256_cvtps_epi32
74 #define _simd_cvttps_epi32 _mm256_cvttps_epi32
75 #define _simd_cvtepi32_ps _mm256_cvtepi32_ps
76 #define _simd_cmplt_ps(a, b) _mm256_cmp_ps(a, b, _CMP_LT_OQ)
77 #define _simd_cmpgt_ps(a, b) _mm256_cmp_ps(a, b, _CMP_GT_OQ)
78 #define _simd_cmpneq_ps(a, b) _mm256_cmp_ps(a, b, _CMP_NEQ_OQ)
79 #define _simd_cmpeq_ps(a, b) _mm256_cmp_ps(a, b, _CMP_EQ_OQ)
80 #define _simd_cmpge_ps(a, b) _mm256_cmp_ps(a, b, _CMP_GE_OQ)
81 #define _simd_cmple_ps(a, b) _mm256_cmp_ps(a, b, _CMP_LE_OQ)
82 #define _simd_cmp_ps(a, b, imm) _mm256_cmp_ps(a, b, imm)
83 #define _simd_and_ps _mm256_and_ps
84 #define _simd_or_ps _mm256_or_ps
86 #define _simd_rcp_ps _mm256_rcp_ps
87 #define _simd_div_ps _mm256_div_ps
88 #define _simd_castsi_ps _mm256_castsi256_ps
89 #define _simd_andnot_ps _mm256_andnot_ps
90 #define _simd_round_ps _mm256_round_ps
91 #define _simd_castpd_ps _mm256_castpd_ps
92 #define _simd_broadcast_ps(a) _mm256_broadcast_ps((const __m128*)(a))
93 #define _simd_stream_ps _mm256_stream_ps
95 #define _simd_load_sd _mm256_load_sd
96 #define _simd_movemask_pd _mm256_movemask_pd
97 #define _simd_castsi_pd _mm256_castsi256_pd
99 // emulated integer simd
100 #define SIMD_EMU_EPI(func, intrin) \
102 __m256i func(__m256i a, __m256i b)\
104 __m128i aHi = _mm256_extractf128_si256(a, 1);\
105 __m128i bHi = _mm256_extractf128_si256(b, 1);\
106 __m128i aLo = _mm256_castsi256_si128(a);\
107 __m128i bLo = _mm256_castsi256_si128(b);\
109 __m128i subLo = intrin(aLo, bLo);\
110 __m128i subHi = intrin(aHi, bHi);\
112 __m256i result = _mm256_castsi128_si256(subLo);\
113 result = _mm256_insertf128_si256(result, subHi, 1);\
118 #if (KNOB_ARCH == KNOB_ARCH_AVX)
120 __m256
_simdemu_permute_ps(__m256 a
, __m256i b
)
122 __m128 aHi
= _mm256_extractf128_ps(a
, 1);
123 __m128i bHi
= _mm256_extractf128_si256(b
, 1);
124 __m128 aLo
= _mm256_castps256_ps128(a
);
125 __m128i bLo
= _mm256_castsi256_si128(b
);
127 __m128i indexHi
= _mm_cmpgt_epi32(bLo
, _mm_set1_epi32(3));
128 __m128 resLow
= _mm_permutevar_ps(aLo
, _mm_and_si128(bLo
, _mm_set1_epi32(0x3)));
129 __m128 resHi
= _mm_permutevar_ps(aHi
, _mm_and_si128(bLo
, _mm_set1_epi32(0x3)));
130 __m128 blendLowRes
= _mm_blendv_ps(resLow
, resHi
, _mm_castsi128_ps(indexHi
));
132 indexHi
= _mm_cmpgt_epi32(bHi
, _mm_set1_epi32(3));
133 resLow
= _mm_permutevar_ps(aLo
, _mm_and_si128(bHi
, _mm_set1_epi32(0x3)));
134 resHi
= _mm_permutevar_ps(aHi
, _mm_and_si128(bHi
, _mm_set1_epi32(0x3)));
135 __m128 blendHiRes
= _mm_blendv_ps(resLow
, resHi
, _mm_castsi128_ps(indexHi
));
137 __m256 result
= _mm256_castps128_ps256(blendLowRes
);
138 result
= _mm256_insertf128_ps(result
, blendHiRes
, 1);
144 __m256i
_simdemu_permute_epi32(__m256i a
, __m256i b
)
146 return _mm256_castps_si256(_simdemu_permute_ps(_mm256_castsi256_ps(a
), b
));
150 __m256i
_simdemu_srlv_epi32(__m256i vA
, __m256i vCount
)
152 int32_t aHi
, aLow
, countHi
, countLow
;
153 __m128i vAHi
= _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA
), 1));
154 __m128i vALow
= _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA
), 0));
155 __m128i vCountHi
= _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount
), 1));
156 __m128i vCountLow
= _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount
), 0));
158 aHi
= _mm_extract_epi32(vAHi
, 0);
159 countHi
= _mm_extract_epi32(vCountHi
, 0);
161 vAHi
= _mm_insert_epi32(vAHi
, aHi
, 0);
163 aLow
= _mm_extract_epi32(vALow
, 0);
164 countLow
= _mm_extract_epi32(vCountLow
, 0);
166 vALow
= _mm_insert_epi32(vALow
, aLow
, 0);
168 aHi
= _mm_extract_epi32(vAHi
, 1);
169 countHi
= _mm_extract_epi32(vCountHi
, 1);
171 vAHi
= _mm_insert_epi32(vAHi
, aHi
, 1);
173 aLow
= _mm_extract_epi32(vALow
, 1);
174 countLow
= _mm_extract_epi32(vCountLow
, 1);
176 vALow
= _mm_insert_epi32(vALow
, aLow
, 1);
178 aHi
= _mm_extract_epi32(vAHi
, 2);
179 countHi
= _mm_extract_epi32(vCountHi
, 2);
181 vAHi
= _mm_insert_epi32(vAHi
, aHi
, 2);
183 aLow
= _mm_extract_epi32(vALow
, 2);
184 countLow
= _mm_extract_epi32(vCountLow
, 2);
186 vALow
= _mm_insert_epi32(vALow
, aLow
, 2);
188 aHi
= _mm_extract_epi32(vAHi
, 3);
189 countHi
= _mm_extract_epi32(vCountHi
, 3);
191 vAHi
= _mm_insert_epi32(vAHi
, aHi
, 3);
193 aLow
= _mm_extract_epi32(vALow
, 3);
194 countLow
= _mm_extract_epi32(vCountLow
, 3);
196 vALow
= _mm_insert_epi32(vALow
, aLow
, 3);
198 __m256i ret
= _mm256_set1_epi32(0);
199 ret
= _mm256_insertf128_si256(ret
, vAHi
, 1);
200 ret
= _mm256_insertf128_si256(ret
, vALow
, 0);
206 __m256i
_simdemu_sllv_epi32(__m256i vA
, __m256i vCount
)
208 int32_t aHi
, aLow
, countHi
, countLow
;
209 __m128i vAHi
= _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA
), 1));
210 __m128i vALow
= _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA
), 0));
211 __m128i vCountHi
= _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount
), 1));
212 __m128i vCountLow
= _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount
), 0));
214 aHi
= _mm_extract_epi32(vAHi
, 0);
215 countHi
= _mm_extract_epi32(vCountHi
, 0);
217 vAHi
= _mm_insert_epi32(vAHi
, aHi
, 0);
219 aLow
= _mm_extract_epi32(vALow
, 0);
220 countLow
= _mm_extract_epi32(vCountLow
, 0);
222 vALow
= _mm_insert_epi32(vALow
, aLow
, 0);
224 aHi
= _mm_extract_epi32(vAHi
, 1);
225 countHi
= _mm_extract_epi32(vCountHi
, 1);
227 vAHi
= _mm_insert_epi32(vAHi
, aHi
, 1);
229 aLow
= _mm_extract_epi32(vALow
, 1);
230 countLow
= _mm_extract_epi32(vCountLow
, 1);
232 vALow
= _mm_insert_epi32(vALow
, aLow
, 1);
234 aHi
= _mm_extract_epi32(vAHi
, 2);
235 countHi
= _mm_extract_epi32(vCountHi
, 2);
237 vAHi
= _mm_insert_epi32(vAHi
, aHi
, 2);
239 aLow
= _mm_extract_epi32(vALow
, 2);
240 countLow
= _mm_extract_epi32(vCountLow
, 2);
242 vALow
= _mm_insert_epi32(vALow
, aLow
, 2);
244 aHi
= _mm_extract_epi32(vAHi
, 3);
245 countHi
= _mm_extract_epi32(vCountHi
, 3);
247 vAHi
= _mm_insert_epi32(vAHi
, aHi
, 3);
249 aLow
= _mm_extract_epi32(vALow
, 3);
250 countLow
= _mm_extract_epi32(vCountLow
, 3);
252 vALow
= _mm_insert_epi32(vALow
, aLow
, 3);
254 __m256i ret
= _mm256_set1_epi32(0);
255 ret
= _mm256_insertf128_si256(ret
, vAHi
, 1);
256 ret
= _mm256_insertf128_si256(ret
, vALow
, 0);
260 #define _simd_mul_epi32 _simdemu_mul_epi32
261 #define _simd_mullo_epi32 _simdemu_mullo_epi32
262 #define _simd_sub_epi32 _simdemu_sub_epi32
263 #define _simd_sub_epi64 _simdemu_sub_epi64
264 #define _simd_min_epi32 _simdemu_min_epi32
265 #define _simd_min_epu32 _simdemu_min_epu32
266 #define _simd_max_epi32 _simdemu_max_epi32
267 #define _simd_max_epu32 _simdemu_max_epu32
268 #define _simd_add_epi32 _simdemu_add_epi32
269 #define _simd_and_si _simdemu_and_si
270 #define _simd_andnot_si _simdemu_andnot_si
271 #define _simd_cmpeq_epi32 _simdemu_cmpeq_epi32
272 #define _simd_cmplt_epi32 _simdemu_cmplt_epi32
273 #define _simd_cmpgt_epi32 _simdemu_cmpgt_epi32
274 #define _simd_or_si _simdemu_or_si
275 #define _simd_xor_si _simdemu_xor_si
276 #define _simd_castps_si _mm256_castps_si256
277 #define _simd_adds_epu8 _simdemu_adds_epu8
278 #define _simd_subs_epu8 _simdemu_subs_epu8
279 #define _simd_add_epi8 _simdemu_add_epi8
280 #define _simd_cmpeq_epi64 _simdemu_cmpeq_epi64
281 #define _simd_cmpgt_epi64 _simdemu_cmpgt_epi64
282 #define _simd_cmpgt_epi8 _simdemu_cmpgt_epi8
283 #define _simd_cmpeq_epi8 _simdemu_cmpeq_epi8
284 #define _simd_cmpgt_epi16 _simdemu_cmpgt_epi16
285 #define _simd_cmpeq_epi16 _simdemu_cmpeq_epi16
286 #define _simd_movemask_epi8 _simdemu_movemask_epi8
287 #define _simd_permute_ps _simdemu_permute_ps
288 #define _simd_permute_epi32 _simdemu_permute_epi32
289 #define _simd_srlv_epi32 _simdemu_srlv_epi32
290 #define _simd_sllv_epi32 _simdemu_sllv_epi32
292 SIMD_EMU_EPI(_simdemu_mul_epi32
, _mm_mul_epi32
)
293 SIMD_EMU_EPI(_simdemu_mullo_epi32
, _mm_mullo_epi32
)
294 SIMD_EMU_EPI(_simdemu_sub_epi32
, _mm_sub_epi32
)
295 SIMD_EMU_EPI(_simdemu_sub_epi64
, _mm_sub_epi64
)
296 SIMD_EMU_EPI(_simdemu_min_epi32
, _mm_min_epi32
)
297 SIMD_EMU_EPI(_simdemu_min_epu32
, _mm_min_epu32
)
298 SIMD_EMU_EPI(_simdemu_max_epi32
, _mm_max_epi32
)
299 SIMD_EMU_EPI(_simdemu_max_epu32
, _mm_max_epu32
)
300 SIMD_EMU_EPI(_simdemu_add_epi32
, _mm_add_epi32
)
301 SIMD_EMU_EPI(_simdemu_and_si
, _mm_and_si128
)
302 SIMD_EMU_EPI(_simdemu_andnot_si
, _mm_andnot_si128
)
303 SIMD_EMU_EPI(_simdemu_cmpeq_epi32
, _mm_cmpeq_epi32
)
304 SIMD_EMU_EPI(_simdemu_cmplt_epi32
, _mm_cmplt_epi32
)
305 SIMD_EMU_EPI(_simdemu_cmpgt_epi32
, _mm_cmpgt_epi32
)
306 SIMD_EMU_EPI(_simdemu_or_si
, _mm_or_si128
)
307 SIMD_EMU_EPI(_simdemu_xor_si
, _mm_xor_si128
)
308 SIMD_EMU_EPI(_simdemu_adds_epu8
, _mm_adds_epu8
)
309 SIMD_EMU_EPI(_simdemu_subs_epu8
, _mm_subs_epu8
)
310 SIMD_EMU_EPI(_simdemu_add_epi8
, _mm_add_epi8
)
311 SIMD_EMU_EPI(_simdemu_cmpeq_epi64
, _mm_cmpeq_epi64
)
312 SIMD_EMU_EPI(_simdemu_cmpgt_epi64
, _mm_cmpgt_epi64
)
313 SIMD_EMU_EPI(_simdemu_cmpgt_epi8
, _mm_cmpgt_epi8
)
314 SIMD_EMU_EPI(_simdemu_cmpeq_epi8
, _mm_cmpeq_epi8
)
315 SIMD_EMU_EPI(_simdemu_cmpgt_epi16
, _mm_cmpgt_epi16
)
316 SIMD_EMU_EPI(_simdemu_cmpeq_epi16
, _mm_cmpeq_epi16
)
318 #define _simd_unpacklo_epi32(a, b) _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)))
319 #define _simd_unpackhi_epi32(a, b) _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)))
320 #define _simd_unpacklo_epi64(a, b) _mm256_castpd_si256(_mm256_unpacklo_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b)))
321 #define _simd_unpackhi_epi64(a, b) _mm256_castpd_si256(_mm256_unpackhi_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b)))
323 #define _simd_slli_epi32(a,i) _simdemu_slli_epi32(a,i)
324 #define _simd_srai_epi32(a,i) _simdemu_srai_epi32(a,i)
325 #define _simd_srli_epi32(a,i) _simdemu_srli_epi32(a,i)
326 #define _simd_srlisi_ps(a,i) _mm256_castsi256_ps(_simdemu_srli_si128<i>(_mm256_castps_si256(a)))
328 #define _simd128_fmadd_ps _mm_fmaddemu_ps
329 #define _simd_fmadd_ps _mm_fmaddemu256_ps
330 #define _simd_fmsub_ps _mm_fmsubemu256_ps
331 #define _simd_shuffle_epi8 _simdemu_shuffle_epi8
332 SIMD_EMU_EPI(_simdemu_shuffle_epi8
, _mm_shuffle_epi8
)
335 __m128
_mm_fmaddemu_ps(__m128 a
, __m128 b
, __m128 c
)
337 __m128 res
= _mm_mul_ps(a
, b
);
338 res
= _mm_add_ps(res
, c
);
343 __m256
_mm_fmaddemu256_ps(__m256 a
, __m256 b
, __m256 c
)
345 __m256 res
= _mm256_mul_ps(a
, b
);
346 res
= _mm256_add_ps(res
, c
);
351 __m256
_mm_fmsubemu256_ps(__m256 a
, __m256 b
, __m256 c
)
353 __m256 res
= _mm256_mul_ps(a
, b
);
354 res
= _mm256_sub_ps(res
, c
);
359 __m256
_simd_i32gather_ps(const float* pBase
, __m256i vOffsets
, const int scale
)
361 uint32_t *pOffsets
= (uint32_t*)&vOffsets
;
363 float* pResult
= (float*)&vResult
;
364 for (uint32_t i
= 0; i
< KNOB_SIMD_WIDTH
; ++i
)
366 uint32_t offset
= pOffsets
[i
];
367 offset
= offset
* scale
;
368 pResult
[i
] = *(float*)(((const uint8_t*)pBase
+ offset
));
375 __m256
_simd_mask_i32gather_ps(__m256 vSrc
, const float* pBase
, __m256i vOffsets
, __m256 vMask
, const int scale
)
377 uint32_t *pOffsets
= (uint32_t*)&vOffsets
;
378 simdscalar vResult
= vSrc
;
379 float* pResult
= (float*)&vResult
;
381 uint32_t mask
= _simd_movemask_ps(vMask
);
382 while (_BitScanForward(&index
, mask
))
384 mask
&= ~(1 << index
);
385 uint32_t offset
= pOffsets
[index
];
386 offset
= offset
* scale
;
387 pResult
[index
] = *(float*)(((const uint8_t*)pBase
+ offset
));
394 __m256i
_simd_abs_epi32(__m256i a
)
396 __m128i aHi
= _mm256_extractf128_si256(a
, 1);
397 __m128i aLo
= _mm256_castsi256_si128(a
);
398 __m128i absLo
= _mm_abs_epi32(aLo
);
399 __m128i absHi
= _mm_abs_epi32(aHi
);
400 __m256i result
= _mm256_castsi128_si256(absLo
);
401 result
= _mm256_insertf128_si256(result
, absHi
, 1);
406 int _simdemu_movemask_epi8(__m256i a
)
408 __m128i aHi
= _mm256_extractf128_si256(a
, 1);
409 __m128i aLo
= _mm256_castsi256_si128(a
);
411 int resHi
= _mm_movemask_epi8(aHi
);
412 int resLo
= _mm_movemask_epi8(aLo
);
414 return (resHi
<< 16) | resLo
;
418 __m256i
_simd_cvtepu8_epi32(__m128i a
)
420 __m128i resultlo
= _mm_cvtepu8_epi32(a
);
421 __m128i resulthi
= _mm_shuffle_epi8(a
, _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004));
423 __m256i result
= _mm256_castsi128_si256(resultlo
);
425 return _mm256_insertf128_si256(result
, resulthi
, 1);
429 __m256i
_simd_cvtepu16_epi32(__m128i a
)
431 __m128i resultlo
= _mm_cvtepu16_epi32(a
);
432 __m128i resulthi
= _mm_shuffle_epi8(a
, _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908));
434 __m256i result
= _mm256_castsi128_si256(resultlo
);
436 return _mm256_insertf128_si256(result
, resulthi
, 1);
440 __m256i
_simd_packus_epi32(__m256i a
, __m256i b
)
442 __m128i alo
= _mm256_extractf128_si256(a
, 0);
443 __m128i ahi
= _mm256_extractf128_si256(a
, 1);
445 __m128i blo
= _mm256_extractf128_si256(b
, 0);
446 __m128i bhi
= _mm256_extractf128_si256(b
, 1);
448 __m128i resultlo
= _mm_packus_epi32(alo
, blo
);
449 __m128i resulthi
= _mm_packus_epi32(ahi
, bhi
);
451 __m256i result
= _mm256_castsi128_si256(resultlo
);
453 return _mm256_insertf128_si256(result
, resulthi
, 1);
457 __m256i
_simd_packs_epi32(__m256i a
, __m256i b
)
459 __m128i alo
= _mm256_extractf128_si256(a
, 0);
460 __m128i ahi
= _mm256_extractf128_si256(a
, 1);
462 __m128i blo
= _mm256_extractf128_si256(b
, 0);
463 __m128i bhi
= _mm256_extractf128_si256(b
, 1);
465 __m128i resultlo
= _mm_packs_epi32(alo
, blo
);
466 __m128i resulthi
= _mm_packs_epi32(ahi
, bhi
);
468 __m256i result
= _mm256_castsi128_si256(resultlo
);
470 return _mm256_insertf128_si256(result
, resulthi
, 1);
475 #define _simd_mul_epi32 _mm256_mul_epi32
476 #define _simd_mullo_epi32 _mm256_mullo_epi32
477 #define _simd_sub_epi32 _mm256_sub_epi32
478 #define _simd_sub_epi64 _mm256_sub_epi64
479 #define _simd_min_epi32 _mm256_min_epi32
480 #define _simd_max_epi32 _mm256_max_epi32
481 #define _simd_min_epu32 _mm256_min_epu32
482 #define _simd_max_epu32 _mm256_max_epu32
483 #define _simd_add_epi32 _mm256_add_epi32
484 #define _simd_and_si _mm256_and_si256
485 #define _simd_andnot_si _mm256_andnot_si256
486 #define _simd_cmpeq_epi32 _mm256_cmpeq_epi32
487 #define _simd_cmplt_epi32(a,b) _mm256_cmpgt_epi32(b,a)
488 #define _simd_cmpgt_epi32(a,b) _mm256_cmpgt_epi32(a,b)
489 #define _simd_or_si _mm256_or_si256
490 #define _simd_xor_si _mm256_xor_si256
491 #define _simd_castps_si _mm256_castps_si256
493 #define _simd_unpacklo_epi32 _mm256_unpacklo_epi32
494 #define _simd_unpackhi_epi32 _mm256_unpackhi_epi32
495 #define _simd_unpacklo_epi64 _mm256_unpacklo_epi64
496 #define _simd_unpackhi_epi64 _mm256_unpackhi_epi64
498 #define _simd_srli_si(a,i) _simdemu_srli_si128<i>(a)
499 #define _simd_slli_epi32 _mm256_slli_epi32
500 #define _simd_srai_epi32 _mm256_srai_epi32
501 #define _simd_srli_epi32 _mm256_srli_epi32
502 #define _simd_srlisi_ps(a,i) _mm256_castsi256_ps(_simdemu_srli_si128<i>(_mm256_castps_si256(a)))
503 #define _simd128_fmadd_ps _mm_fmadd_ps
504 #define _simd_fmadd_ps _mm256_fmadd_ps
505 #define _simd_fmsub_ps _mm256_fmsub_ps
506 #define _simd_shuffle_epi8 _mm256_shuffle_epi8
507 #define _simd_adds_epu8 _mm256_adds_epu8
508 #define _simd_subs_epu8 _mm256_subs_epu8
509 #define _simd_add_epi8 _mm256_add_epi8
510 #define _simd_i32gather_ps _mm256_i32gather_ps
511 #define _simd_mask_i32gather_ps _mm256_mask_i32gather_ps
512 #define _simd_abs_epi32 _mm256_abs_epi32
514 #define _simd_cmpeq_epi64 _mm256_cmpeq_epi64
515 #define _simd_cmpgt_epi64 _mm256_cmpgt_epi64
516 #define _simd_cmpgt_epi8 _mm256_cmpgt_epi8
517 #define _simd_cmpeq_epi8 _mm256_cmpeq_epi8
518 #define _simd_cmpgt_epi16 _mm256_cmpgt_epi16
519 #define _simd_cmpeq_epi16 _mm256_cmpeq_epi16
520 #define _simd_movemask_epi8 _mm256_movemask_epi8
521 #define _simd_permute_ps _mm256_permutevar8x32_ps
522 #define _simd_permute_epi32 _mm256_permutevar8x32_epi32
523 #define _simd_srlv_epi32 _mm256_srlv_epi32
524 #define _simd_sllv_epi32 _mm256_sllv_epi32
525 #define _simd_cvtepu8_epi32 _mm256_cvtepu8_epi32
526 #define _simd_cvtepu16_epi32 _mm256_cvtepu16_epi32
527 #define _simd_packus_epi32 _mm256_packus_epi32
528 #define _simd_packs_epi32 _mm256_packs_epi32
532 #define _simd_permute2f128_ps _mm256_permute2f128_ps
533 #define _simd_permute2f128_pd _mm256_permute2f128_pd
534 #define _simd_permute2f128_si _mm256_permute2f128_si256
535 #define _simd_shuffle_ps _mm256_shuffle_ps
536 #define _simd_shuffle_pd _mm256_shuffle_pd
537 #define _simd_shuffle_epi32(a, b, imm8) _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), imm8))
538 #define _simd_shuffle_epi64(a, b, imm8) _mm256_castps_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b), imm8))
539 #define _simd_set1_epi32 _mm256_set1_epi32
540 #define _simd_set_epi32 _mm256_set_epi32
541 #define _simd_set1_epi8 _mm256_set1_epi8
542 #define _simd_setzero_si _mm256_setzero_si256
543 #define _simd_cvttps_epi32 _mm256_cvttps_epi32
544 #define _simd_store_si _mm256_store_si256
545 #define _simd_broadcast_ss _mm256_broadcast_ss
546 #define _simd_maskstore_ps _mm256_maskstore_ps
547 #define _simd_load_si _mm256_load_si256
548 #define _simd_loadu_si _mm256_loadu_si256
549 #define _simd_sub_ps _mm256_sub_ps
550 #define _simd_testz_ps _mm256_testz_ps
551 #define _simd_xor_ps _mm256_xor_ps
554 simdscalari
_simd_blendv_epi32(simdscalari a
, simdscalari b
, simdscalar mask
)
556 return _simd_castps_si(_simd_blendv_ps(_simd_castsi_ps(a
), _simd_castsi_ps(b
), mask
));
560 simdscalari
_simd_blendv_epi32(simdscalari a
, simdscalari b
, simdscalari mask
)
562 return _simd_castps_si(_simd_blendv_ps(_simd_castsi_ps(a
), _simd_castsi_ps(b
), _simd_castsi_ps(mask
)));
565 // convert bitmask to vector mask
567 simdscalar
vMask(int32_t mask
)
569 __m256i vec
= _mm256_set1_epi32(mask
);
570 const __m256i bit
= _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
571 vec
= _simd_and_si(vec
, bit
);
572 vec
= _simd_cmplt_epi32(_mm256_setzero_si256(), vec
);
573 return _simd_castsi_ps(vec
);
577 void _simd_mov(simdscalar
&r
, unsigned int rlane
, simdscalar
& s
, unsigned int slane
)
579 OSALIGNSIMD(float) rArray
[KNOB_SIMD_WIDTH
], sArray
[KNOB_SIMD_WIDTH
];
580 _mm256_store_ps(rArray
, r
);
581 _mm256_store_ps(sArray
, s
);
582 rArray
[rlane
] = sArray
[slane
];
583 r
= _mm256_load_ps(rArray
);
586 INLINE __m256i
_simdemu_slli_epi32(__m256i a
, uint32_t i
)
588 __m128i aHi
= _mm256_extractf128_si256(a
, 1);
589 __m128i aLo
= _mm256_castsi256_si128(a
);
591 __m128i resHi
= _mm_slli_epi32(aHi
, i
);
592 __m128i resLo
= _mm_slli_epi32(aLo
, i
);
594 __m256i result
= _mm256_castsi128_si256(resLo
);
595 result
= _mm256_insertf128_si256(result
, resHi
, 1);
600 INLINE __m256i
_simdemu_srai_epi32(__m256i a
, uint32_t i
)
602 __m128i aHi
= _mm256_extractf128_si256(a
, 1);
603 __m128i aLo
= _mm256_castsi256_si128(a
);
605 __m128i resHi
= _mm_srai_epi32(aHi
, i
);
606 __m128i resLo
= _mm_srai_epi32(aLo
, i
);
608 __m256i result
= _mm256_castsi128_si256(resLo
);
609 result
= _mm256_insertf128_si256(result
, resHi
, 1);
614 INLINE __m256i
_simdemu_srli_epi32(__m256i a
, uint32_t i
)
616 __m128i aHi
= _mm256_extractf128_si256(a
, 1);
617 __m128i aLo
= _mm256_castsi256_si128(a
);
619 __m128i resHi
= _mm_srli_epi32(aHi
, i
);
620 __m128i resLo
= _mm_srli_epi32(aLo
, i
);
622 __m256i result
= _mm256_castsi128_si256(resLo
);
623 result
= _mm256_insertf128_si256(result
, resHi
, 1);
629 void _simdvec_transpose(simdvector
&v
)
631 SWR_ASSERT(false, "Need to implement 8 wide version");
635 #error Unsupported vector width
638 // Populates a simdvector from a vector. So p = xyzw becomes xxxx yyyy zzzz wwww.
640 void _simdvec_load_ps(simdvector
& r
, const float *p
)
642 r
[0] = _simd_set1_ps(p
[0]);
643 r
[1] = _simd_set1_ps(p
[1]);
644 r
[2] = _simd_set1_ps(p
[2]);
645 r
[3] = _simd_set1_ps(p
[3]);
649 void _simdvec_mov(simdvector
& r
, const simdscalar
& s
)
658 void _simdvec_mov(simdvector
& r
, const simdvector
& v
)
667 // just move a lane from the source simdvector to dest simdvector
669 void _simdvec_mov(simdvector
&r
, unsigned int rlane
, simdvector
& s
, unsigned int slane
)
671 _simd_mov(r
[0], rlane
, s
[0], slane
);
672 _simd_mov(r
[1], rlane
, s
[1], slane
);
673 _simd_mov(r
[2], rlane
, s
[2], slane
);
674 _simd_mov(r
[3], rlane
, s
[3], slane
);
679 void _simdvec_dp3_ps(simdscalar
& r
, const simdvector
& v0
, const simdvector
& v1
)
682 r
= _simd_mul_ps(v0
[0], v1
[0]); // (v0.x*v1.x)
684 tmp
= _simd_mul_ps(v0
[1], v1
[1]); // (v0.y*v1.y)
685 r
= _simd_add_ps(r
, tmp
); // (v0.x*v1.x) + (v0.y*v1.y)
687 tmp
= _simd_mul_ps(v0
[2], v1
[2]); // (v0.z*v1.z)
688 r
= _simd_add_ps(r
, tmp
); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
692 void _simdvec_dp4_ps(simdscalar
& r
, const simdvector
& v0
, const simdvector
& v1
)
695 r
= _simd_mul_ps(v0
[0], v1
[0]); // (v0.x*v1.x)
697 tmp
= _simd_mul_ps(v0
[1], v1
[1]); // (v0.y*v1.y)
698 r
= _simd_add_ps(r
, tmp
); // (v0.x*v1.x) + (v0.y*v1.y)
700 tmp
= _simd_mul_ps(v0
[2], v1
[2]); // (v0.z*v1.z)
701 r
= _simd_add_ps(r
, tmp
); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
703 tmp
= _simd_mul_ps(v0
[3], v1
[3]); // (v0.w*v1.w)
704 r
= _simd_add_ps(r
, tmp
); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
708 simdscalar
_simdvec_rcp_length_ps(const simdvector
& v
)
711 _simdvec_dp4_ps(length
, v
, v
);
712 return _simd_rsqrt_ps(length
);
716 void _simdvec_normalize_ps(simdvector
& r
, const simdvector
& v
)
718 simdscalar vecLength
;
719 vecLength
= _simdvec_rcp_length_ps(v
);
721 r
[0] = _simd_mul_ps(v
[0], vecLength
);
722 r
[1] = _simd_mul_ps(v
[1], vecLength
);
723 r
[2] = _simd_mul_ps(v
[2], vecLength
);
724 r
[3] = _simd_mul_ps(v
[3], vecLength
);
728 void _simdvec_mul_ps(simdvector
& r
, const simdvector
& v
, const simdscalar
& s
)
730 r
[0] = _simd_mul_ps(v
[0], s
);
731 r
[1] = _simd_mul_ps(v
[1], s
);
732 r
[2] = _simd_mul_ps(v
[2], s
);
733 r
[3] = _simd_mul_ps(v
[3], s
);
737 void _simdvec_mul_ps(simdvector
& r
, const simdvector
& v0
, const simdvector
& v1
)
739 r
[0] = _simd_mul_ps(v0
[0], v1
[0]);
740 r
[1] = _simd_mul_ps(v0
[1], v1
[1]);
741 r
[2] = _simd_mul_ps(v0
[2], v1
[2]);
742 r
[3] = _simd_mul_ps(v0
[3], v1
[3]);
746 void _simdvec_add_ps(simdvector
& r
, const simdvector
& v0
, const simdvector
& v1
)
748 r
[0] = _simd_add_ps(v0
[0], v1
[0]);
749 r
[1] = _simd_add_ps(v0
[1], v1
[1]);
750 r
[2] = _simd_add_ps(v0
[2], v1
[2]);
751 r
[3] = _simd_add_ps(v0
[3], v1
[3]);
755 void _simdvec_min_ps(simdvector
& r
, const simdvector
& v0
, const simdscalar
& s
)
757 r
[0] = _simd_min_ps(v0
[0], s
);
758 r
[1] = _simd_min_ps(v0
[1], s
);
759 r
[2] = _simd_min_ps(v0
[2], s
);
760 r
[3] = _simd_min_ps(v0
[3], s
);
764 void _simdvec_max_ps(simdvector
& r
, const simdvector
& v0
, const simdscalar
& s
)
766 r
[0] = _simd_max_ps(v0
[0], s
);
767 r
[1] = _simd_max_ps(v0
[1], s
);
768 r
[2] = _simd_max_ps(v0
[2], s
);
769 r
[3] = _simd_max_ps(v0
[3], s
);
772 // Matrix4x4 * Vector4
773 // outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * v.w)
774 // outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * v.w)
775 // outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * v.w)
776 // outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * v.w)
778 void _simd_mat4x4_vec4_multiply(
780 const float *pMatrix
,
787 m
= _simd_load1_ps(pMatrix
+ 0*4 + 0); // m[row][0]
788 r0
= _simd_mul_ps(m
, v
[0]); // (m00 * v.x)
789 m
= _simd_load1_ps(pMatrix
+ 0*4 + 1); // m[row][1]
790 r1
= _simd_mul_ps(m
, v
[1]); // (m1 * v.y)
791 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y)
792 m
= _simd_load1_ps(pMatrix
+ 0*4 + 2); // m[row][2]
793 r1
= _simd_mul_ps(m
, v
[2]); // (m2 * v.z)
794 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
795 m
= _simd_load1_ps(pMatrix
+ 0*4 + 3); // m[row][3]
796 r1
= _simd_mul_ps(m
, v
[3]); // (m3 * v.z)
797 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
800 m
= _simd_load1_ps(pMatrix
+ 1*4 + 0); // m[row][0]
801 r0
= _simd_mul_ps(m
, v
[0]); // (m00 * v.x)
802 m
= _simd_load1_ps(pMatrix
+ 1*4 + 1); // m[row][1]
803 r1
= _simd_mul_ps(m
, v
[1]); // (m1 * v.y)
804 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y)
805 m
= _simd_load1_ps(pMatrix
+ 1*4 + 2); // m[row][2]
806 r1
= _simd_mul_ps(m
, v
[2]); // (m2 * v.z)
807 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
808 m
= _simd_load1_ps(pMatrix
+ 1*4 + 3); // m[row][3]
809 r1
= _simd_mul_ps(m
, v
[3]); // (m3 * v.z)
810 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
813 m
= _simd_load1_ps(pMatrix
+ 2*4 + 0); // m[row][0]
814 r0
= _simd_mul_ps(m
, v
[0]); // (m00 * v.x)
815 m
= _simd_load1_ps(pMatrix
+ 2*4 + 1); // m[row][1]
816 r1
= _simd_mul_ps(m
, v
[1]); // (m1 * v.y)
817 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y)
818 m
= _simd_load1_ps(pMatrix
+ 2*4 + 2); // m[row][2]
819 r1
= _simd_mul_ps(m
, v
[2]); // (m2 * v.z)
820 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
821 m
= _simd_load1_ps(pMatrix
+ 2*4 + 3); // m[row][3]
822 r1
= _simd_mul_ps(m
, v
[3]); // (m3 * v.z)
823 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
826 m
= _simd_load1_ps(pMatrix
+ 3*4 + 0); // m[row][0]
827 r0
= _simd_mul_ps(m
, v
[0]); // (m00 * v.x)
828 m
= _simd_load1_ps(pMatrix
+ 3*4 + 1); // m[row][1]
829 r1
= _simd_mul_ps(m
, v
[1]); // (m1 * v.y)
830 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y)
831 m
= _simd_load1_ps(pMatrix
+ 3*4 + 2); // m[row][2]
832 r1
= _simd_mul_ps(m
, v
[2]); // (m2 * v.z)
833 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
834 m
= _simd_load1_ps(pMatrix
+ 3*4 + 3); // m[row][3]
835 r1
= _simd_mul_ps(m
, v
[3]); // (m3 * v.z)
836 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
840 // Matrix4x4 * Vector3 - Direction Vector where w = 0.
841 // outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 0)
842 // outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 0)
843 // outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 0)
844 // outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 0)
846 void _simd_mat3x3_vec3_w0_multiply(
848 const float *pMatrix
,
855 m
= _simd_load1_ps(pMatrix
+ 0*4 + 0); // m[row][0]
856 r0
= _simd_mul_ps(m
, v
[0]); // (m00 * v.x)
857 m
= _simd_load1_ps(pMatrix
+ 0*4 + 1); // m[row][1]
858 r1
= _simd_mul_ps(m
, v
[1]); // (m1 * v.y)
859 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y)
860 m
= _simd_load1_ps(pMatrix
+ 0*4 + 2); // m[row][2]
861 r1
= _simd_mul_ps(m
, v
[2]); // (m2 * v.z)
862 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
865 m
= _simd_load1_ps(pMatrix
+ 1*4 + 0); // m[row][0]
866 r0
= _simd_mul_ps(m
, v
[0]); // (m00 * v.x)
867 m
= _simd_load1_ps(pMatrix
+ 1*4 + 1); // m[row][1]
868 r1
= _simd_mul_ps(m
, v
[1]); // (m1 * v.y)
869 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y)
870 m
= _simd_load1_ps(pMatrix
+ 1*4 + 2); // m[row][2]
871 r1
= _simd_mul_ps(m
, v
[2]); // (m2 * v.z)
872 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
875 m
= _simd_load1_ps(pMatrix
+ 2*4 + 0); // m[row][0]
876 r0
= _simd_mul_ps(m
, v
[0]); // (m00 * v.x)
877 m
= _simd_load1_ps(pMatrix
+ 2*4 + 1); // m[row][1]
878 r1
= _simd_mul_ps(m
, v
[1]); // (m1 * v.y)
879 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y)
880 m
= _simd_load1_ps(pMatrix
+ 2*4 + 2); // m[row][2]
881 r1
= _simd_mul_ps(m
, v
[2]); // (m2 * v.z)
882 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
885 result
[3] = _simd_setzero_ps();
888 // Matrix4x4 * Vector3 - Position vector where w = 1.
889 // outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 1)
890 // outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 1)
891 // outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 1)
892 // outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 1)
894 void _simd_mat4x4_vec3_w1_multiply(
896 const float *pMatrix
,
903 m
= _simd_load1_ps(pMatrix
+ 0*4 + 0); // m[row][0]
904 r0
= _simd_mul_ps(m
, v
[0]); // (m00 * v.x)
905 m
= _simd_load1_ps(pMatrix
+ 0*4 + 1); // m[row][1]
906 r1
= _simd_mul_ps(m
, v
[1]); // (m1 * v.y)
907 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y)
908 m
= _simd_load1_ps(pMatrix
+ 0*4 + 2); // m[row][2]
909 r1
= _simd_mul_ps(m
, v
[2]); // (m2 * v.z)
910 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
911 m
= _simd_load1_ps(pMatrix
+ 0*4 + 3); // m[row][3]
912 r0
= _simd_add_ps(r0
, m
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
915 m
= _simd_load1_ps(pMatrix
+ 1*4 + 0); // m[row][0]
916 r0
= _simd_mul_ps(m
, v
[0]); // (m00 * v.x)
917 m
= _simd_load1_ps(pMatrix
+ 1*4 + 1); // m[row][1]
918 r1
= _simd_mul_ps(m
, v
[1]); // (m1 * v.y)
919 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y)
920 m
= _simd_load1_ps(pMatrix
+ 1*4 + 2); // m[row][2]
921 r1
= _simd_mul_ps(m
, v
[2]); // (m2 * v.z)
922 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
923 m
= _simd_load1_ps(pMatrix
+ 1*4 + 3); // m[row][3]
924 r0
= _simd_add_ps(r0
, m
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
927 m
= _simd_load1_ps(pMatrix
+ 2*4 + 0); // m[row][0]
928 r0
= _simd_mul_ps(m
, v
[0]); // (m00 * v.x)
929 m
= _simd_load1_ps(pMatrix
+ 2*4 + 1); // m[row][1]
930 r1
= _simd_mul_ps(m
, v
[1]); // (m1 * v.y)
931 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y)
932 m
= _simd_load1_ps(pMatrix
+ 2*4 + 2); // m[row][2]
933 r1
= _simd_mul_ps(m
, v
[2]); // (m2 * v.z)
934 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
935 m
= _simd_load1_ps(pMatrix
+ 2*4 + 3); // m[row][3]
936 r0
= _simd_add_ps(r0
, m
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
939 m
= _simd_load1_ps(pMatrix
+ 3*4 + 0); // m[row][0]
940 r0
= _simd_mul_ps(m
, v
[0]); // (m00 * v.x)
941 m
= _simd_load1_ps(pMatrix
+ 3*4 + 1); // m[row][1]
942 r1
= _simd_mul_ps(m
, v
[1]); // (m1 * v.y)
943 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y)
944 m
= _simd_load1_ps(pMatrix
+ 3*4 + 2); // m[row][2]
945 r1
= _simd_mul_ps(m
, v
[2]); // (m2 * v.z)
946 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
947 m
= _simd_load1_ps(pMatrix
+ 3*4 + 3); // m[row][3]
948 result
[3] = _simd_add_ps(r0
, m
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
952 void _simd_mat4x3_vec3_w1_multiply(
954 const float *pMatrix
,
961 m
= _simd_load1_ps(pMatrix
+ 0*4 + 0); // m[row][0]
962 r0
= _simd_mul_ps(m
, v
[0]); // (m00 * v.x)
963 m
= _simd_load1_ps(pMatrix
+ 0*4 + 1); // m[row][1]
964 r1
= _simd_mul_ps(m
, v
[1]); // (m1 * v.y)
965 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y)
966 m
= _simd_load1_ps(pMatrix
+ 0*4 + 2); // m[row][2]
967 r1
= _simd_mul_ps(m
, v
[2]); // (m2 * v.z)
968 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
969 m
= _simd_load1_ps(pMatrix
+ 0*4 + 3); // m[row][3]
970 r0
= _simd_add_ps(r0
, m
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
973 m
= _simd_load1_ps(pMatrix
+ 1*4 + 0); // m[row][0]
974 r0
= _simd_mul_ps(m
, v
[0]); // (m00 * v.x)
975 m
= _simd_load1_ps(pMatrix
+ 1*4 + 1); // m[row][1]
976 r1
= _simd_mul_ps(m
, v
[1]); // (m1 * v.y)
977 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y)
978 m
= _simd_load1_ps(pMatrix
+ 1*4 + 2); // m[row][2]
979 r1
= _simd_mul_ps(m
, v
[2]); // (m2 * v.z)
980 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
981 m
= _simd_load1_ps(pMatrix
+ 1*4 + 3); // m[row][3]
982 r0
= _simd_add_ps(r0
, m
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
985 m
= _simd_load1_ps(pMatrix
+ 2*4 + 0); // m[row][0]
986 r0
= _simd_mul_ps(m
, v
[0]); // (m00 * v.x)
987 m
= _simd_load1_ps(pMatrix
+ 2*4 + 1); // m[row][1]
988 r1
= _simd_mul_ps(m
, v
[1]); // (m1 * v.y)
989 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y)
990 m
= _simd_load1_ps(pMatrix
+ 2*4 + 2); // m[row][2]
991 r1
= _simd_mul_ps(m
, v
[2]); // (m2 * v.z)
992 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
993 m
= _simd_load1_ps(pMatrix
+ 2*4 + 3); // m[row][3]
994 r0
= _simd_add_ps(r0
, m
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
996 result
[3] = _simd_set1_ps(1.0f
);
999 //////////////////////////////////////////////////////////////////////////
1000 /// @brief Compute plane equation vA * vX + vB * vY + vC
1001 INLINE simdscalar
vplaneps(simdscalar vA
, simdscalar vB
, simdscalar vC
, simdscalar
&vX
, simdscalar
&vY
)
1003 simdscalar vOut
= _simd_fmadd_ps(vA
, vX
, vC
);
1004 vOut
= _simd_fmadd_ps(vB
, vY
, vOut
);
1008 //////////////////////////////////////////////////////////////////////////
1009 /// @brief Compute plane equation vA * vX + vB * vY + vC
1010 INLINE __m128
vplaneps128(__m128 vA
, __m128 vB
, __m128 vC
, __m128
&vX
, __m128
&vY
)
1012 __m128 vOut
= _simd128_fmadd_ps(vA
, vX
, vC
);
1013 vOut
= _simd128_fmadd_ps(vB
, vY
, vOut
);
1017 //////////////////////////////////////////////////////////////////////////
1018 /// @brief Interpolates a single component.
1019 /// @param vI - barycentric I
1020 /// @param vJ - barycentric J
1021 /// @param pInterpBuffer - pointer to attribute barycentric coeffs
1022 template<UINT Attrib
, UINT Comp
, UINT numComponents
= 4>
1023 static INLINE simdscalar
InterpolateComponent(simdscalar vI
, simdscalar vJ
, const float *pInterpBuffer
)
1025 const float *pInterpA
= &pInterpBuffer
[Attrib
* 3 * numComponents
+ 0 + Comp
];
1026 const float *pInterpB
= &pInterpBuffer
[Attrib
* 3 * numComponents
+ numComponents
+ Comp
];
1027 const float *pInterpC
= &pInterpBuffer
[Attrib
* 3 * numComponents
+ numComponents
* 2 + Comp
];
1029 simdscalar vA
= _simd_broadcast_ss(pInterpA
);
1030 simdscalar vB
= _simd_broadcast_ss(pInterpB
);
1031 simdscalar vC
= _simd_broadcast_ss(pInterpC
);
1033 simdscalar vk
= _simd_sub_ps(_simd_sub_ps(_simd_set1_ps(1.0f
), vI
), vJ
);
1034 vC
= _simd_mul_ps(vk
, vC
);
1036 return vplaneps(vA
, vB
, vC
, vI
, vJ
);
1039 //////////////////////////////////////////////////////////////////////////
1040 /// @brief Interpolates a single component.
1041 /// @param vI - barycentric I
1042 /// @param vJ - barycentric J
1043 /// @param pInterpBuffer - pointer to attribute barycentric coeffs
1044 template<UINT Attrib
, UINT Comp
, UINT numComponents
= 4>
1045 static INLINE __m128
InterpolateComponent(__m128 vI
, __m128 vJ
, const float *pInterpBuffer
)
1047 const float *pInterpA
= &pInterpBuffer
[Attrib
* 3 * numComponents
+ 0 + Comp
];
1048 const float *pInterpB
= &pInterpBuffer
[Attrib
* 3 * numComponents
+ numComponents
+ Comp
];
1049 const float *pInterpC
= &pInterpBuffer
[Attrib
* 3 * numComponents
+ numComponents
* 2 + Comp
];
1051 __m128 vA
= _mm_broadcast_ss(pInterpA
);
1052 __m128 vB
= _mm_broadcast_ss(pInterpB
);
1053 __m128 vC
= _mm_broadcast_ss(pInterpC
);
1055 __m128 vk
= _mm_sub_ps(_mm_sub_ps(_mm_set1_ps(1.0f
), vI
), vJ
);
1056 vC
= _mm_mul_ps(vk
, vC
);
1058 return vplaneps128(vA
, vB
, vC
, vI
, vJ
);
1061 static INLINE __m128
_simd128_abs_ps(__m128 a
)
1063 __m128i ai
= _mm_castps_si128(a
);
1064 return _mm_castsi128_ps(_mm_and_si128(ai
, _mm_set1_epi32(0x7fffffff)));
1067 static INLINE simdscalar
_simd_abs_ps(simdscalar a
)
1069 simdscalari ai
= _simd_castps_si(a
);
1070 return _simd_castsi_ps(_simd_and_si(ai
, _simd_set1_epi32(0x7fffffff)));
1074 UINT
pdep_u32(UINT a
, UINT mask
)
1076 #if KNOB_ARCH >= KNOB_ARCH_AVX2
1077 return _pdep_u32(a
, mask
);
1081 // copied from http://wm.ite.pl/articles/pdep-soft-emu.html
1082 // using bsf instead of funky loop
1084 while (_BitScanForward(&maskIndex
, mask
))
1086 // 1. isolate lowest set bit of mask
1087 const UINT lowest
= 1 << maskIndex
;
1089 // 2. populate LSB from src
1090 const UINT LSB
= (UINT
)((int)(a
<< 31) >> 31);
1092 // 3. copy bit from mask
1093 result
|= LSB
& lowest
;
1095 // 4. clear lowest bit
1098 // 5. prepare for next iteration
1107 UINT
pext_u32(UINT a
, UINT mask
)
1109 #if KNOB_ARCH >= KNOB_ARCH_AVX2
1110 return _pext_u32(a
, mask
);
1114 uint32_t currentBit
= 0;
1115 while (_BitScanForward(&maskIndex
, mask
))
1117 // 1. isolate lowest set bit of mask
1118 const UINT lowest
= 1 << maskIndex
;
1120 // 2. copy bit from mask
1121 result
|= ((a
& lowest
) > 0) << currentBit
++;
1123 // 3. clear lowest bit
1130 #if ENABLE_AVX512_SIMD16
1131 #include "simd16intrin.h"
1132 #endif//ENABLE_AVX512_SIMD16
1134 #endif//__SWR_SIMDINTRIN_H__