1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 ****************************************************************************/
24 #ifndef __SWR_SIMDINTRIN_H__
25 #define __SWR_SIMDINTRIN_H__
31 #include <emmintrin.h>
32 #include <immintrin.h>
33 #include <xmmintrin.h>
35 #if KNOB_SIMD_WIDTH == 8
36 typedef __m256 simdscalar
;
37 typedef __m256i simdscalari
;
38 typedef uint8_t simdmask
;
40 #error Unsupported vector width
44 OSALIGNSIMD(union) simdvector
49 simdscalar x
, y
, z
, w
;
52 simdscalar
& operator[] (const int i
) { return v
[i
]; }
53 const simdscalar
& operator[] (const int i
) const { return v
[i
]; }
56 #if KNOB_SIMD_WIDTH == 8
57 #define _simd128_maskstore_ps _mm_maskstore_ps
58 #define _simd_load_ps _mm256_load_ps
59 #define _simd_load1_ps _mm256_broadcast_ss
60 #define _simd_loadu_ps _mm256_loadu_ps
61 #define _simd_setzero_ps _mm256_setzero_ps
62 #define _simd_set1_ps _mm256_set1_ps
63 #define _simd_blend_ps _mm256_blend_ps
64 #define _simd_blendv_ps _mm256_blendv_ps
65 #define _simd_store_ps _mm256_store_ps
66 #define _simd_mul_ps _mm256_mul_ps
67 #define _simd_add_ps _mm256_add_ps
68 #define _simd_sub_ps _mm256_sub_ps
69 #define _simd_rsqrt_ps _mm256_rsqrt_ps
70 #define _simd_min_ps _mm256_min_ps
71 #define _simd_max_ps _mm256_max_ps
72 #define _simd_movemask_ps _mm256_movemask_ps
73 #define _simd_cvtps_epi32 _mm256_cvtps_epi32
74 #define _simd_cvttps_epi32 _mm256_cvttps_epi32
75 #define _simd_cvtepi32_ps _mm256_cvtepi32_ps
76 #define _simd_cmplt_ps(a, b) _mm256_cmp_ps(a, b, _CMP_LT_OQ)
77 #define _simd_cmpgt_ps(a, b) _mm256_cmp_ps(a, b, _CMP_GT_OQ)
78 #define _simd_cmpneq_ps(a, b) _mm256_cmp_ps(a, b, _CMP_NEQ_OQ)
79 #define _simd_cmpeq_ps(a, b) _mm256_cmp_ps(a, b, _CMP_EQ_OQ)
80 #define _simd_cmpge_ps(a, b) _mm256_cmp_ps(a, b, _CMP_GE_OQ)
81 #define _simd_cmple_ps(a, b) _mm256_cmp_ps(a, b, _CMP_LE_OQ)
82 #define _simd_cmp_ps(a, b, imm) _mm256_cmp_ps(a, b, imm)
83 #define _simd_and_ps _mm256_and_ps
84 #define _simd_or_ps _mm256_or_ps
86 #define _simd_rcp_ps _mm256_rcp_ps
87 #define _simd_div_ps _mm256_div_ps
88 #define _simd_castsi_ps _mm256_castsi256_ps
89 #define _simd_andnot_ps _mm256_andnot_ps
90 #define _simd_round_ps _mm256_round_ps
91 #define _simd_castpd_ps _mm256_castpd_ps
92 #define _simd_broadcast_ps(a) _mm256_broadcast_ps((const __m128*)(a))
93 #define _simd_stream_ps _mm256_stream_ps
95 #define _simd_load_sd _mm256_load_sd
96 #define _simd_movemask_pd _mm256_movemask_pd
97 #define _simd_castsi_pd _mm256_castsi256_pd
99 // emulated integer simd
100 #define SIMD_EMU_EPI(func, intrin) \
102 __m256i func(__m256i a, __m256i b)\
104 __m128i aHi = _mm256_extractf128_si256(a, 1);\
105 __m128i bHi = _mm256_extractf128_si256(b, 1);\
106 __m128i aLo = _mm256_castsi256_si128(a);\
107 __m128i bLo = _mm256_castsi256_si128(b);\
109 __m128i subLo = intrin(aLo, bLo);\
110 __m128i subHi = intrin(aHi, bHi);\
112 __m256i result = _mm256_castsi128_si256(subLo);\
113 result = _mm256_insertf128_si256(result, subHi, 1);\
118 #if (KNOB_ARCH == KNOB_ARCH_AVX)
120 __m256
_simdemu_permute_ps(__m256 a
, __m256i b
)
122 __m128 aHi
= _mm256_extractf128_ps(a
, 1);
123 __m128i bHi
= _mm256_extractf128_si256(b
, 1);
124 __m128 aLo
= _mm256_castps256_ps128(a
);
125 __m128i bLo
= _mm256_castsi256_si128(b
);
127 __m128i indexHi
= _mm_cmpgt_epi32(bLo
, _mm_set1_epi32(3));
128 __m128 resLow
= _mm_permutevar_ps(aLo
, _mm_and_si128(bLo
, _mm_set1_epi32(0x3)));
129 __m128 resHi
= _mm_permutevar_ps(aHi
, _mm_and_si128(bLo
, _mm_set1_epi32(0x3)));
130 __m128 blendLowRes
= _mm_blendv_ps(resLow
, resHi
, _mm_castsi128_ps(indexHi
));
132 indexHi
= _mm_cmpgt_epi32(bHi
, _mm_set1_epi32(3));
133 resLow
= _mm_permutevar_ps(aLo
, _mm_and_si128(bHi
, _mm_set1_epi32(0x3)));
134 resHi
= _mm_permutevar_ps(aHi
, _mm_and_si128(bHi
, _mm_set1_epi32(0x3)));
135 __m128 blendHiRes
= _mm_blendv_ps(resLow
, resHi
, _mm_castsi128_ps(indexHi
));
137 __m256 result
= _mm256_castps128_ps256(blendLowRes
);
138 result
= _mm256_insertf128_ps(result
, blendHiRes
, 1);
144 __m256i
_simdemu_permute_epi32(__m256i a
, __m256i b
)
146 return _mm256_castps_si256(_simdemu_permute_ps(_mm256_castsi256_ps(a
), b
));
150 __m256i
_simdemu_srlv_epi32(__m256i vA
, __m256i vCount
)
152 int32_t aHi
, aLow
, countHi
, countLow
;
153 __m128i vAHi
= _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA
), 1));
154 __m128i vALow
= _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA
), 0));
155 __m128i vCountHi
= _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount
), 1));
156 __m128i vCountLow
= _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount
), 0));
158 aHi
= _mm_extract_epi32(vAHi
, 0);
159 countHi
= _mm_extract_epi32(vCountHi
, 0);
161 vAHi
= _mm_insert_epi32(vAHi
, aHi
, 0);
163 aLow
= _mm_extract_epi32(vALow
, 0);
164 countLow
= _mm_extract_epi32(vCountLow
, 0);
166 vALow
= _mm_insert_epi32(vALow
, aLow
, 0);
168 aHi
= _mm_extract_epi32(vAHi
, 1);
169 countHi
= _mm_extract_epi32(vCountHi
, 1);
171 vAHi
= _mm_insert_epi32(vAHi
, aHi
, 1);
173 aLow
= _mm_extract_epi32(vALow
, 1);
174 countLow
= _mm_extract_epi32(vCountLow
, 1);
176 vALow
= _mm_insert_epi32(vALow
, aLow
, 1);
178 aHi
= _mm_extract_epi32(vAHi
, 2);
179 countHi
= _mm_extract_epi32(vCountHi
, 2);
181 vAHi
= _mm_insert_epi32(vAHi
, aHi
, 2);
183 aLow
= _mm_extract_epi32(vALow
, 2);
184 countLow
= _mm_extract_epi32(vCountLow
, 2);
186 vALow
= _mm_insert_epi32(vALow
, aLow
, 2);
188 aHi
= _mm_extract_epi32(vAHi
, 3);
189 countHi
= _mm_extract_epi32(vCountHi
, 3);
191 vAHi
= _mm_insert_epi32(vAHi
, aHi
, 3);
193 aLow
= _mm_extract_epi32(vALow
, 3);
194 countLow
= _mm_extract_epi32(vCountLow
, 3);
196 vALow
= _mm_insert_epi32(vALow
, aLow
, 3);
198 __m256i ret
= _mm256_set1_epi32(0);
199 ret
= _mm256_insertf128_si256(ret
, vAHi
, 1);
200 ret
= _mm256_insertf128_si256(ret
, vALow
, 0);
206 __m256i
_simdemu_sllv_epi32(__m256i vA
, __m256i vCount
)
208 int32_t aHi
, aLow
, countHi
, countLow
;
209 __m128i vAHi
= _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA
), 1));
210 __m128i vALow
= _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA
), 0));
211 __m128i vCountHi
= _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount
), 1));
212 __m128i vCountLow
= _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount
), 0));
214 aHi
= _mm_extract_epi32(vAHi
, 0);
215 countHi
= _mm_extract_epi32(vCountHi
, 0);
217 vAHi
= _mm_insert_epi32(vAHi
, aHi
, 0);
219 aLow
= _mm_extract_epi32(vALow
, 0);
220 countLow
= _mm_extract_epi32(vCountLow
, 0);
222 vALow
= _mm_insert_epi32(vALow
, aLow
, 0);
224 aHi
= _mm_extract_epi32(vAHi
, 1);
225 countHi
= _mm_extract_epi32(vCountHi
, 1);
227 vAHi
= _mm_insert_epi32(vAHi
, aHi
, 1);
229 aLow
= _mm_extract_epi32(vALow
, 1);
230 countLow
= _mm_extract_epi32(vCountLow
, 1);
232 vALow
= _mm_insert_epi32(vALow
, aLow
, 1);
234 aHi
= _mm_extract_epi32(vAHi
, 2);
235 countHi
= _mm_extract_epi32(vCountHi
, 2);
237 vAHi
= _mm_insert_epi32(vAHi
, aHi
, 2);
239 aLow
= _mm_extract_epi32(vALow
, 2);
240 countLow
= _mm_extract_epi32(vCountLow
, 2);
242 vALow
= _mm_insert_epi32(vALow
, aLow
, 2);
244 aHi
= _mm_extract_epi32(vAHi
, 3);
245 countHi
= _mm_extract_epi32(vCountHi
, 3);
247 vAHi
= _mm_insert_epi32(vAHi
, aHi
, 3);
249 aLow
= _mm_extract_epi32(vALow
, 3);
250 countLow
= _mm_extract_epi32(vCountLow
, 3);
252 vALow
= _mm_insert_epi32(vALow
, aLow
, 3);
254 __m256i ret
= _mm256_set1_epi32(0);
255 ret
= _mm256_insertf128_si256(ret
, vAHi
, 1);
256 ret
= _mm256_insertf128_si256(ret
, vALow
, 0);
260 #define _simd_mul_epi32 _simdemu_mul_epi32
261 #define _simd_mullo_epi32 _simdemu_mullo_epi32
262 #define _simd_sub_epi32 _simdemu_sub_epi32
263 #define _simd_sub_epi64 _simdemu_sub_epi64
264 #define _simd_min_epi32 _simdemu_min_epi32
265 #define _simd_min_epu32 _simdemu_min_epu32
266 #define _simd_max_epi32 _simdemu_max_epi32
267 #define _simd_max_epu32 _simdemu_max_epu32
268 #define _simd_add_epi32 _simdemu_add_epi32
269 #define _simd_and_si _simdemu_and_si
270 #define _simd_andnot_si _simdemu_andnot_si
271 #define _simd_cmpeq_epi32 _simdemu_cmpeq_epi32
272 #define _simd_cmplt_epi32 _simdemu_cmplt_epi32
273 #define _simd_cmpgt_epi32 _simdemu_cmpgt_epi32
274 #define _simd_or_si _simdemu_or_si
275 #define _simd_xor_si _simdemu_xor_si
276 #define _simd_castps_si _mm256_castps_si256
277 #define _simd_adds_epu8 _simdemu_adds_epu8
278 #define _simd_subs_epu8 _simdemu_subs_epu8
279 #define _simd_add_epi8 _simdemu_add_epi8
280 #define _simd_cmpeq_epi64 _simdemu_cmpeq_epi64
281 #define _simd_cmpgt_epi64 _simdemu_cmpgt_epi64
282 #define _simd_cmpgt_epi8 _simdemu_cmpgt_epi8
283 #define _simd_cmpeq_epi8 _simdemu_cmpeq_epi8
284 #define _simd_cmpgt_epi16 _simdemu_cmpgt_epi16
285 #define _simd_cmpeq_epi16 _simdemu_cmpeq_epi16
286 #define _simd_movemask_epi8 _simdemu_movemask_epi8
287 #define _simd_permute_ps _simdemu_permute_ps
288 #define _simd_permute_epi32 _simdemu_permute_epi32
289 #define _simd_srlv_epi32 _simdemu_srlv_epi32
290 #define _simd_sllv_epi32 _simdemu_sllv_epi32
292 SIMD_EMU_EPI(_simdemu_mul_epi32
, _mm_mul_epi32
)
293 SIMD_EMU_EPI(_simdemu_mullo_epi32
, _mm_mullo_epi32
)
294 SIMD_EMU_EPI(_simdemu_sub_epi32
, _mm_sub_epi32
)
295 SIMD_EMU_EPI(_simdemu_sub_epi64
, _mm_sub_epi64
)
296 SIMD_EMU_EPI(_simdemu_min_epi32
, _mm_min_epi32
)
297 SIMD_EMU_EPI(_simdemu_min_epu32
, _mm_min_epu32
)
298 SIMD_EMU_EPI(_simdemu_max_epi32
, _mm_max_epi32
)
299 SIMD_EMU_EPI(_simdemu_max_epu32
, _mm_max_epu32
)
300 SIMD_EMU_EPI(_simdemu_add_epi32
, _mm_add_epi32
)
301 SIMD_EMU_EPI(_simdemu_and_si
, _mm_and_si128
)
302 SIMD_EMU_EPI(_simdemu_andnot_si
, _mm_andnot_si128
)
303 SIMD_EMU_EPI(_simdemu_cmpeq_epi32
, _mm_cmpeq_epi32
)
304 SIMD_EMU_EPI(_simdemu_cmplt_epi32
, _mm_cmplt_epi32
)
305 SIMD_EMU_EPI(_simdemu_cmpgt_epi32
, _mm_cmpgt_epi32
)
306 SIMD_EMU_EPI(_simdemu_or_si
, _mm_or_si128
)
307 SIMD_EMU_EPI(_simdemu_xor_si
, _mm_xor_si128
)
308 SIMD_EMU_EPI(_simdemu_adds_epu8
, _mm_adds_epu8
)
309 SIMD_EMU_EPI(_simdemu_subs_epu8
, _mm_subs_epu8
)
310 SIMD_EMU_EPI(_simdemu_add_epi8
, _mm_add_epi8
)
311 SIMD_EMU_EPI(_simdemu_cmpeq_epi64
, _mm_cmpeq_epi64
)
312 SIMD_EMU_EPI(_simdemu_cmpgt_epi64
, _mm_cmpgt_epi64
)
313 SIMD_EMU_EPI(_simdemu_cmpgt_epi8
, _mm_cmpgt_epi8
)
314 SIMD_EMU_EPI(_simdemu_cmpeq_epi8
, _mm_cmpeq_epi8
)
315 SIMD_EMU_EPI(_simdemu_cmpgt_epi16
, _mm_cmpgt_epi16
)
316 SIMD_EMU_EPI(_simdemu_cmpeq_epi16
, _mm_cmpeq_epi16
)
317 SIMD_EMU_EPI(_simdemu_unpacklo_epi8
, _mm_unpacklo_epi8
)
318 SIMD_EMU_EPI(_simdemu_unpackhi_epi8
, _mm_unpackhi_epi8
)
319 SIMD_EMU_EPI(_simdemu_unpacklo_epi16
, _mm_unpacklo_epi16
)
320 SIMD_EMU_EPI(_simdemu_unpackhi_epi16
, _mm_unpackhi_epi16
)
322 #define _simd_unpacklo_epi8 _simdemu_unpacklo_epi8
323 #define _simd_unpackhi_epi8 _simdemu_unpackhi_epi8
324 #define _simd_unpacklo_epi16 _simdemu_unpacklo_epi16
325 #define _simd_unpackhi_epi16 _simdemu_unpackhi_epi16
326 #define _simd_unpacklo_epi32(a, b) _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)))
327 #define _simd_unpackhi_epi32(a, b) _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)))
328 #define _simd_unpacklo_epi64(a, b) _mm256_castpd_si256(_mm256_unpacklo_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b)))
329 #define _simd_unpackhi_epi64(a, b) _mm256_castpd_si256(_mm256_unpackhi_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b)))
331 #define _simd_slli_epi32(a,i) _simdemu_slli_epi32(a,i)
332 #define _simd_srai_epi32(a,i) _simdemu_srai_epi32(a,i)
333 #define _simd_srli_epi32(a,i) _simdemu_srli_epi32(a,i)
334 #define _simd_srlisi_ps(a,i) _mm256_castsi256_ps(_simdemu_srli_si128<i>(_mm256_castps_si256(a)))
336 #define _simd128_fmadd_ps _mm_fmaddemu_ps
337 #define _simd_fmadd_ps _mm_fmaddemu256_ps
338 #define _simd_fmsub_ps _mm_fmsubemu256_ps
339 #define _simd_shuffle_epi8 _simdemu_shuffle_epi8
340 SIMD_EMU_EPI(_simdemu_shuffle_epi8
, _mm_shuffle_epi8
)
343 __m128
_mm_fmaddemu_ps(__m128 a
, __m128 b
, __m128 c
)
345 __m128 res
= _mm_mul_ps(a
, b
);
346 res
= _mm_add_ps(res
, c
);
351 __m256
_mm_fmaddemu256_ps(__m256 a
, __m256 b
, __m256 c
)
353 __m256 res
= _mm256_mul_ps(a
, b
);
354 res
= _mm256_add_ps(res
, c
);
359 __m256
_mm_fmsubemu256_ps(__m256 a
, __m256 b
, __m256 c
)
361 __m256 res
= _mm256_mul_ps(a
, b
);
362 res
= _mm256_sub_ps(res
, c
);
367 __m256
_simd_i32gather_ps(const float* pBase
, __m256i vOffsets
, const int scale
)
369 uint32_t *pOffsets
= (uint32_t*)&vOffsets
;
371 float* pResult
= (float*)&vResult
;
372 for (uint32_t i
= 0; i
< KNOB_SIMD_WIDTH
; ++i
)
374 uint32_t offset
= pOffsets
[i
];
375 offset
= offset
* scale
;
376 pResult
[i
] = *(float*)(((const uint8_t*)pBase
+ offset
));
383 __m256
_simd_mask_i32gather_ps(__m256 vSrc
, const float* pBase
, __m256i vOffsets
, __m256 vMask
, const int scale
)
385 uint32_t *pOffsets
= (uint32_t*)&vOffsets
;
386 simdscalar vResult
= vSrc
;
387 float* pResult
= (float*)&vResult
;
389 uint32_t mask
= _simd_movemask_ps(vMask
);
390 while (_BitScanForward(&index
, mask
))
392 mask
&= ~(1 << index
);
393 uint32_t offset
= pOffsets
[index
];
394 offset
= offset
* scale
;
395 pResult
[index
] = *(float*)(((const uint8_t*)pBase
+ offset
));
402 __m256i
_simd_abs_epi32(__m256i a
)
404 __m128i aHi
= _mm256_extractf128_si256(a
, 1);
405 __m128i aLo
= _mm256_castsi256_si128(a
);
406 __m128i absLo
= _mm_abs_epi32(aLo
);
407 __m128i absHi
= _mm_abs_epi32(aHi
);
408 __m256i result
= _mm256_castsi128_si256(absLo
);
409 result
= _mm256_insertf128_si256(result
, absHi
, 1);
414 int _simdemu_movemask_epi8(__m256i a
)
416 __m128i aHi
= _mm256_extractf128_si256(a
, 1);
417 __m128i aLo
= _mm256_castsi256_si128(a
);
419 int resHi
= _mm_movemask_epi8(aHi
);
420 int resLo
= _mm_movemask_epi8(aLo
);
422 return (resHi
<< 16) | resLo
;
426 __m256i
_simd_cvtepu8_epi32(__m128i a
)
428 __m128i resultlo
= _mm_cvtepu8_epi32(a
);
429 __m128i resulthi
= _mm_shuffle_epi8(a
, _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004));
431 __m256i result
= _mm256_castsi128_si256(resultlo
);
433 return _mm256_insertf128_si256(result
, resulthi
, 1);
437 __m256i
_simd_cvtepu16_epi32(__m128i a
)
439 __m128i resultlo
= _mm_cvtepu16_epi32(a
);
440 __m128i resulthi
= _mm_shuffle_epi8(a
, _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908));
442 __m256i result
= _mm256_castsi128_si256(resultlo
);
444 return _mm256_insertf128_si256(result
, resulthi
, 1);
448 __m256i
_simd_packus_epi32(__m256i a
, __m256i b
)
450 __m128i alo
= _mm256_extractf128_si256(a
, 0);
451 __m128i ahi
= _mm256_extractf128_si256(a
, 1);
453 __m128i blo
= _mm256_extractf128_si256(b
, 0);
454 __m128i bhi
= _mm256_extractf128_si256(b
, 1);
456 __m128i resultlo
= _mm_packus_epi32(alo
, blo
);
457 __m128i resulthi
= _mm_packus_epi32(ahi
, bhi
);
459 __m256i result
= _mm256_castsi128_si256(resultlo
);
461 return _mm256_insertf128_si256(result
, resulthi
, 1);
465 __m256i
_simd_packs_epi32(__m256i a
, __m256i b
)
467 __m128i alo
= _mm256_extractf128_si256(a
, 0);
468 __m128i ahi
= _mm256_extractf128_si256(a
, 1);
470 __m128i blo
= _mm256_extractf128_si256(b
, 0);
471 __m128i bhi
= _mm256_extractf128_si256(b
, 1);
473 __m128i resultlo
= _mm_packs_epi32(alo
, blo
);
474 __m128i resulthi
= _mm_packs_epi32(ahi
, bhi
);
476 __m256i result
= _mm256_castsi128_si256(resultlo
);
478 return _mm256_insertf128_si256(result
, resulthi
, 1);
483 #define _simd_mul_epi32 _mm256_mul_epi32
484 #define _simd_mullo_epi32 _mm256_mullo_epi32
485 #define _simd_sub_epi32 _mm256_sub_epi32
486 #define _simd_sub_epi64 _mm256_sub_epi64
487 #define _simd_min_epi32 _mm256_min_epi32
488 #define _simd_max_epi32 _mm256_max_epi32
489 #define _simd_min_epu32 _mm256_min_epu32
490 #define _simd_max_epu32 _mm256_max_epu32
491 #define _simd_add_epi32 _mm256_add_epi32
492 #define _simd_and_si _mm256_and_si256
493 #define _simd_andnot_si _mm256_andnot_si256
494 #define _simd_cmpeq_epi32 _mm256_cmpeq_epi32
495 #define _simd_cmplt_epi32(a,b) _mm256_cmpgt_epi32(b,a)
496 #define _simd_cmpgt_epi32(a,b) _mm256_cmpgt_epi32(a,b)
497 #define _simd_or_si _mm256_or_si256
498 #define _simd_xor_si _mm256_xor_si256
499 #define _simd_castps_si _mm256_castps_si256
501 #define _simd_unpacklo_epi8 _mm256_unpacklo_epi8
502 #define _simd_unpackhi_epi8 _mm256_unpackhi_epi8
503 #define _simd_unpacklo_epi16 _mm256_unpacklo_epi16
504 #define _simd_unpackhi_epi16 _mm256_unpackhi_epi16
505 #define _simd_unpacklo_epi32 _mm256_unpacklo_epi32
506 #define _simd_unpackhi_epi32 _mm256_unpackhi_epi32
507 #define _simd_unpacklo_epi64 _mm256_unpacklo_epi64
508 #define _simd_unpackhi_epi64 _mm256_unpackhi_epi64
510 #define _simd_srli_si(a,i) _simdemu_srli_si128<i>(a)
511 #define _simd_slli_epi32 _mm256_slli_epi32
512 #define _simd_srai_epi32 _mm256_srai_epi32
513 #define _simd_srli_epi32 _mm256_srli_epi32
514 #define _simd_srlisi_ps(a,i) _mm256_castsi256_ps(_simdemu_srli_si128<i>(_mm256_castps_si256(a)))
515 #define _simd128_fmadd_ps _mm_fmadd_ps
516 #define _simd_fmadd_ps _mm256_fmadd_ps
517 #define _simd_fmsub_ps _mm256_fmsub_ps
518 #define _simd_shuffle_epi8 _mm256_shuffle_epi8
519 #define _simd_adds_epu8 _mm256_adds_epu8
520 #define _simd_subs_epu8 _mm256_subs_epu8
521 #define _simd_add_epi8 _mm256_add_epi8
522 #define _simd_i32gather_ps _mm256_i32gather_ps
523 #define _simd_mask_i32gather_ps _mm256_mask_i32gather_ps
524 #define _simd_abs_epi32 _mm256_abs_epi32
526 #define _simd_cmpeq_epi64 _mm256_cmpeq_epi64
527 #define _simd_cmpgt_epi64 _mm256_cmpgt_epi64
528 #define _simd_cmpgt_epi8 _mm256_cmpgt_epi8
529 #define _simd_cmpeq_epi8 _mm256_cmpeq_epi8
530 #define _simd_cmpgt_epi16 _mm256_cmpgt_epi16
531 #define _simd_cmpeq_epi16 _mm256_cmpeq_epi16
532 #define _simd_movemask_epi8 _mm256_movemask_epi8
533 #define _simd_permute_ps _mm256_permutevar8x32_ps
534 #define _simd_permute_epi32 _mm256_permutevar8x32_epi32
535 #define _simd_srlv_epi32 _mm256_srlv_epi32
536 #define _simd_sllv_epi32 _mm256_sllv_epi32
537 #define _simd_cvtepu8_epi32 _mm256_cvtepu8_epi32
538 #define _simd_cvtepu16_epi32 _mm256_cvtepu16_epi32
539 #define _simd_packus_epi32 _mm256_packus_epi32
540 #define _simd_packs_epi32 _mm256_packs_epi32
544 #define _simd_unpacklo_ps _mm256_unpacklo_ps
545 #define _simd_unpacklo_pd _mm256_unpacklo_pd
546 #define _simd_insertf128_ps _mm256_insertf128_ps
547 #define _simd_insertf128_pd _mm256_insertf128_pd
548 #define _simd_insertf128_si _mm256_insertf128_si256
549 #define _simd_extractf128_ps _mm256_extractf128_ps
550 #define _simd_extractf128_pd _mm256_extractf128_pd
551 #define _simd_extractf128_si _mm256_extractf128_si256
552 #define _simd_permute2f128_ps _mm256_permute2f128_ps
553 #define _simd_permute2f128_pd _mm256_permute2f128_pd
554 #define _simd_permute2f128_si _mm256_permute2f128_si256
555 #define _simd_shuffle_ps _mm256_shuffle_ps
556 #define _simd_shuffle_pd _mm256_shuffle_pd
557 #define _simd_shuffle_epi32(a, b, imm8) _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), imm8))
558 #define _simd_shuffle_epi64(a, b, imm8) _mm256_castps_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b), imm8))
559 #define _simd_set1_epi32 _mm256_set1_epi32
560 #define _simd_set_epi32 _mm256_set_epi32
561 #define _simd_set1_epi8 _mm256_set1_epi8
562 #define _simd_setzero_si _mm256_setzero_si256
563 #define _simd_cvttps_epi32 _mm256_cvttps_epi32
564 #define _simd_store_si _mm256_store_si256
565 #define _simd_broadcast_ss _mm256_broadcast_ss
566 #define _simd_maskstore_ps _mm256_maskstore_ps
567 #define _simd_load_si _mm256_load_si256
568 #define _simd_loadu_si _mm256_loadu_si256
569 #define _simd_sub_ps _mm256_sub_ps
570 #define _simd_testz_ps _mm256_testz_ps
571 #define _simd_xor_ps _mm256_xor_ps
574 simdscalari
_simd_loadu2_si(const __m128i
*hiaddr
, const __m128i
*loaddr
)
576 __m128i lo
= _mm_loadu_si128(loaddr
);
577 __m128i hi
= _mm_loadu_si128(hiaddr
);
579 return _mm256_insertf128_si256(_mm256_castsi128_si256(lo
), (hi
), 1);
583 void _simd_storeu2_si(__m128i
*hiaddr
, __m128i
*loaddr
, simdscalari a
)
585 _mm_storeu_si128(loaddr
, _mm256_castsi256_si128(a
));
586 _mm_storeu_si128(hiaddr
, _mm256_extractf128_si256(a
, 1));
590 simdscalari
_simd_blendv_epi32(simdscalari a
, simdscalari b
, simdscalar mask
)
592 return _simd_castps_si(_simd_blendv_ps(_simd_castsi_ps(a
), _simd_castsi_ps(b
), mask
));
596 simdscalari
_simd_blendv_epi32(simdscalari a
, simdscalari b
, simdscalari mask
)
598 return _simd_castps_si(_simd_blendv_ps(_simd_castsi_ps(a
), _simd_castsi_ps(b
), _simd_castsi_ps(mask
)));
601 // convert bitmask to vector mask
603 simdscalar
vMask(int32_t mask
)
605 __m256i vec
= _mm256_set1_epi32(mask
);
606 const __m256i bit
= _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
607 vec
= _simd_and_si(vec
, bit
);
608 vec
= _simd_cmplt_epi32(_mm256_setzero_si256(), vec
);
609 return _simd_castsi_ps(vec
);
613 void _simd_mov(simdscalar
&r
, unsigned int rlane
, simdscalar
& s
, unsigned int slane
)
615 OSALIGNSIMD(float) rArray
[KNOB_SIMD_WIDTH
], sArray
[KNOB_SIMD_WIDTH
];
616 _mm256_store_ps(rArray
, r
);
617 _mm256_store_ps(sArray
, s
);
618 rArray
[rlane
] = sArray
[slane
];
619 r
= _mm256_load_ps(rArray
);
622 INLINE __m256i
_simdemu_slli_epi32(__m256i a
, uint32_t i
)
624 __m128i aHi
= _mm256_extractf128_si256(a
, 1);
625 __m128i aLo
= _mm256_castsi256_si128(a
);
627 __m128i resHi
= _mm_slli_epi32(aHi
, i
);
628 __m128i resLo
= _mm_slli_epi32(aLo
, i
);
630 __m256i result
= _mm256_castsi128_si256(resLo
);
631 result
= _mm256_insertf128_si256(result
, resHi
, 1);
636 INLINE __m256i
_simdemu_srai_epi32(__m256i a
, uint32_t i
)
638 __m128i aHi
= _mm256_extractf128_si256(a
, 1);
639 __m128i aLo
= _mm256_castsi256_si128(a
);
641 __m128i resHi
= _mm_srai_epi32(aHi
, i
);
642 __m128i resLo
= _mm_srai_epi32(aLo
, i
);
644 __m256i result
= _mm256_castsi128_si256(resLo
);
645 result
= _mm256_insertf128_si256(result
, resHi
, 1);
650 INLINE __m256i
_simdemu_srli_epi32(__m256i a
, uint32_t i
)
652 __m128i aHi
= _mm256_extractf128_si256(a
, 1);
653 __m128i aLo
= _mm256_castsi256_si128(a
);
655 __m128i resHi
= _mm_srli_epi32(aHi
, i
);
656 __m128i resLo
= _mm_srli_epi32(aLo
, i
);
658 __m256i result
= _mm256_castsi128_si256(resLo
);
659 result
= _mm256_insertf128_si256(result
, resHi
, 1);
665 void _simdvec_transpose(simdvector
&v
)
667 SWR_ASSERT(false, "Need to implement 8 wide version");
671 #error Unsupported vector width
674 // Populates a simdvector from a vector. So p = xyzw becomes xxxx yyyy zzzz wwww.
676 void _simdvec_load_ps(simdvector
& r
, const float *p
)
678 r
[0] = _simd_set1_ps(p
[0]);
679 r
[1] = _simd_set1_ps(p
[1]);
680 r
[2] = _simd_set1_ps(p
[2]);
681 r
[3] = _simd_set1_ps(p
[3]);
685 void _simdvec_mov(simdvector
& r
, const simdscalar
& s
)
694 void _simdvec_mov(simdvector
& r
, const simdvector
& v
)
703 // just move a lane from the source simdvector to dest simdvector
705 void _simdvec_mov(simdvector
&r
, unsigned int rlane
, simdvector
& s
, unsigned int slane
)
707 _simd_mov(r
[0], rlane
, s
[0], slane
);
708 _simd_mov(r
[1], rlane
, s
[1], slane
);
709 _simd_mov(r
[2], rlane
, s
[2], slane
);
710 _simd_mov(r
[3], rlane
, s
[3], slane
);
715 void _simdvec_dp3_ps(simdscalar
& r
, const simdvector
& v0
, const simdvector
& v1
)
718 r
= _simd_mul_ps(v0
[0], v1
[0]); // (v0.x*v1.x)
720 tmp
= _simd_mul_ps(v0
[1], v1
[1]); // (v0.y*v1.y)
721 r
= _simd_add_ps(r
, tmp
); // (v0.x*v1.x) + (v0.y*v1.y)
723 tmp
= _simd_mul_ps(v0
[2], v1
[2]); // (v0.z*v1.z)
724 r
= _simd_add_ps(r
, tmp
); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
728 void _simdvec_dp4_ps(simdscalar
& r
, const simdvector
& v0
, const simdvector
& v1
)
731 r
= _simd_mul_ps(v0
[0], v1
[0]); // (v0.x*v1.x)
733 tmp
= _simd_mul_ps(v0
[1], v1
[1]); // (v0.y*v1.y)
734 r
= _simd_add_ps(r
, tmp
); // (v0.x*v1.x) + (v0.y*v1.y)
736 tmp
= _simd_mul_ps(v0
[2], v1
[2]); // (v0.z*v1.z)
737 r
= _simd_add_ps(r
, tmp
); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
739 tmp
= _simd_mul_ps(v0
[3], v1
[3]); // (v0.w*v1.w)
740 r
= _simd_add_ps(r
, tmp
); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
744 simdscalar
_simdvec_rcp_length_ps(const simdvector
& v
)
747 _simdvec_dp4_ps(length
, v
, v
);
748 return _simd_rsqrt_ps(length
);
752 void _simdvec_normalize_ps(simdvector
& r
, const simdvector
& v
)
754 simdscalar vecLength
;
755 vecLength
= _simdvec_rcp_length_ps(v
);
757 r
[0] = _simd_mul_ps(v
[0], vecLength
);
758 r
[1] = _simd_mul_ps(v
[1], vecLength
);
759 r
[2] = _simd_mul_ps(v
[2], vecLength
);
760 r
[3] = _simd_mul_ps(v
[3], vecLength
);
764 void _simdvec_mul_ps(simdvector
& r
, const simdvector
& v
, const simdscalar
& s
)
766 r
[0] = _simd_mul_ps(v
[0], s
);
767 r
[1] = _simd_mul_ps(v
[1], s
);
768 r
[2] = _simd_mul_ps(v
[2], s
);
769 r
[3] = _simd_mul_ps(v
[3], s
);
773 void _simdvec_mul_ps(simdvector
& r
, const simdvector
& v0
, const simdvector
& v1
)
775 r
[0] = _simd_mul_ps(v0
[0], v1
[0]);
776 r
[1] = _simd_mul_ps(v0
[1], v1
[1]);
777 r
[2] = _simd_mul_ps(v0
[2], v1
[2]);
778 r
[3] = _simd_mul_ps(v0
[3], v1
[3]);
782 void _simdvec_add_ps(simdvector
& r
, const simdvector
& v0
, const simdvector
& v1
)
784 r
[0] = _simd_add_ps(v0
[0], v1
[0]);
785 r
[1] = _simd_add_ps(v0
[1], v1
[1]);
786 r
[2] = _simd_add_ps(v0
[2], v1
[2]);
787 r
[3] = _simd_add_ps(v0
[3], v1
[3]);
791 void _simdvec_min_ps(simdvector
& r
, const simdvector
& v0
, const simdscalar
& s
)
793 r
[0] = _simd_min_ps(v0
[0], s
);
794 r
[1] = _simd_min_ps(v0
[1], s
);
795 r
[2] = _simd_min_ps(v0
[2], s
);
796 r
[3] = _simd_min_ps(v0
[3], s
);
800 void _simdvec_max_ps(simdvector
& r
, const simdvector
& v0
, const simdscalar
& s
)
802 r
[0] = _simd_max_ps(v0
[0], s
);
803 r
[1] = _simd_max_ps(v0
[1], s
);
804 r
[2] = _simd_max_ps(v0
[2], s
);
805 r
[3] = _simd_max_ps(v0
[3], s
);
808 // Matrix4x4 * Vector4
809 // outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * v.w)
810 // outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * v.w)
811 // outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * v.w)
812 // outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * v.w)
814 void _simd_mat4x4_vec4_multiply(
816 const float *pMatrix
,
823 m
= _simd_load1_ps(pMatrix
+ 0*4 + 0); // m[row][0]
824 r0
= _simd_mul_ps(m
, v
[0]); // (m00 * v.x)
825 m
= _simd_load1_ps(pMatrix
+ 0*4 + 1); // m[row][1]
826 r1
= _simd_mul_ps(m
, v
[1]); // (m1 * v.y)
827 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y)
828 m
= _simd_load1_ps(pMatrix
+ 0*4 + 2); // m[row][2]
829 r1
= _simd_mul_ps(m
, v
[2]); // (m2 * v.z)
830 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
831 m
= _simd_load1_ps(pMatrix
+ 0*4 + 3); // m[row][3]
832 r1
= _simd_mul_ps(m
, v
[3]); // (m3 * v.z)
833 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
836 m
= _simd_load1_ps(pMatrix
+ 1*4 + 0); // m[row][0]
837 r0
= _simd_mul_ps(m
, v
[0]); // (m00 * v.x)
838 m
= _simd_load1_ps(pMatrix
+ 1*4 + 1); // m[row][1]
839 r1
= _simd_mul_ps(m
, v
[1]); // (m1 * v.y)
840 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y)
841 m
= _simd_load1_ps(pMatrix
+ 1*4 + 2); // m[row][2]
842 r1
= _simd_mul_ps(m
, v
[2]); // (m2 * v.z)
843 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
844 m
= _simd_load1_ps(pMatrix
+ 1*4 + 3); // m[row][3]
845 r1
= _simd_mul_ps(m
, v
[3]); // (m3 * v.z)
846 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
849 m
= _simd_load1_ps(pMatrix
+ 2*4 + 0); // m[row][0]
850 r0
= _simd_mul_ps(m
, v
[0]); // (m00 * v.x)
851 m
= _simd_load1_ps(pMatrix
+ 2*4 + 1); // m[row][1]
852 r1
= _simd_mul_ps(m
, v
[1]); // (m1 * v.y)
853 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y)
854 m
= _simd_load1_ps(pMatrix
+ 2*4 + 2); // m[row][2]
855 r1
= _simd_mul_ps(m
, v
[2]); // (m2 * v.z)
856 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
857 m
= _simd_load1_ps(pMatrix
+ 2*4 + 3); // m[row][3]
858 r1
= _simd_mul_ps(m
, v
[3]); // (m3 * v.z)
859 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
862 m
= _simd_load1_ps(pMatrix
+ 3*4 + 0); // m[row][0]
863 r0
= _simd_mul_ps(m
, v
[0]); // (m00 * v.x)
864 m
= _simd_load1_ps(pMatrix
+ 3*4 + 1); // m[row][1]
865 r1
= _simd_mul_ps(m
, v
[1]); // (m1 * v.y)
866 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y)
867 m
= _simd_load1_ps(pMatrix
+ 3*4 + 2); // m[row][2]
868 r1
= _simd_mul_ps(m
, v
[2]); // (m2 * v.z)
869 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
870 m
= _simd_load1_ps(pMatrix
+ 3*4 + 3); // m[row][3]
871 r1
= _simd_mul_ps(m
, v
[3]); // (m3 * v.z)
872 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
876 // Matrix4x4 * Vector3 - Direction Vector where w = 0.
877 // outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 0)
878 // outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 0)
879 // outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 0)
880 // outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 0)
882 void _simd_mat3x3_vec3_w0_multiply(
884 const float *pMatrix
,
891 m
= _simd_load1_ps(pMatrix
+ 0*4 + 0); // m[row][0]
892 r0
= _simd_mul_ps(m
, v
[0]); // (m00 * v.x)
893 m
= _simd_load1_ps(pMatrix
+ 0*4 + 1); // m[row][1]
894 r1
= _simd_mul_ps(m
, v
[1]); // (m1 * v.y)
895 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y)
896 m
= _simd_load1_ps(pMatrix
+ 0*4 + 2); // m[row][2]
897 r1
= _simd_mul_ps(m
, v
[2]); // (m2 * v.z)
898 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
901 m
= _simd_load1_ps(pMatrix
+ 1*4 + 0); // m[row][0]
902 r0
= _simd_mul_ps(m
, v
[0]); // (m00 * v.x)
903 m
= _simd_load1_ps(pMatrix
+ 1*4 + 1); // m[row][1]
904 r1
= _simd_mul_ps(m
, v
[1]); // (m1 * v.y)
905 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y)
906 m
= _simd_load1_ps(pMatrix
+ 1*4 + 2); // m[row][2]
907 r1
= _simd_mul_ps(m
, v
[2]); // (m2 * v.z)
908 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
911 m
= _simd_load1_ps(pMatrix
+ 2*4 + 0); // m[row][0]
912 r0
= _simd_mul_ps(m
, v
[0]); // (m00 * v.x)
913 m
= _simd_load1_ps(pMatrix
+ 2*4 + 1); // m[row][1]
914 r1
= _simd_mul_ps(m
, v
[1]); // (m1 * v.y)
915 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y)
916 m
= _simd_load1_ps(pMatrix
+ 2*4 + 2); // m[row][2]
917 r1
= _simd_mul_ps(m
, v
[2]); // (m2 * v.z)
918 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
921 result
[3] = _simd_setzero_ps();
924 // Matrix4x4 * Vector3 - Position vector where w = 1.
925 // outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 1)
926 // outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 1)
927 // outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 1)
928 // outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 1)
930 void _simd_mat4x4_vec3_w1_multiply(
932 const float *pMatrix
,
939 m
= _simd_load1_ps(pMatrix
+ 0*4 + 0); // m[row][0]
940 r0
= _simd_mul_ps(m
, v
[0]); // (m00 * v.x)
941 m
= _simd_load1_ps(pMatrix
+ 0*4 + 1); // m[row][1]
942 r1
= _simd_mul_ps(m
, v
[1]); // (m1 * v.y)
943 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y)
944 m
= _simd_load1_ps(pMatrix
+ 0*4 + 2); // m[row][2]
945 r1
= _simd_mul_ps(m
, v
[2]); // (m2 * v.z)
946 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
947 m
= _simd_load1_ps(pMatrix
+ 0*4 + 3); // m[row][3]
948 r0
= _simd_add_ps(r0
, m
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
951 m
= _simd_load1_ps(pMatrix
+ 1*4 + 0); // m[row][0]
952 r0
= _simd_mul_ps(m
, v
[0]); // (m00 * v.x)
953 m
= _simd_load1_ps(pMatrix
+ 1*4 + 1); // m[row][1]
954 r1
= _simd_mul_ps(m
, v
[1]); // (m1 * v.y)
955 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y)
956 m
= _simd_load1_ps(pMatrix
+ 1*4 + 2); // m[row][2]
957 r1
= _simd_mul_ps(m
, v
[2]); // (m2 * v.z)
958 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
959 m
= _simd_load1_ps(pMatrix
+ 1*4 + 3); // m[row][3]
960 r0
= _simd_add_ps(r0
, m
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
963 m
= _simd_load1_ps(pMatrix
+ 2*4 + 0); // m[row][0]
964 r0
= _simd_mul_ps(m
, v
[0]); // (m00 * v.x)
965 m
= _simd_load1_ps(pMatrix
+ 2*4 + 1); // m[row][1]
966 r1
= _simd_mul_ps(m
, v
[1]); // (m1 * v.y)
967 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y)
968 m
= _simd_load1_ps(pMatrix
+ 2*4 + 2); // m[row][2]
969 r1
= _simd_mul_ps(m
, v
[2]); // (m2 * v.z)
970 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
971 m
= _simd_load1_ps(pMatrix
+ 2*4 + 3); // m[row][3]
972 r0
= _simd_add_ps(r0
, m
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
975 m
= _simd_load1_ps(pMatrix
+ 3*4 + 0); // m[row][0]
976 r0
= _simd_mul_ps(m
, v
[0]); // (m00 * v.x)
977 m
= _simd_load1_ps(pMatrix
+ 3*4 + 1); // m[row][1]
978 r1
= _simd_mul_ps(m
, v
[1]); // (m1 * v.y)
979 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y)
980 m
= _simd_load1_ps(pMatrix
+ 3*4 + 2); // m[row][2]
981 r1
= _simd_mul_ps(m
, v
[2]); // (m2 * v.z)
982 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
983 m
= _simd_load1_ps(pMatrix
+ 3*4 + 3); // m[row][3]
984 result
[3] = _simd_add_ps(r0
, m
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
988 void _simd_mat4x3_vec3_w1_multiply(
990 const float *pMatrix
,
997 m
= _simd_load1_ps(pMatrix
+ 0*4 + 0); // m[row][0]
998 r0
= _simd_mul_ps(m
, v
[0]); // (m00 * v.x)
999 m
= _simd_load1_ps(pMatrix
+ 0*4 + 1); // m[row][1]
1000 r1
= _simd_mul_ps(m
, v
[1]); // (m1 * v.y)
1001 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y)
1002 m
= _simd_load1_ps(pMatrix
+ 0*4 + 2); // m[row][2]
1003 r1
= _simd_mul_ps(m
, v
[2]); // (m2 * v.z)
1004 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
1005 m
= _simd_load1_ps(pMatrix
+ 0*4 + 3); // m[row][3]
1006 r0
= _simd_add_ps(r0
, m
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
1009 m
= _simd_load1_ps(pMatrix
+ 1*4 + 0); // m[row][0]
1010 r0
= _simd_mul_ps(m
, v
[0]); // (m00 * v.x)
1011 m
= _simd_load1_ps(pMatrix
+ 1*4 + 1); // m[row][1]
1012 r1
= _simd_mul_ps(m
, v
[1]); // (m1 * v.y)
1013 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y)
1014 m
= _simd_load1_ps(pMatrix
+ 1*4 + 2); // m[row][2]
1015 r1
= _simd_mul_ps(m
, v
[2]); // (m2 * v.z)
1016 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
1017 m
= _simd_load1_ps(pMatrix
+ 1*4 + 3); // m[row][3]
1018 r0
= _simd_add_ps(r0
, m
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
1021 m
= _simd_load1_ps(pMatrix
+ 2*4 + 0); // m[row][0]
1022 r0
= _simd_mul_ps(m
, v
[0]); // (m00 * v.x)
1023 m
= _simd_load1_ps(pMatrix
+ 2*4 + 1); // m[row][1]
1024 r1
= _simd_mul_ps(m
, v
[1]); // (m1 * v.y)
1025 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y)
1026 m
= _simd_load1_ps(pMatrix
+ 2*4 + 2); // m[row][2]
1027 r1
= _simd_mul_ps(m
, v
[2]); // (m2 * v.z)
1028 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
1029 m
= _simd_load1_ps(pMatrix
+ 2*4 + 3); // m[row][3]
1030 r0
= _simd_add_ps(r0
, m
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
1032 result
[3] = _simd_set1_ps(1.0f
);
1035 //////////////////////////////////////////////////////////////////////////
1036 /// @brief Compute plane equation vA * vX + vB * vY + vC
1037 INLINE simdscalar
vplaneps(simdscalar vA
, simdscalar vB
, simdscalar vC
, simdscalar
&vX
, simdscalar
&vY
)
1039 simdscalar vOut
= _simd_fmadd_ps(vA
, vX
, vC
);
1040 vOut
= _simd_fmadd_ps(vB
, vY
, vOut
);
1044 //////////////////////////////////////////////////////////////////////////
1045 /// @brief Compute plane equation vA * vX + vB * vY + vC
1046 INLINE __m128
vplaneps128(__m128 vA
, __m128 vB
, __m128 vC
, __m128
&vX
, __m128
&vY
)
1048 __m128 vOut
= _simd128_fmadd_ps(vA
, vX
, vC
);
1049 vOut
= _simd128_fmadd_ps(vB
, vY
, vOut
);
1053 //////////////////////////////////////////////////////////////////////////
1054 /// @brief Interpolates a single component.
1055 /// @param vI - barycentric I
1056 /// @param vJ - barycentric J
1057 /// @param pInterpBuffer - pointer to attribute barycentric coeffs
1058 template<UINT Attrib
, UINT Comp
, UINT numComponents
= 4>
1059 static INLINE simdscalar
InterpolateComponent(simdscalar vI
, simdscalar vJ
, const float *pInterpBuffer
)
1061 const float *pInterpA
= &pInterpBuffer
[Attrib
* 3 * numComponents
+ 0 + Comp
];
1062 const float *pInterpB
= &pInterpBuffer
[Attrib
* 3 * numComponents
+ numComponents
+ Comp
];
1063 const float *pInterpC
= &pInterpBuffer
[Attrib
* 3 * numComponents
+ numComponents
* 2 + Comp
];
1065 simdscalar vA
= _simd_broadcast_ss(pInterpA
);
1066 simdscalar vB
= _simd_broadcast_ss(pInterpB
);
1067 simdscalar vC
= _simd_broadcast_ss(pInterpC
);
1069 simdscalar vk
= _simd_sub_ps(_simd_sub_ps(_simd_set1_ps(1.0f
), vI
), vJ
);
1070 vC
= _simd_mul_ps(vk
, vC
);
1072 return vplaneps(vA
, vB
, vC
, vI
, vJ
);
1075 //////////////////////////////////////////////////////////////////////////
1076 /// @brief Interpolates a single component.
1077 /// @param vI - barycentric I
1078 /// @param vJ - barycentric J
1079 /// @param pInterpBuffer - pointer to attribute barycentric coeffs
1080 template<UINT Attrib
, UINT Comp
, UINT numComponents
= 4>
1081 static INLINE __m128
InterpolateComponent(__m128 vI
, __m128 vJ
, const float *pInterpBuffer
)
1083 const float *pInterpA
= &pInterpBuffer
[Attrib
* 3 * numComponents
+ 0 + Comp
];
1084 const float *pInterpB
= &pInterpBuffer
[Attrib
* 3 * numComponents
+ numComponents
+ Comp
];
1085 const float *pInterpC
= &pInterpBuffer
[Attrib
* 3 * numComponents
+ numComponents
* 2 + Comp
];
1087 __m128 vA
= _mm_broadcast_ss(pInterpA
);
1088 __m128 vB
= _mm_broadcast_ss(pInterpB
);
1089 __m128 vC
= _mm_broadcast_ss(pInterpC
);
1091 __m128 vk
= _mm_sub_ps(_mm_sub_ps(_mm_set1_ps(1.0f
), vI
), vJ
);
1092 vC
= _mm_mul_ps(vk
, vC
);
1094 return vplaneps128(vA
, vB
, vC
, vI
, vJ
);
1097 static INLINE __m128
_simd128_abs_ps(__m128 a
)
1099 __m128i ai
= _mm_castps_si128(a
);
1100 return _mm_castsi128_ps(_mm_and_si128(ai
, _mm_set1_epi32(0x7fffffff)));
1103 static INLINE simdscalar
_simd_abs_ps(simdscalar a
)
1105 simdscalari ai
= _simd_castps_si(a
);
1106 return _simd_castsi_ps(_simd_and_si(ai
, _simd_set1_epi32(0x7fffffff)));
1110 UINT
pdep_u32(UINT a
, UINT mask
)
1112 #if KNOB_ARCH >= KNOB_ARCH_AVX2
1113 return _pdep_u32(a
, mask
);
1117 // copied from http://wm.ite.pl/articles/pdep-soft-emu.html
1118 // using bsf instead of funky loop
1120 while (_BitScanForward(&maskIndex
, mask
))
1122 // 1. isolate lowest set bit of mask
1123 const UINT lowest
= 1 << maskIndex
;
1125 // 2. populate LSB from src
1126 const UINT LSB
= (UINT
)((int)(a
<< 31) >> 31);
1128 // 3. copy bit from mask
1129 result
|= LSB
& lowest
;
1131 // 4. clear lowest bit
1134 // 5. prepare for next iteration
1143 UINT
pext_u32(UINT a
, UINT mask
)
1145 #if KNOB_ARCH >= KNOB_ARCH_AVX2
1146 return _pext_u32(a
, mask
);
1150 uint32_t currentBit
= 0;
1151 while (_BitScanForward(&maskIndex
, mask
))
1153 // 1. isolate lowest set bit of mask
1154 const UINT lowest
= 1 << maskIndex
;
1156 // 2. copy bit from mask
1157 result
|= ((a
& lowest
) > 0) << currentBit
++;
1159 // 3. clear lowest bit
1166 #if ENABLE_AVX512_SIMD16
1167 #include "simd16intrin.h"
1168 #endif//ENABLE_AVX512_SIMD16
1170 #endif//__SWR_SIMDINTRIN_H__