61c0c5461a39579da6d2937e6d9d1c4cf0bc5762
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 ****************************************************************************/
24 #ifndef __SWR_SIMDINTRIN_H__
25 #define __SWR_SIMDINTRIN_H__
31 #include <emmintrin.h>
32 #include <immintrin.h>
33 #include <xmmintrin.h>
35 #if KNOB_SIMD_WIDTH == 8
36 typedef __m256 simdscalar
;
37 typedef __m256i simdscalari
;
38 typedef uint8_t simdmask
;
40 #error Unsupported vector width
44 OSALIGNSIMD(union) simdvector
49 simdscalar x
, y
, z
, w
;
52 simdscalar
& operator[] (const int i
) { return v
[i
]; }
53 const simdscalar
& operator[] (const int i
) const { return v
[i
]; }
56 #if KNOB_SIMD_WIDTH == 8
57 #define _simd128_maskstore_ps _mm_maskstore_ps
58 #define _simd_load_ps _mm256_load_ps
59 #define _simd_load1_ps _mm256_broadcast_ss
60 #define _simd_loadu_ps _mm256_loadu_ps
61 #define _simd_setzero_ps _mm256_setzero_ps
62 #define _simd_set1_ps _mm256_set1_ps
63 #define _simd_blend_ps _mm256_blend_ps
64 #define _simd_blendv_ps _mm256_blendv_ps
65 #define _simd_store_ps _mm256_store_ps
66 #define _simd_mul_ps _mm256_mul_ps
67 #define _simd_add_ps _mm256_add_ps
68 #define _simd_sub_ps _mm256_sub_ps
69 #define _simd_rsqrt_ps _mm256_rsqrt_ps
70 #define _simd_min_ps _mm256_min_ps
71 #define _simd_max_ps _mm256_max_ps
72 #define _simd_movemask_ps _mm256_movemask_ps
73 #define _simd_cvtps_epi32 _mm256_cvtps_epi32
74 #define _simd_cvttps_epi32 _mm256_cvttps_epi32
75 #define _simd_cvtepi32_ps _mm256_cvtepi32_ps
76 #define _simd_cmplt_ps(a, b) _mm256_cmp_ps(a, b, _CMP_LT_OQ)
77 #define _simd_cmpgt_ps(a, b) _mm256_cmp_ps(a, b, _CMP_GT_OQ)
78 #define _simd_cmpneq_ps(a, b) _mm256_cmp_ps(a, b, _CMP_NEQ_OQ)
79 #define _simd_cmpeq_ps(a, b) _mm256_cmp_ps(a, b, _CMP_EQ_OQ)
80 #define _simd_cmpge_ps(a, b) _mm256_cmp_ps(a, b, _CMP_GE_OQ)
81 #define _simd_cmple_ps(a, b) _mm256_cmp_ps(a, b, _CMP_LE_OQ)
82 #define _simd_cmp_ps(a, b, imm) _mm256_cmp_ps(a, b, imm)
83 #define _simd_and_ps _mm256_and_ps
84 #define _simd_or_ps _mm256_or_ps
86 #define _simd_rcp_ps _mm256_rcp_ps
87 #define _simd_div_ps _mm256_div_ps
88 #define _simd_castsi_ps _mm256_castsi256_ps
89 #define _simd_andnot_ps _mm256_andnot_ps
90 #define _simd_round_ps _mm256_round_ps
91 #define _simd_castpd_ps _mm256_castpd_ps
92 #define _simd_broadcast_ps(a) _mm256_broadcast_ps((const __m128*)(a))
93 #define _simd_stream_ps _mm256_stream_ps
95 #define _simd_load_sd _mm256_load_sd
96 #define _simd_movemask_pd _mm256_movemask_pd
97 #define _simd_castsi_pd _mm256_castsi256_pd
99 // emulated integer simd
100 #define SIMD_EMU_EPI(func, intrin) \
102 __m256i func(__m256i a, __m256i b)\
104 __m128i aHi = _mm256_extractf128_si256(a, 1);\
105 __m128i bHi = _mm256_extractf128_si256(b, 1);\
106 __m128i aLo = _mm256_castsi256_si128(a);\
107 __m128i bLo = _mm256_castsi256_si128(b);\
109 __m128i subLo = intrin(aLo, bLo);\
110 __m128i subHi = intrin(aHi, bHi);\
112 __m256i result = _mm256_castsi128_si256(subLo);\
113 result = _mm256_insertf128_si256(result, subHi, 1);\
118 #if (KNOB_ARCH == KNOB_ARCH_AVX)
120 __m256
_simdemu_permute_ps(__m256 a
, __m256i b
)
122 __m128 aHi
= _mm256_extractf128_ps(a
, 1);
123 __m128i bHi
= _mm256_extractf128_si256(b
, 1);
124 __m128 aLo
= _mm256_castps256_ps128(a
);
125 __m128i bLo
= _mm256_castsi256_si128(b
);
127 __m128i indexHi
= _mm_cmpgt_epi32(bLo
, _mm_set1_epi32(3));
128 __m128 resLow
= _mm_permutevar_ps(aLo
, _mm_and_si128(bLo
, _mm_set1_epi32(0x3)));
129 __m128 resHi
= _mm_permutevar_ps(aHi
, _mm_and_si128(bLo
, _mm_set1_epi32(0x3)));
130 __m128 blendLowRes
= _mm_blendv_ps(resLow
, resHi
, _mm_castsi128_ps(indexHi
));
132 indexHi
= _mm_cmpgt_epi32(bHi
, _mm_set1_epi32(3));
133 resLow
= _mm_permutevar_ps(aLo
, _mm_and_si128(bHi
, _mm_set1_epi32(0x3)));
134 resHi
= _mm_permutevar_ps(aHi
, _mm_and_si128(bHi
, _mm_set1_epi32(0x3)));
135 __m128 blendHiRes
= _mm_blendv_ps(resLow
, resHi
, _mm_castsi128_ps(indexHi
));
137 __m256 result
= _mm256_castps128_ps256(blendLowRes
);
138 result
= _mm256_insertf128_ps(result
, blendHiRes
, 1);
144 __m256i
_simdemu_permute_epi32(__m256i a
, __m256i b
)
146 return _mm256_castps_si256(_simdemu_permute_ps(_mm256_castsi256_ps(a
), b
));
150 __m256i
_simdemu_srlv_epi32(__m256i vA
, __m256i vCount
)
152 int32_t aHi
, aLow
, countHi
, countLow
;
153 __m128i vAHi
= _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA
), 1));
154 __m128i vALow
= _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA
), 0));
155 __m128i vCountHi
= _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount
), 1));
156 __m128i vCountLow
= _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount
), 0));
158 aHi
= _mm_extract_epi32(vAHi
, 0);
159 countHi
= _mm_extract_epi32(vCountHi
, 0);
161 vAHi
= _mm_insert_epi32(vAHi
, aHi
, 0);
163 aLow
= _mm_extract_epi32(vALow
, 0);
164 countLow
= _mm_extract_epi32(vCountLow
, 0);
166 vALow
= _mm_insert_epi32(vALow
, aLow
, 0);
168 aHi
= _mm_extract_epi32(vAHi
, 1);
169 countHi
= _mm_extract_epi32(vCountHi
, 1);
171 vAHi
= _mm_insert_epi32(vAHi
, aHi
, 1);
173 aLow
= _mm_extract_epi32(vALow
, 1);
174 countLow
= _mm_extract_epi32(vCountLow
, 1);
176 vALow
= _mm_insert_epi32(vALow
, aLow
, 1);
178 aHi
= _mm_extract_epi32(vAHi
, 2);
179 countHi
= _mm_extract_epi32(vCountHi
, 2);
181 vAHi
= _mm_insert_epi32(vAHi
, aHi
, 2);
183 aLow
= _mm_extract_epi32(vALow
, 2);
184 countLow
= _mm_extract_epi32(vCountLow
, 2);
186 vALow
= _mm_insert_epi32(vALow
, aLow
, 2);
188 aHi
= _mm_extract_epi32(vAHi
, 3);
189 countHi
= _mm_extract_epi32(vCountHi
, 3);
191 vAHi
= _mm_insert_epi32(vAHi
, aHi
, 3);
193 aLow
= _mm_extract_epi32(vALow
, 3);
194 countLow
= _mm_extract_epi32(vCountLow
, 3);
196 vALow
= _mm_insert_epi32(vALow
, aLow
, 3);
198 __m256i ret
= _mm256_set1_epi32(0);
199 ret
= _mm256_insertf128_si256(ret
, vAHi
, 1);
200 ret
= _mm256_insertf128_si256(ret
, vALow
, 0);
206 __m256i
_simdemu_sllv_epi32(__m256i vA
, __m256i vCount
)
208 int32_t aHi
, aLow
, countHi
, countLow
;
209 __m128i vAHi
= _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA
), 1));
210 __m128i vALow
= _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA
), 0));
211 __m128i vCountHi
= _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount
), 1));
212 __m128i vCountLow
= _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount
), 0));
214 aHi
= _mm_extract_epi32(vAHi
, 0);
215 countHi
= _mm_extract_epi32(vCountHi
, 0);
217 vAHi
= _mm_insert_epi32(vAHi
, aHi
, 0);
219 aLow
= _mm_extract_epi32(vALow
, 0);
220 countLow
= _mm_extract_epi32(vCountLow
, 0);
222 vALow
= _mm_insert_epi32(vALow
, aLow
, 0);
224 aHi
= _mm_extract_epi32(vAHi
, 1);
225 countHi
= _mm_extract_epi32(vCountHi
, 1);
227 vAHi
= _mm_insert_epi32(vAHi
, aHi
, 1);
229 aLow
= _mm_extract_epi32(vALow
, 1);
230 countLow
= _mm_extract_epi32(vCountLow
, 1);
232 vALow
= _mm_insert_epi32(vALow
, aLow
, 1);
234 aHi
= _mm_extract_epi32(vAHi
, 2);
235 countHi
= _mm_extract_epi32(vCountHi
, 2);
237 vAHi
= _mm_insert_epi32(vAHi
, aHi
, 2);
239 aLow
= _mm_extract_epi32(vALow
, 2);
240 countLow
= _mm_extract_epi32(vCountLow
, 2);
242 vALow
= _mm_insert_epi32(vALow
, aLow
, 2);
244 aHi
= _mm_extract_epi32(vAHi
, 3);
245 countHi
= _mm_extract_epi32(vCountHi
, 3);
247 vAHi
= _mm_insert_epi32(vAHi
, aHi
, 3);
249 aLow
= _mm_extract_epi32(vALow
, 3);
250 countLow
= _mm_extract_epi32(vCountLow
, 3);
252 vALow
= _mm_insert_epi32(vALow
, aLow
, 3);
254 __m256i ret
= _mm256_set1_epi32(0);
255 ret
= _mm256_insertf128_si256(ret
, vAHi
, 1);
256 ret
= _mm256_insertf128_si256(ret
, vALow
, 0);
260 #define _simd_mul_epi32 _simdemu_mul_epi32
261 #define _simd_mullo_epi32 _simdemu_mullo_epi32
262 #define _simd_sub_epi32 _simdemu_sub_epi32
263 #define _simd_sub_epi64 _simdemu_sub_epi64
264 #define _simd_min_epi32 _simdemu_min_epi32
265 #define _simd_min_epu32 _simdemu_min_epu32
266 #define _simd_max_epi32 _simdemu_max_epi32
267 #define _simd_max_epu32 _simdemu_max_epu32
268 #define _simd_add_epi32 _simdemu_add_epi32
269 #define _simd_and_si _simdemu_and_si
270 #define _simd_andnot_si _simdemu_andnot_si
271 #define _simd_cmpeq_epi32 _simdemu_cmpeq_epi32
272 #define _simd_cmplt_epi32 _simdemu_cmplt_epi32
273 #define _simd_cmpgt_epi32 _simdemu_cmpgt_epi32
274 #define _simd_or_si _simdemu_or_si
275 #define _simd_xor_si _simdemu_xor_si
276 #define _simd_castps_si _mm256_castps_si256
277 #define _simd_adds_epu8 _simdemu_adds_epu8
278 #define _simd_subs_epu8 _simdemu_subs_epu8
279 #define _simd_add_epi8 _simdemu_add_epi8
280 #define _simd_cmpeq_epi64 _simdemu_cmpeq_epi64
281 #define _simd_cmpgt_epi64 _simdemu_cmpgt_epi64
282 #define _simd_cmpgt_epi8 _simdemu_cmpgt_epi8
283 #define _simd_cmpeq_epi8 _simdemu_cmpeq_epi8
284 #define _simd_cmpgt_epi16 _simdemu_cmpgt_epi16
285 #define _simd_cmpeq_epi16 _simdemu_cmpeq_epi16
286 #define _simd_movemask_epi8 _simdemu_movemask_epi8
287 #define _simd_permute_ps _simdemu_permute_ps
288 #define _simd_permute_epi32 _simdemu_permute_epi32
289 #define _simd_srlv_epi32 _simdemu_srlv_epi32
290 #define _simd_sllv_epi32 _simdemu_sllv_epi32
292 SIMD_EMU_EPI(_simdemu_mul_epi32
, _mm_mul_epi32
)
293 SIMD_EMU_EPI(_simdemu_mullo_epi32
, _mm_mullo_epi32
)
294 SIMD_EMU_EPI(_simdemu_sub_epi32
, _mm_sub_epi32
)
295 SIMD_EMU_EPI(_simdemu_sub_epi64
, _mm_sub_epi64
)
296 SIMD_EMU_EPI(_simdemu_min_epi32
, _mm_min_epi32
)
297 SIMD_EMU_EPI(_simdemu_min_epu32
, _mm_min_epu32
)
298 SIMD_EMU_EPI(_simdemu_max_epi32
, _mm_max_epi32
)
299 SIMD_EMU_EPI(_simdemu_max_epu32
, _mm_max_epu32
)
300 SIMD_EMU_EPI(_simdemu_add_epi32
, _mm_add_epi32
)
301 SIMD_EMU_EPI(_simdemu_and_si
, _mm_and_si128
)
302 SIMD_EMU_EPI(_simdemu_andnot_si
, _mm_andnot_si128
)
303 SIMD_EMU_EPI(_simdemu_cmpeq_epi32
, _mm_cmpeq_epi32
)
304 SIMD_EMU_EPI(_simdemu_cmplt_epi32
, _mm_cmplt_epi32
)
305 SIMD_EMU_EPI(_simdemu_cmpgt_epi32
, _mm_cmpgt_epi32
)
306 SIMD_EMU_EPI(_simdemu_or_si
, _mm_or_si128
)
307 SIMD_EMU_EPI(_simdemu_xor_si
, _mm_xor_si128
)
308 SIMD_EMU_EPI(_simdemu_adds_epu8
, _mm_adds_epu8
)
309 SIMD_EMU_EPI(_simdemu_subs_epu8
, _mm_subs_epu8
)
310 SIMD_EMU_EPI(_simdemu_add_epi8
, _mm_add_epi8
)
311 SIMD_EMU_EPI(_simdemu_cmpeq_epi64
, _mm_cmpeq_epi64
)
312 SIMD_EMU_EPI(_simdemu_cmpgt_epi64
, _mm_cmpgt_epi64
)
313 SIMD_EMU_EPI(_simdemu_cmpgt_epi8
, _mm_cmpgt_epi8
)
314 SIMD_EMU_EPI(_simdemu_cmpeq_epi8
, _mm_cmpeq_epi8
)
315 SIMD_EMU_EPI(_simdemu_cmpgt_epi16
, _mm_cmpgt_epi16
)
316 SIMD_EMU_EPI(_simdemu_cmpeq_epi16
, _mm_cmpeq_epi16
)
317 SIMD_EMU_EPI(_simdemu_unpacklo_epi8
, _mm_unpacklo_epi8
)
318 SIMD_EMU_EPI(_simdemu_unpackhi_epi8
, _mm_unpackhi_epi8
)
319 SIMD_EMU_EPI(_simdemu_unpacklo_epi16
, _mm_unpacklo_epi16
)
320 SIMD_EMU_EPI(_simdemu_unpackhi_epi16
, _mm_unpackhi_epi16
)
322 #define _simd_unpacklo_epi8 _simdemu_unpacklo_epi8
323 #define _simd_unpackhi_epi8 _simdemu_unpackhi_epi8
324 #define _simd_unpacklo_epi16 _simdemu_unpacklo_epi16
325 #define _simd_unpackhi_epi16 _simdemu_unpackhi_epi16
326 #define _simd_unpacklo_epi32(a, b) _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)))
327 #define _simd_unpackhi_epi32(a, b) _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)))
328 #define _simd_unpacklo_epi64(a, b) _mm256_castpd_si256(_mm256_unpacklo_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b)))
329 #define _simd_unpackhi_epi64(a, b) _mm256_castpd_si256(_mm256_unpackhi_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b)))
331 #define _simd_slli_epi32(a,i) _simdemu_slli_epi32(a,i)
332 #define _simd_srai_epi32(a,i) _simdemu_srai_epi32(a,i)
333 #define _simd_srli_epi32(a,i) _simdemu_srli_epi32(a,i)
334 #define _simd_srlisi_ps(a,i) _mm256_castsi256_ps(_simdemu_srli_si128<i>(_mm256_castps_si256(a)))
336 #define _simd128_fmadd_ps _mm_fmaddemu_ps
337 #define _simd_fmadd_ps _mm_fmaddemu256_ps
338 #define _simd_fmsub_ps _mm_fmsubemu256_ps
339 #define _simd_shuffle_epi8 _simdemu_shuffle_epi8
340 SIMD_EMU_EPI(_simdemu_shuffle_epi8
, _mm_shuffle_epi8
)
343 __m128
_mm_fmaddemu_ps(__m128 a
, __m128 b
, __m128 c
)
345 __m128 res
= _mm_mul_ps(a
, b
);
346 res
= _mm_add_ps(res
, c
);
351 __m256
_mm_fmaddemu256_ps(__m256 a
, __m256 b
, __m256 c
)
353 __m256 res
= _mm256_mul_ps(a
, b
);
354 res
= _mm256_add_ps(res
, c
);
359 __m256
_mm_fmsubemu256_ps(__m256 a
, __m256 b
, __m256 c
)
361 __m256 res
= _mm256_mul_ps(a
, b
);
362 res
= _mm256_sub_ps(res
, c
);
367 __m256
_simd_i32gather_ps(const float* pBase
, __m256i vOffsets
, const int scale
)
369 uint32_t *pOffsets
= (uint32_t*)&vOffsets
;
371 float* pResult
= (float*)&vResult
;
372 for (uint32_t i
= 0; i
< KNOB_SIMD_WIDTH
; ++i
)
374 uint32_t offset
= pOffsets
[i
];
375 offset
= offset
* scale
;
376 pResult
[i
] = *(float*)(((const uint8_t*)pBase
+ offset
));
383 __m256
_simd_mask_i32gather_ps(__m256 vSrc
, const float* pBase
, __m256i vOffsets
, __m256 vMask
, const int scale
)
385 uint32_t *pOffsets
= (uint32_t*)&vOffsets
;
386 simdscalar vResult
= vSrc
;
387 float* pResult
= (float*)&vResult
;
389 uint32_t mask
= _simd_movemask_ps(vMask
);
390 while (_BitScanForward(&index
, mask
))
392 mask
&= ~(1 << index
);
393 uint32_t offset
= pOffsets
[index
];
394 offset
= offset
* scale
;
395 pResult
[index
] = *(float*)(((const uint8_t*)pBase
+ offset
));
402 __m256i
_simd_abs_epi32(__m256i a
)
404 __m128i aHi
= _mm256_extractf128_si256(a
, 1);
405 __m128i aLo
= _mm256_castsi256_si128(a
);
406 __m128i absLo
= _mm_abs_epi32(aLo
);
407 __m128i absHi
= _mm_abs_epi32(aHi
);
408 __m256i result
= _mm256_castsi128_si256(absLo
);
409 result
= _mm256_insertf128_si256(result
, absHi
, 1);
414 int _simdemu_movemask_epi8(__m256i a
)
416 __m128i aHi
= _mm256_extractf128_si256(a
, 1);
417 __m128i aLo
= _mm256_castsi256_si128(a
);
419 int resHi
= _mm_movemask_epi8(aHi
);
420 int resLo
= _mm_movemask_epi8(aLo
);
422 return (resHi
<< 16) | resLo
;
426 __m256i
_simd_cvtepu8_epi16(__m128i a
)
428 __m128i resultlo
= _mm_cvtepu8_epi16(a
);
429 __m128i resulthi
= _mm_cvtepu8_epi16(_mm_srli_si128(a
, 8));
431 __m256i result
= _mm256_castsi128_si256(resultlo
);
433 return _mm256_insertf128_si256(result
, resulthi
, 1);
437 __m256i
_simd_cvtepu8_epi32(__m128i a
)
439 __m128i resultlo
= _mm_cvtepu8_epi32(a
);
440 __m128i resulthi
= _mm_cvtepu8_epi32(_mm_srli_si128(a
, 4));
442 __m256i result
= _mm256_castsi128_si256(resultlo
);
444 return _mm256_insertf128_si256(result
, resulthi
, 1);
448 __m256i
_simd_cvtepu16_epi32(__m128i a
)
450 __m128i resultlo
= _mm_cvtepu16_epi32(a
);
451 __m128i resulthi
= _mm_cvtepu16_epi32(_mm_srli_si128(a
, 8));
453 __m256i result
= _mm256_castsi128_si256(resultlo
);
455 return _mm256_insertf128_si256(result
, resulthi
, 1);
459 __m256i
_simd_packus_epi16(__m256i a
, __m256i b
)
461 __m128i alo
= _mm256_extractf128_si256(a
, 0);
462 __m128i ahi
= _mm256_extractf128_si256(a
, 1);
464 __m128i blo
= _mm256_extractf128_si256(b
, 0);
465 __m128i bhi
= _mm256_extractf128_si256(b
, 1);
467 __m128i resultlo
= _mm_packus_epi16(alo
, blo
);
468 __m128i resulthi
= _mm_packus_epi16(ahi
, bhi
);
470 __m256i result
= _mm256_castsi128_si256(resultlo
);
472 return _mm256_insertf128_si256(result
, resulthi
, 1);
476 __m256i
_simd_packs_epi16(__m256i a
, __m256i b
)
478 __m128i alo
= _mm256_extractf128_si256(a
, 0);
479 __m128i ahi
= _mm256_extractf128_si256(a
, 1);
481 __m128i blo
= _mm256_extractf128_si256(b
, 0);
482 __m128i bhi
= _mm256_extractf128_si256(b
, 1);
484 __m128i resultlo
= _mm_packs_epi16(alo
, blo
);
485 __m128i resulthi
= _mm_packs_epi16(ahi
, bhi
);
487 __m256i result
= _mm256_castsi128_si256(resultlo
);
489 return _mm256_insertf128_si256(result
, resulthi
, 1);
493 __m256i
_simd_packus_epi32(__m256i a
, __m256i b
)
495 __m128i alo
= _mm256_extractf128_si256(a
, 0);
496 __m128i ahi
= _mm256_extractf128_si256(a
, 1);
498 __m128i blo
= _mm256_extractf128_si256(b
, 0);
499 __m128i bhi
= _mm256_extractf128_si256(b
, 1);
501 __m128i resultlo
= _mm_packus_epi32(alo
, blo
);
502 __m128i resulthi
= _mm_packus_epi32(ahi
, bhi
);
504 __m256i result
= _mm256_castsi128_si256(resultlo
);
506 return _mm256_insertf128_si256(result
, resulthi
, 1);
510 __m256i
_simd_packs_epi32(__m256i a
, __m256i b
)
512 __m128i alo
= _mm256_extractf128_si256(a
, 0);
513 __m128i ahi
= _mm256_extractf128_si256(a
, 1);
515 __m128i blo
= _mm256_extractf128_si256(b
, 0);
516 __m128i bhi
= _mm256_extractf128_si256(b
, 1);
518 __m128i resultlo
= _mm_packs_epi32(alo
, blo
);
519 __m128i resulthi
= _mm_packs_epi32(ahi
, bhi
);
521 __m256i result
= _mm256_castsi128_si256(resultlo
);
523 return _mm256_insertf128_si256(result
, resulthi
, 1);
528 #define _simd_mul_epi32 _mm256_mul_epi32
529 #define _simd_mullo_epi32 _mm256_mullo_epi32
530 #define _simd_sub_epi32 _mm256_sub_epi32
531 #define _simd_sub_epi64 _mm256_sub_epi64
532 #define _simd_min_epi32 _mm256_min_epi32
533 #define _simd_max_epi32 _mm256_max_epi32
534 #define _simd_min_epu32 _mm256_min_epu32
535 #define _simd_max_epu32 _mm256_max_epu32
536 #define _simd_add_epi32 _mm256_add_epi32
537 #define _simd_and_si _mm256_and_si256
538 #define _simd_andnot_si _mm256_andnot_si256
539 #define _simd_cmpeq_epi32 _mm256_cmpeq_epi32
540 #define _simd_cmplt_epi32(a,b) _mm256_cmpgt_epi32(b,a)
541 #define _simd_cmpgt_epi32(a,b) _mm256_cmpgt_epi32(a,b)
542 #define _simd_or_si _mm256_or_si256
543 #define _simd_xor_si _mm256_xor_si256
544 #define _simd_castps_si _mm256_castps_si256
546 #define _simd_unpacklo_epi8 _mm256_unpacklo_epi8
547 #define _simd_unpackhi_epi8 _mm256_unpackhi_epi8
548 #define _simd_unpacklo_epi16 _mm256_unpacklo_epi16
549 #define _simd_unpackhi_epi16 _mm256_unpackhi_epi16
550 #define _simd_unpacklo_epi32 _mm256_unpacklo_epi32
551 #define _simd_unpackhi_epi32 _mm256_unpackhi_epi32
552 #define _simd_unpacklo_epi64 _mm256_unpacklo_epi64
553 #define _simd_unpackhi_epi64 _mm256_unpackhi_epi64
555 #define _simd_srli_si(a,i) _simdemu_srli_si128<i>(a)
556 #define _simd_slli_epi32 _mm256_slli_epi32
557 #define _simd_srai_epi32 _mm256_srai_epi32
558 #define _simd_srli_epi32 _mm256_srli_epi32
559 #define _simd_srlisi_ps(a,i) _mm256_castsi256_ps(_simdemu_srli_si128<i>(_mm256_castps_si256(a)))
560 #define _simd128_fmadd_ps _mm_fmadd_ps
561 #define _simd_fmadd_ps _mm256_fmadd_ps
562 #define _simd_fmsub_ps _mm256_fmsub_ps
563 #define _simd_shuffle_epi8 _mm256_shuffle_epi8
564 #define _simd_adds_epu8 _mm256_adds_epu8
565 #define _simd_subs_epu8 _mm256_subs_epu8
566 #define _simd_add_epi8 _mm256_add_epi8
567 #define _simd_i32gather_ps _mm256_i32gather_ps
568 #define _simd_mask_i32gather_ps _mm256_mask_i32gather_ps
569 #define _simd_abs_epi32 _mm256_abs_epi32
571 #define _simd_cmpeq_epi64 _mm256_cmpeq_epi64
572 #define _simd_cmpgt_epi64 _mm256_cmpgt_epi64
573 #define _simd_cmpgt_epi8 _mm256_cmpgt_epi8
574 #define _simd_cmpeq_epi8 _mm256_cmpeq_epi8
575 #define _simd_cmpgt_epi16 _mm256_cmpgt_epi16
576 #define _simd_cmpeq_epi16 _mm256_cmpeq_epi16
577 #define _simd_movemask_epi8 _mm256_movemask_epi8
578 #define _simd_permute_ps _mm256_permutevar8x32_ps
579 #define _simd_permute_epi32 _mm256_permutevar8x32_epi32
580 #define _simd_srlv_epi32 _mm256_srlv_epi32
581 #define _simd_sllv_epi32 _mm256_sllv_epi32
582 #define _simd_cvtepu8_epi16 _mm256_cvtepu8_epi16
583 #define _simd_cvtepu8_epi32 _mm256_cvtepu8_epi32
584 #define _simd_cvtepu16_epi32 _mm256_cvtepu16_epi32
585 #define _simd_packus_epi16 _mm256_packus_epi16
586 #define _simd_packs_epi16 _mm256_packs_epi16
587 #define _simd_packus_epi32 _mm256_packus_epi32
588 #define _simd_packs_epi32 _mm256_packs_epi32
592 #define _simd_unpacklo_ps _mm256_unpacklo_ps
593 #define _simd_unpackhi_ps _mm256_unpackhi_ps
594 #define _simd_unpacklo_pd _mm256_unpacklo_pd
595 #define _simd_unpackhi_pd _mm256_unpackhi_pd
596 #define _simd_insertf128_ps _mm256_insertf128_ps
597 #define _simd_insertf128_pd _mm256_insertf128_pd
598 #define _simd_insertf128_si _mm256_insertf128_si256
599 #define _simd_extractf128_ps _mm256_extractf128_ps
600 #define _simd_extractf128_pd _mm256_extractf128_pd
601 #define _simd_extractf128_si _mm256_extractf128_si256
602 #define _simd_permute2f128_ps _mm256_permute2f128_ps
603 #define _simd_permute2f128_pd _mm256_permute2f128_pd
604 #define _simd_permute2f128_si _mm256_permute2f128_si256
605 #define _simd_shuffle_ps _mm256_shuffle_ps
606 #define _simd_shuffle_pd _mm256_shuffle_pd
607 #define _simd_shuffle_epi32(a, b, imm8) _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), imm8))
608 #define _simd_shuffle_epi64(a, b, imm8) _mm256_castps_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b), imm8))
609 #define _simd_set1_epi32 _mm256_set1_epi32
610 #define _simd_set_epi32 _mm256_set_epi32
611 #define _simd_set1_epi8 _mm256_set1_epi8
612 #define _simd_setzero_si _mm256_setzero_si256
613 #define _simd_cvttps_epi32 _mm256_cvttps_epi32
614 #define _simd_store_si _mm256_store_si256
615 #define _simd_broadcast_ss _mm256_broadcast_ss
616 #define _simd_maskstore_ps _mm256_maskstore_ps
617 #define _simd_load_si _mm256_load_si256
618 #define _simd_loadu_si _mm256_loadu_si256
619 #define _simd_sub_ps _mm256_sub_ps
620 #define _simd_testz_ps _mm256_testz_ps
621 #define _simd_testz_si _mm256_testz_si256
622 #define _simd_xor_ps _mm256_xor_ps
625 simdscalari
_simd_loadu2_si(const __m128i
*hiaddr
, const __m128i
*loaddr
)
627 __m128i lo
= _mm_loadu_si128(loaddr
);
628 __m128i hi
= _mm_loadu_si128(hiaddr
);
630 return _mm256_insertf128_si256(_mm256_castsi128_si256(lo
), (hi
), 1);
634 void _simd_storeu2_si(__m128i
*hiaddr
, __m128i
*loaddr
, simdscalari a
)
636 _mm_storeu_si128(loaddr
, _mm256_castsi256_si128(a
));
637 _mm_storeu_si128(hiaddr
, _mm256_extractf128_si256(a
, 1));
641 simdscalari
_simd_blendv_epi32(simdscalari a
, simdscalari b
, simdscalar mask
)
643 return _simd_castps_si(_simd_blendv_ps(_simd_castsi_ps(a
), _simd_castsi_ps(b
), mask
));
647 simdscalari
_simd_blendv_epi32(simdscalari a
, simdscalari b
, simdscalari mask
)
649 return _simd_castps_si(_simd_blendv_ps(_simd_castsi_ps(a
), _simd_castsi_ps(b
), _simd_castsi_ps(mask
)));
654 __m128i
_simd_blend4_epi32(__m128i a
, __m128i b
)
656 return _mm_castps_si128(_mm_blend_ps(_mm_castsi128_ps(a
), _mm_castsi128_ps(b
), mask
));
659 // convert bitmask to vector mask
661 simdscalar
vMask(int32_t mask
)
663 __m256i vec
= _mm256_set1_epi32(mask
);
664 const __m256i bit
= _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
665 vec
= _simd_and_si(vec
, bit
);
666 vec
= _simd_cmplt_epi32(_mm256_setzero_si256(), vec
);
667 return _simd_castsi_ps(vec
);
671 simdscalari
vMaski(int32_t mask
)
673 __m256i vec
= _mm256_set1_epi32(mask
);
674 const __m256i bit
= _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
675 vec
= _simd_and_si(vec
, bit
);
676 return _simd_cmplt_epi32(_mm256_setzero_si256(), vec
);
680 void _simd_mov(simdscalar
&r
, unsigned int rlane
, simdscalar
& s
, unsigned int slane
)
682 OSALIGNSIMD(float) rArray
[KNOB_SIMD_WIDTH
], sArray
[KNOB_SIMD_WIDTH
];
683 _mm256_store_ps(rArray
, r
);
684 _mm256_store_ps(sArray
, s
);
685 rArray
[rlane
] = sArray
[slane
];
686 r
= _mm256_load_ps(rArray
);
689 INLINE __m256i
_simdemu_slli_epi32(__m256i a
, uint32_t i
)
691 __m128i aHi
= _mm256_extractf128_si256(a
, 1);
692 __m128i aLo
= _mm256_castsi256_si128(a
);
694 __m128i resHi
= _mm_slli_epi32(aHi
, i
);
695 __m128i resLo
= _mm_slli_epi32(aLo
, i
);
697 __m256i result
= _mm256_castsi128_si256(resLo
);
698 result
= _mm256_insertf128_si256(result
, resHi
, 1);
703 INLINE __m256i
_simdemu_srai_epi32(__m256i a
, uint32_t i
)
705 __m128i aHi
= _mm256_extractf128_si256(a
, 1);
706 __m128i aLo
= _mm256_castsi256_si128(a
);
708 __m128i resHi
= _mm_srai_epi32(aHi
, i
);
709 __m128i resLo
= _mm_srai_epi32(aLo
, i
);
711 __m256i result
= _mm256_castsi128_si256(resLo
);
712 result
= _mm256_insertf128_si256(result
, resHi
, 1);
717 INLINE __m256i
_simdemu_srli_epi32(__m256i a
, uint32_t i
)
719 __m128i aHi
= _mm256_extractf128_si256(a
, 1);
720 __m128i aLo
= _mm256_castsi256_si128(a
);
722 __m128i resHi
= _mm_srli_epi32(aHi
, i
);
723 __m128i resLo
= _mm_srli_epi32(aLo
, i
);
725 __m256i result
= _mm256_castsi128_si256(resLo
);
726 result
= _mm256_insertf128_si256(result
, resHi
, 1);
732 void _simdvec_transpose(simdvector
&v
)
734 SWR_INVALID("Need to implement 8 wide version");
738 #error Unsupported vector width
741 // Populates a simdvector from a vector. So p = xyzw becomes xxxx yyyy zzzz wwww.
743 void _simdvec_load_ps(simdvector
& r
, const float *p
)
745 r
[0] = _simd_set1_ps(p
[0]);
746 r
[1] = _simd_set1_ps(p
[1]);
747 r
[2] = _simd_set1_ps(p
[2]);
748 r
[3] = _simd_set1_ps(p
[3]);
752 void _simdvec_mov(simdvector
& r
, const simdscalar
& s
)
761 void _simdvec_mov(simdvector
& r
, const simdvector
& v
)
770 // just move a lane from the source simdvector to dest simdvector
772 void _simdvec_mov(simdvector
&r
, unsigned int rlane
, simdvector
& s
, unsigned int slane
)
774 _simd_mov(r
[0], rlane
, s
[0], slane
);
775 _simd_mov(r
[1], rlane
, s
[1], slane
);
776 _simd_mov(r
[2], rlane
, s
[2], slane
);
777 _simd_mov(r
[3], rlane
, s
[3], slane
);
782 void _simdvec_dp3_ps(simdscalar
& r
, const simdvector
& v0
, const simdvector
& v1
)
785 r
= _simd_mul_ps(v0
[0], v1
[0]); // (v0.x*v1.x)
787 tmp
= _simd_mul_ps(v0
[1], v1
[1]); // (v0.y*v1.y)
788 r
= _simd_add_ps(r
, tmp
); // (v0.x*v1.x) + (v0.y*v1.y)
790 tmp
= _simd_mul_ps(v0
[2], v1
[2]); // (v0.z*v1.z)
791 r
= _simd_add_ps(r
, tmp
); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
795 void _simdvec_dp4_ps(simdscalar
& r
, const simdvector
& v0
, const simdvector
& v1
)
798 r
= _simd_mul_ps(v0
[0], v1
[0]); // (v0.x*v1.x)
800 tmp
= _simd_mul_ps(v0
[1], v1
[1]); // (v0.y*v1.y)
801 r
= _simd_add_ps(r
, tmp
); // (v0.x*v1.x) + (v0.y*v1.y)
803 tmp
= _simd_mul_ps(v0
[2], v1
[2]); // (v0.z*v1.z)
804 r
= _simd_add_ps(r
, tmp
); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
806 tmp
= _simd_mul_ps(v0
[3], v1
[3]); // (v0.w*v1.w)
807 r
= _simd_add_ps(r
, tmp
); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
811 simdscalar
_simdvec_rcp_length_ps(const simdvector
& v
)
814 _simdvec_dp4_ps(length
, v
, v
);
815 return _simd_rsqrt_ps(length
);
819 void _simdvec_normalize_ps(simdvector
& r
, const simdvector
& v
)
821 simdscalar vecLength
;
822 vecLength
= _simdvec_rcp_length_ps(v
);
824 r
[0] = _simd_mul_ps(v
[0], vecLength
);
825 r
[1] = _simd_mul_ps(v
[1], vecLength
);
826 r
[2] = _simd_mul_ps(v
[2], vecLength
);
827 r
[3] = _simd_mul_ps(v
[3], vecLength
);
831 void _simdvec_mul_ps(simdvector
& r
, const simdvector
& v
, const simdscalar
& s
)
833 r
[0] = _simd_mul_ps(v
[0], s
);
834 r
[1] = _simd_mul_ps(v
[1], s
);
835 r
[2] = _simd_mul_ps(v
[2], s
);
836 r
[3] = _simd_mul_ps(v
[3], s
);
840 void _simdvec_mul_ps(simdvector
& r
, const simdvector
& v0
, const simdvector
& v1
)
842 r
[0] = _simd_mul_ps(v0
[0], v1
[0]);
843 r
[1] = _simd_mul_ps(v0
[1], v1
[1]);
844 r
[2] = _simd_mul_ps(v0
[2], v1
[2]);
845 r
[3] = _simd_mul_ps(v0
[3], v1
[3]);
849 void _simdvec_add_ps(simdvector
& r
, const simdvector
& v0
, const simdvector
& v1
)
851 r
[0] = _simd_add_ps(v0
[0], v1
[0]);
852 r
[1] = _simd_add_ps(v0
[1], v1
[1]);
853 r
[2] = _simd_add_ps(v0
[2], v1
[2]);
854 r
[3] = _simd_add_ps(v0
[3], v1
[3]);
858 void _simdvec_min_ps(simdvector
& r
, const simdvector
& v0
, const simdscalar
& s
)
860 r
[0] = _simd_min_ps(v0
[0], s
);
861 r
[1] = _simd_min_ps(v0
[1], s
);
862 r
[2] = _simd_min_ps(v0
[2], s
);
863 r
[3] = _simd_min_ps(v0
[3], s
);
867 void _simdvec_max_ps(simdvector
& r
, const simdvector
& v0
, const simdscalar
& s
)
869 r
[0] = _simd_max_ps(v0
[0], s
);
870 r
[1] = _simd_max_ps(v0
[1], s
);
871 r
[2] = _simd_max_ps(v0
[2], s
);
872 r
[3] = _simd_max_ps(v0
[3], s
);
875 // Matrix4x4 * Vector4
876 // outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * v.w)
877 // outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * v.w)
878 // outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * v.w)
879 // outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * v.w)
881 void _simd_mat4x4_vec4_multiply(
883 const float *pMatrix
,
890 m
= _simd_load1_ps(pMatrix
+ 0*4 + 0); // m[row][0]
891 r0
= _simd_mul_ps(m
, v
[0]); // (m00 * v.x)
892 m
= _simd_load1_ps(pMatrix
+ 0*4 + 1); // m[row][1]
893 r1
= _simd_mul_ps(m
, v
[1]); // (m1 * v.y)
894 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y)
895 m
= _simd_load1_ps(pMatrix
+ 0*4 + 2); // m[row][2]
896 r1
= _simd_mul_ps(m
, v
[2]); // (m2 * v.z)
897 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
898 m
= _simd_load1_ps(pMatrix
+ 0*4 + 3); // m[row][3]
899 r1
= _simd_mul_ps(m
, v
[3]); // (m3 * v.z)
900 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
903 m
= _simd_load1_ps(pMatrix
+ 1*4 + 0); // m[row][0]
904 r0
= _simd_mul_ps(m
, v
[0]); // (m00 * v.x)
905 m
= _simd_load1_ps(pMatrix
+ 1*4 + 1); // m[row][1]
906 r1
= _simd_mul_ps(m
, v
[1]); // (m1 * v.y)
907 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y)
908 m
= _simd_load1_ps(pMatrix
+ 1*4 + 2); // m[row][2]
909 r1
= _simd_mul_ps(m
, v
[2]); // (m2 * v.z)
910 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
911 m
= _simd_load1_ps(pMatrix
+ 1*4 + 3); // m[row][3]
912 r1
= _simd_mul_ps(m
, v
[3]); // (m3 * v.z)
913 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
916 m
= _simd_load1_ps(pMatrix
+ 2*4 + 0); // m[row][0]
917 r0
= _simd_mul_ps(m
, v
[0]); // (m00 * v.x)
918 m
= _simd_load1_ps(pMatrix
+ 2*4 + 1); // m[row][1]
919 r1
= _simd_mul_ps(m
, v
[1]); // (m1 * v.y)
920 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y)
921 m
= _simd_load1_ps(pMatrix
+ 2*4 + 2); // m[row][2]
922 r1
= _simd_mul_ps(m
, v
[2]); // (m2 * v.z)
923 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
924 m
= _simd_load1_ps(pMatrix
+ 2*4 + 3); // m[row][3]
925 r1
= _simd_mul_ps(m
, v
[3]); // (m3 * v.z)
926 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
929 m
= _simd_load1_ps(pMatrix
+ 3*4 + 0); // m[row][0]
930 r0
= _simd_mul_ps(m
, v
[0]); // (m00 * v.x)
931 m
= _simd_load1_ps(pMatrix
+ 3*4 + 1); // m[row][1]
932 r1
= _simd_mul_ps(m
, v
[1]); // (m1 * v.y)
933 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y)
934 m
= _simd_load1_ps(pMatrix
+ 3*4 + 2); // m[row][2]
935 r1
= _simd_mul_ps(m
, v
[2]); // (m2 * v.z)
936 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
937 m
= _simd_load1_ps(pMatrix
+ 3*4 + 3); // m[row][3]
938 r1
= _simd_mul_ps(m
, v
[3]); // (m3 * v.z)
939 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
943 // Matrix4x4 * Vector3 - Direction Vector where w = 0.
944 // outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 0)
945 // outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 0)
946 // outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 0)
947 // outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 0)
949 void _simd_mat3x3_vec3_w0_multiply(
951 const float *pMatrix
,
958 m
= _simd_load1_ps(pMatrix
+ 0*4 + 0); // m[row][0]
959 r0
= _simd_mul_ps(m
, v
[0]); // (m00 * v.x)
960 m
= _simd_load1_ps(pMatrix
+ 0*4 + 1); // m[row][1]
961 r1
= _simd_mul_ps(m
, v
[1]); // (m1 * v.y)
962 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y)
963 m
= _simd_load1_ps(pMatrix
+ 0*4 + 2); // m[row][2]
964 r1
= _simd_mul_ps(m
, v
[2]); // (m2 * v.z)
965 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
968 m
= _simd_load1_ps(pMatrix
+ 1*4 + 0); // m[row][0]
969 r0
= _simd_mul_ps(m
, v
[0]); // (m00 * v.x)
970 m
= _simd_load1_ps(pMatrix
+ 1*4 + 1); // m[row][1]
971 r1
= _simd_mul_ps(m
, v
[1]); // (m1 * v.y)
972 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y)
973 m
= _simd_load1_ps(pMatrix
+ 1*4 + 2); // m[row][2]
974 r1
= _simd_mul_ps(m
, v
[2]); // (m2 * v.z)
975 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
978 m
= _simd_load1_ps(pMatrix
+ 2*4 + 0); // m[row][0]
979 r0
= _simd_mul_ps(m
, v
[0]); // (m00 * v.x)
980 m
= _simd_load1_ps(pMatrix
+ 2*4 + 1); // m[row][1]
981 r1
= _simd_mul_ps(m
, v
[1]); // (m1 * v.y)
982 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y)
983 m
= _simd_load1_ps(pMatrix
+ 2*4 + 2); // m[row][2]
984 r1
= _simd_mul_ps(m
, v
[2]); // (m2 * v.z)
985 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
988 result
[3] = _simd_setzero_ps();
991 // Matrix4x4 * Vector3 - Position vector where w = 1.
992 // outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 1)
993 // outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 1)
994 // outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 1)
995 // outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 1)
997 void _simd_mat4x4_vec3_w1_multiply(
999 const float *pMatrix
,
1000 const simdvector
& v
)
1006 m
= _simd_load1_ps(pMatrix
+ 0*4 + 0); // m[row][0]
1007 r0
= _simd_mul_ps(m
, v
[0]); // (m00 * v.x)
1008 m
= _simd_load1_ps(pMatrix
+ 0*4 + 1); // m[row][1]
1009 r1
= _simd_mul_ps(m
, v
[1]); // (m1 * v.y)
1010 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y)
1011 m
= _simd_load1_ps(pMatrix
+ 0*4 + 2); // m[row][2]
1012 r1
= _simd_mul_ps(m
, v
[2]); // (m2 * v.z)
1013 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
1014 m
= _simd_load1_ps(pMatrix
+ 0*4 + 3); // m[row][3]
1015 r0
= _simd_add_ps(r0
, m
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
1018 m
= _simd_load1_ps(pMatrix
+ 1*4 + 0); // m[row][0]
1019 r0
= _simd_mul_ps(m
, v
[0]); // (m00 * v.x)
1020 m
= _simd_load1_ps(pMatrix
+ 1*4 + 1); // m[row][1]
1021 r1
= _simd_mul_ps(m
, v
[1]); // (m1 * v.y)
1022 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y)
1023 m
= _simd_load1_ps(pMatrix
+ 1*4 + 2); // m[row][2]
1024 r1
= _simd_mul_ps(m
, v
[2]); // (m2 * v.z)
1025 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
1026 m
= _simd_load1_ps(pMatrix
+ 1*4 + 3); // m[row][3]
1027 r0
= _simd_add_ps(r0
, m
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
1030 m
= _simd_load1_ps(pMatrix
+ 2*4 + 0); // m[row][0]
1031 r0
= _simd_mul_ps(m
, v
[0]); // (m00 * v.x)
1032 m
= _simd_load1_ps(pMatrix
+ 2*4 + 1); // m[row][1]
1033 r1
= _simd_mul_ps(m
, v
[1]); // (m1 * v.y)
1034 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y)
1035 m
= _simd_load1_ps(pMatrix
+ 2*4 + 2); // m[row][2]
1036 r1
= _simd_mul_ps(m
, v
[2]); // (m2 * v.z)
1037 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
1038 m
= _simd_load1_ps(pMatrix
+ 2*4 + 3); // m[row][3]
1039 r0
= _simd_add_ps(r0
, m
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
1042 m
= _simd_load1_ps(pMatrix
+ 3*4 + 0); // m[row][0]
1043 r0
= _simd_mul_ps(m
, v
[0]); // (m00 * v.x)
1044 m
= _simd_load1_ps(pMatrix
+ 3*4 + 1); // m[row][1]
1045 r1
= _simd_mul_ps(m
, v
[1]); // (m1 * v.y)
1046 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y)
1047 m
= _simd_load1_ps(pMatrix
+ 3*4 + 2); // m[row][2]
1048 r1
= _simd_mul_ps(m
, v
[2]); // (m2 * v.z)
1049 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
1050 m
= _simd_load1_ps(pMatrix
+ 3*4 + 3); // m[row][3]
1051 result
[3] = _simd_add_ps(r0
, m
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
1055 void _simd_mat4x3_vec3_w1_multiply(
1057 const float *pMatrix
,
1058 const simdvector
& v
)
1064 m
= _simd_load1_ps(pMatrix
+ 0*4 + 0); // m[row][0]
1065 r0
= _simd_mul_ps(m
, v
[0]); // (m00 * v.x)
1066 m
= _simd_load1_ps(pMatrix
+ 0*4 + 1); // m[row][1]
1067 r1
= _simd_mul_ps(m
, v
[1]); // (m1 * v.y)
1068 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y)
1069 m
= _simd_load1_ps(pMatrix
+ 0*4 + 2); // m[row][2]
1070 r1
= _simd_mul_ps(m
, v
[2]); // (m2 * v.z)
1071 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
1072 m
= _simd_load1_ps(pMatrix
+ 0*4 + 3); // m[row][3]
1073 r0
= _simd_add_ps(r0
, m
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
1076 m
= _simd_load1_ps(pMatrix
+ 1*4 + 0); // m[row][0]
1077 r0
= _simd_mul_ps(m
, v
[0]); // (m00 * v.x)
1078 m
= _simd_load1_ps(pMatrix
+ 1*4 + 1); // m[row][1]
1079 r1
= _simd_mul_ps(m
, v
[1]); // (m1 * v.y)
1080 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y)
1081 m
= _simd_load1_ps(pMatrix
+ 1*4 + 2); // m[row][2]
1082 r1
= _simd_mul_ps(m
, v
[2]); // (m2 * v.z)
1083 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
1084 m
= _simd_load1_ps(pMatrix
+ 1*4 + 3); // m[row][3]
1085 r0
= _simd_add_ps(r0
, m
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
1088 m
= _simd_load1_ps(pMatrix
+ 2*4 + 0); // m[row][0]
1089 r0
= _simd_mul_ps(m
, v
[0]); // (m00 * v.x)
1090 m
= _simd_load1_ps(pMatrix
+ 2*4 + 1); // m[row][1]
1091 r1
= _simd_mul_ps(m
, v
[1]); // (m1 * v.y)
1092 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y)
1093 m
= _simd_load1_ps(pMatrix
+ 2*4 + 2); // m[row][2]
1094 r1
= _simd_mul_ps(m
, v
[2]); // (m2 * v.z)
1095 r0
= _simd_add_ps(r0
, r1
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
1096 m
= _simd_load1_ps(pMatrix
+ 2*4 + 3); // m[row][3]
1097 r0
= _simd_add_ps(r0
, m
); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
1099 result
[3] = _simd_set1_ps(1.0f
);
1102 //////////////////////////////////////////////////////////////////////////
1103 /// @brief Compute plane equation vA * vX + vB * vY + vC
1104 INLINE simdscalar
vplaneps(simdscalar vA
, simdscalar vB
, simdscalar vC
, simdscalar
&vX
, simdscalar
&vY
)
1106 simdscalar vOut
= _simd_fmadd_ps(vA
, vX
, vC
);
1107 vOut
= _simd_fmadd_ps(vB
, vY
, vOut
);
1111 //////////////////////////////////////////////////////////////////////////
1112 /// @brief Compute plane equation vA * vX + vB * vY + vC
1113 INLINE __m128
vplaneps128(__m128 vA
, __m128 vB
, __m128 vC
, __m128
&vX
, __m128
&vY
)
1115 __m128 vOut
= _simd128_fmadd_ps(vA
, vX
, vC
);
1116 vOut
= _simd128_fmadd_ps(vB
, vY
, vOut
);
1120 //////////////////////////////////////////////////////////////////////////
1121 /// @brief Interpolates a single component.
1122 /// @param vI - barycentric I
1123 /// @param vJ - barycentric J
1124 /// @param pInterpBuffer - pointer to attribute barycentric coeffs
1125 template<UINT Attrib
, UINT Comp
, UINT numComponents
= 4>
1126 static INLINE simdscalar
InterpolateComponent(simdscalar vI
, simdscalar vJ
, const float *pInterpBuffer
)
1128 const float *pInterpA
= &pInterpBuffer
[Attrib
* 3 * numComponents
+ 0 + Comp
];
1129 const float *pInterpB
= &pInterpBuffer
[Attrib
* 3 * numComponents
+ numComponents
+ Comp
];
1130 const float *pInterpC
= &pInterpBuffer
[Attrib
* 3 * numComponents
+ numComponents
* 2 + Comp
];
1132 simdscalar vA
= _simd_broadcast_ss(pInterpA
);
1133 simdscalar vB
= _simd_broadcast_ss(pInterpB
);
1134 simdscalar vC
= _simd_broadcast_ss(pInterpC
);
1136 simdscalar vk
= _simd_sub_ps(_simd_sub_ps(_simd_set1_ps(1.0f
), vI
), vJ
);
1137 vC
= _simd_mul_ps(vk
, vC
);
1139 return vplaneps(vA
, vB
, vC
, vI
, vJ
);
1142 //////////////////////////////////////////////////////////////////////////
1143 /// @brief Interpolates a single component (flat shade).
1144 /// @param pInterpBuffer - pointer to attribute barycentric coeffs
1145 template<UINT Attrib
, UINT Comp
, UINT numComponents
= 4>
1146 static INLINE simdscalar
InterpolateComponentFlat(const float *pInterpBuffer
)
1148 const float *pInterpA
= &pInterpBuffer
[Attrib
* 3 * numComponents
+ 0 + Comp
];
1150 simdscalar vA
= _simd_broadcast_ss(pInterpA
);
1155 //////////////////////////////////////////////////////////////////////////
1156 /// @brief Interpolates a single component.
1157 /// @param vI - barycentric I
1158 /// @param vJ - barycentric J
1159 /// @param pInterpBuffer - pointer to attribute barycentric coeffs
1160 template<UINT Attrib
, UINT Comp
, UINT numComponents
= 4>
1161 static INLINE __m128
InterpolateComponent(__m128 vI
, __m128 vJ
, const float *pInterpBuffer
)
1163 const float *pInterpA
= &pInterpBuffer
[Attrib
* 3 * numComponents
+ 0 + Comp
];
1164 const float *pInterpB
= &pInterpBuffer
[Attrib
* 3 * numComponents
+ numComponents
+ Comp
];
1165 const float *pInterpC
= &pInterpBuffer
[Attrib
* 3 * numComponents
+ numComponents
* 2 + Comp
];
1167 __m128 vA
= _mm_broadcast_ss(pInterpA
);
1168 __m128 vB
= _mm_broadcast_ss(pInterpB
);
1169 __m128 vC
= _mm_broadcast_ss(pInterpC
);
1171 __m128 vk
= _mm_sub_ps(_mm_sub_ps(_mm_set1_ps(1.0f
), vI
), vJ
);
1172 vC
= _mm_mul_ps(vk
, vC
);
1174 return vplaneps128(vA
, vB
, vC
, vI
, vJ
);
1177 static INLINE __m128
_simd128_abs_ps(__m128 a
)
1179 __m128i ai
= _mm_castps_si128(a
);
1180 return _mm_castsi128_ps(_mm_and_si128(ai
, _mm_set1_epi32(0x7fffffff)));
1183 static INLINE simdscalar
_simd_abs_ps(simdscalar a
)
1185 simdscalari ai
= _simd_castps_si(a
);
1186 return _simd_castsi_ps(_simd_and_si(ai
, _simd_set1_epi32(0x7fffffff)));
1190 UINT
pdep_u32(UINT a
, UINT mask
)
1192 #if KNOB_ARCH >= KNOB_ARCH_AVX2
1193 return _pdep_u32(a
, mask
);
1197 // copied from http://wm.ite.pl/articles/pdep-soft-emu.html
1198 // using bsf instead of funky loop
1200 while (_BitScanForward(&maskIndex
, mask
))
1202 // 1. isolate lowest set bit of mask
1203 const UINT lowest
= 1 << maskIndex
;
1205 // 2. populate LSB from src
1206 const UINT LSB
= (UINT
)((int)(a
<< 31) >> 31);
1208 // 3. copy bit from mask
1209 result
|= LSB
& lowest
;
1211 // 4. clear lowest bit
1214 // 5. prepare for next iteration
1223 UINT
pext_u32(UINT a
, UINT mask
)
1225 #if KNOB_ARCH >= KNOB_ARCH_AVX2
1226 return _pext_u32(a
, mask
);
1230 uint32_t currentBit
= 0;
1231 while (_BitScanForward(&maskIndex
, mask
))
1233 // 1. isolate lowest set bit of mask
1234 const UINT lowest
= 1 << maskIndex
;
1236 // 2. copy bit from mask
1237 result
|= ((a
& lowest
) > 0) << currentBit
++;
1239 // 3. clear lowest bit
1246 #if ENABLE_AVX512_SIMD16
1247 #include "simd16intrin.h"
1248 #endif//ENABLE_AVX512_SIMD16
1250 #endif//__SWR_SIMDINTRIN_H__