9ba28177257a9c92608589276d853b6d5405014b
[mesa.git] / src / gallium / drivers / swr / rasterizer / common / simdintrin.h
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 ****************************************************************************/
23
24 #ifndef __SWR_SIMDINTRIN_H__
25 #define __SWR_SIMDINTRIN_H__
26
27 #include "os.h"
28
29 #include <cassert>
30
31 #include <emmintrin.h>
32 #include <immintrin.h>
33 #include <xmmintrin.h>
34
35 #if KNOB_SIMD_WIDTH == 8
36 typedef __m256 simdscalar;
37 typedef __m256i simdscalari;
38 typedef uint8_t simdmask;
39 #else
40 #error Unsupported vector width
41 #endif
42
43 // simd vector
44 OSALIGNSIMD(union) simdvector
45 {
46 simdscalar v[4];
47 struct
48 {
49 simdscalar x, y, z, w;
50 };
51
52 simdscalar& operator[] (const int i) { return v[i]; }
53 const simdscalar& operator[] (const int i) const { return v[i]; }
54 };
55
56 #if KNOB_SIMD_WIDTH == 8
57 #define _simd128_maskstore_ps _mm_maskstore_ps
58 #define _simd_load_ps _mm256_load_ps
59 #define _simd_load1_ps _mm256_broadcast_ss
60 #define _simd_loadu_ps _mm256_loadu_ps
61 #define _simd_setzero_ps _mm256_setzero_ps
62 #define _simd_set1_ps _mm256_set1_ps
63 #define _simd_blend_ps _mm256_blend_ps
64 #define _simd_blendv_ps _mm256_blendv_ps
65 #define _simd_store_ps _mm256_store_ps
66 #define _simd_mul_ps _mm256_mul_ps
67 #define _simd_add_ps _mm256_add_ps
68 #define _simd_sub_ps _mm256_sub_ps
69 #define _simd_rsqrt_ps _mm256_rsqrt_ps
70 #define _simd_min_ps _mm256_min_ps
71 #define _simd_max_ps _mm256_max_ps
72 #define _simd_movemask_ps _mm256_movemask_ps
73 #define _simd_cvtps_epi32 _mm256_cvtps_epi32
74 #define _simd_cvttps_epi32 _mm256_cvttps_epi32
75 #define _simd_cvtepi32_ps _mm256_cvtepi32_ps
76 #define _simd_cmplt_ps(a, b) _mm256_cmp_ps(a, b, _CMP_LT_OQ)
77 #define _simd_cmpgt_ps(a, b) _mm256_cmp_ps(a, b, _CMP_GT_OQ)
78 #define _simd_cmpneq_ps(a, b) _mm256_cmp_ps(a, b, _CMP_NEQ_OQ)
79 #define _simd_cmpeq_ps(a, b) _mm256_cmp_ps(a, b, _CMP_EQ_OQ)
80 #define _simd_cmpge_ps(a, b) _mm256_cmp_ps(a, b, _CMP_GE_OQ)
81 #define _simd_cmple_ps(a, b) _mm256_cmp_ps(a, b, _CMP_LE_OQ)
82 #define _simd_cmp_ps(a, b, imm) _mm256_cmp_ps(a, b, imm)
83 #define _simd_and_ps _mm256_and_ps
84 #define _simd_or_ps _mm256_or_ps
85
86 #define _simd_rcp_ps _mm256_rcp_ps
87 #define _simd_div_ps _mm256_div_ps
88 #define _simd_castsi_ps _mm256_castsi256_ps
89 #define _simd_andnot_ps _mm256_andnot_ps
90 #define _simd_round_ps _mm256_round_ps
91 #define _simd_castpd_ps _mm256_castpd_ps
92 #define _simd_broadcast_ps(a) _mm256_broadcast_ps((const __m128*)(a))
93
94 #define _simd_load_sd _mm256_load_sd
95 #define _simd_movemask_pd _mm256_movemask_pd
96 #define _simd_castsi_pd _mm256_castsi256_pd
97
98 // emulated integer simd
99 #define SIMD_EMU_EPI(func, intrin) \
100 INLINE \
101 __m256i func(__m256i a, __m256i b)\
102 {\
103 __m128i aHi = _mm256_extractf128_si256(a, 1);\
104 __m128i bHi = _mm256_extractf128_si256(b, 1);\
105 __m128i aLo = _mm256_castsi256_si128(a);\
106 __m128i bLo = _mm256_castsi256_si128(b);\
107 \
108 __m128i subLo = intrin(aLo, bLo);\
109 __m128i subHi = intrin(aHi, bHi);\
110 \
111 __m256i result = _mm256_castsi128_si256(subLo);\
112 result = _mm256_insertf128_si256(result, subHi, 1);\
113 \
114 return result;\
115 }
116
117 #if (KNOB_ARCH == KNOB_ARCH_AVX)
118 INLINE
119 __m256 _simdemu_permute_ps(__m256 a, __m256i b)
120 {
121 __m128 aHi = _mm256_extractf128_ps(a, 1);
122 __m128i bHi = _mm256_extractf128_si256(b, 1);
123 __m128 aLo = _mm256_castps256_ps128(a);
124 __m128i bLo = _mm256_castsi256_si128(b);
125
126 __m128i indexHi = _mm_cmpgt_epi32(bLo, _mm_set1_epi32(3));
127 __m128 resLow = _mm_permutevar_ps(aLo, _mm_and_si128(bLo, _mm_set1_epi32(0x3)));
128 __m128 resHi = _mm_permutevar_ps(aHi, _mm_and_si128(bLo, _mm_set1_epi32(0x3)));
129 __m128 blendLowRes = _mm_blendv_ps(resLow, resHi, _mm_castsi128_ps(indexHi));
130
131 indexHi = _mm_cmpgt_epi32(bHi, _mm_set1_epi32(3));
132 resLow = _mm_permutevar_ps(aLo, _mm_and_si128(bHi, _mm_set1_epi32(0x3)));
133 resHi = _mm_permutevar_ps(aHi, _mm_and_si128(bHi, _mm_set1_epi32(0x3)));
134 __m128 blendHiRes = _mm_blendv_ps(resLow, resHi, _mm_castsi128_ps(indexHi));
135
136 __m256 result = _mm256_castps128_ps256(blendLowRes);
137 result = _mm256_insertf128_ps(result, blendHiRes, 1);
138
139 return result;
140 }
141
142 #define _simd_mul_epi32 _simdemu_mul_epi32
143 #define _simd_mullo_epi32 _simdemu_mullo_epi32
144 #define _simd_sub_epi32 _simdemu_sub_epi32
145 #define _simd_sub_epi64 _simdemu_sub_epi64
146 #define _simd_min_epi32 _simdemu_min_epi32
147 #define _simd_min_epu32 _simdemu_min_epu32
148 #define _simd_max_epi32 _simdemu_max_epi32
149 #define _simd_max_epu32 _simdemu_max_epu32
150 #define _simd_add_epi32 _simdemu_add_epi32
151 #define _simd_and_si _simdemu_and_si
152 #define _simd_andnot_si _simdemu_andnot_si
153 #define _simd_cmpeq_epi32 _simdemu_cmpeq_epi32
154 #define _simd_cmplt_epi32 _simdemu_cmplt_epi32
155 #define _simd_cmpgt_epi32 _simdemu_cmpgt_epi32
156 #define _simd_or_si _simdemu_or_si
157 #define _simd_castps_si _mm256_castps_si256
158 #define _simd_adds_epu8 _simdemu_adds_epu8
159 #define _simd_subs_epu8 _simdemu_subs_epu8
160 #define _simd_add_epi8 _simdemu_add_epi8
161 #define _simd_cmpeq_epi64 _simdemu_cmpeq_epi64
162 #define _simd_cmpgt_epi64 _simdemu_cmpgt_epi64
163 #define _simd_cmpgt_epi8 _simdemu_cmpgt_epi8
164 #define _simd_cmpeq_epi8 _simdemu_cmpeq_epi8
165 #define _simd_cmpgt_epi16 _simdemu_cmpgt_epi16
166 #define _simd_cmpeq_epi16 _simdemu_cmpeq_epi16
167 #define _simd_movemask_epi8 _simdemu_movemask_epi8
168 #define _simd_permute_ps _simdemu_permute_ps
169
170 SIMD_EMU_EPI(_simdemu_mul_epi32, _mm_mul_epi32)
171 SIMD_EMU_EPI(_simdemu_mullo_epi32, _mm_mullo_epi32)
172 SIMD_EMU_EPI(_simdemu_sub_epi32, _mm_sub_epi32)
173 SIMD_EMU_EPI(_simdemu_sub_epi64, _mm_sub_epi64)
174 SIMD_EMU_EPI(_simdemu_min_epi32, _mm_min_epi32)
175 SIMD_EMU_EPI(_simdemu_min_epu32, _mm_min_epu32)
176 SIMD_EMU_EPI(_simdemu_max_epi32, _mm_max_epi32)
177 SIMD_EMU_EPI(_simdemu_max_epu32, _mm_max_epu32)
178 SIMD_EMU_EPI(_simdemu_add_epi32, _mm_add_epi32)
179 SIMD_EMU_EPI(_simdemu_and_si, _mm_and_si128)
180 SIMD_EMU_EPI(_simdemu_andnot_si, _mm_andnot_si128)
181 SIMD_EMU_EPI(_simdemu_cmpeq_epi32, _mm_cmpeq_epi32)
182 SIMD_EMU_EPI(_simdemu_cmplt_epi32, _mm_cmplt_epi32)
183 SIMD_EMU_EPI(_simdemu_cmpgt_epi32, _mm_cmpgt_epi32)
184 SIMD_EMU_EPI(_simdemu_or_si, _mm_or_si128)
185 SIMD_EMU_EPI(_simdemu_adds_epu8, _mm_adds_epu8)
186 SIMD_EMU_EPI(_simdemu_subs_epu8, _mm_subs_epu8)
187 SIMD_EMU_EPI(_simdemu_add_epi8, _mm_add_epi8)
188 SIMD_EMU_EPI(_simdemu_cmpeq_epi64, _mm_cmpeq_epi64)
189 SIMD_EMU_EPI(_simdemu_cmpgt_epi64, _mm_cmpgt_epi64)
190 SIMD_EMU_EPI(_simdemu_cmpgt_epi8, _mm_cmpgt_epi8)
191 SIMD_EMU_EPI(_simdemu_cmpeq_epi8, _mm_cmpeq_epi8)
192 SIMD_EMU_EPI(_simdemu_cmpgt_epi16, _mm_cmpgt_epi16)
193 SIMD_EMU_EPI(_simdemu_cmpeq_epi16, _mm_cmpeq_epi16)
194
195 #define _simd_unpacklo_epi32(a, b) _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)))
196 #define _simd_unpackhi_epi32(a, b) _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)))
197
198 #define _simd_slli_epi32(a,i) _simdemu_slli_epi32(a,i)
199 #define _simd_srai_epi32(a,i) _simdemu_srai_epi32(a,i)
200 #define _simd_srli_epi32(a,i) _simdemu_srli_epi32(a,i)
201 #define _simd_srlisi_ps(a,i) _mm256_castsi256_ps(_simdemu_srli_si128<i>(_mm256_castps_si256(a)))
202
203 #define _simd128_fmadd_ps _mm_fmaddemu_ps
204 #define _simd_fmadd_ps _mm_fmaddemu256_ps
205 #define _simd_fmsub_ps _mm_fmsubemu256_ps
206 #define _simd_shuffle_epi8 _simdemu_shuffle_epi8
207 SIMD_EMU_EPI(_simdemu_shuffle_epi8, _mm_shuffle_epi8)
208
209 INLINE
210 __m128 _mm_fmaddemu_ps(__m128 a, __m128 b, __m128 c)
211 {
212 __m128 res = _mm_mul_ps(a, b);
213 res = _mm_add_ps(res, c);
214 return res;
215 }
216
217 INLINE
218 __m256 _mm_fmaddemu256_ps(__m256 a, __m256 b, __m256 c)
219 {
220 __m256 res = _mm256_mul_ps(a, b);
221 res = _mm256_add_ps(res, c);
222 return res;
223 }
224
225 INLINE
226 __m256 _mm_fmsubemu256_ps(__m256 a, __m256 b, __m256 c)
227 {
228 __m256 res = _mm256_mul_ps(a, b);
229 res = _mm256_sub_ps(res, c);
230 return res;
231 }
232
233 INLINE
234 __m256 _simd_i32gather_ps(const float* pBase, __m256i vOffsets, const int scale)
235 {
236 uint32_t *pOffsets = (uint32_t*)&vOffsets;
237 simdscalar vResult;
238 float* pResult = (float*)&vResult;
239 for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
240 {
241 uint32_t offset = pOffsets[i];
242 offset = offset * scale;
243 pResult[i] = *(float*)(((const uint8_t*)pBase + offset));
244 }
245
246 return vResult;
247 }
248
249 INLINE
250 __m256 _simd_mask_i32gather_ps(__m256 vSrc, const float* pBase, __m256i vOffsets, __m256 vMask, const int scale)
251 {
252 uint32_t *pOffsets = (uint32_t*)&vOffsets;
253 simdscalar vResult = vSrc;
254 float* pResult = (float*)&vResult;
255 DWORD index;
256 uint32_t mask = _simd_movemask_ps(vMask);
257 while (_BitScanForward(&index, mask))
258 {
259 mask &= ~(1 << index);
260 uint32_t offset = pOffsets[index];
261 offset = offset * scale;
262 pResult[index] = *(float*)(((const uint8_t*)pBase + offset));
263 }
264
265 return vResult;
266 }
267
268 INLINE
269 __m256i _simd_abs_epi32(__m256i a)
270 {
271 __m128i aHi = _mm256_extractf128_si256(a, 1);
272 __m128i aLo = _mm256_castsi256_si128(a);
273 __m128i absLo = _mm_abs_epi32(aLo);
274 __m128i absHi = _mm_abs_epi32(aHi);
275 __m256i result = _mm256_castsi128_si256(absLo);
276 result = _mm256_insertf128_si256(result, absHi, 1);
277 return result;
278 }
279
280 INLINE
281 int _simdemu_movemask_epi8(__m256i a)
282 {
283 __m128i aHi = _mm256_extractf128_si256(a, 1);
284 __m128i aLo = _mm256_castsi256_si128(a);
285
286 int resHi = _mm_movemask_epi8(aHi);
287 int resLo = _mm_movemask_epi8(aLo);
288
289 return (resHi << 16) | resLo;
290 }
291 #else
292
293 #define _simd_mul_epi32 _mm256_mul_epi32
294 #define _simd_mullo_epi32 _mm256_mullo_epi32
295 #define _simd_sub_epi32 _mm256_sub_epi32
296 #define _simd_sub_epi64 _mm256_sub_epi64
297 #define _simd_min_epi32 _mm256_min_epi32
298 #define _simd_max_epi32 _mm256_max_epi32
299 #define _simd_min_epu32 _mm256_min_epu32
300 #define _simd_max_epu32 _mm256_max_epu32
301 #define _simd_add_epi32 _mm256_add_epi32
302 #define _simd_and_si _mm256_and_si256
303 #define _simd_andnot_si _mm256_andnot_si256
304 #define _simd_cmpeq_epi32 _mm256_cmpeq_epi32
305 #define _simd_cmplt_epi32(a,b) _mm256_cmpgt_epi32(b,a)
306 #define _simd_cmpgt_epi32(a,b) _mm256_cmpgt_epi32(a,b)
307 #define _simd_or_si _mm256_or_si256
308 #define _simd_castps_si _mm256_castps_si256
309
310 #define _simd_unpacklo_epi32 _mm256_unpacklo_epi32
311 #define _simd_unpackhi_epi32 _mm256_unpackhi_epi32
312
313 #define _simd_srli_si(a,i) _simdemu_srli_si128<i>(a)
314 #define _simd_slli_epi32 _mm256_slli_epi32
315 #define _simd_srai_epi32 _mm256_srai_epi32
316 #define _simd_srli_epi32 _mm256_srli_epi32
317 #define _simd_srlisi_ps(a,i) _mm256_castsi256_ps(_simdemu_srli_si128<i>(_mm256_castps_si256(a)))
318 #define _simd128_fmadd_ps _mm_fmadd_ps
319 #define _simd_fmadd_ps _mm256_fmadd_ps
320 #define _simd_fmsub_ps _mm256_fmsub_ps
321 #define _simd_shuffle_epi8 _mm256_shuffle_epi8
322 #define _simd_adds_epu8 _mm256_adds_epu8
323 #define _simd_subs_epu8 _mm256_subs_epu8
324 #define _simd_add_epi8 _mm256_add_epi8
325 #define _simd_i32gather_ps _mm256_i32gather_ps
326 #define _simd_mask_i32gather_ps _mm256_mask_i32gather_ps
327 #define _simd_abs_epi32 _mm256_abs_epi32
328
329 #define _simd_cmpeq_epi64 _mm256_cmpeq_epi64
330 #define _simd_cmpgt_epi64 _mm256_cmpgt_epi64
331 #define _simd_cmpgt_epi8 _mm256_cmpgt_epi8
332 #define _simd_cmpeq_epi8 _mm256_cmpeq_epi8
333 #define _simd_cmpgt_epi16 _mm256_cmpgt_epi16
334 #define _simd_cmpeq_epi16 _mm256_cmpeq_epi16
335 #define _simd_movemask_epi8 _mm256_movemask_epi8
336 #define _simd_permute_ps _mm256_permutevar8x32_ps
337 #endif
338
339 #define _simd_shuffleps_epi32(vA, vB, imm) _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(vA), _mm256_castsi256_ps(vB), imm))
340 #define _simd_shuffle_ps _mm256_shuffle_ps
341 #define _simd_set1_epi32 _mm256_set1_epi32
342 #define _simd_set1_epi8 _mm256_set1_epi8
343 #define _simd_setzero_si _mm256_setzero_si256
344 #define _simd_cvttps_epi32 _mm256_cvttps_epi32
345 #define _simd_store_si _mm256_store_si256
346 #define _simd_broadcast_ss _mm256_broadcast_ss
347 #define _simd_maskstore_ps _mm256_maskstore_ps
348 #define _simd_load_si _mm256_load_si256
349 #define _simd_loadu_si _mm256_loadu_si256
350 #define _simd_sub_ps _mm256_sub_ps
351 #define _simd_testz_ps _mm256_testz_ps
352 #define _simd_xor_ps _mm256_xor_ps
353
354
355 INLINE
356 simdscalari _simd_blendv_epi32(simdscalari a, simdscalari b, simdscalar mask)
357 {
358 return _simd_castps_si(_simd_blendv_ps(_simd_castsi_ps(a), _simd_castsi_ps(b), mask));
359 }
360
361 // convert bitmask to vector mask
362 INLINE
363 simdscalar vMask(int32_t mask)
364 {
365 __m256i vec = _mm256_set1_epi32(mask);
366 const __m256i bit = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
367 vec = _simd_and_si(vec, bit);
368 vec = _simd_cmplt_epi32(_mm256_setzero_si256(), vec);
369 return _simd_castsi_ps(vec);
370 }
371
372 INLINE
373 void _simd_mov(simdscalar &r, unsigned int rlane, simdscalar& s, unsigned int slane)
374 {
375 OSALIGNSIMD(float) rArray[KNOB_SIMD_WIDTH], sArray[KNOB_SIMD_WIDTH];
376 _mm256_store_ps(rArray, r);
377 _mm256_store_ps(sArray, s);
378 rArray[rlane] = sArray[slane];
379 r = _mm256_load_ps(rArray);
380 }
381
382 INLINE __m256i _simdemu_slli_epi32(__m256i a, uint32_t i)
383 {
384 __m128i aHi = _mm256_extractf128_si256(a, 1);
385 __m128i aLo = _mm256_castsi256_si128(a);
386
387 __m128i resHi = _mm_slli_epi32(aHi, i);
388 __m128i resLo = _mm_slli_epi32(aLo, i);
389
390 __m256i result = _mm256_castsi128_si256(resLo);
391 result = _mm256_insertf128_si256(result, resHi, 1);
392
393 return result;
394 }
395
396 INLINE __m256i _simdemu_srai_epi32(__m256i a, uint32_t i)
397 {
398 __m128i aHi = _mm256_extractf128_si256(a, 1);
399 __m128i aLo = _mm256_castsi256_si128(a);
400
401 __m128i resHi = _mm_srai_epi32(aHi, i);
402 __m128i resLo = _mm_srai_epi32(aLo, i);
403
404 __m256i result = _mm256_castsi128_si256(resLo);
405 result = _mm256_insertf128_si256(result, resHi, 1);
406
407 return result;
408 }
409
410 INLINE __m256i _simdemu_srli_epi32(__m256i a, uint32_t i)
411 {
412 __m128i aHi = _mm256_extractf128_si256(a, 1);
413 __m128i aLo = _mm256_castsi256_si128(a);
414
415 __m128i resHi = _mm_srli_epi32(aHi, i);
416 __m128i resLo = _mm_srli_epi32(aLo, i);
417
418 __m256i result = _mm256_castsi128_si256(resLo);
419 result = _mm256_insertf128_si256(result, resHi, 1);
420
421 return result;
422 }
423
424 INLINE
425 void _simdvec_transpose(simdvector &v)
426 {
427 SWR_ASSERT(false, "Need to implement 8 wide version");
428 }
429
430 #else
431 #error Unsupported vector width
432 #endif
433
434 // Populates a simdvector from a vector. So p = xyzw becomes xxxx yyyy zzzz wwww.
435 INLINE
436 void _simdvec_load_ps(simdvector& r, const float *p)
437 {
438 r[0] = _simd_set1_ps(p[0]);
439 r[1] = _simd_set1_ps(p[1]);
440 r[2] = _simd_set1_ps(p[2]);
441 r[3] = _simd_set1_ps(p[3]);
442 }
443
444 INLINE
445 void _simdvec_mov(simdvector& r, const simdscalar& s)
446 {
447 r[0] = s;
448 r[1] = s;
449 r[2] = s;
450 r[3] = s;
451 }
452
453 INLINE
454 void _simdvec_mov(simdvector& r, const simdvector& v)
455 {
456 r[0] = v[0];
457 r[1] = v[1];
458 r[2] = v[2];
459 r[3] = v[3];
460 }
461
462 // just move a lane from the source simdvector to dest simdvector
463 INLINE
464 void _simdvec_mov(simdvector &r, unsigned int rlane, simdvector& s, unsigned int slane)
465 {
466 _simd_mov(r[0], rlane, s[0], slane);
467 _simd_mov(r[1], rlane, s[1], slane);
468 _simd_mov(r[2], rlane, s[2], slane);
469 _simd_mov(r[3], rlane, s[3], slane);
470 }
471
472 INLINE
473 void _simdvec_dp3_ps(simdscalar& r, const simdvector& v0, const simdvector& v1)
474 {
475 simdscalar tmp;
476 r = _simd_mul_ps(v0[0], v1[0]); // (v0.x*v1.x)
477
478 tmp = _simd_mul_ps(v0[1], v1[1]); // (v0.y*v1.y)
479 r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y)
480
481 tmp = _simd_mul_ps(v0[2], v1[2]); // (v0.z*v1.z)
482 r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
483 }
484
485 INLINE
486 void _simdvec_dp4_ps(simdscalar& r, const simdvector& v0, const simdvector& v1)
487 {
488 simdscalar tmp;
489 r = _simd_mul_ps(v0[0], v1[0]); // (v0.x*v1.x)
490
491 tmp = _simd_mul_ps(v0[1], v1[1]); // (v0.y*v1.y)
492 r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y)
493
494 tmp = _simd_mul_ps(v0[2], v1[2]); // (v0.z*v1.z)
495 r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
496
497 tmp = _simd_mul_ps(v0[3], v1[3]); // (v0.w*v1.w)
498 r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
499 }
500
501 INLINE
502 simdscalar _simdvec_rcp_length_ps(const simdvector& v)
503 {
504 simdscalar length;
505 _simdvec_dp4_ps(length, v, v);
506 return _simd_rsqrt_ps(length);
507 }
508
509 INLINE
510 void _simdvec_normalize_ps(simdvector& r, const simdvector& v)
511 {
512 simdscalar vecLength;
513 vecLength = _simdvec_rcp_length_ps(v);
514
515 r[0] = _simd_mul_ps(v[0], vecLength);
516 r[1] = _simd_mul_ps(v[1], vecLength);
517 r[2] = _simd_mul_ps(v[2], vecLength);
518 r[3] = _simd_mul_ps(v[3], vecLength);
519 }
520
521 INLINE
522 void _simdvec_mul_ps(simdvector& r, const simdvector& v, const simdscalar& s)
523 {
524 r[0] = _simd_mul_ps(v[0], s);
525 r[1] = _simd_mul_ps(v[1], s);
526 r[2] = _simd_mul_ps(v[2], s);
527 r[3] = _simd_mul_ps(v[3], s);
528 }
529
530 INLINE
531 void _simdvec_mul_ps(simdvector& r, const simdvector& v0, const simdvector& v1)
532 {
533 r[0] = _simd_mul_ps(v0[0], v1[0]);
534 r[1] = _simd_mul_ps(v0[1], v1[1]);
535 r[2] = _simd_mul_ps(v0[2], v1[2]);
536 r[3] = _simd_mul_ps(v0[3], v1[3]);
537 }
538
539 INLINE
540 void _simdvec_add_ps(simdvector& r, const simdvector& v0, const simdvector& v1)
541 {
542 r[0] = _simd_add_ps(v0[0], v1[0]);
543 r[1] = _simd_add_ps(v0[1], v1[1]);
544 r[2] = _simd_add_ps(v0[2], v1[2]);
545 r[3] = _simd_add_ps(v0[3], v1[3]);
546 }
547
548 INLINE
549 void _simdvec_min_ps(simdvector& r, const simdvector& v0, const simdscalar& s)
550 {
551 r[0] = _simd_min_ps(v0[0], s);
552 r[1] = _simd_min_ps(v0[1], s);
553 r[2] = _simd_min_ps(v0[2], s);
554 r[3] = _simd_min_ps(v0[3], s);
555 }
556
557 INLINE
558 void _simdvec_max_ps(simdvector& r, const simdvector& v0, const simdscalar& s)
559 {
560 r[0] = _simd_max_ps(v0[0], s);
561 r[1] = _simd_max_ps(v0[1], s);
562 r[2] = _simd_max_ps(v0[2], s);
563 r[3] = _simd_max_ps(v0[3], s);
564 }
565
566 // Matrix4x4 * Vector4
567 // outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * v.w)
568 // outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * v.w)
569 // outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * v.w)
570 // outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * v.w)
571 INLINE
572 void _simd_mat4x4_vec4_multiply(
573 simdvector& result,
574 const float *pMatrix,
575 const simdvector& v)
576 {
577 simdscalar m;
578 simdscalar r0;
579 simdscalar r1;
580
581 m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0]
582 r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
583 m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1]
584 r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
585 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
586 m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2]
587 r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
588 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
589 m = _simd_load1_ps(pMatrix + 0*4 + 3); // m[row][3]
590 r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z)
591 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
592 result[0] = r0;
593
594 m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0]
595 r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
596 m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1]
597 r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
598 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
599 m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2]
600 r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
601 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
602 m = _simd_load1_ps(pMatrix + 1*4 + 3); // m[row][3]
603 r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z)
604 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
605 result[1] = r0;
606
607 m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0]
608 r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
609 m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1]
610 r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
611 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
612 m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2]
613 r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
614 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
615 m = _simd_load1_ps(pMatrix + 2*4 + 3); // m[row][3]
616 r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z)
617 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
618 result[2] = r0;
619
620 m = _simd_load1_ps(pMatrix + 3*4 + 0); // m[row][0]
621 r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
622 m = _simd_load1_ps(pMatrix + 3*4 + 1); // m[row][1]
623 r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
624 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
625 m = _simd_load1_ps(pMatrix + 3*4 + 2); // m[row][2]
626 r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
627 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
628 m = _simd_load1_ps(pMatrix + 3*4 + 3); // m[row][3]
629 r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z)
630 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
631 result[3] = r0;
632 }
633
634 // Matrix4x4 * Vector3 - Direction Vector where w = 0.
635 // outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 0)
636 // outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 0)
637 // outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 0)
638 // outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 0)
639 INLINE
640 void _simd_mat3x3_vec3_w0_multiply(
641 simdvector& result,
642 const float *pMatrix,
643 const simdvector& v)
644 {
645 simdscalar m;
646 simdscalar r0;
647 simdscalar r1;
648
649 m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0]
650 r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
651 m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1]
652 r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
653 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
654 m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2]
655 r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
656 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
657 result[0] = r0;
658
659 m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0]
660 r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
661 m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1]
662 r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
663 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
664 m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2]
665 r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
666 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
667 result[1] = r0;
668
669 m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0]
670 r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
671 m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1]
672 r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
673 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
674 m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2]
675 r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
676 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
677 result[2] = r0;
678
679 result[3] = _simd_setzero_ps();
680 }
681
682 // Matrix4x4 * Vector3 - Position vector where w = 1.
683 // outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 1)
684 // outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 1)
685 // outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 1)
686 // outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 1)
687 INLINE
688 void _simd_mat4x4_vec3_w1_multiply(
689 simdvector& result,
690 const float *pMatrix,
691 const simdvector& v)
692 {
693 simdscalar m;
694 simdscalar r0;
695 simdscalar r1;
696
697 m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0]
698 r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
699 m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1]
700 r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
701 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
702 m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2]
703 r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
704 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
705 m = _simd_load1_ps(pMatrix + 0*4 + 3); // m[row][3]
706 r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
707 result[0] = r0;
708
709 m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0]
710 r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
711 m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1]
712 r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
713 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
714 m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2]
715 r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
716 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
717 m = _simd_load1_ps(pMatrix + 1*4 + 3); // m[row][3]
718 r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
719 result[1] = r0;
720
721 m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0]
722 r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
723 m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1]
724 r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
725 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
726 m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2]
727 r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
728 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
729 m = _simd_load1_ps(pMatrix + 2*4 + 3); // m[row][3]
730 r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
731 result[2] = r0;
732
733 m = _simd_load1_ps(pMatrix + 3*4 + 0); // m[row][0]
734 r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
735 m = _simd_load1_ps(pMatrix + 3*4 + 1); // m[row][1]
736 r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
737 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
738 m = _simd_load1_ps(pMatrix + 3*4 + 2); // m[row][2]
739 r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
740 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
741 m = _simd_load1_ps(pMatrix + 3*4 + 3); // m[row][3]
742 result[3] = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
743 }
744
745 INLINE
746 void _simd_mat4x3_vec3_w1_multiply(
747 simdvector& result,
748 const float *pMatrix,
749 const simdvector& v)
750 {
751 simdscalar m;
752 simdscalar r0;
753 simdscalar r1;
754
755 m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0]
756 r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
757 m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1]
758 r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
759 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
760 m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2]
761 r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
762 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
763 m = _simd_load1_ps(pMatrix + 0*4 + 3); // m[row][3]
764 r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
765 result[0] = r0;
766
767 m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0]
768 r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
769 m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1]
770 r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
771 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
772 m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2]
773 r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
774 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
775 m = _simd_load1_ps(pMatrix + 1*4 + 3); // m[row][3]
776 r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
777 result[1] = r0;
778
779 m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0]
780 r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
781 m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1]
782 r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
783 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
784 m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2]
785 r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
786 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
787 m = _simd_load1_ps(pMatrix + 2*4 + 3); // m[row][3]
788 r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
789 result[2] = r0;
790 result[3] = _simd_set1_ps(1.0f);
791 }
792
793 //////////////////////////////////////////////////////////////////////////
794 /// @brief Compute plane equation vA * vX + vB * vY + vC
795 INLINE simdscalar vplaneps(simdscalar vA, simdscalar vB, simdscalar vC, simdscalar &vX, simdscalar &vY)
796 {
797 simdscalar vOut = _simd_fmadd_ps(vA, vX, vC);
798 vOut = _simd_fmadd_ps(vB, vY, vOut);
799 return vOut;
800 }
801
802 //////////////////////////////////////////////////////////////////////////
803 /// @brief Interpolates a single component.
804 /// @param vI - barycentric I
805 /// @param vJ - barycentric J
806 /// @param pInterpBuffer - pointer to attribute barycentric coeffs
807 template<UINT Attrib, UINT Comp>
808 static INLINE simdscalar InterpolateComponent(simdscalar vI, simdscalar vJ, const float *pInterpBuffer)
809 {
810 const float *pInterpA = &pInterpBuffer[Attrib * 12 + 0 + Comp];
811 const float *pInterpB = &pInterpBuffer[Attrib * 12 + 4 + Comp];
812 const float *pInterpC = &pInterpBuffer[Attrib * 12 + 8 + Comp];
813
814 simdscalar vA = _simd_broadcast_ss(pInterpA);
815 simdscalar vB = _simd_broadcast_ss(pInterpB);
816 simdscalar vC = _simd_broadcast_ss(pInterpC);
817
818 simdscalar vk = _simd_sub_ps(_simd_sub_ps(_simd_set1_ps(1.0f), vI), vJ);
819 vC = _simd_mul_ps(vk, vC);
820
821 return vplaneps(vA, vB, vC, vI, vJ);
822 }
823
824 INLINE
825 UINT pdep_u32(UINT a, UINT mask)
826 {
827 #if KNOB_ARCH==KNOB_ARCH_AVX2
828 return _pdep_u32(a, mask);
829 #else
830 UINT result = 0;
831
832 // copied from http://wm.ite.pl/articles/pdep-soft-emu.html
833 // using bsf instead of funky loop
834 DWORD maskIndex;
835 while (_BitScanForward(&maskIndex, mask))
836 {
837 // 1. isolate lowest set bit of mask
838 const UINT lowest = 1 << maskIndex;
839
840 // 2. populate LSB from src
841 const UINT LSB = (UINT)((int)(a << 31) >> 31);
842
843 // 3. copy bit from mask
844 result |= LSB & lowest;
845
846 // 4. clear lowest bit
847 mask &= ~lowest;
848
849 // 5. prepare for next iteration
850 a >>= 1;
851 }
852
853 return result;
854 #endif
855 }
856
857 INLINE
858 UINT pext_u32(UINT a, UINT mask)
859 {
860 #if KNOB_ARCH==KNOB_ARCH_AVX2
861 return _pext_u32(a, mask);
862 #else
863 UINT result = 0;
864 DWORD maskIndex;
865 uint32_t currentBit = 0;
866 while (_BitScanForward(&maskIndex, mask))
867 {
868 // 1. isolate lowest set bit of mask
869 const UINT lowest = 1 << maskIndex;
870
871 // 2. copy bit from mask
872 result |= ((a & lowest) > 0) << currentBit++;
873
874 // 3. clear lowest bit
875 mask &= ~lowest;
876 }
877 return result;
878 #endif
879 }
880
881 #endif//__SWR_SIMDINTRIN_H__