61c0c5461a39579da6d2937e6d9d1c4cf0bc5762
[mesa.git] / src / gallium / drivers / swr / rasterizer / common / simdintrin.h
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 ****************************************************************************/
23
24 #ifndef __SWR_SIMDINTRIN_H__
25 #define __SWR_SIMDINTRIN_H__
26
27 #include "os.h"
28
29 #include <cassert>
30
31 #include <emmintrin.h>
32 #include <immintrin.h>
33 #include <xmmintrin.h>
34
35 #if KNOB_SIMD_WIDTH == 8
36 typedef __m256 simdscalar;
37 typedef __m256i simdscalari;
38 typedef uint8_t simdmask;
39 #else
40 #error Unsupported vector width
41 #endif
42
43 // simd vector
44 OSALIGNSIMD(union) simdvector
45 {
46 simdscalar v[4];
47 struct
48 {
49 simdscalar x, y, z, w;
50 };
51
52 simdscalar& operator[] (const int i) { return v[i]; }
53 const simdscalar& operator[] (const int i) const { return v[i]; }
54 };
55
56 #if KNOB_SIMD_WIDTH == 8
57 #define _simd128_maskstore_ps _mm_maskstore_ps
58 #define _simd_load_ps _mm256_load_ps
59 #define _simd_load1_ps _mm256_broadcast_ss
60 #define _simd_loadu_ps _mm256_loadu_ps
61 #define _simd_setzero_ps _mm256_setzero_ps
62 #define _simd_set1_ps _mm256_set1_ps
63 #define _simd_blend_ps _mm256_blend_ps
64 #define _simd_blendv_ps _mm256_blendv_ps
65 #define _simd_store_ps _mm256_store_ps
66 #define _simd_mul_ps _mm256_mul_ps
67 #define _simd_add_ps _mm256_add_ps
68 #define _simd_sub_ps _mm256_sub_ps
69 #define _simd_rsqrt_ps _mm256_rsqrt_ps
70 #define _simd_min_ps _mm256_min_ps
71 #define _simd_max_ps _mm256_max_ps
72 #define _simd_movemask_ps _mm256_movemask_ps
73 #define _simd_cvtps_epi32 _mm256_cvtps_epi32
74 #define _simd_cvttps_epi32 _mm256_cvttps_epi32
75 #define _simd_cvtepi32_ps _mm256_cvtepi32_ps
76 #define _simd_cmplt_ps(a, b) _mm256_cmp_ps(a, b, _CMP_LT_OQ)
77 #define _simd_cmpgt_ps(a, b) _mm256_cmp_ps(a, b, _CMP_GT_OQ)
78 #define _simd_cmpneq_ps(a, b) _mm256_cmp_ps(a, b, _CMP_NEQ_OQ)
79 #define _simd_cmpeq_ps(a, b) _mm256_cmp_ps(a, b, _CMP_EQ_OQ)
80 #define _simd_cmpge_ps(a, b) _mm256_cmp_ps(a, b, _CMP_GE_OQ)
81 #define _simd_cmple_ps(a, b) _mm256_cmp_ps(a, b, _CMP_LE_OQ)
82 #define _simd_cmp_ps(a, b, imm) _mm256_cmp_ps(a, b, imm)
83 #define _simd_and_ps _mm256_and_ps
84 #define _simd_or_ps _mm256_or_ps
85
86 #define _simd_rcp_ps _mm256_rcp_ps
87 #define _simd_div_ps _mm256_div_ps
88 #define _simd_castsi_ps _mm256_castsi256_ps
89 #define _simd_andnot_ps _mm256_andnot_ps
90 #define _simd_round_ps _mm256_round_ps
91 #define _simd_castpd_ps _mm256_castpd_ps
92 #define _simd_broadcast_ps(a) _mm256_broadcast_ps((const __m128*)(a))
93 #define _simd_stream_ps _mm256_stream_ps
94
95 #define _simd_load_sd _mm256_load_sd
96 #define _simd_movemask_pd _mm256_movemask_pd
97 #define _simd_castsi_pd _mm256_castsi256_pd
98
99 // emulated integer simd
100 #define SIMD_EMU_EPI(func, intrin) \
101 INLINE \
102 __m256i func(__m256i a, __m256i b)\
103 {\
104 __m128i aHi = _mm256_extractf128_si256(a, 1);\
105 __m128i bHi = _mm256_extractf128_si256(b, 1);\
106 __m128i aLo = _mm256_castsi256_si128(a);\
107 __m128i bLo = _mm256_castsi256_si128(b);\
108 \
109 __m128i subLo = intrin(aLo, bLo);\
110 __m128i subHi = intrin(aHi, bHi);\
111 \
112 __m256i result = _mm256_castsi128_si256(subLo);\
113 result = _mm256_insertf128_si256(result, subHi, 1);\
114 \
115 return result;\
116 }
117
118 #if (KNOB_ARCH == KNOB_ARCH_AVX)
119 INLINE
120 __m256 _simdemu_permute_ps(__m256 a, __m256i b)
121 {
122 __m128 aHi = _mm256_extractf128_ps(a, 1);
123 __m128i bHi = _mm256_extractf128_si256(b, 1);
124 __m128 aLo = _mm256_castps256_ps128(a);
125 __m128i bLo = _mm256_castsi256_si128(b);
126
127 __m128i indexHi = _mm_cmpgt_epi32(bLo, _mm_set1_epi32(3));
128 __m128 resLow = _mm_permutevar_ps(aLo, _mm_and_si128(bLo, _mm_set1_epi32(0x3)));
129 __m128 resHi = _mm_permutevar_ps(aHi, _mm_and_si128(bLo, _mm_set1_epi32(0x3)));
130 __m128 blendLowRes = _mm_blendv_ps(resLow, resHi, _mm_castsi128_ps(indexHi));
131
132 indexHi = _mm_cmpgt_epi32(bHi, _mm_set1_epi32(3));
133 resLow = _mm_permutevar_ps(aLo, _mm_and_si128(bHi, _mm_set1_epi32(0x3)));
134 resHi = _mm_permutevar_ps(aHi, _mm_and_si128(bHi, _mm_set1_epi32(0x3)));
135 __m128 blendHiRes = _mm_blendv_ps(resLow, resHi, _mm_castsi128_ps(indexHi));
136
137 __m256 result = _mm256_castps128_ps256(blendLowRes);
138 result = _mm256_insertf128_ps(result, blendHiRes, 1);
139
140 return result;
141 }
142
143 INLINE
144 __m256i _simdemu_permute_epi32(__m256i a, __m256i b)
145 {
146 return _mm256_castps_si256(_simdemu_permute_ps(_mm256_castsi256_ps(a), b));
147 }
148
149 INLINE
150 __m256i _simdemu_srlv_epi32(__m256i vA, __m256i vCount)
151 {
152 int32_t aHi, aLow, countHi, countLow;
153 __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
154 __m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
155 __m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
156 __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0));
157
158 aHi = _mm_extract_epi32(vAHi, 0);
159 countHi = _mm_extract_epi32(vCountHi, 0);
160 aHi >>= countHi;
161 vAHi = _mm_insert_epi32(vAHi, aHi, 0);
162
163 aLow = _mm_extract_epi32(vALow, 0);
164 countLow = _mm_extract_epi32(vCountLow, 0);
165 aLow >>= countLow;
166 vALow = _mm_insert_epi32(vALow, aLow, 0);
167
168 aHi = _mm_extract_epi32(vAHi, 1);
169 countHi = _mm_extract_epi32(vCountHi, 1);
170 aHi >>= countHi;
171 vAHi = _mm_insert_epi32(vAHi, aHi, 1);
172
173 aLow = _mm_extract_epi32(vALow, 1);
174 countLow = _mm_extract_epi32(vCountLow, 1);
175 aLow >>= countLow;
176 vALow = _mm_insert_epi32(vALow, aLow, 1);
177
178 aHi = _mm_extract_epi32(vAHi, 2);
179 countHi = _mm_extract_epi32(vCountHi, 2);
180 aHi >>= countHi;
181 vAHi = _mm_insert_epi32(vAHi, aHi, 2);
182
183 aLow = _mm_extract_epi32(vALow, 2);
184 countLow = _mm_extract_epi32(vCountLow, 2);
185 aLow >>= countLow;
186 vALow = _mm_insert_epi32(vALow, aLow, 2);
187
188 aHi = _mm_extract_epi32(vAHi, 3);
189 countHi = _mm_extract_epi32(vCountHi, 3);
190 aHi >>= countHi;
191 vAHi = _mm_insert_epi32(vAHi, aHi, 3);
192
193 aLow = _mm_extract_epi32(vALow, 3);
194 countLow = _mm_extract_epi32(vCountLow, 3);
195 aLow >>= countLow;
196 vALow = _mm_insert_epi32(vALow, aLow, 3);
197
198 __m256i ret = _mm256_set1_epi32(0);
199 ret = _mm256_insertf128_si256(ret, vAHi, 1);
200 ret = _mm256_insertf128_si256(ret, vALow, 0);
201 return ret;
202 }
203
204
205 INLINE
206 __m256i _simdemu_sllv_epi32(__m256i vA, __m256i vCount)
207 {
208 int32_t aHi, aLow, countHi, countLow;
209 __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
210 __m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
211 __m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
212 __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0));
213
214 aHi = _mm_extract_epi32(vAHi, 0);
215 countHi = _mm_extract_epi32(vCountHi, 0);
216 aHi <<= countHi;
217 vAHi = _mm_insert_epi32(vAHi, aHi, 0);
218
219 aLow = _mm_extract_epi32(vALow, 0);
220 countLow = _mm_extract_epi32(vCountLow, 0);
221 aLow <<= countLow;
222 vALow = _mm_insert_epi32(vALow, aLow, 0);
223
224 aHi = _mm_extract_epi32(vAHi, 1);
225 countHi = _mm_extract_epi32(vCountHi, 1);
226 aHi <<= countHi;
227 vAHi = _mm_insert_epi32(vAHi, aHi, 1);
228
229 aLow = _mm_extract_epi32(vALow, 1);
230 countLow = _mm_extract_epi32(vCountLow, 1);
231 aLow <<= countLow;
232 vALow = _mm_insert_epi32(vALow, aLow, 1);
233
234 aHi = _mm_extract_epi32(vAHi, 2);
235 countHi = _mm_extract_epi32(vCountHi, 2);
236 aHi <<= countHi;
237 vAHi = _mm_insert_epi32(vAHi, aHi, 2);
238
239 aLow = _mm_extract_epi32(vALow, 2);
240 countLow = _mm_extract_epi32(vCountLow, 2);
241 aLow <<= countLow;
242 vALow = _mm_insert_epi32(vALow, aLow, 2);
243
244 aHi = _mm_extract_epi32(vAHi, 3);
245 countHi = _mm_extract_epi32(vCountHi, 3);
246 aHi <<= countHi;
247 vAHi = _mm_insert_epi32(vAHi, aHi, 3);
248
249 aLow = _mm_extract_epi32(vALow, 3);
250 countLow = _mm_extract_epi32(vCountLow, 3);
251 aLow <<= countLow;
252 vALow = _mm_insert_epi32(vALow, aLow, 3);
253
254 __m256i ret = _mm256_set1_epi32(0);
255 ret = _mm256_insertf128_si256(ret, vAHi, 1);
256 ret = _mm256_insertf128_si256(ret, vALow, 0);
257 return ret;
258 }
259
260 #define _simd_mul_epi32 _simdemu_mul_epi32
261 #define _simd_mullo_epi32 _simdemu_mullo_epi32
262 #define _simd_sub_epi32 _simdemu_sub_epi32
263 #define _simd_sub_epi64 _simdemu_sub_epi64
264 #define _simd_min_epi32 _simdemu_min_epi32
265 #define _simd_min_epu32 _simdemu_min_epu32
266 #define _simd_max_epi32 _simdemu_max_epi32
267 #define _simd_max_epu32 _simdemu_max_epu32
268 #define _simd_add_epi32 _simdemu_add_epi32
269 #define _simd_and_si _simdemu_and_si
270 #define _simd_andnot_si _simdemu_andnot_si
271 #define _simd_cmpeq_epi32 _simdemu_cmpeq_epi32
272 #define _simd_cmplt_epi32 _simdemu_cmplt_epi32
273 #define _simd_cmpgt_epi32 _simdemu_cmpgt_epi32
274 #define _simd_or_si _simdemu_or_si
275 #define _simd_xor_si _simdemu_xor_si
276 #define _simd_castps_si _mm256_castps_si256
277 #define _simd_adds_epu8 _simdemu_adds_epu8
278 #define _simd_subs_epu8 _simdemu_subs_epu8
279 #define _simd_add_epi8 _simdemu_add_epi8
280 #define _simd_cmpeq_epi64 _simdemu_cmpeq_epi64
281 #define _simd_cmpgt_epi64 _simdemu_cmpgt_epi64
282 #define _simd_cmpgt_epi8 _simdemu_cmpgt_epi8
283 #define _simd_cmpeq_epi8 _simdemu_cmpeq_epi8
284 #define _simd_cmpgt_epi16 _simdemu_cmpgt_epi16
285 #define _simd_cmpeq_epi16 _simdemu_cmpeq_epi16
286 #define _simd_movemask_epi8 _simdemu_movemask_epi8
287 #define _simd_permute_ps _simdemu_permute_ps
288 #define _simd_permute_epi32 _simdemu_permute_epi32
289 #define _simd_srlv_epi32 _simdemu_srlv_epi32
290 #define _simd_sllv_epi32 _simdemu_sllv_epi32
291
292 SIMD_EMU_EPI(_simdemu_mul_epi32, _mm_mul_epi32)
293 SIMD_EMU_EPI(_simdemu_mullo_epi32, _mm_mullo_epi32)
294 SIMD_EMU_EPI(_simdemu_sub_epi32, _mm_sub_epi32)
295 SIMD_EMU_EPI(_simdemu_sub_epi64, _mm_sub_epi64)
296 SIMD_EMU_EPI(_simdemu_min_epi32, _mm_min_epi32)
297 SIMD_EMU_EPI(_simdemu_min_epu32, _mm_min_epu32)
298 SIMD_EMU_EPI(_simdemu_max_epi32, _mm_max_epi32)
299 SIMD_EMU_EPI(_simdemu_max_epu32, _mm_max_epu32)
300 SIMD_EMU_EPI(_simdemu_add_epi32, _mm_add_epi32)
301 SIMD_EMU_EPI(_simdemu_and_si, _mm_and_si128)
302 SIMD_EMU_EPI(_simdemu_andnot_si, _mm_andnot_si128)
303 SIMD_EMU_EPI(_simdemu_cmpeq_epi32, _mm_cmpeq_epi32)
304 SIMD_EMU_EPI(_simdemu_cmplt_epi32, _mm_cmplt_epi32)
305 SIMD_EMU_EPI(_simdemu_cmpgt_epi32, _mm_cmpgt_epi32)
306 SIMD_EMU_EPI(_simdemu_or_si, _mm_or_si128)
307 SIMD_EMU_EPI(_simdemu_xor_si, _mm_xor_si128)
308 SIMD_EMU_EPI(_simdemu_adds_epu8, _mm_adds_epu8)
309 SIMD_EMU_EPI(_simdemu_subs_epu8, _mm_subs_epu8)
310 SIMD_EMU_EPI(_simdemu_add_epi8, _mm_add_epi8)
311 SIMD_EMU_EPI(_simdemu_cmpeq_epi64, _mm_cmpeq_epi64)
312 SIMD_EMU_EPI(_simdemu_cmpgt_epi64, _mm_cmpgt_epi64)
313 SIMD_EMU_EPI(_simdemu_cmpgt_epi8, _mm_cmpgt_epi8)
314 SIMD_EMU_EPI(_simdemu_cmpeq_epi8, _mm_cmpeq_epi8)
315 SIMD_EMU_EPI(_simdemu_cmpgt_epi16, _mm_cmpgt_epi16)
316 SIMD_EMU_EPI(_simdemu_cmpeq_epi16, _mm_cmpeq_epi16)
317 SIMD_EMU_EPI(_simdemu_unpacklo_epi8, _mm_unpacklo_epi8)
318 SIMD_EMU_EPI(_simdemu_unpackhi_epi8, _mm_unpackhi_epi8)
319 SIMD_EMU_EPI(_simdemu_unpacklo_epi16, _mm_unpacklo_epi16)
320 SIMD_EMU_EPI(_simdemu_unpackhi_epi16, _mm_unpackhi_epi16)
321
322 #define _simd_unpacklo_epi8 _simdemu_unpacklo_epi8
323 #define _simd_unpackhi_epi8 _simdemu_unpackhi_epi8
324 #define _simd_unpacklo_epi16 _simdemu_unpacklo_epi16
325 #define _simd_unpackhi_epi16 _simdemu_unpackhi_epi16
326 #define _simd_unpacklo_epi32(a, b) _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)))
327 #define _simd_unpackhi_epi32(a, b) _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)))
328 #define _simd_unpacklo_epi64(a, b) _mm256_castpd_si256(_mm256_unpacklo_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b)))
329 #define _simd_unpackhi_epi64(a, b) _mm256_castpd_si256(_mm256_unpackhi_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b)))
330
331 #define _simd_slli_epi32(a,i) _simdemu_slli_epi32(a,i)
332 #define _simd_srai_epi32(a,i) _simdemu_srai_epi32(a,i)
333 #define _simd_srli_epi32(a,i) _simdemu_srli_epi32(a,i)
334 #define _simd_srlisi_ps(a,i) _mm256_castsi256_ps(_simdemu_srli_si128<i>(_mm256_castps_si256(a)))
335
336 #define _simd128_fmadd_ps _mm_fmaddemu_ps
337 #define _simd_fmadd_ps _mm_fmaddemu256_ps
338 #define _simd_fmsub_ps _mm_fmsubemu256_ps
339 #define _simd_shuffle_epi8 _simdemu_shuffle_epi8
340 SIMD_EMU_EPI(_simdemu_shuffle_epi8, _mm_shuffle_epi8)
341
342 INLINE
343 __m128 _mm_fmaddemu_ps(__m128 a, __m128 b, __m128 c)
344 {
345 __m128 res = _mm_mul_ps(a, b);
346 res = _mm_add_ps(res, c);
347 return res;
348 }
349
350 INLINE
351 __m256 _mm_fmaddemu256_ps(__m256 a, __m256 b, __m256 c)
352 {
353 __m256 res = _mm256_mul_ps(a, b);
354 res = _mm256_add_ps(res, c);
355 return res;
356 }
357
358 INLINE
359 __m256 _mm_fmsubemu256_ps(__m256 a, __m256 b, __m256 c)
360 {
361 __m256 res = _mm256_mul_ps(a, b);
362 res = _mm256_sub_ps(res, c);
363 return res;
364 }
365
366 INLINE
367 __m256 _simd_i32gather_ps(const float* pBase, __m256i vOffsets, const int scale)
368 {
369 uint32_t *pOffsets = (uint32_t*)&vOffsets;
370 simdscalar vResult;
371 float* pResult = (float*)&vResult;
372 for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
373 {
374 uint32_t offset = pOffsets[i];
375 offset = offset * scale;
376 pResult[i] = *(float*)(((const uint8_t*)pBase + offset));
377 }
378
379 return vResult;
380 }
381
382 INLINE
383 __m256 _simd_mask_i32gather_ps(__m256 vSrc, const float* pBase, __m256i vOffsets, __m256 vMask, const int scale)
384 {
385 uint32_t *pOffsets = (uint32_t*)&vOffsets;
386 simdscalar vResult = vSrc;
387 float* pResult = (float*)&vResult;
388 DWORD index;
389 uint32_t mask = _simd_movemask_ps(vMask);
390 while (_BitScanForward(&index, mask))
391 {
392 mask &= ~(1 << index);
393 uint32_t offset = pOffsets[index];
394 offset = offset * scale;
395 pResult[index] = *(float*)(((const uint8_t*)pBase + offset));
396 }
397
398 return vResult;
399 }
400
401 INLINE
402 __m256i _simd_abs_epi32(__m256i a)
403 {
404 __m128i aHi = _mm256_extractf128_si256(a, 1);
405 __m128i aLo = _mm256_castsi256_si128(a);
406 __m128i absLo = _mm_abs_epi32(aLo);
407 __m128i absHi = _mm_abs_epi32(aHi);
408 __m256i result = _mm256_castsi128_si256(absLo);
409 result = _mm256_insertf128_si256(result, absHi, 1);
410 return result;
411 }
412
413 INLINE
414 int _simdemu_movemask_epi8(__m256i a)
415 {
416 __m128i aHi = _mm256_extractf128_si256(a, 1);
417 __m128i aLo = _mm256_castsi256_si128(a);
418
419 int resHi = _mm_movemask_epi8(aHi);
420 int resLo = _mm_movemask_epi8(aLo);
421
422 return (resHi << 16) | resLo;
423 }
424
425 INLINE
426 __m256i _simd_cvtepu8_epi16(__m128i a)
427 {
428 __m128i resultlo = _mm_cvtepu8_epi16(a);
429 __m128i resulthi = _mm_cvtepu8_epi16(_mm_srli_si128(a, 8));
430
431 __m256i result = _mm256_castsi128_si256(resultlo);
432
433 return _mm256_insertf128_si256(result, resulthi, 1);
434 }
435
436 INLINE
437 __m256i _simd_cvtepu8_epi32(__m128i a)
438 {
439 __m128i resultlo = _mm_cvtepu8_epi32(a);
440 __m128i resulthi = _mm_cvtepu8_epi32(_mm_srli_si128(a, 4));
441
442 __m256i result = _mm256_castsi128_si256(resultlo);
443
444 return _mm256_insertf128_si256(result, resulthi, 1);
445 }
446
447 INLINE
448 __m256i _simd_cvtepu16_epi32(__m128i a)
449 {
450 __m128i resultlo = _mm_cvtepu16_epi32(a);
451 __m128i resulthi = _mm_cvtepu16_epi32(_mm_srli_si128(a, 8));
452
453 __m256i result = _mm256_castsi128_si256(resultlo);
454
455 return _mm256_insertf128_si256(result, resulthi, 1);
456 }
457
458 INLINE
459 __m256i _simd_packus_epi16(__m256i a, __m256i b)
460 {
461 __m128i alo = _mm256_extractf128_si256(a, 0);
462 __m128i ahi = _mm256_extractf128_si256(a, 1);
463
464 __m128i blo = _mm256_extractf128_si256(b, 0);
465 __m128i bhi = _mm256_extractf128_si256(b, 1);
466
467 __m128i resultlo = _mm_packus_epi16(alo, blo);
468 __m128i resulthi = _mm_packus_epi16(ahi, bhi);
469
470 __m256i result = _mm256_castsi128_si256(resultlo);
471
472 return _mm256_insertf128_si256(result, resulthi, 1);
473 }
474
475 INLINE
476 __m256i _simd_packs_epi16(__m256i a, __m256i b)
477 {
478 __m128i alo = _mm256_extractf128_si256(a, 0);
479 __m128i ahi = _mm256_extractf128_si256(a, 1);
480
481 __m128i blo = _mm256_extractf128_si256(b, 0);
482 __m128i bhi = _mm256_extractf128_si256(b, 1);
483
484 __m128i resultlo = _mm_packs_epi16(alo, blo);
485 __m128i resulthi = _mm_packs_epi16(ahi, bhi);
486
487 __m256i result = _mm256_castsi128_si256(resultlo);
488
489 return _mm256_insertf128_si256(result, resulthi, 1);
490 }
491
492 INLINE
493 __m256i _simd_packus_epi32(__m256i a, __m256i b)
494 {
495 __m128i alo = _mm256_extractf128_si256(a, 0);
496 __m128i ahi = _mm256_extractf128_si256(a, 1);
497
498 __m128i blo = _mm256_extractf128_si256(b, 0);
499 __m128i bhi = _mm256_extractf128_si256(b, 1);
500
501 __m128i resultlo = _mm_packus_epi32(alo, blo);
502 __m128i resulthi = _mm_packus_epi32(ahi, bhi);
503
504 __m256i result = _mm256_castsi128_si256(resultlo);
505
506 return _mm256_insertf128_si256(result, resulthi, 1);
507 }
508
509 INLINE
510 __m256i _simd_packs_epi32(__m256i a, __m256i b)
511 {
512 __m128i alo = _mm256_extractf128_si256(a, 0);
513 __m128i ahi = _mm256_extractf128_si256(a, 1);
514
515 __m128i blo = _mm256_extractf128_si256(b, 0);
516 __m128i bhi = _mm256_extractf128_si256(b, 1);
517
518 __m128i resultlo = _mm_packs_epi32(alo, blo);
519 __m128i resulthi = _mm_packs_epi32(ahi, bhi);
520
521 __m256i result = _mm256_castsi128_si256(resultlo);
522
523 return _mm256_insertf128_si256(result, resulthi, 1);
524 }
525
526 #else
527
528 #define _simd_mul_epi32 _mm256_mul_epi32
529 #define _simd_mullo_epi32 _mm256_mullo_epi32
530 #define _simd_sub_epi32 _mm256_sub_epi32
531 #define _simd_sub_epi64 _mm256_sub_epi64
532 #define _simd_min_epi32 _mm256_min_epi32
533 #define _simd_max_epi32 _mm256_max_epi32
534 #define _simd_min_epu32 _mm256_min_epu32
535 #define _simd_max_epu32 _mm256_max_epu32
536 #define _simd_add_epi32 _mm256_add_epi32
537 #define _simd_and_si _mm256_and_si256
538 #define _simd_andnot_si _mm256_andnot_si256
539 #define _simd_cmpeq_epi32 _mm256_cmpeq_epi32
540 #define _simd_cmplt_epi32(a,b) _mm256_cmpgt_epi32(b,a)
541 #define _simd_cmpgt_epi32(a,b) _mm256_cmpgt_epi32(a,b)
542 #define _simd_or_si _mm256_or_si256
543 #define _simd_xor_si _mm256_xor_si256
544 #define _simd_castps_si _mm256_castps_si256
545
546 #define _simd_unpacklo_epi8 _mm256_unpacklo_epi8
547 #define _simd_unpackhi_epi8 _mm256_unpackhi_epi8
548 #define _simd_unpacklo_epi16 _mm256_unpacklo_epi16
549 #define _simd_unpackhi_epi16 _mm256_unpackhi_epi16
550 #define _simd_unpacklo_epi32 _mm256_unpacklo_epi32
551 #define _simd_unpackhi_epi32 _mm256_unpackhi_epi32
552 #define _simd_unpacklo_epi64 _mm256_unpacklo_epi64
553 #define _simd_unpackhi_epi64 _mm256_unpackhi_epi64
554
555 #define _simd_srli_si(a,i) _simdemu_srli_si128<i>(a)
556 #define _simd_slli_epi32 _mm256_slli_epi32
557 #define _simd_srai_epi32 _mm256_srai_epi32
558 #define _simd_srli_epi32 _mm256_srli_epi32
559 #define _simd_srlisi_ps(a,i) _mm256_castsi256_ps(_simdemu_srli_si128<i>(_mm256_castps_si256(a)))
560 #define _simd128_fmadd_ps _mm_fmadd_ps
561 #define _simd_fmadd_ps _mm256_fmadd_ps
562 #define _simd_fmsub_ps _mm256_fmsub_ps
563 #define _simd_shuffle_epi8 _mm256_shuffle_epi8
564 #define _simd_adds_epu8 _mm256_adds_epu8
565 #define _simd_subs_epu8 _mm256_subs_epu8
566 #define _simd_add_epi8 _mm256_add_epi8
567 #define _simd_i32gather_ps _mm256_i32gather_ps
568 #define _simd_mask_i32gather_ps _mm256_mask_i32gather_ps
569 #define _simd_abs_epi32 _mm256_abs_epi32
570
571 #define _simd_cmpeq_epi64 _mm256_cmpeq_epi64
572 #define _simd_cmpgt_epi64 _mm256_cmpgt_epi64
573 #define _simd_cmpgt_epi8 _mm256_cmpgt_epi8
574 #define _simd_cmpeq_epi8 _mm256_cmpeq_epi8
575 #define _simd_cmpgt_epi16 _mm256_cmpgt_epi16
576 #define _simd_cmpeq_epi16 _mm256_cmpeq_epi16
577 #define _simd_movemask_epi8 _mm256_movemask_epi8
578 #define _simd_permute_ps _mm256_permutevar8x32_ps
579 #define _simd_permute_epi32 _mm256_permutevar8x32_epi32
580 #define _simd_srlv_epi32 _mm256_srlv_epi32
581 #define _simd_sllv_epi32 _mm256_sllv_epi32
582 #define _simd_cvtepu8_epi16 _mm256_cvtepu8_epi16
583 #define _simd_cvtepu8_epi32 _mm256_cvtepu8_epi32
584 #define _simd_cvtepu16_epi32 _mm256_cvtepu16_epi32
585 #define _simd_packus_epi16 _mm256_packus_epi16
586 #define _simd_packs_epi16 _mm256_packs_epi16
587 #define _simd_packus_epi32 _mm256_packus_epi32
588 #define _simd_packs_epi32 _mm256_packs_epi32
589
590 #endif
591
592 #define _simd_unpacklo_ps _mm256_unpacklo_ps
593 #define _simd_unpackhi_ps _mm256_unpackhi_ps
594 #define _simd_unpacklo_pd _mm256_unpacklo_pd
595 #define _simd_unpackhi_pd _mm256_unpackhi_pd
596 #define _simd_insertf128_ps _mm256_insertf128_ps
597 #define _simd_insertf128_pd _mm256_insertf128_pd
598 #define _simd_insertf128_si _mm256_insertf128_si256
599 #define _simd_extractf128_ps _mm256_extractf128_ps
600 #define _simd_extractf128_pd _mm256_extractf128_pd
601 #define _simd_extractf128_si _mm256_extractf128_si256
602 #define _simd_permute2f128_ps _mm256_permute2f128_ps
603 #define _simd_permute2f128_pd _mm256_permute2f128_pd
604 #define _simd_permute2f128_si _mm256_permute2f128_si256
605 #define _simd_shuffle_ps _mm256_shuffle_ps
606 #define _simd_shuffle_pd _mm256_shuffle_pd
607 #define _simd_shuffle_epi32(a, b, imm8) _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), imm8))
608 #define _simd_shuffle_epi64(a, b, imm8) _mm256_castps_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b), imm8))
609 #define _simd_set1_epi32 _mm256_set1_epi32
610 #define _simd_set_epi32 _mm256_set_epi32
611 #define _simd_set1_epi8 _mm256_set1_epi8
612 #define _simd_setzero_si _mm256_setzero_si256
613 #define _simd_cvttps_epi32 _mm256_cvttps_epi32
614 #define _simd_store_si _mm256_store_si256
615 #define _simd_broadcast_ss _mm256_broadcast_ss
616 #define _simd_maskstore_ps _mm256_maskstore_ps
617 #define _simd_load_si _mm256_load_si256
618 #define _simd_loadu_si _mm256_loadu_si256
619 #define _simd_sub_ps _mm256_sub_ps
620 #define _simd_testz_ps _mm256_testz_ps
621 #define _simd_testz_si _mm256_testz_si256
622 #define _simd_xor_ps _mm256_xor_ps
623
624 INLINE
625 simdscalari _simd_loadu2_si(const __m128i *hiaddr, const __m128i *loaddr)
626 {
627 __m128i lo = _mm_loadu_si128(loaddr);
628 __m128i hi = _mm_loadu_si128(hiaddr);
629
630 return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
631 }
632
633 INLINE
634 void _simd_storeu2_si(__m128i *hiaddr, __m128i *loaddr, simdscalari a)
635 {
636 _mm_storeu_si128(loaddr, _mm256_castsi256_si128(a));
637 _mm_storeu_si128(hiaddr, _mm256_extractf128_si256(a, 1));
638 }
639
640 INLINE
641 simdscalari _simd_blendv_epi32(simdscalari a, simdscalari b, simdscalar mask)
642 {
643 return _simd_castps_si(_simd_blendv_ps(_simd_castsi_ps(a), _simd_castsi_ps(b), mask));
644 }
645
646 INLINE
647 simdscalari _simd_blendv_epi32(simdscalari a, simdscalari b, simdscalari mask)
648 {
649 return _simd_castps_si(_simd_blendv_ps(_simd_castsi_ps(a), _simd_castsi_ps(b), _simd_castsi_ps(mask)));
650 }
651
652 template<int mask>
653 INLINE
654 __m128i _simd_blend4_epi32(__m128i a, __m128i b)
655 {
656 return _mm_castps_si128(_mm_blend_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), mask));
657 }
658
659 // convert bitmask to vector mask
660 INLINE
661 simdscalar vMask(int32_t mask)
662 {
663 __m256i vec = _mm256_set1_epi32(mask);
664 const __m256i bit = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
665 vec = _simd_and_si(vec, bit);
666 vec = _simd_cmplt_epi32(_mm256_setzero_si256(), vec);
667 return _simd_castsi_ps(vec);
668 }
669
670 INLINE
671 simdscalari vMaski(int32_t mask)
672 {
673 __m256i vec = _mm256_set1_epi32(mask);
674 const __m256i bit = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
675 vec = _simd_and_si(vec, bit);
676 return _simd_cmplt_epi32(_mm256_setzero_si256(), vec);
677 }
678
679 INLINE
680 void _simd_mov(simdscalar &r, unsigned int rlane, simdscalar& s, unsigned int slane)
681 {
682 OSALIGNSIMD(float) rArray[KNOB_SIMD_WIDTH], sArray[KNOB_SIMD_WIDTH];
683 _mm256_store_ps(rArray, r);
684 _mm256_store_ps(sArray, s);
685 rArray[rlane] = sArray[slane];
686 r = _mm256_load_ps(rArray);
687 }
688
689 INLINE __m256i _simdemu_slli_epi32(__m256i a, uint32_t i)
690 {
691 __m128i aHi = _mm256_extractf128_si256(a, 1);
692 __m128i aLo = _mm256_castsi256_si128(a);
693
694 __m128i resHi = _mm_slli_epi32(aHi, i);
695 __m128i resLo = _mm_slli_epi32(aLo, i);
696
697 __m256i result = _mm256_castsi128_si256(resLo);
698 result = _mm256_insertf128_si256(result, resHi, 1);
699
700 return result;
701 }
702
703 INLINE __m256i _simdemu_srai_epi32(__m256i a, uint32_t i)
704 {
705 __m128i aHi = _mm256_extractf128_si256(a, 1);
706 __m128i aLo = _mm256_castsi256_si128(a);
707
708 __m128i resHi = _mm_srai_epi32(aHi, i);
709 __m128i resLo = _mm_srai_epi32(aLo, i);
710
711 __m256i result = _mm256_castsi128_si256(resLo);
712 result = _mm256_insertf128_si256(result, resHi, 1);
713
714 return result;
715 }
716
717 INLINE __m256i _simdemu_srli_epi32(__m256i a, uint32_t i)
718 {
719 __m128i aHi = _mm256_extractf128_si256(a, 1);
720 __m128i aLo = _mm256_castsi256_si128(a);
721
722 __m128i resHi = _mm_srli_epi32(aHi, i);
723 __m128i resLo = _mm_srli_epi32(aLo, i);
724
725 __m256i result = _mm256_castsi128_si256(resLo);
726 result = _mm256_insertf128_si256(result, resHi, 1);
727
728 return result;
729 }
730
731 INLINE
732 void _simdvec_transpose(simdvector &v)
733 {
734 SWR_INVALID("Need to implement 8 wide version");
735 }
736
737 #else
738 #error Unsupported vector width
739 #endif
740
741 // Populates a simdvector from a vector. So p = xyzw becomes xxxx yyyy zzzz wwww.
742 INLINE
743 void _simdvec_load_ps(simdvector& r, const float *p)
744 {
745 r[0] = _simd_set1_ps(p[0]);
746 r[1] = _simd_set1_ps(p[1]);
747 r[2] = _simd_set1_ps(p[2]);
748 r[3] = _simd_set1_ps(p[3]);
749 }
750
751 INLINE
752 void _simdvec_mov(simdvector& r, const simdscalar& s)
753 {
754 r[0] = s;
755 r[1] = s;
756 r[2] = s;
757 r[3] = s;
758 }
759
760 INLINE
761 void _simdvec_mov(simdvector& r, const simdvector& v)
762 {
763 r[0] = v[0];
764 r[1] = v[1];
765 r[2] = v[2];
766 r[3] = v[3];
767 }
768
769 #if 0
770 // just move a lane from the source simdvector to dest simdvector
771 INLINE
772 void _simdvec_mov(simdvector &r, unsigned int rlane, simdvector& s, unsigned int slane)
773 {
774 _simd_mov(r[0], rlane, s[0], slane);
775 _simd_mov(r[1], rlane, s[1], slane);
776 _simd_mov(r[2], rlane, s[2], slane);
777 _simd_mov(r[3], rlane, s[3], slane);
778 }
779
780 #endif
781 INLINE
782 void _simdvec_dp3_ps(simdscalar& r, const simdvector& v0, const simdvector& v1)
783 {
784 simdscalar tmp;
785 r = _simd_mul_ps(v0[0], v1[0]); // (v0.x*v1.x)
786
787 tmp = _simd_mul_ps(v0[1], v1[1]); // (v0.y*v1.y)
788 r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y)
789
790 tmp = _simd_mul_ps(v0[2], v1[2]); // (v0.z*v1.z)
791 r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
792 }
793
794 INLINE
795 void _simdvec_dp4_ps(simdscalar& r, const simdvector& v0, const simdvector& v1)
796 {
797 simdscalar tmp;
798 r = _simd_mul_ps(v0[0], v1[0]); // (v0.x*v1.x)
799
800 tmp = _simd_mul_ps(v0[1], v1[1]); // (v0.y*v1.y)
801 r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y)
802
803 tmp = _simd_mul_ps(v0[2], v1[2]); // (v0.z*v1.z)
804 r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
805
806 tmp = _simd_mul_ps(v0[3], v1[3]); // (v0.w*v1.w)
807 r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
808 }
809
810 INLINE
811 simdscalar _simdvec_rcp_length_ps(const simdvector& v)
812 {
813 simdscalar length;
814 _simdvec_dp4_ps(length, v, v);
815 return _simd_rsqrt_ps(length);
816 }
817
818 INLINE
819 void _simdvec_normalize_ps(simdvector& r, const simdvector& v)
820 {
821 simdscalar vecLength;
822 vecLength = _simdvec_rcp_length_ps(v);
823
824 r[0] = _simd_mul_ps(v[0], vecLength);
825 r[1] = _simd_mul_ps(v[1], vecLength);
826 r[2] = _simd_mul_ps(v[2], vecLength);
827 r[3] = _simd_mul_ps(v[3], vecLength);
828 }
829
830 INLINE
831 void _simdvec_mul_ps(simdvector& r, const simdvector& v, const simdscalar& s)
832 {
833 r[0] = _simd_mul_ps(v[0], s);
834 r[1] = _simd_mul_ps(v[1], s);
835 r[2] = _simd_mul_ps(v[2], s);
836 r[3] = _simd_mul_ps(v[3], s);
837 }
838
839 INLINE
840 void _simdvec_mul_ps(simdvector& r, const simdvector& v0, const simdvector& v1)
841 {
842 r[0] = _simd_mul_ps(v0[0], v1[0]);
843 r[1] = _simd_mul_ps(v0[1], v1[1]);
844 r[2] = _simd_mul_ps(v0[2], v1[2]);
845 r[3] = _simd_mul_ps(v0[3], v1[3]);
846 }
847
848 INLINE
849 void _simdvec_add_ps(simdvector& r, const simdvector& v0, const simdvector& v1)
850 {
851 r[0] = _simd_add_ps(v0[0], v1[0]);
852 r[1] = _simd_add_ps(v0[1], v1[1]);
853 r[2] = _simd_add_ps(v0[2], v1[2]);
854 r[3] = _simd_add_ps(v0[3], v1[3]);
855 }
856
857 INLINE
858 void _simdvec_min_ps(simdvector& r, const simdvector& v0, const simdscalar& s)
859 {
860 r[0] = _simd_min_ps(v0[0], s);
861 r[1] = _simd_min_ps(v0[1], s);
862 r[2] = _simd_min_ps(v0[2], s);
863 r[3] = _simd_min_ps(v0[3], s);
864 }
865
866 INLINE
867 void _simdvec_max_ps(simdvector& r, const simdvector& v0, const simdscalar& s)
868 {
869 r[0] = _simd_max_ps(v0[0], s);
870 r[1] = _simd_max_ps(v0[1], s);
871 r[2] = _simd_max_ps(v0[2], s);
872 r[3] = _simd_max_ps(v0[3], s);
873 }
874
875 // Matrix4x4 * Vector4
876 // outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * v.w)
877 // outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * v.w)
878 // outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * v.w)
879 // outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * v.w)
880 INLINE
881 void _simd_mat4x4_vec4_multiply(
882 simdvector& result,
883 const float *pMatrix,
884 const simdvector& v)
885 {
886 simdscalar m;
887 simdscalar r0;
888 simdscalar r1;
889
890 m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0]
891 r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
892 m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1]
893 r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
894 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
895 m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2]
896 r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
897 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
898 m = _simd_load1_ps(pMatrix + 0*4 + 3); // m[row][3]
899 r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z)
900 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
901 result[0] = r0;
902
903 m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0]
904 r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
905 m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1]
906 r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
907 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
908 m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2]
909 r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
910 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
911 m = _simd_load1_ps(pMatrix + 1*4 + 3); // m[row][3]
912 r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z)
913 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
914 result[1] = r0;
915
916 m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0]
917 r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
918 m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1]
919 r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
920 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
921 m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2]
922 r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
923 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
924 m = _simd_load1_ps(pMatrix + 2*4 + 3); // m[row][3]
925 r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z)
926 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
927 result[2] = r0;
928
929 m = _simd_load1_ps(pMatrix + 3*4 + 0); // m[row][0]
930 r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
931 m = _simd_load1_ps(pMatrix + 3*4 + 1); // m[row][1]
932 r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
933 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
934 m = _simd_load1_ps(pMatrix + 3*4 + 2); // m[row][2]
935 r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
936 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
937 m = _simd_load1_ps(pMatrix + 3*4 + 3); // m[row][3]
938 r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z)
939 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
940 result[3] = r0;
941 }
942
943 // Matrix4x4 * Vector3 - Direction Vector where w = 0.
944 // outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 0)
945 // outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 0)
946 // outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 0)
947 // outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 0)
948 INLINE
949 void _simd_mat3x3_vec3_w0_multiply(
950 simdvector& result,
951 const float *pMatrix,
952 const simdvector& v)
953 {
954 simdscalar m;
955 simdscalar r0;
956 simdscalar r1;
957
958 m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0]
959 r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
960 m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1]
961 r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
962 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
963 m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2]
964 r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
965 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
966 result[0] = r0;
967
968 m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0]
969 r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
970 m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1]
971 r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
972 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
973 m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2]
974 r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
975 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
976 result[1] = r0;
977
978 m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0]
979 r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
980 m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1]
981 r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
982 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
983 m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2]
984 r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
985 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
986 result[2] = r0;
987
988 result[3] = _simd_setzero_ps();
989 }
990
991 // Matrix4x4 * Vector3 - Position vector where w = 1.
992 // outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 1)
993 // outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 1)
994 // outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 1)
995 // outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 1)
996 INLINE
997 void _simd_mat4x4_vec3_w1_multiply(
998 simdvector& result,
999 const float *pMatrix,
1000 const simdvector& v)
1001 {
1002 simdscalar m;
1003 simdscalar r0;
1004 simdscalar r1;
1005
1006 m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0]
1007 r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
1008 m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1]
1009 r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
1010 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
1011 m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2]
1012 r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
1013 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
1014 m = _simd_load1_ps(pMatrix + 0*4 + 3); // m[row][3]
1015 r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
1016 result[0] = r0;
1017
1018 m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0]
1019 r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
1020 m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1]
1021 r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
1022 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
1023 m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2]
1024 r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
1025 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
1026 m = _simd_load1_ps(pMatrix + 1*4 + 3); // m[row][3]
1027 r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
1028 result[1] = r0;
1029
1030 m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0]
1031 r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
1032 m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1]
1033 r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
1034 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
1035 m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2]
1036 r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
1037 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
1038 m = _simd_load1_ps(pMatrix + 2*4 + 3); // m[row][3]
1039 r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
1040 result[2] = r0;
1041
1042 m = _simd_load1_ps(pMatrix + 3*4 + 0); // m[row][0]
1043 r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
1044 m = _simd_load1_ps(pMatrix + 3*4 + 1); // m[row][1]
1045 r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
1046 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
1047 m = _simd_load1_ps(pMatrix + 3*4 + 2); // m[row][2]
1048 r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
1049 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
1050 m = _simd_load1_ps(pMatrix + 3*4 + 3); // m[row][3]
1051 result[3] = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
1052 }
1053
1054 INLINE
1055 void _simd_mat4x3_vec3_w1_multiply(
1056 simdvector& result,
1057 const float *pMatrix,
1058 const simdvector& v)
1059 {
1060 simdscalar m;
1061 simdscalar r0;
1062 simdscalar r1;
1063
1064 m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0]
1065 r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
1066 m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1]
1067 r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
1068 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
1069 m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2]
1070 r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
1071 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
1072 m = _simd_load1_ps(pMatrix + 0*4 + 3); // m[row][3]
1073 r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
1074 result[0] = r0;
1075
1076 m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0]
1077 r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
1078 m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1]
1079 r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
1080 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
1081 m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2]
1082 r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
1083 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
1084 m = _simd_load1_ps(pMatrix + 1*4 + 3); // m[row][3]
1085 r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
1086 result[1] = r0;
1087
1088 m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0]
1089 r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
1090 m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1]
1091 r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
1092 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
1093 m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2]
1094 r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
1095 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
1096 m = _simd_load1_ps(pMatrix + 2*4 + 3); // m[row][3]
1097 r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
1098 result[2] = r0;
1099 result[3] = _simd_set1_ps(1.0f);
1100 }
1101
1102 //////////////////////////////////////////////////////////////////////////
1103 /// @brief Compute plane equation vA * vX + vB * vY + vC
1104 INLINE simdscalar vplaneps(simdscalar vA, simdscalar vB, simdscalar vC, simdscalar &vX, simdscalar &vY)
1105 {
1106 simdscalar vOut = _simd_fmadd_ps(vA, vX, vC);
1107 vOut = _simd_fmadd_ps(vB, vY, vOut);
1108 return vOut;
1109 }
1110
1111 //////////////////////////////////////////////////////////////////////////
1112 /// @brief Compute plane equation vA * vX + vB * vY + vC
1113 INLINE __m128 vplaneps128(__m128 vA, __m128 vB, __m128 vC, __m128 &vX, __m128 &vY)
1114 {
1115 __m128 vOut = _simd128_fmadd_ps(vA, vX, vC);
1116 vOut = _simd128_fmadd_ps(vB, vY, vOut);
1117 return vOut;
1118 }
1119
1120 //////////////////////////////////////////////////////////////////////////
1121 /// @brief Interpolates a single component.
1122 /// @param vI - barycentric I
1123 /// @param vJ - barycentric J
1124 /// @param pInterpBuffer - pointer to attribute barycentric coeffs
1125 template<UINT Attrib, UINT Comp, UINT numComponents = 4>
1126 static INLINE simdscalar InterpolateComponent(simdscalar vI, simdscalar vJ, const float *pInterpBuffer)
1127 {
1128 const float *pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp];
1129 const float *pInterpB = &pInterpBuffer[Attrib * 3 * numComponents + numComponents + Comp];
1130 const float *pInterpC = &pInterpBuffer[Attrib * 3 * numComponents + numComponents * 2 + Comp];
1131
1132 simdscalar vA = _simd_broadcast_ss(pInterpA);
1133 simdscalar vB = _simd_broadcast_ss(pInterpB);
1134 simdscalar vC = _simd_broadcast_ss(pInterpC);
1135
1136 simdscalar vk = _simd_sub_ps(_simd_sub_ps(_simd_set1_ps(1.0f), vI), vJ);
1137 vC = _simd_mul_ps(vk, vC);
1138
1139 return vplaneps(vA, vB, vC, vI, vJ);
1140 }
1141
1142 //////////////////////////////////////////////////////////////////////////
1143 /// @brief Interpolates a single component (flat shade).
1144 /// @param pInterpBuffer - pointer to attribute barycentric coeffs
1145 template<UINT Attrib, UINT Comp, UINT numComponents = 4>
1146 static INLINE simdscalar InterpolateComponentFlat(const float *pInterpBuffer)
1147 {
1148 const float *pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp];
1149
1150 simdscalar vA = _simd_broadcast_ss(pInterpA);
1151
1152 return vA;
1153 }
1154
1155 //////////////////////////////////////////////////////////////////////////
1156 /// @brief Interpolates a single component.
1157 /// @param vI - barycentric I
1158 /// @param vJ - barycentric J
1159 /// @param pInterpBuffer - pointer to attribute barycentric coeffs
1160 template<UINT Attrib, UINT Comp, UINT numComponents = 4>
1161 static INLINE __m128 InterpolateComponent(__m128 vI, __m128 vJ, const float *pInterpBuffer)
1162 {
1163 const float *pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp];
1164 const float *pInterpB = &pInterpBuffer[Attrib * 3 * numComponents + numComponents + Comp];
1165 const float *pInterpC = &pInterpBuffer[Attrib * 3 * numComponents + numComponents * 2 + Comp];
1166
1167 __m128 vA = _mm_broadcast_ss(pInterpA);
1168 __m128 vB = _mm_broadcast_ss(pInterpB);
1169 __m128 vC = _mm_broadcast_ss(pInterpC);
1170
1171 __m128 vk = _mm_sub_ps(_mm_sub_ps(_mm_set1_ps(1.0f), vI), vJ);
1172 vC = _mm_mul_ps(vk, vC);
1173
1174 return vplaneps128(vA, vB, vC, vI, vJ);
1175 }
1176
1177 static INLINE __m128 _simd128_abs_ps(__m128 a)
1178 {
1179 __m128i ai = _mm_castps_si128(a);
1180 return _mm_castsi128_ps(_mm_and_si128(ai, _mm_set1_epi32(0x7fffffff)));
1181 }
1182
1183 static INLINE simdscalar _simd_abs_ps(simdscalar a)
1184 {
1185 simdscalari ai = _simd_castps_si(a);
1186 return _simd_castsi_ps(_simd_and_si(ai, _simd_set1_epi32(0x7fffffff)));
1187 }
1188
1189 INLINE
1190 UINT pdep_u32(UINT a, UINT mask)
1191 {
1192 #if KNOB_ARCH >= KNOB_ARCH_AVX2
1193 return _pdep_u32(a, mask);
1194 #else
1195 UINT result = 0;
1196
1197 // copied from http://wm.ite.pl/articles/pdep-soft-emu.html
1198 // using bsf instead of funky loop
1199 DWORD maskIndex;
1200 while (_BitScanForward(&maskIndex, mask))
1201 {
1202 // 1. isolate lowest set bit of mask
1203 const UINT lowest = 1 << maskIndex;
1204
1205 // 2. populate LSB from src
1206 const UINT LSB = (UINT)((int)(a << 31) >> 31);
1207
1208 // 3. copy bit from mask
1209 result |= LSB & lowest;
1210
1211 // 4. clear lowest bit
1212 mask &= ~lowest;
1213
1214 // 5. prepare for next iteration
1215 a >>= 1;
1216 }
1217
1218 return result;
1219 #endif
1220 }
1221
1222 INLINE
1223 UINT pext_u32(UINT a, UINT mask)
1224 {
1225 #if KNOB_ARCH >= KNOB_ARCH_AVX2
1226 return _pext_u32(a, mask);
1227 #else
1228 UINT result = 0;
1229 DWORD maskIndex;
1230 uint32_t currentBit = 0;
1231 while (_BitScanForward(&maskIndex, mask))
1232 {
1233 // 1. isolate lowest set bit of mask
1234 const UINT lowest = 1 << maskIndex;
1235
1236 // 2. copy bit from mask
1237 result |= ((a & lowest) > 0) << currentBit++;
1238
1239 // 3. clear lowest bit
1240 mask &= ~lowest;
1241 }
1242 return result;
1243 #endif
1244 }
1245
1246 #if ENABLE_AVX512_SIMD16
1247 #include "simd16intrin.h"
1248 #endif//ENABLE_AVX512_SIMD16
1249
1250 #endif//__SWR_SIMDINTRIN_H__