swr: [rasterizer core] per-primitive viewports/scissors
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / utils.h
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file utils.h
24 *
25 * @brief Utilities used by SWR core.
26 *
27 ******************************************************************************/
28 #pragma once
29
30 #include <string.h>
31 #include <type_traits>
32 #include <algorithm>
33 #include "common/os.h"
34 #include "common/simdintrin.h"
35 #include "common/swr_assert.h"
36 #include "core/api.h"
37
38 #if defined(_WIN64) || defined(__x86_64__)
39 #define _MM_INSERT_EPI64 _mm_insert_epi64
40 #define _MM_EXTRACT_EPI64 _mm_extract_epi64
41 #else
42 INLINE int64_t _MM_EXTRACT_EPI64(__m128i a, const int32_t ndx)
43 {
44 OSALIGNLINE(uint32_t) elems[4];
45 _mm_store_si128((__m128i*)elems, a);
46 if (ndx == 0)
47 {
48 uint64_t foo = elems[0];
49 foo |= (uint64_t)elems[1] << 32;
50 return foo;
51 }
52 else
53 {
54 uint64_t foo = elems[2];
55 foo |= (uint64_t)elems[3] << 32;
56 return foo;
57 }
58 }
59
60 INLINE __m128i _MM_INSERT_EPI64(__m128i a, int64_t b, const int32_t ndx)
61 {
62 OSALIGNLINE(int64_t) elems[2];
63 _mm_store_si128((__m128i*)elems, a);
64 if (ndx == 0)
65 {
66 elems[0] = b;
67 }
68 else
69 {
70 elems[1] = b;
71 }
72 __m128i out;
73 out = _mm_load_si128((const __m128i*)elems);
74 return out;
75 }
76 #endif
77
78 struct simdBBox
79 {
80 simdscalari ymin;
81 simdscalari ymax;
82 simdscalari xmin;
83 simdscalari xmax;
84 };
85
86 INLINE
87 void vTranspose(__m128 &row0, __m128 &row1, __m128 &row2, __m128 &row3)
88 {
89 __m128i row0i = _mm_castps_si128(row0);
90 __m128i row1i = _mm_castps_si128(row1);
91 __m128i row2i = _mm_castps_si128(row2);
92 __m128i row3i = _mm_castps_si128(row3);
93
94 __m128i vTemp = row2i;
95 row2i = _mm_unpacklo_epi32(row2i, row3i);
96 vTemp = _mm_unpackhi_epi32(vTemp, row3i);
97
98 row3i = row0i;
99 row0i = _mm_unpacklo_epi32(row0i, row1i);
100 row3i = _mm_unpackhi_epi32(row3i, row1i);
101
102 row1i = row0i;
103 row0i = _mm_unpacklo_epi64(row0i, row2i);
104 row1i = _mm_unpackhi_epi64(row1i, row2i);
105
106 row2i = row3i;
107 row2i = _mm_unpacklo_epi64(row2i, vTemp);
108 row3i = _mm_unpackhi_epi64(row3i, vTemp);
109
110 row0 = _mm_castsi128_ps(row0i);
111 row1 = _mm_castsi128_ps(row1i);
112 row2 = _mm_castsi128_ps(row2i);
113 row3 = _mm_castsi128_ps(row3i);
114 }
115
116 INLINE
117 void vTranspose(__m128i &row0, __m128i &row1, __m128i &row2, __m128i &row3)
118 {
119 __m128i vTemp = row2;
120 row2 = _mm_unpacklo_epi32(row2, row3);
121 vTemp = _mm_unpackhi_epi32(vTemp, row3);
122
123 row3 = row0;
124 row0 = _mm_unpacklo_epi32(row0, row1);
125 row3 = _mm_unpackhi_epi32(row3, row1);
126
127 row1 = row0;
128 row0 = _mm_unpacklo_epi64(row0, row2);
129 row1 = _mm_unpackhi_epi64(row1, row2);
130
131 row2 = row3;
132 row2 = _mm_unpacklo_epi64(row2, vTemp);
133 row3 = _mm_unpackhi_epi64(row3, vTemp);
134 }
135
136 #define GCC_VERSION (__GNUC__ * 10000 \
137 + __GNUC_MINOR__ * 100 \
138 + __GNUC_PATCHLEVEL__)
139
140 #if defined(__clang__) || (defined(__GNUC__) && (GCC_VERSION < 40900))
141 #define _mm_undefined_ps _mm_setzero_ps
142 #define _mm_undefined_si128 _mm_setzero_si128
143 #if KNOB_SIMD_WIDTH == 8
144 #define _mm256_undefined_ps _mm256_setzero_ps
145 #endif
146 #endif
147
148 #if KNOB_SIMD_WIDTH == 8 || KNOB_SIMD_WIDTH == 16
149 INLINE
150 void vTranspose3x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc2)
151 {
152 __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5
153 __m256 r1rx = _mm256_unpacklo_ps(vSrc1, _mm256_undefined_ps()); //y0w0y1w1 y4w4y5w5
154 __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4
155 __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5
156
157 r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7
158 r1rx = _mm256_unpackhi_ps(vSrc1, _mm256_undefined_ps()); //y2w2y3w3 y6w6yw77
159 __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6
160 __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7
161
162 vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
163 vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
164 vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
165 vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
166
167 vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1);
168 vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1);
169 vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1);
170 vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1);
171 }
172
173 INLINE
174 void vTranspose4x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc2, __m256 &vSrc3)
175 {
176 __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5
177 __m256 r1rx = _mm256_unpacklo_ps(vSrc1, vSrc3); //y0w0y1w1 y4w4y5w5
178 __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4
179 __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5
180
181 r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7
182 r1rx = _mm256_unpackhi_ps(vSrc1, vSrc3) ; //y2w2y3w3 y6w6yw77
183 __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6
184 __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7
185
186 vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
187 vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
188 vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
189 vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
190
191 vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1);
192 vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1);
193 vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1);
194 vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1);
195 }
196
197 INLINE
198 void vTranspose8x8(__m256 (&vDst)[8], const __m256 &vMask0, const __m256 &vMask1, const __m256 &vMask2, const __m256 &vMask3, const __m256 &vMask4, const __m256 &vMask5, const __m256 &vMask6, const __m256 &vMask7)
199 {
200 __m256 __t0 = _mm256_unpacklo_ps(vMask0, vMask1);
201 __m256 __t1 = _mm256_unpackhi_ps(vMask0, vMask1);
202 __m256 __t2 = _mm256_unpacklo_ps(vMask2, vMask3);
203 __m256 __t3 = _mm256_unpackhi_ps(vMask2, vMask3);
204 __m256 __t4 = _mm256_unpacklo_ps(vMask4, vMask5);
205 __m256 __t5 = _mm256_unpackhi_ps(vMask4, vMask5);
206 __m256 __t6 = _mm256_unpacklo_ps(vMask6, vMask7);
207 __m256 __t7 = _mm256_unpackhi_ps(vMask6, vMask7);
208 __m256 __tt0 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0));
209 __m256 __tt1 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2));
210 __m256 __tt2 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0));
211 __m256 __tt3 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2));
212 __m256 __tt4 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0));
213 __m256 __tt5 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2));
214 __m256 __tt6 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0));
215 __m256 __tt7 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2));
216 vDst[0] = _mm256_permute2f128_ps(__tt0, __tt4, 0x20);
217 vDst[1] = _mm256_permute2f128_ps(__tt1, __tt5, 0x20);
218 vDst[2] = _mm256_permute2f128_ps(__tt2, __tt6, 0x20);
219 vDst[3] = _mm256_permute2f128_ps(__tt3, __tt7, 0x20);
220 vDst[4] = _mm256_permute2f128_ps(__tt0, __tt4, 0x31);
221 vDst[5] = _mm256_permute2f128_ps(__tt1, __tt5, 0x31);
222 vDst[6] = _mm256_permute2f128_ps(__tt2, __tt6, 0x31);
223 vDst[7] = _mm256_permute2f128_ps(__tt3, __tt7, 0x31);
224 }
225
226 INLINE
227 void vTranspose8x8(__m256 (&vDst)[8], const __m256i &vMask0, const __m256i &vMask1, const __m256i &vMask2, const __m256i &vMask3, const __m256i &vMask4, const __m256i &vMask5, const __m256i &vMask6, const __m256i &vMask7)
228 {
229 vTranspose8x8(vDst, _mm256_castsi256_ps(vMask0), _mm256_castsi256_ps(vMask1), _mm256_castsi256_ps(vMask2), _mm256_castsi256_ps(vMask3),
230 _mm256_castsi256_ps(vMask4), _mm256_castsi256_ps(vMask5), _mm256_castsi256_ps(vMask6), _mm256_castsi256_ps(vMask7));
231 }
232 #endif
233
234 //////////////////////////////////////////////////////////////////////////
235 /// TranposeSingleComponent
236 //////////////////////////////////////////////////////////////////////////
237 template<uint32_t bpp>
238 struct TransposeSingleComponent
239 {
240 //////////////////////////////////////////////////////////////////////////
241 /// @brief Pass-thru for single component.
242 /// @param pSrc - source data in SOA form
243 /// @param pDst - output data in AOS form
244 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
245 {
246 memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8);
247 }
248 };
249
250 //////////////////////////////////////////////////////////////////////////
251 /// Transpose8_8_8_8
252 //////////////////////////////////////////////////////////////////////////
253 struct Transpose8_8_8_8
254 {
255 //////////////////////////////////////////////////////////////////////////
256 /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data.
257 /// @param pSrc - source data in SOA form
258 /// @param pDst - output data in AOS form
259 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
260 {
261 simdscalari src = _simd_load_si((const simdscalari*)pSrc);
262
263 #if KNOB_SIMD_WIDTH == 8
264 #if KNOB_ARCH == KNOB_ARCH_AVX
265 __m128i c0c1 = _mm256_castsi256_si128(src); // rrrrrrrrgggggggg
266 __m128i c2c3 = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(src), 1)); // bbbbbbbbaaaaaaaa
267 __m128i c0c2 = _mm_unpacklo_epi64(c0c1, c2c3); // rrrrrrrrbbbbbbbb
268 __m128i c1c3 = _mm_unpackhi_epi64(c0c1, c2c3); // ggggggggaaaaaaaa
269 __m128i c01 = _mm_unpacklo_epi8(c0c2, c1c3); // rgrgrgrgrgrgrgrg
270 __m128i c23 = _mm_unpackhi_epi8(c0c2, c1c3); // babababababababa
271 __m128i c0123lo = _mm_unpacklo_epi16(c01, c23); // rgbargbargbargba
272 __m128i c0123hi = _mm_unpackhi_epi16(c01, c23); // rgbargbargbargba
273 _mm_store_si128((__m128i*)pDst, c0123lo);
274 _mm_store_si128((__m128i*)(pDst + 16), c0123hi);
275 #elif KNOB_ARCH == KNOB_ARCH_AVX2
276 simdscalari dst01 = _mm256_shuffle_epi8(src,
277 _mm256_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800));
278 simdscalari dst23 = _mm256_permute2x128_si256(src, src, 0x01);
279 dst23 = _mm256_shuffle_epi8(dst23,
280 _mm256_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080));
281 simdscalari dst = _mm256_or_si256(dst01, dst23);
282 _simd_store_si((simdscalari*)pDst, dst);
283 #endif
284 #elif KNOB_SIMD_WIDTH == 16
285 simdscalari mask0 = _simd_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800);
286
287 simdscalari dst01 = _simd_shuffle_epi8(src, mask0);
288
289 simdscalari perm1 = _simd_permute_128(src, src, 1);
290
291 simdscalari mask1 = _simd_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080);
292
293 simdscalari dst23 = _simd_shuffle_epi8(perm1, mask1);
294
295 simdscalari dst = _simd_or_si(dst01, dst23);
296
297 _simd_store_si(reinterpret_cast<simdscalari *>(pDst), dst);
298 #else
299 #error Unsupported vector width
300 #endif
301 }
302 };
303
304 //////////////////////////////////////////////////////////////////////////
305 /// Transpose8_8_8
306 //////////////////////////////////////////////////////////////////////////
307 struct Transpose8_8_8
308 {
309 //////////////////////////////////////////////////////////////////////////
310 /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data.
311 /// @param pSrc - source data in SOA form
312 /// @param pDst - output data in AOS form
313 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
314 };
315
316 //////////////////////////////////////////////////////////////////////////
317 /// Transpose8_8
318 //////////////////////////////////////////////////////////////////////////
319 struct Transpose8_8
320 {
321 //////////////////////////////////////////////////////////////////////////
322 /// @brief Performs an SOA to AOS conversion for packed 8_8 data.
323 /// @param pSrc - source data in SOA form
324 /// @param pDst - output data in AOS form
325 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
326 {
327 #if KNOB_SIMD_WIDTH == 8
328 simdscalari src = _simd_load_si((const simdscalari*)pSrc);
329
330 __m128i rg = _mm256_castsi256_si128(src); // rrrrrrrr gggggggg
331 __m128i g = _mm_unpackhi_epi64(rg, rg); // gggggggg gggggggg
332 rg = _mm_unpacklo_epi8(rg, g);
333 _mm_store_si128((__m128i*)pDst, rg);
334 #elif KNOB_SIMD_WIDTH == 16
335 __m256i src = _mm256_load_si256(reinterpret_cast<const __m256i *>(pSrc)); // rrrrrrrrrrrrrrrrgggggggggggggggg
336
337 __m256i r = _mm256_permute4x64_epi64(src, 0x50); // 0x50 = 01010000b // rrrrrrrrxxxxxxxxrrrrrrrrxxxxxxxx
338
339 __m256i g = _mm256_permute4x64_epi64(src, 0xFA); // 0xFA = 11111010b // ggggggggxxxxxxxxggggggggxxxxxxxx
340
341 __m256i dst = _mm256_unpacklo_epi8(r, g); // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg
342
343 _mm256_store_si256(reinterpret_cast<__m256i *>(pDst), dst);
344 #else
345 #error Unsupported vector width
346 #endif
347 }
348 };
349
350 //////////////////////////////////////////////////////////////////////////
351 /// Transpose32_32_32_32
352 //////////////////////////////////////////////////////////////////////////
353 struct Transpose32_32_32_32
354 {
355 //////////////////////////////////////////////////////////////////////////
356 /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data.
357 /// @param pSrc - source data in SOA form
358 /// @param pDst - output data in AOS form
359 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
360 {
361 #if KNOB_SIMD_WIDTH == 8
362 simdscalar src0 = _simd_load_ps((const float*)pSrc);
363 simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
364 simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
365 simdscalar src3 = _simd_load_ps((const float*)pSrc + 24);
366
367 __m128 vDst[8];
368 vTranspose4x8(vDst, src0, src1, src2, src3);
369 _mm_store_ps((float*)pDst, vDst[0]);
370 _mm_store_ps((float*)pDst+4, vDst[1]);
371 _mm_store_ps((float*)pDst+8, vDst[2]);
372 _mm_store_ps((float*)pDst+12, vDst[3]);
373 _mm_store_ps((float*)pDst+16, vDst[4]);
374 _mm_store_ps((float*)pDst+20, vDst[5]);
375 _mm_store_ps((float*)pDst+24, vDst[6]);
376 _mm_store_ps((float*)pDst+28, vDst[7]);
377 #elif KNOB_SIMD_WIDTH == 16
378 #if ENABLE_AVX512_EMULATION
379 simdscalar src0 = _simd_load_ps(reinterpret_cast<const float*>(pSrc));
380 simdscalar src1 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) + 16);
381 simdscalar src2 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) + 32);
382 simdscalar src3 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) + 48);
383
384 __m128 vDst[8];
385
386 vTranspose4x8(vDst, src0.lo, src1.lo, src2.lo, src3.lo);
387
388 _mm_store_ps(reinterpret_cast<float*>(pDst), vDst[0]);
389 _mm_store_ps(reinterpret_cast<float*>(pDst) + 4, vDst[1]);
390 _mm_store_ps(reinterpret_cast<float*>(pDst) + 8, vDst[2]);
391 _mm_store_ps(reinterpret_cast<float*>(pDst) + 12, vDst[3]);
392 _mm_store_ps(reinterpret_cast<float*>(pDst) + 16, vDst[4]);
393 _mm_store_ps(reinterpret_cast<float*>(pDst) + 20, vDst[5]);
394 _mm_store_ps(reinterpret_cast<float*>(pDst) + 24, vDst[6]);
395 _mm_store_ps(reinterpret_cast<float*>(pDst) + 28, vDst[7]);
396
397 vTranspose4x8(vDst, src0.hi, src1.hi, src2.hi, src3.hi);
398
399 _mm_store_ps(reinterpret_cast<float*>(pDst) + 32, vDst[0]);
400 _mm_store_ps(reinterpret_cast<float*>(pDst) + 36, vDst[1]);
401 _mm_store_ps(reinterpret_cast<float*>(pDst) + 40, vDst[2]);
402 _mm_store_ps(reinterpret_cast<float*>(pDst) + 44, vDst[3]);
403 _mm_store_ps(reinterpret_cast<float*>(pDst) + 48, vDst[4]);
404 _mm_store_ps(reinterpret_cast<float*>(pDst) + 52, vDst[5]);
405 _mm_store_ps(reinterpret_cast<float*>(pDst) + 56, vDst[6]);
406 _mm_store_ps(reinterpret_cast<float*>(pDst) + 60, vDst[7]);
407 #endif
408 #else
409 #error Unsupported vector width
410 #endif
411 }
412 };
413
414 //////////////////////////////////////////////////////////////////////////
415 /// Transpose32_32_32
416 //////////////////////////////////////////////////////////////////////////
417 struct Transpose32_32_32
418 {
419 //////////////////////////////////////////////////////////////////////////
420 /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data.
421 /// @param pSrc - source data in SOA form
422 /// @param pDst - output data in AOS form
423 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
424 {
425 #if KNOB_SIMD_WIDTH == 8
426 simdscalar src0 = _simd_load_ps((const float*)pSrc);
427 simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
428 simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
429
430 __m128 vDst[8];
431 vTranspose3x8(vDst, src0, src1, src2);
432 _mm_store_ps((float*)pDst, vDst[0]);
433 _mm_store_ps((float*)pDst + 4, vDst[1]);
434 _mm_store_ps((float*)pDst + 8, vDst[2]);
435 _mm_store_ps((float*)pDst + 12, vDst[3]);
436 _mm_store_ps((float*)pDst + 16, vDst[4]);
437 _mm_store_ps((float*)pDst + 20, vDst[5]);
438 _mm_store_ps((float*)pDst + 24, vDst[6]);
439 _mm_store_ps((float*)pDst + 28, vDst[7]);
440 #elif KNOB_SIMD_WIDTH == 16
441 #if ENABLE_AVX512_EMULATION
442 simdscalar src0 = _simd_load_ps(reinterpret_cast<const float*>(pSrc));
443 simdscalar src1 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) + 16);
444 simdscalar src2 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) + 32);
445
446 __m128 vDst[8];
447
448 vTranspose3x8(vDst, src0.lo, src1.lo, src2.lo);
449
450 _mm_store_ps(reinterpret_cast<float*>(pDst), vDst[0]);
451 _mm_store_ps(reinterpret_cast<float*>(pDst) + 4, vDst[1]);
452 _mm_store_ps(reinterpret_cast<float*>(pDst) + 8, vDst[2]);
453 _mm_store_ps(reinterpret_cast<float*>(pDst) + 12, vDst[3]);
454 _mm_store_ps(reinterpret_cast<float*>(pDst) + 16, vDst[4]);
455 _mm_store_ps(reinterpret_cast<float*>(pDst) + 20, vDst[5]);
456 _mm_store_ps(reinterpret_cast<float*>(pDst) + 24, vDst[6]);
457 _mm_store_ps(reinterpret_cast<float*>(pDst) + 28, vDst[7]);
458
459 vTranspose3x8(vDst, src0.hi, src1.hi, src2.hi);
460
461 _mm_store_ps(reinterpret_cast<float*>(pDst) + 32, vDst[0]);
462 _mm_store_ps(reinterpret_cast<float*>(pDst) + 36, vDst[1]);
463 _mm_store_ps(reinterpret_cast<float*>(pDst) + 40, vDst[2]);
464 _mm_store_ps(reinterpret_cast<float*>(pDst) + 44, vDst[3]);
465 _mm_store_ps(reinterpret_cast<float*>(pDst) + 48, vDst[4]);
466 _mm_store_ps(reinterpret_cast<float*>(pDst) + 52, vDst[5]);
467 _mm_store_ps(reinterpret_cast<float*>(pDst) + 56, vDst[6]);
468 _mm_store_ps(reinterpret_cast<float*>(pDst) + 60, vDst[7]);
469 #endif
470 #else
471 #error Unsupported vector width
472 #endif
473 }
474 };
475
476 //////////////////////////////////////////////////////////////////////////
477 /// Transpose32_32
478 //////////////////////////////////////////////////////////////////////////
479 struct Transpose32_32
480 {
481 //////////////////////////////////////////////////////////////////////////
482 /// @brief Performs an SOA to AOS conversion for packed 32_32 data.
483 /// @param pSrc - source data in SOA form
484 /// @param pDst - output data in AOS form
485 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
486 {
487 #if KNOB_SIMD_WIDTH == 8
488 const float* pfSrc = (const float*)pSrc;
489 __m128 src_r0 = _mm_load_ps(pfSrc + 0);
490 __m128 src_r1 = _mm_load_ps(pfSrc + 4);
491 __m128 src_g0 = _mm_load_ps(pfSrc + 8);
492 __m128 src_g1 = _mm_load_ps(pfSrc + 12);
493
494 __m128 dst0 = _mm_unpacklo_ps(src_r0, src_g0);
495 __m128 dst1 = _mm_unpackhi_ps(src_r0, src_g0);
496 __m128 dst2 = _mm_unpacklo_ps(src_r1, src_g1);
497 __m128 dst3 = _mm_unpackhi_ps(src_r1, src_g1);
498
499 float* pfDst = (float*)pDst;
500 _mm_store_ps(pfDst + 0, dst0);
501 _mm_store_ps(pfDst + 4, dst1);
502 _mm_store_ps(pfDst + 8, dst2);
503 _mm_store_ps(pfDst + 12, dst3);
504 #elif KNOB_SIMD_WIDTH == 16
505 const float* pfSrc = (const float*)pSrc;
506 __m256 src_r0 = _mm256_load_ps(pfSrc + 0);
507 __m256 src_r1 = _mm256_load_ps(pfSrc + 8);
508 __m256 src_g0 = _mm256_load_ps(pfSrc + 16);
509 __m256 src_g1 = _mm256_load_ps(pfSrc + 24);
510
511 __m256 dst0 = _mm256_unpacklo_ps(src_r0, src_g0);
512 __m256 dst1 = _mm256_unpackhi_ps(src_r0, src_g0);
513 __m256 dst2 = _mm256_unpacklo_ps(src_r1, src_g1);
514 __m256 dst3 = _mm256_unpackhi_ps(src_r1, src_g1);
515
516 float* pfDst = (float*)pDst;
517 _mm256_store_ps(pfDst + 0, dst0);
518 _mm256_store_ps(pfDst + 8, dst1);
519 _mm256_store_ps(pfDst + 16, dst2);
520 _mm256_store_ps(pfDst + 24, dst3);
521 #else
522 #error Unsupported vector width
523 #endif
524 }
525 };
526
527 //////////////////////////////////////////////////////////////////////////
528 /// Transpose16_16_16_16
529 //////////////////////////////////////////////////////////////////////////
530 struct Transpose16_16_16_16
531 {
532 //////////////////////////////////////////////////////////////////////////
533 /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data.
534 /// @param pSrc - source data in SOA form
535 /// @param pDst - output data in AOS form
536 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
537 {
538 #if KNOB_SIMD_WIDTH == 8
539 simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
540 simdscalari src_ba = _simd_load_si((const simdscalari*)(pSrc + sizeof(simdscalari)));
541
542 __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
543 __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
544 __m128i src_b = _mm256_extractf128_si256(src_ba, 0);
545 __m128i src_a = _mm256_extractf128_si256(src_ba, 1);
546
547 __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
548 __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
549 __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
550 __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
551
552 __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
553 __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
554 __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
555 __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
556
557 _mm_store_si128(((__m128i*)pDst) + 0, dst0);
558 _mm_store_si128(((__m128i*)pDst) + 1, dst1);
559 _mm_store_si128(((__m128i*)pDst) + 2, dst2);
560 _mm_store_si128(((__m128i*)pDst) + 3, dst3);
561 #elif KNOB_SIMD_WIDTH == 16
562 #if ENABLE_AVX512_EMULATION
563 simdscalari src_rg = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc));
564 simdscalari src_ba = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc + sizeof(simdscalari)));
565
566 __m256i src_r = src_rg.lo;
567 __m256i src_g = src_rg.hi;
568 __m256i src_b = src_ba.lo;
569 __m256i src_a = src_ba.hi;
570
571 __m256i rg0 = _mm256_unpacklo_epi16(src_r, src_g);
572 __m256i rg1 = _mm256_unpackhi_epi16(src_r, src_g);
573 __m256i ba0 = _mm256_unpacklo_epi16(src_b, src_a);
574 __m256i ba1 = _mm256_unpackhi_epi16(src_b, src_a);
575
576 __m256i dst0 = _mm256_unpacklo_epi32(rg0, ba0);
577 __m256i dst1 = _mm256_unpackhi_epi32(rg0, ba0);
578 __m256i dst2 = _mm256_unpacklo_epi32(rg1, ba1);
579 __m256i dst3 = _mm256_unpackhi_epi32(rg1, ba1);
580
581 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 0, dst0);
582 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 1, dst1);
583 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 2, dst2);
584 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 3, dst3);
585 #endif
586 #else
587 #error Unsupported vector width
588 #endif
589 }
590 };
591
592 //////////////////////////////////////////////////////////////////////////
593 /// Transpose16_16_16
594 //////////////////////////////////////////////////////////////////////////
595 struct Transpose16_16_16
596 {
597 //////////////////////////////////////////////////////////////////////////
598 /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data.
599 /// @param pSrc - source data in SOA form
600 /// @param pDst - output data in AOS form
601 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
602 {
603 #if KNOB_SIMD_WIDTH == 8
604 simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
605
606 __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
607 __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
608 __m128i src_b = _mm_load_si128((const __m128i*)(pSrc + sizeof(simdscalari)));
609 __m128i src_a = _mm_undefined_si128();
610
611 __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
612 __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
613 __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
614 __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
615
616 __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
617 __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
618 __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
619 __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
620
621 _mm_store_si128(((__m128i*)pDst) + 0, dst0);
622 _mm_store_si128(((__m128i*)pDst) + 1, dst1);
623 _mm_store_si128(((__m128i*)pDst) + 2, dst2);
624 _mm_store_si128(((__m128i*)pDst) + 3, dst3);
625 #elif KNOB_SIMD_WIDTH == 16
626 #if ENABLE_AVX512_EMULATION
627 simdscalari src_rg = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc));
628
629 __m256i src_r = src_rg.lo;
630 __m256i src_g = src_rg.hi;
631 __m256i src_b = _mm256_load_si256(reinterpret_cast<const __m256i*>(pSrc + sizeof(simdscalari)));
632 __m256i src_a = _mm256_undefined_si256();
633
634 __m256i rg0 = _mm256_unpacklo_epi16(src_r, src_g);
635 __m256i rg1 = _mm256_unpackhi_epi16(src_r, src_g);
636 __m256i ba0 = _mm256_unpacklo_epi16(src_b, src_a);
637 __m256i ba1 = _mm256_unpackhi_epi16(src_b, src_a);
638
639 __m256i dst0 = _mm256_unpacklo_epi32(rg0, ba0);
640 __m256i dst1 = _mm256_unpackhi_epi32(rg0, ba0);
641 __m256i dst2 = _mm256_unpacklo_epi32(rg1, ba1);
642 __m256i dst3 = _mm256_unpackhi_epi32(rg1, ba1);
643
644 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 0, dst0);
645 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 1, dst1);
646 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 2, dst2);
647 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 3, dst3);
648 #endif
649 #else
650 #error Unsupported vector width
651 #endif
652 }
653 };
654
655 //////////////////////////////////////////////////////////////////////////
656 /// Transpose16_16
657 //////////////////////////////////////////////////////////////////////////
658 struct Transpose16_16
659 {
660 //////////////////////////////////////////////////////////////////////////
661 /// @brief Performs an SOA to AOS conversion for packed 16_16 data.
662 /// @param pSrc - source data in SOA form
663 /// @param pDst - output data in AOS form
664 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
665 {
666 #if KNOB_SIMD_WIDTH == 8
667 simdscalar src = _simd_load_ps((const float*)pSrc);
668
669 __m128 comp0 = _mm256_castps256_ps128(src);
670 __m128 comp1 = _mm256_extractf128_ps(src, 1);
671
672 __m128i comp0i = _mm_castps_si128(comp0);
673 __m128i comp1i = _mm_castps_si128(comp1);
674
675 __m128i resLo = _mm_unpacklo_epi16(comp0i, comp1i);
676 __m128i resHi = _mm_unpackhi_epi16(comp0i, comp1i);
677
678 _mm_store_si128((__m128i*)pDst, resLo);
679 _mm_store_si128((__m128i*)pDst + 1, resHi);
680 #elif KNOB_SIMD_WIDTH == 16
681 #if ENABLE_AVX512_EMULATION
682 simdscalari src = _simd_castps_si(_simd_load_ps(reinterpret_cast<const float*>(pSrc)));
683
684 simdscalari result;
685
686 result.lo = _mm256_unpacklo_epi16(src.lo, src.hi);
687 result.hi = _mm256_unpackhi_epi16(src.lo, src.hi);
688
689 _simd_store_si(reinterpret_cast<simdscalari *>(pDst), result);
690 #endif
691 #else
692 #error Unsupported vector width
693 #endif
694 }
695 };
696
697 //////////////////////////////////////////////////////////////////////////
698 /// Transpose24_8
699 //////////////////////////////////////////////////////////////////////////
700 struct Transpose24_8
701 {
702 //////////////////////////////////////////////////////////////////////////
703 /// @brief Performs an SOA to AOS conversion for packed 24_8 data.
704 /// @param pSrc - source data in SOA form
705 /// @param pDst - output data in AOS form
706 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
707 };
708
709 //////////////////////////////////////////////////////////////////////////
710 /// Transpose32_8_24
711 //////////////////////////////////////////////////////////////////////////
712 struct Transpose32_8_24
713 {
714 //////////////////////////////////////////////////////////////////////////
715 /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data.
716 /// @param pSrc - source data in SOA form
717 /// @param pDst - output data in AOS form
718 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
719 };
720
721
722
723 //////////////////////////////////////////////////////////////////////////
724 /// Transpose4_4_4_4
725 //////////////////////////////////////////////////////////////////////////
726 struct Transpose4_4_4_4
727 {
728 //////////////////////////////////////////////////////////////////////////
729 /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data.
730 /// @param pSrc - source data in SOA form
731 /// @param pDst - output data in AOS form
732 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
733 };
734
735 //////////////////////////////////////////////////////////////////////////
736 /// Transpose5_6_5
737 //////////////////////////////////////////////////////////////////////////
738 struct Transpose5_6_5
739 {
740 //////////////////////////////////////////////////////////////////////////
741 /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data.
742 /// @param pSrc - source data in SOA form
743 /// @param pDst - output data in AOS form
744 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
745 };
746
747 //////////////////////////////////////////////////////////////////////////
748 /// Transpose9_9_9_5
749 //////////////////////////////////////////////////////////////////////////
750 struct Transpose9_9_9_5
751 {
752 //////////////////////////////////////////////////////////////////////////
753 /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data.
754 /// @param pSrc - source data in SOA form
755 /// @param pDst - output data in AOS form
756 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
757 };
758
759 //////////////////////////////////////////////////////////////////////////
760 /// Transpose5_5_5_1
761 //////////////////////////////////////////////////////////////////////////
762 struct Transpose5_5_5_1
763 {
764 //////////////////////////////////////////////////////////////////////////
765 /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
766 /// @param pSrc - source data in SOA form
767 /// @param pDst - output data in AOS form
768 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
769 };
770
771 //////////////////////////////////////////////////////////////////////////
772 /// Transpose10_10_10_2
773 //////////////////////////////////////////////////////////////////////////
774 struct Transpose10_10_10_2
775 {
776 //////////////////////////////////////////////////////////////////////////
777 /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data.
778 /// @param pSrc - source data in SOA form
779 /// @param pDst - output data in AOS form
780 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
781 };
782
783 //////////////////////////////////////////////////////////////////////////
784 /// Transpose11_11_10
785 //////////////////////////////////////////////////////////////////////////
786 struct Transpose11_11_10
787 {
788 //////////////////////////////////////////////////////////////////////////
789 /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data.
790 /// @param pSrc - source data in SOA form
791 /// @param pDst - output data in AOS form
792 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
793 };
794
795 // helper function to unroll loops
796 template<int Begin, int End, int Step = 1>
797 struct UnrollerL {
798 template<typename Lambda>
799 INLINE static void step(Lambda& func) {
800 func(Begin);
801 UnrollerL<Begin + Step, End, Step>::step(func);
802 }
803 };
804
805 template<int End, int Step>
806 struct UnrollerL<End, End, Step> {
807 template<typename Lambda>
808 static void step(Lambda& func) {
809 }
810 };
811
812 // helper function to unroll loops, with mask to skip specific iterations
813 template<int Begin, int End, int Step = 1, int Mask = 0x7f>
814 struct UnrollerLMask {
815 template<typename Lambda>
816 INLINE static void step(Lambda& func) {
817 if(Mask & (1 << Begin))
818 {
819 func(Begin);
820 }
821 UnrollerL<Begin + Step, End, Step>::step(func);
822 }
823 };
824
825 template<int End, int Step, int Mask>
826 struct UnrollerLMask<End, End, Step, Mask> {
827 template<typename Lambda>
828 static void step(Lambda& func) {
829 }
830 };
831
832 // general CRC compute
833 INLINE
834 uint32_t ComputeCRC(uint32_t crc, const void *pData, uint32_t size)
835 {
836 #if defined(_WIN64) || defined(__x86_64__)
837 uint32_t sizeInQwords = size / sizeof(uint64_t);
838 uint32_t sizeRemainderBytes = size % sizeof(uint64_t);
839 uint64_t* pDataWords = (uint64_t*)pData;
840 for (uint32_t i = 0; i < sizeInQwords; ++i)
841 {
842 crc = (uint32_t)_mm_crc32_u64(crc, *pDataWords++);
843 }
844 #else
845 uint32_t sizeInDwords = size / sizeof(uint32_t);
846 uint32_t sizeRemainderBytes = size % sizeof(uint32_t);
847 uint32_t* pDataWords = (uint32_t*)pData;
848 for (uint32_t i = 0; i < sizeInDwords; ++i)
849 {
850 crc = _mm_crc32_u32(crc, *pDataWords++);
851 }
852 #endif
853
854 uint8_t* pRemainderBytes = (uint8_t*)pDataWords;
855 for (uint32_t i = 0; i < sizeRemainderBytes; ++i)
856 {
857 crc = _mm_crc32_u8(crc, *pRemainderBytes++);
858 }
859
860 return crc;
861 }
862
863 //////////////////////////////////////////////////////////////////////////
864 /// Add byte offset to any-type pointer
865 //////////////////////////////////////////////////////////////////////////
866 template <typename T>
867 INLINE
868 static T* PtrAdd(T* p, intptr_t offset)
869 {
870 intptr_t intp = reinterpret_cast<intptr_t>(p);
871 return reinterpret_cast<T*>(intp + offset);
872 }
873
874 //////////////////////////////////////////////////////////////////////////
875 /// Is a power-of-2?
876 //////////////////////////////////////////////////////////////////////////
877 template <typename T>
878 INLINE
879 static bool IsPow2(T value)
880 {
881 return value == (value & (0 - value));
882 }
883
884 //////////////////////////////////////////////////////////////////////////
885 /// Align down to specified alignment
886 /// Note: IsPow2(alignment) MUST be true
887 //////////////////////////////////////////////////////////////////////////
888 template <typename T1, typename T2>
889 INLINE
890 static T1 AlignDownPow2(T1 value, T2 alignment)
891 {
892 SWR_ASSERT(IsPow2(alignment));
893 return value & ~T1(alignment - 1);
894 }
895
896 //////////////////////////////////////////////////////////////////////////
897 /// Align up to specified alignment
898 /// Note: IsPow2(alignment) MUST be true
899 //////////////////////////////////////////////////////////////////////////
900 template <typename T1, typename T2>
901 INLINE
902 static T1 AlignUpPow2(T1 value, T2 alignment)
903 {
904 return AlignDownPow2(value + T1(alignment - 1), alignment);
905 }
906
907 //////////////////////////////////////////////////////////////////////////
908 /// Align up ptr to specified alignment
909 /// Note: IsPow2(alignment) MUST be true
910 //////////////////////////////////////////////////////////////////////////
911 template <typename T1, typename T2>
912 INLINE
913 static T1* AlignUpPow2(T1* value, T2 alignment)
914 {
915 return reinterpret_cast<T1*>(
916 AlignDownPow2(reinterpret_cast<uintptr_t>(value) + uintptr_t(alignment - 1), alignment));
917 }
918
919 //////////////////////////////////////////////////////////////////////////
920 /// Align down to specified alignment
921 //////////////////////////////////////////////////////////////////////////
922 template <typename T1, typename T2>
923 INLINE
924 static T1 AlignDown(T1 value, T2 alignment)
925 {
926 if (IsPow2(alignment)) { return AlignDownPow2(value, alignment); }
927 return value - T1(value % alignment);
928 }
929
930 //////////////////////////////////////////////////////////////////////////
931 /// Align down to specified alignment
932 //////////////////////////////////////////////////////////////////////////
933 template <typename T1, typename T2>
934 INLINE
935 static T1* AlignDown(T1* value, T2 alignment)
936 {
937 return (T1*)AlignDown(uintptr_t(value), alignment);
938 }
939
940 //////////////////////////////////////////////////////////////////////////
941 /// Align up to specified alignment
942 /// Note: IsPow2(alignment) MUST be true
943 //////////////////////////////////////////////////////////////////////////
944 template <typename T1, typename T2>
945 INLINE
946 static T1 AlignUp(T1 value, T2 alignment)
947 {
948 return AlignDown(value + T1(alignment - 1), alignment);
949 }
950
951 //////////////////////////////////////////////////////////////////////////
952 /// Align up to specified alignment
953 /// Note: IsPow2(alignment) MUST be true
954 //////////////////////////////////////////////////////////////////////////
955 template <typename T1, typename T2>
956 INLINE
957 static T1* AlignUp(T1* value, T2 alignment)
958 {
959 return AlignDown(PtrAdd(value, alignment - 1), alignment);
960 }
961
962 //////////////////////////////////////////////////////////////////////////
963 /// Helper structure used to access an array of elements that don't
964 /// correspond to a typical word size.
965 //////////////////////////////////////////////////////////////////////////
966 template<typename T, size_t BitsPerElementT, size_t ArrayLenT>
967 class BitsArray
968 {
969 private:
970 static const size_t BITS_PER_WORD = sizeof(size_t) * 8;
971 static const size_t ELEMENTS_PER_WORD = BITS_PER_WORD / BitsPerElementT;
972 static const size_t NUM_WORDS = (ArrayLenT + ELEMENTS_PER_WORD - 1) / ELEMENTS_PER_WORD;
973 static const size_t ELEMENT_MASK = (size_t(1) << BitsPerElementT) - 1;
974
975 static_assert(ELEMENTS_PER_WORD * BitsPerElementT == BITS_PER_WORD,
976 "Element size must an integral fraction of pointer size");
977
978 size_t m_words[NUM_WORDS] = {};
979
980 public:
981
982 T operator[] (size_t elementIndex) const
983 {
984 size_t word = m_words[elementIndex / ELEMENTS_PER_WORD];
985 word >>= ((elementIndex % ELEMENTS_PER_WORD) * BitsPerElementT);
986 return T(word & ELEMENT_MASK);
987 }
988 };
989
990 // Ranged integer argument for TemplateArgUnroller
991 template <uint32_t TMin, uint32_t TMax>
992 struct IntArg
993 {
994 uint32_t val;
995 };
996
997 // Recursive template used to auto-nest conditionals. Converts dynamic boolean function
998 // arguments to static template arguments.
999 template <typename TermT, typename... ArgsB>
1000 struct TemplateArgUnroller
1001 {
1002 //-----------------------------------------
1003 // Boolean value
1004 //-----------------------------------------
1005
1006 // Last Arg Terminator
1007 static typename TermT::FuncType GetFunc(bool bArg)
1008 {
1009 if (bArg)
1010 {
1011 return TermT::template GetFunc<ArgsB..., std::true_type>();
1012 }
1013
1014 return TermT::template GetFunc<ArgsB..., std::false_type>();
1015 }
1016
1017 // Recursively parse args
1018 template <typename... TArgsT>
1019 static typename TermT::FuncType GetFunc(bool bArg, TArgsT... remainingArgs)
1020 {
1021 if (bArg)
1022 {
1023 return TemplateArgUnroller<TermT, ArgsB..., std::true_type>::GetFunc(remainingArgs...);
1024 }
1025
1026 return TemplateArgUnroller<TermT, ArgsB..., std::false_type>::GetFunc(remainingArgs...);
1027 }
1028
1029 //-----------------------------------------
1030 // Integer value (within specified range)
1031 //-----------------------------------------
1032
1033 // Last Arg Terminator
1034 template <uint32_t TMin, uint32_t TMax>
1035 static typename TermT::FuncType GetFunc(IntArg<TMin, TMax> iArg)
1036 {
1037 if (iArg.val == TMax)
1038 {
1039 return TermT::template GetFunc<ArgsB..., std::integral_constant<uint32_t, TMax>>();
1040 }
1041 if (TMax > TMin)
1042 {
1043 return TemplateArgUnroller<TermT, ArgsB...>::GetFunc(IntArg<TMin, TMax-1>{iArg.val});
1044 }
1045 SWR_ASSUME(false); return nullptr;
1046 }
1047 template <uint32_t TVal>
1048 static typename TermT::FuncType GetFunc(IntArg<TVal, TVal> iArg)
1049 {
1050 SWR_ASSERT(iArg.val == TVal);
1051 return TermT::template GetFunc<ArgsB..., std::integral_constant<uint32_t, TVal>>();
1052 }
1053
1054 // Recursively parse args
1055 template <uint32_t TMin, uint32_t TMax, typename... TArgsT>
1056 static typename TermT::FuncType GetFunc(IntArg<TMin, TMax> iArg, TArgsT... remainingArgs)
1057 {
1058 if (iArg.val == TMax)
1059 {
1060 return TemplateArgUnroller<TermT, ArgsB..., std::integral_constant<uint32_t, TMax>>::GetFunc(remainingArgs...);
1061 }
1062 if (TMax > TMin)
1063 {
1064 return TemplateArgUnroller<TermT, ArgsB...>::GetFunc(IntArg<TMin, TMax - 1>{iArg.val}, remainingArgs...);
1065 }
1066 SWR_ASSUME(false); return nullptr;
1067 }
1068 template <uint32_t TVal, typename... TArgsT>
1069 static typename TermT::FuncType GetFunc(IntArg<TVal, TVal> iArg, TArgsT... remainingArgs)
1070 {
1071 SWR_ASSERT(iArg.val == TVal);
1072 return TemplateArgUnroller<TermT, ArgsB..., std::integral_constant<uint32_t, TVal>>::GetFunc(remainingArgs...);
1073 }
1074 };
1075
1076