96f061a78aff562301595e472cb87b04424d47d7
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / utils.h
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file utils.h
24 *
25 * @brief Utilities used by SWR core.
26 *
27 ******************************************************************************/
28 #pragma once
29
30 #include <string.h>
31 #include <type_traits>
32 #include <algorithm>
33 #include "common/os.h"
34 #include "common/simdintrin.h"
35 #include "common/swr_assert.h"
36 #include "core/api.h"
37
38 #if defined(_WIN64) || defined(__x86_64__)
39 #define _MM_INSERT_EPI64 _mm_insert_epi64
40 #define _MM_EXTRACT_EPI64 _mm_extract_epi64
41 #else
42 INLINE int64_t _MM_EXTRACT_EPI64(__m128i a, const int32_t ndx)
43 {
44 OSALIGNLINE(uint32_t) elems[4];
45 _mm_store_si128((__m128i*)elems, a);
46 if (ndx == 0)
47 {
48 uint64_t foo = elems[0];
49 foo |= (uint64_t)elems[1] << 32;
50 return foo;
51 }
52 else
53 {
54 uint64_t foo = elems[2];
55 foo |= (uint64_t)elems[3] << 32;
56 return foo;
57 }
58 }
59
60 INLINE __m128i _MM_INSERT_EPI64(__m128i a, int64_t b, const int32_t ndx)
61 {
62 OSALIGNLINE(int64_t) elems[2];
63 _mm_store_si128((__m128i*)elems, a);
64 if (ndx == 0)
65 {
66 elems[0] = b;
67 }
68 else
69 {
70 elems[1] = b;
71 }
72 __m128i out;
73 out = _mm_load_si128((const __m128i*)elems);
74 return out;
75 }
76 #endif
77
78 struct simdBBox
79 {
80 simdscalari ymin;
81 simdscalari ymax;
82 simdscalari xmin;
83 simdscalari xmax;
84 };
85
86 INLINE
87 void vTranspose(__m128 &row0, __m128 &row1, __m128 &row2, __m128 &row3)
88 {
89 __m128i row0i = _mm_castps_si128(row0);
90 __m128i row1i = _mm_castps_si128(row1);
91 __m128i row2i = _mm_castps_si128(row2);
92 __m128i row3i = _mm_castps_si128(row3);
93
94 __m128i vTemp = row2i;
95 row2i = _mm_unpacklo_epi32(row2i, row3i);
96 vTemp = _mm_unpackhi_epi32(vTemp, row3i);
97
98 row3i = row0i;
99 row0i = _mm_unpacklo_epi32(row0i, row1i);
100 row3i = _mm_unpackhi_epi32(row3i, row1i);
101
102 row1i = row0i;
103 row0i = _mm_unpacklo_epi64(row0i, row2i);
104 row1i = _mm_unpackhi_epi64(row1i, row2i);
105
106 row2i = row3i;
107 row2i = _mm_unpacklo_epi64(row2i, vTemp);
108 row3i = _mm_unpackhi_epi64(row3i, vTemp);
109
110 row0 = _mm_castsi128_ps(row0i);
111 row1 = _mm_castsi128_ps(row1i);
112 row2 = _mm_castsi128_ps(row2i);
113 row3 = _mm_castsi128_ps(row3i);
114 }
115
116 INLINE
117 void vTranspose(__m128i &row0, __m128i &row1, __m128i &row2, __m128i &row3)
118 {
119 __m128i vTemp = row2;
120 row2 = _mm_unpacklo_epi32(row2, row3);
121 vTemp = _mm_unpackhi_epi32(vTemp, row3);
122
123 row3 = row0;
124 row0 = _mm_unpacklo_epi32(row0, row1);
125 row3 = _mm_unpackhi_epi32(row3, row1);
126
127 row1 = row0;
128 row0 = _mm_unpacklo_epi64(row0, row2);
129 row1 = _mm_unpackhi_epi64(row1, row2);
130
131 row2 = row3;
132 row2 = _mm_unpacklo_epi64(row2, vTemp);
133 row3 = _mm_unpackhi_epi64(row3, vTemp);
134 }
135
136 #define GCC_VERSION (__GNUC__ * 10000 \
137 + __GNUC_MINOR__ * 100 \
138 + __GNUC_PATCHLEVEL__)
139
140 #if defined(__clang__) || (defined(__GNUC__) && (GCC_VERSION < 40900))
141 #define _mm_undefined_ps _mm_setzero_ps
142 #define _mm_undefined_si128 _mm_setzero_si128
143 #if KNOB_SIMD_WIDTH == 8
144 #define _mm256_undefined_ps _mm256_setzero_ps
145 #endif
146 #endif
147
148 #if KNOB_SIMD_WIDTH == 8 || KNOB_SIMD_WIDTH == 16
149 INLINE
150 void vTranspose3x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc2)
151 {
152 __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5
153 __m256 r1rx = _mm256_unpacklo_ps(vSrc1, _mm256_undefined_ps()); //y0w0y1w1 y4w4y5w5
154 __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4
155 __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5
156
157 r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7
158 r1rx = _mm256_unpackhi_ps(vSrc1, _mm256_undefined_ps()); //y2w2y3w3 y6w6yw77
159 __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6
160 __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7
161
162 vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
163 vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
164 vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
165 vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
166
167 vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1);
168 vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1);
169 vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1);
170 vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1);
171 }
172
173 INLINE
174 void vTranspose4x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc2, __m256 &vSrc3)
175 {
176 __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5
177 __m256 r1rx = _mm256_unpacklo_ps(vSrc1, vSrc3); //y0w0y1w1 y4w4y5w5
178 __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4
179 __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5
180
181 r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7
182 r1rx = _mm256_unpackhi_ps(vSrc1, vSrc3) ; //y2w2y3w3 y6w6yw77
183 __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6
184 __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7
185
186 vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
187 vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
188 vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
189 vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
190
191 vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1);
192 vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1);
193 vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1);
194 vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1);
195 }
196
197 INLINE
198 void vTranspose8x8(__m256 (&vDst)[8], const __m256 &vMask0, const __m256 &vMask1, const __m256 &vMask2, const __m256 &vMask3, const __m256 &vMask4, const __m256 &vMask5, const __m256 &vMask6, const __m256 &vMask7)
199 {
200 __m256 __t0 = _mm256_unpacklo_ps(vMask0, vMask1);
201 __m256 __t1 = _mm256_unpackhi_ps(vMask0, vMask1);
202 __m256 __t2 = _mm256_unpacklo_ps(vMask2, vMask3);
203 __m256 __t3 = _mm256_unpackhi_ps(vMask2, vMask3);
204 __m256 __t4 = _mm256_unpacklo_ps(vMask4, vMask5);
205 __m256 __t5 = _mm256_unpackhi_ps(vMask4, vMask5);
206 __m256 __t6 = _mm256_unpacklo_ps(vMask6, vMask7);
207 __m256 __t7 = _mm256_unpackhi_ps(vMask6, vMask7);
208 __m256 __tt0 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0));
209 __m256 __tt1 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2));
210 __m256 __tt2 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0));
211 __m256 __tt3 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2));
212 __m256 __tt4 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0));
213 __m256 __tt5 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2));
214 __m256 __tt6 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0));
215 __m256 __tt7 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2));
216 vDst[0] = _mm256_permute2f128_ps(__tt0, __tt4, 0x20);
217 vDst[1] = _mm256_permute2f128_ps(__tt1, __tt5, 0x20);
218 vDst[2] = _mm256_permute2f128_ps(__tt2, __tt6, 0x20);
219 vDst[3] = _mm256_permute2f128_ps(__tt3, __tt7, 0x20);
220 vDst[4] = _mm256_permute2f128_ps(__tt0, __tt4, 0x31);
221 vDst[5] = _mm256_permute2f128_ps(__tt1, __tt5, 0x31);
222 vDst[6] = _mm256_permute2f128_ps(__tt2, __tt6, 0x31);
223 vDst[7] = _mm256_permute2f128_ps(__tt3, __tt7, 0x31);
224 }
225
226 INLINE
227 void vTranspose8x8(__m256 (&vDst)[8], const __m256i &vMask0, const __m256i &vMask1, const __m256i &vMask2, const __m256i &vMask3, const __m256i &vMask4, const __m256i &vMask5, const __m256i &vMask6, const __m256i &vMask7)
228 {
229 vTranspose8x8(vDst, _mm256_castsi256_ps(vMask0), _mm256_castsi256_ps(vMask1), _mm256_castsi256_ps(vMask2), _mm256_castsi256_ps(vMask3),
230 _mm256_castsi256_ps(vMask4), _mm256_castsi256_ps(vMask5), _mm256_castsi256_ps(vMask6), _mm256_castsi256_ps(vMask7));
231 }
232 #endif
233
234 //////////////////////////////////////////////////////////////////////////
235 /// TranposeSingleComponent
236 //////////////////////////////////////////////////////////////////////////
237 template<uint32_t bpp>
238 struct TransposeSingleComponent
239 {
240 //////////////////////////////////////////////////////////////////////////
241 /// @brief Pass-thru for single component.
242 /// @param pSrc - source data in SOA form
243 /// @param pDst - output data in AOS form
244 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
245 {
246 memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8);
247 }
248 };
249
250 //////////////////////////////////////////////////////////////////////////
251 /// Transpose8_8_8_8
252 //////////////////////////////////////////////////////////////////////////
253 struct Transpose8_8_8_8
254 {
255 //////////////////////////////////////////////////////////////////////////
256 /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data.
257 /// @param pSrc - source data in SOA form
258 /// @param pDst - output data in AOS form
259 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
260 {
261 simdscalari src = _simd_load_si((const simdscalari*)pSrc);
262
263 #if KNOB_SIMD_WIDTH == 8
264 #if KNOB_ARCH == KNOB_ARCH_AVX
265 __m128i c0c1 = _mm256_castsi256_si128(src); // rrrrrrrrgggggggg
266 __m128i c2c3 = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(src), 1)); // bbbbbbbbaaaaaaaa
267 __m128i c0c2 = _mm_unpacklo_epi64(c0c1, c2c3); // rrrrrrrrbbbbbbbb
268 __m128i c1c3 = _mm_unpackhi_epi64(c0c1, c2c3); // ggggggggaaaaaaaa
269 __m128i c01 = _mm_unpacklo_epi8(c0c2, c1c3); // rgrgrgrgrgrgrgrg
270 __m128i c23 = _mm_unpackhi_epi8(c0c2, c1c3); // babababababababa
271 __m128i c0123lo = _mm_unpacklo_epi16(c01, c23); // rgbargbargbargba
272 __m128i c0123hi = _mm_unpackhi_epi16(c01, c23); // rgbargbargbargba
273 _mm_store_si128((__m128i*)pDst, c0123lo);
274 _mm_store_si128((__m128i*)(pDst + 16), c0123hi);
275 #elif KNOB_ARCH == KNOB_ARCH_AVX2
276 simdscalari dst01 = _mm256_shuffle_epi8(src,
277 _mm256_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800));
278 simdscalari dst23 = _mm256_permute2x128_si256(src, src, 0x01);
279 dst23 = _mm256_shuffle_epi8(dst23,
280 _mm256_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080));
281 simdscalari dst = _mm256_or_si256(dst01, dst23);
282 _simd_store_si((simdscalari*)pDst, dst);
283 #endif
284 #elif KNOB_SIMD_WIDTH == 16
285 simdscalari mask0 = _simd_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800);
286
287 simdscalari dst01 = _simd_shuffle_epi8(src, mask0);
288
289 simdscalari perm1 = _simd_permute_128(src, src, 1);
290
291 simdscalari mask1 = _simd_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080);
292
293 simdscalari dst23 = _simd_shuffle_epi8(perm1, mask1);
294
295 simdscalari dst = _simd_or_si(dst01, dst23);
296
297 _simd_store_si(reinterpret_cast<simdscalari *>(pDst), dst);
298 #else
299 #error Unsupported vector width
300 #endif
301 }
302 };
303
304 //////////////////////////////////////////////////////////////////////////
305 /// Transpose8_8_8
306 //////////////////////////////////////////////////////////////////////////
307 struct Transpose8_8_8
308 {
309 //////////////////////////////////////////////////////////////////////////
310 /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data.
311 /// @param pSrc - source data in SOA form
312 /// @param pDst - output data in AOS form
313 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
314 };
315
316 //////////////////////////////////////////////////////////////////////////
317 /// Transpose8_8
318 //////////////////////////////////////////////////////////////////////////
319 struct Transpose8_8
320 {
321 //////////////////////////////////////////////////////////////////////////
322 /// @brief Performs an SOA to AOS conversion for packed 8_8 data.
323 /// @param pSrc - source data in SOA form
324 /// @param pDst - output data in AOS form
325 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
326 {
327 #if KNOB_SIMD_WIDTH == 8
328 simdscalari src = _simd_load_si((const simdscalari*)pSrc);
329
330 __m128i rg = _mm256_castsi256_si128(src); // rrrrrrrr gggggggg
331 __m128i g = _mm_unpackhi_epi64(rg, rg); // gggggggg gggggggg
332 rg = _mm_unpacklo_epi8(rg, g);
333 _mm_store_si128((__m128i*)pDst, rg);
334 #elif KNOB_SIMD_WIDTH == 16
335 __m256i src = _mm256_load_si256(reinterpret_cast<const __m256i *>(pSrc)); // rrrrrrrrrrrrrrrrgggggggggggggggg
336
337 __m256i r = _mm256_permute4x64_epi64(src, 0x50); // 0x50 = 01010000b // rrrrrrrrxxxxxxxxrrrrrrrrxxxxxxxx
338
339 __m256i g = _mm256_permute4x64_epi64(src, 0xFA); // 0xFA = 11111010b // ggggggggxxxxxxxxggggggggxxxxxxxx
340
341 __m256i dst = _mm256_unpacklo_epi8(r, g); // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg
342
343 _mm256_store_si256(reinterpret_cast<__m256i *>(pDst), dst);
344 #else
345 #error Unsupported vector width
346 #endif
347 }
348 };
349
350 //////////////////////////////////////////////////////////////////////////
351 /// Transpose32_32_32_32
352 //////////////////////////////////////////////////////////////////////////
353 struct Transpose32_32_32_32
354 {
355 //////////////////////////////////////////////////////////////////////////
356 /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data.
357 /// @param pSrc - source data in SOA form
358 /// @param pDst - output data in AOS form
359 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
360 {
361 #if KNOB_SIMD_WIDTH == 8
362 simdscalar src0 = _simd_load_ps((const float*)pSrc);
363 simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
364 simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
365 simdscalar src3 = _simd_load_ps((const float*)pSrc + 24);
366
367 __m128 vDst[8];
368 vTranspose4x8(vDst, src0, src1, src2, src3);
369 _mm_store_ps((float*)pDst, vDst[0]);
370 _mm_store_ps((float*)pDst+4, vDst[1]);
371 _mm_store_ps((float*)pDst+8, vDst[2]);
372 _mm_store_ps((float*)pDst+12, vDst[3]);
373 _mm_store_ps((float*)pDst+16, vDst[4]);
374 _mm_store_ps((float*)pDst+20, vDst[5]);
375 _mm_store_ps((float*)pDst+24, vDst[6]);
376 _mm_store_ps((float*)pDst+28, vDst[7]);
377 #elif KNOB_SIMD_WIDTH == 16
378 #if ENABLE_AVX512_EMULATION
379 simdscalar src0 = _simd_load_ps(reinterpret_cast<const float*>(pSrc));
380 simdscalar src1 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) + 16);
381 simdscalar src2 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) + 32);
382 simdscalar src3 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) + 48);
383
384 __m128 vDst[8];
385
386 vTranspose4x8(vDst, src0.lo, src1.lo, src2.lo, src3.lo);
387
388 _mm_store_ps(reinterpret_cast<float*>(pDst), vDst[0]);
389 _mm_store_ps(reinterpret_cast<float*>(pDst) + 4, vDst[1]);
390 _mm_store_ps(reinterpret_cast<float*>(pDst) + 8, vDst[2]);
391 _mm_store_ps(reinterpret_cast<float*>(pDst) + 12, vDst[3]);
392 _mm_store_ps(reinterpret_cast<float*>(pDst) + 16, vDst[4]);
393 _mm_store_ps(reinterpret_cast<float*>(pDst) + 20, vDst[5]);
394 _mm_store_ps(reinterpret_cast<float*>(pDst) + 24, vDst[6]);
395 _mm_store_ps(reinterpret_cast<float*>(pDst) + 28, vDst[7]);
396
397 vTranspose4x8(vDst, src0.hi, src1.hi, src2.hi, src3.hi);
398
399 _mm_store_ps(reinterpret_cast<float*>(pDst) + 32, vDst[0]);
400 _mm_store_ps(reinterpret_cast<float*>(pDst) + 36, vDst[1]);
401 _mm_store_ps(reinterpret_cast<float*>(pDst) + 40, vDst[2]);
402 _mm_store_ps(reinterpret_cast<float*>(pDst) + 44, vDst[3]);
403 _mm_store_ps(reinterpret_cast<float*>(pDst) + 48, vDst[4]);
404 _mm_store_ps(reinterpret_cast<float*>(pDst) + 52, vDst[5]);
405 _mm_store_ps(reinterpret_cast<float*>(pDst) + 56, vDst[6]);
406 _mm_store_ps(reinterpret_cast<float*>(pDst) + 60, vDst[7]);
407 #endif
408 #else
409 #error Unsupported vector width
410 #endif
411 }
412 };
413
414 //////////////////////////////////////////////////////////////////////////
415 /// Transpose32_32_32
416 //////////////////////////////////////////////////////////////////////////
417 struct Transpose32_32_32
418 {
419 //////////////////////////////////////////////////////////////////////////
420 /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data.
421 /// @param pSrc - source data in SOA form
422 /// @param pDst - output data in AOS form
423 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
424 {
425 #if KNOB_SIMD_WIDTH == 8
426 simdscalar src0 = _simd_load_ps((const float*)pSrc);
427 simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
428 simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
429
430 __m128 vDst[8];
431 vTranspose3x8(vDst, src0, src1, src2);
432 _mm_store_ps((float*)pDst, vDst[0]);
433 _mm_store_ps((float*)pDst + 4, vDst[1]);
434 _mm_store_ps((float*)pDst + 8, vDst[2]);
435 _mm_store_ps((float*)pDst + 12, vDst[3]);
436 _mm_store_ps((float*)pDst + 16, vDst[4]);
437 _mm_store_ps((float*)pDst + 20, vDst[5]);
438 _mm_store_ps((float*)pDst + 24, vDst[6]);
439 _mm_store_ps((float*)pDst + 28, vDst[7]);
440 #elif KNOB_SIMD_WIDTH == 16
441 #if ENABLE_AVX512_EMULATION
442 simdscalar src0 = _simd_load_ps(reinterpret_cast<const float*>(pSrc));
443 simdscalar src1 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) + 16);
444 simdscalar src2 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) + 32);
445
446 __m128 vDst[8];
447
448 vTranspose3x8(vDst, src0.lo, src1.lo, src2.lo);
449
450 _mm_store_ps(reinterpret_cast<float*>(pDst), vDst[0]);
451 _mm_store_ps(reinterpret_cast<float*>(pDst) + 4, vDst[1]);
452 _mm_store_ps(reinterpret_cast<float*>(pDst) + 8, vDst[2]);
453 _mm_store_ps(reinterpret_cast<float*>(pDst) + 12, vDst[3]);
454 _mm_store_ps(reinterpret_cast<float*>(pDst) + 16, vDst[4]);
455 _mm_store_ps(reinterpret_cast<float*>(pDst) + 20, vDst[5]);
456 _mm_store_ps(reinterpret_cast<float*>(pDst) + 24, vDst[6]);
457 _mm_store_ps(reinterpret_cast<float*>(pDst) + 28, vDst[7]);
458
459 vTranspose3x8(vDst, src0.hi, src1.hi, src2.hi);
460
461 _mm_store_ps(reinterpret_cast<float*>(pDst) + 32, vDst[0]);
462 _mm_store_ps(reinterpret_cast<float*>(pDst) + 36, vDst[1]);
463 _mm_store_ps(reinterpret_cast<float*>(pDst) + 40, vDst[2]);
464 _mm_store_ps(reinterpret_cast<float*>(pDst) + 44, vDst[3]);
465 _mm_store_ps(reinterpret_cast<float*>(pDst) + 48, vDst[4]);
466 _mm_store_ps(reinterpret_cast<float*>(pDst) + 52, vDst[5]);
467 _mm_store_ps(reinterpret_cast<float*>(pDst) + 56, vDst[6]);
468 _mm_store_ps(reinterpret_cast<float*>(pDst) + 60, vDst[7]);
469 #endif
470 #else
471 #error Unsupported vector width
472 #endif
473 }
474 };
475
476 //////////////////////////////////////////////////////////////////////////
477 /// Transpose32_32
478 //////////////////////////////////////////////////////////////////////////
479 struct Transpose32_32
480 {
481 //////////////////////////////////////////////////////////////////////////
482 /// @brief Performs an SOA to AOS conversion for packed 32_32 data.
483 /// @param pSrc - source data in SOA form
484 /// @param pDst - output data in AOS form
485 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
486 {
487 #if KNOB_SIMD_WIDTH == 8
488 const float* pfSrc = (const float*)pSrc;
489 __m128 src_r0 = _mm_load_ps(pfSrc + 0);
490 __m128 src_r1 = _mm_load_ps(pfSrc + 4);
491 __m128 src_g0 = _mm_load_ps(pfSrc + 8);
492 __m128 src_g1 = _mm_load_ps(pfSrc + 12);
493
494 __m128 dst0 = _mm_unpacklo_ps(src_r0, src_g0);
495 __m128 dst1 = _mm_unpackhi_ps(src_r0, src_g0);
496 __m128 dst2 = _mm_unpacklo_ps(src_r1, src_g1);
497 __m128 dst3 = _mm_unpackhi_ps(src_r1, src_g1);
498
499 float* pfDst = (float*)pDst;
500 _mm_store_ps(pfDst + 0, dst0);
501 _mm_store_ps(pfDst + 4, dst1);
502 _mm_store_ps(pfDst + 8, dst2);
503 _mm_store_ps(pfDst + 12, dst3);
504 #elif KNOB_SIMD_WIDTH == 16
505 const float* pfSrc = (const float*)pSrc;
506 __m256 src_r0 = _mm256_load_ps(pfSrc + 0);
507 __m256 src_r1 = _mm256_load_ps(pfSrc + 8);
508 __m256 src_g0 = _mm256_load_ps(pfSrc + 16);
509 __m256 src_g1 = _mm256_load_ps(pfSrc + 24);
510
511 __m256 dst0 = _mm256_unpacklo_ps(src_r0, src_g0);
512 __m256 dst1 = _mm256_unpackhi_ps(src_r0, src_g0);
513 __m256 dst2 = _mm256_unpacklo_ps(src_r1, src_g1);
514 __m256 dst3 = _mm256_unpackhi_ps(src_r1, src_g1);
515
516 float* pfDst = (float*)pDst;
517 _mm256_store_ps(pfDst + 0, dst0);
518 _mm256_store_ps(pfDst + 8, dst1);
519 _mm256_store_ps(pfDst + 16, dst2);
520 _mm256_store_ps(pfDst + 24, dst3);
521 #else
522 #error Unsupported vector width
523 #endif
524 }
525 };
526
527 //////////////////////////////////////////////////////////////////////////
528 /// Transpose16_16_16_16
529 //////////////////////////////////////////////////////////////////////////
530 struct Transpose16_16_16_16
531 {
532 //////////////////////////////////////////////////////////////////////////
533 /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data.
534 /// @param pSrc - source data in SOA form
535 /// @param pDst - output data in AOS form
536 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
537 {
538 #if KNOB_SIMD_WIDTH == 8
539 simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
540 simdscalari src_ba = _simd_load_si((const simdscalari*)(pSrc + sizeof(simdscalari)));
541
542 __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
543 __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
544 __m128i src_b = _mm256_extractf128_si256(src_ba, 0);
545 __m128i src_a = _mm256_extractf128_si256(src_ba, 1);
546
547 __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
548 __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
549 __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
550 __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
551
552 __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
553 __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
554 __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
555 __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
556
557 _mm_store_si128(((__m128i*)pDst) + 0, dst0);
558 _mm_store_si128(((__m128i*)pDst) + 1, dst1);
559 _mm_store_si128(((__m128i*)pDst) + 2, dst2);
560 _mm_store_si128(((__m128i*)pDst) + 3, dst3);
561 #elif KNOB_SIMD_WIDTH == 16
562 #if ENABLE_AVX512_EMULATION
563 simdscalari src_rg = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc));
564 simdscalari src_ba = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc + sizeof(simdscalari)));
565
566 __m256i src_r = src_rg.lo;
567 __m256i src_g = src_rg.hi;
568 __m256i src_b = src_ba.lo;
569 __m256i src_a = src_ba.hi;
570
571 __m256i rg0 = _mm256_unpacklo_epi16(src_r, src_g);
572 __m256i rg1 = _mm256_unpackhi_epi16(src_r, src_g);
573 __m256i ba0 = _mm256_unpacklo_epi16(src_b, src_a);
574 __m256i ba1 = _mm256_unpackhi_epi16(src_b, src_a);
575
576 __m256i dst0 = _mm256_unpacklo_epi32(rg0, ba0);
577 __m256i dst1 = _mm256_unpackhi_epi32(rg0, ba0);
578 __m256i dst2 = _mm256_unpacklo_epi32(rg1, ba1);
579 __m256i dst3 = _mm256_unpackhi_epi32(rg1, ba1);
580
581 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 0, dst0);
582 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 1, dst1);
583 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 2, dst2);
584 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 3, dst3);
585 #endif
586 #else
587 #error Unsupported vector width
588 #endif
589 }
590 };
591
592 //////////////////////////////////////////////////////////////////////////
593 /// Transpose16_16_16
594 //////////////////////////////////////////////////////////////////////////
595 struct Transpose16_16_16
596 {
597 //////////////////////////////////////////////////////////////////////////
598 /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data.
599 /// @param pSrc - source data in SOA form
600 /// @param pDst - output data in AOS form
601 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
602 {
603 #if KNOB_SIMD_WIDTH == 8
604 simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
605
606 __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
607 __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
608 __m128i src_b = _mm_load_si128((const __m128i*)(pSrc + sizeof(simdscalari)));
609 __m128i src_a = _mm_undefined_si128();
610
611 __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
612 __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
613 __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
614 __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
615
616 __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
617 __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
618 __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
619 __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
620
621 _mm_store_si128(((__m128i*)pDst) + 0, dst0);
622 _mm_store_si128(((__m128i*)pDst) + 1, dst1);
623 _mm_store_si128(((__m128i*)pDst) + 2, dst2);
624 _mm_store_si128(((__m128i*)pDst) + 3, dst3);
625 #elif KNOB_SIMD_WIDTH == 16
626 #if ENABLE_AVX512_EMULATION
627 simdscalari src_rg = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc));
628
629 __m256i src_r = src_rg.lo;
630 __m256i src_g = src_rg.hi;
631 __m256i src_b = _mm256_load_si256(reinterpret_cast<const __m256i*>(pSrc + sizeof(simdscalari)));
632 __m256i src_a = _mm256_undefined_si256();
633
634 __m256i rg0 = _mm256_unpacklo_epi16(src_r, src_g);
635 __m256i rg1 = _mm256_unpackhi_epi16(src_r, src_g);
636 __m256i ba0 = _mm256_unpacklo_epi16(src_b, src_a);
637 __m256i ba1 = _mm256_unpackhi_epi16(src_b, src_a);
638
639 __m256i dst0 = _mm256_unpacklo_epi32(rg0, ba0);
640 __m256i dst1 = _mm256_unpackhi_epi32(rg0, ba0);
641 __m256i dst2 = _mm256_unpacklo_epi32(rg1, ba1);
642 __m256i dst3 = _mm256_unpackhi_epi32(rg1, ba1);
643
644 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 0, dst0);
645 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 1, dst1);
646 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 2, dst2);
647 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 3, dst3);
648 #endif
649 #else
650 #error Unsupported vector width
651 #endif
652 }
653 };
654
655 //////////////////////////////////////////////////////////////////////////
656 /// Transpose16_16
657 //////////////////////////////////////////////////////////////////////////
658 struct Transpose16_16
659 {
660 //////////////////////////////////////////////////////////////////////////
661 /// @brief Performs an SOA to AOS conversion for packed 16_16 data.
662 /// @param pSrc - source data in SOA form
663 /// @param pDst - output data in AOS form
664 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
665 {
666 #if KNOB_SIMD_WIDTH == 8
667 simdscalar src = _simd_load_ps((const float*)pSrc);
668
669 __m128 comp0 = _mm256_castps256_ps128(src);
670 __m128 comp1 = _mm256_extractf128_ps(src, 1);
671
672 __m128i comp0i = _mm_castps_si128(comp0);
673 __m128i comp1i = _mm_castps_si128(comp1);
674
675 __m128i resLo = _mm_unpacklo_epi16(comp0i, comp1i);
676 __m128i resHi = _mm_unpackhi_epi16(comp0i, comp1i);
677
678 _mm_store_si128((__m128i*)pDst, resLo);
679 _mm_store_si128((__m128i*)pDst + 1, resHi);
680 #elif KNOB_SIMD_WIDTH == 16
681 #if ENABLE_AVX512_EMULATION
682 simdscalari src = _simd_castps_si(_simd_load_ps(reinterpret_cast<const float*>(pSrc)));
683
684 simdscalari result;
685
686 result.lo = _mm256_unpacklo_epi16(src.lo, src.hi);
687 result.hi = _mm256_unpackhi_epi16(src.lo, src.hi);
688
689 _simd_store_si(reinterpret_cast<simdscalari *>(pDst), result);
690 #endif
691 #else
692 #error Unsupported vector width
693 #endif
694 }
695 };
696
697 //////////////////////////////////////////////////////////////////////////
698 /// Transpose24_8
699 //////////////////////////////////////////////////////////////////////////
700 struct Transpose24_8
701 {
702 //////////////////////////////////////////////////////////////////////////
703 /// @brief Performs an SOA to AOS conversion for packed 24_8 data.
704 /// @param pSrc - source data in SOA form
705 /// @param pDst - output data in AOS form
706 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
707 };
708
709 //////////////////////////////////////////////////////////////////////////
710 /// Transpose32_8_24
711 //////////////////////////////////////////////////////////////////////////
712 struct Transpose32_8_24
713 {
714 //////////////////////////////////////////////////////////////////////////
715 /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data.
716 /// @param pSrc - source data in SOA form
717 /// @param pDst - output data in AOS form
718 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
719 };
720
721
722
723 //////////////////////////////////////////////////////////////////////////
724 /// Transpose4_4_4_4
725 //////////////////////////////////////////////////////////////////////////
726 struct Transpose4_4_4_4
727 {
728 //////////////////////////////////////////////////////////////////////////
729 /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data.
730 /// @param pSrc - source data in SOA form
731 /// @param pDst - output data in AOS form
732 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
733 };
734
735 //////////////////////////////////////////////////////////////////////////
736 /// Transpose5_6_5
737 //////////////////////////////////////////////////////////////////////////
738 struct Transpose5_6_5
739 {
740 //////////////////////////////////////////////////////////////////////////
741 /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data.
742 /// @param pSrc - source data in SOA form
743 /// @param pDst - output data in AOS form
744 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
745 };
746
747 //////////////////////////////////////////////////////////////////////////
748 /// Transpose9_9_9_5
749 //////////////////////////////////////////////////////////////////////////
750 struct Transpose9_9_9_5
751 {
752 //////////////////////////////////////////////////////////////////////////
753 /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data.
754 /// @param pSrc - source data in SOA form
755 /// @param pDst - output data in AOS form
756 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
757 };
758
759 //////////////////////////////////////////////////////////////////////////
760 /// Transpose5_5_5_1
761 //////////////////////////////////////////////////////////////////////////
762 struct Transpose5_5_5_1
763 {
764 //////////////////////////////////////////////////////////////////////////
765 /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
766 /// @param pSrc - source data in SOA form
767 /// @param pDst - output data in AOS form
768 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
769 };
770
771 //////////////////////////////////////////////////////////////////////////
772 /// Transpose1_5_5_5
773 //////////////////////////////////////////////////////////////////////////
774 struct Transpose1_5_5_5
775 {
776 //////////////////////////////////////////////////////////////////////////
777 /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
778 /// @param pSrc - source data in SOA form
779 /// @param pDst - output data in AOS form
780 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
781 };
782
783 //////////////////////////////////////////////////////////////////////////
784 /// Transpose10_10_10_2
785 //////////////////////////////////////////////////////////////////////////
786 struct Transpose10_10_10_2
787 {
788 //////////////////////////////////////////////////////////////////////////
789 /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data.
790 /// @param pSrc - source data in SOA form
791 /// @param pDst - output data in AOS form
792 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
793 };
794
795 //////////////////////////////////////////////////////////////////////////
796 /// Transpose11_11_10
797 //////////////////////////////////////////////////////////////////////////
798 struct Transpose11_11_10
799 {
800 //////////////////////////////////////////////////////////////////////////
801 /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data.
802 /// @param pSrc - source data in SOA form
803 /// @param pDst - output data in AOS form
804 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
805 };
806
807 // helper function to unroll loops
808 template<int Begin, int End, int Step = 1>
809 struct UnrollerL {
810 template<typename Lambda>
811 INLINE static void step(Lambda& func) {
812 func(Begin);
813 UnrollerL<Begin + Step, End, Step>::step(func);
814 }
815 };
816
817 template<int End, int Step>
818 struct UnrollerL<End, End, Step> {
819 template<typename Lambda>
820 static void step(Lambda& func) {
821 }
822 };
823
824 // helper function to unroll loops, with mask to skip specific iterations
825 template<int Begin, int End, int Step = 1, int Mask = 0x7f>
826 struct UnrollerLMask {
827 template<typename Lambda>
828 INLINE static void step(Lambda& func) {
829 if(Mask & (1 << Begin))
830 {
831 func(Begin);
832 }
833 UnrollerL<Begin + Step, End, Step>::step(func);
834 }
835 };
836
837 template<int End, int Step, int Mask>
838 struct UnrollerLMask<End, End, Step, Mask> {
839 template<typename Lambda>
840 static void step(Lambda& func) {
841 }
842 };
843
844 // general CRC compute
845 INLINE
846 uint32_t ComputeCRC(uint32_t crc, const void *pData, uint32_t size)
847 {
848 #if defined(_WIN64) || defined(__x86_64__)
849 uint32_t sizeInQwords = size / sizeof(uint64_t);
850 uint32_t sizeRemainderBytes = size % sizeof(uint64_t);
851 uint64_t* pDataWords = (uint64_t*)pData;
852 for (uint32_t i = 0; i < sizeInQwords; ++i)
853 {
854 crc = (uint32_t)_mm_crc32_u64(crc, *pDataWords++);
855 }
856 #else
857 uint32_t sizeInDwords = size / sizeof(uint32_t);
858 uint32_t sizeRemainderBytes = size % sizeof(uint32_t);
859 uint32_t* pDataWords = (uint32_t*)pData;
860 for (uint32_t i = 0; i < sizeInDwords; ++i)
861 {
862 crc = _mm_crc32_u32(crc, *pDataWords++);
863 }
864 #endif
865
866 uint8_t* pRemainderBytes = (uint8_t*)pDataWords;
867 for (uint32_t i = 0; i < sizeRemainderBytes; ++i)
868 {
869 crc = _mm_crc32_u8(crc, *pRemainderBytes++);
870 }
871
872 return crc;
873 }
874
875 //////////////////////////////////////////////////////////////////////////
876 /// Add byte offset to any-type pointer
877 //////////////////////////////////////////////////////////////////////////
878 template <typename T>
879 INLINE
880 static T* PtrAdd(T* p, intptr_t offset)
881 {
882 intptr_t intp = reinterpret_cast<intptr_t>(p);
883 return reinterpret_cast<T*>(intp + offset);
884 }
885
886 //////////////////////////////////////////////////////////////////////////
887 /// Is a power-of-2?
888 //////////////////////////////////////////////////////////////////////////
889 template <typename T>
890 INLINE
891 static bool IsPow2(T value)
892 {
893 return value == (value & (0 - value));
894 }
895
896 //////////////////////////////////////////////////////////////////////////
897 /// Align down to specified alignment
898 /// Note: IsPow2(alignment) MUST be true
899 //////////////////////////////////////////////////////////////////////////
900 template <typename T1, typename T2>
901 INLINE
902 static T1 AlignDownPow2(T1 value, T2 alignment)
903 {
904 SWR_ASSERT(IsPow2(alignment));
905 return value & ~T1(alignment - 1);
906 }
907
908 //////////////////////////////////////////////////////////////////////////
909 /// Align up to specified alignment
910 /// Note: IsPow2(alignment) MUST be true
911 //////////////////////////////////////////////////////////////////////////
912 template <typename T1, typename T2>
913 INLINE
914 static T1 AlignUpPow2(T1 value, T2 alignment)
915 {
916 return AlignDownPow2(value + T1(alignment - 1), alignment);
917 }
918
919 //////////////////////////////////////////////////////////////////////////
920 /// Align up ptr to specified alignment
921 /// Note: IsPow2(alignment) MUST be true
922 //////////////////////////////////////////////////////////////////////////
923 template <typename T1, typename T2>
924 INLINE
925 static T1* AlignUpPow2(T1* value, T2 alignment)
926 {
927 return reinterpret_cast<T1*>(
928 AlignDownPow2(reinterpret_cast<uintptr_t>(value) + uintptr_t(alignment - 1), alignment));
929 }
930
931 //////////////////////////////////////////////////////////////////////////
932 /// Align down to specified alignment
933 //////////////////////////////////////////////////////////////////////////
934 template <typename T1, typename T2>
935 INLINE
936 static T1 AlignDown(T1 value, T2 alignment)
937 {
938 if (IsPow2(alignment)) { return AlignDownPow2(value, alignment); }
939 return value - T1(value % alignment);
940 }
941
942 //////////////////////////////////////////////////////////////////////////
943 /// Align down to specified alignment
944 //////////////////////////////////////////////////////////////////////////
945 template <typename T1, typename T2>
946 INLINE
947 static T1* AlignDown(T1* value, T2 alignment)
948 {
949 return (T1*)AlignDown(uintptr_t(value), alignment);
950 }
951
952 //////////////////////////////////////////////////////////////////////////
953 /// Align up to specified alignment
954 /// Note: IsPow2(alignment) MUST be true
955 //////////////////////////////////////////////////////////////////////////
956 template <typename T1, typename T2>
957 INLINE
958 static T1 AlignUp(T1 value, T2 alignment)
959 {
960 return AlignDown(value + T1(alignment - 1), alignment);
961 }
962
963 //////////////////////////////////////////////////////////////////////////
964 /// Align up to specified alignment
965 /// Note: IsPow2(alignment) MUST be true
966 //////////////////////////////////////////////////////////////////////////
967 template <typename T1, typename T2>
968 INLINE
969 static T1* AlignUp(T1* value, T2 alignment)
970 {
971 return AlignDown(PtrAdd(value, alignment - 1), alignment);
972 }
973
974 //////////////////////////////////////////////////////////////////////////
975 /// Helper structure used to access an array of elements that don't
976 /// correspond to a typical word size.
977 //////////////////////////////////////////////////////////////////////////
978 template<typename T, size_t BitsPerElementT, size_t ArrayLenT>
979 class BitsArray
980 {
981 private:
982 static const size_t BITS_PER_WORD = sizeof(size_t) * 8;
983 static const size_t ELEMENTS_PER_WORD = BITS_PER_WORD / BitsPerElementT;
984 static const size_t NUM_WORDS = (ArrayLenT + ELEMENTS_PER_WORD - 1) / ELEMENTS_PER_WORD;
985 static const size_t ELEMENT_MASK = (size_t(1) << BitsPerElementT) - 1;
986
987 static_assert(ELEMENTS_PER_WORD * BitsPerElementT == BITS_PER_WORD,
988 "Element size must an integral fraction of pointer size");
989
990 size_t m_words[NUM_WORDS] = {};
991
992 public:
993
994 T operator[] (size_t elementIndex) const
995 {
996 size_t word = m_words[elementIndex / ELEMENTS_PER_WORD];
997 word >>= ((elementIndex % ELEMENTS_PER_WORD) * BitsPerElementT);
998 return T(word & ELEMENT_MASK);
999 }
1000 };
1001
1002 // Ranged integer argument for TemplateArgUnroller
1003 template <uint32_t TMin, uint32_t TMax>
1004 struct IntArg
1005 {
1006 uint32_t val;
1007 };
1008
1009 // Recursive template used to auto-nest conditionals. Converts dynamic boolean function
1010 // arguments to static template arguments.
1011 template <typename TermT, typename... ArgsB>
1012 struct TemplateArgUnroller
1013 {
1014 //-----------------------------------------
1015 // Boolean value
1016 //-----------------------------------------
1017
1018 // Last Arg Terminator
1019 static typename TermT::FuncType GetFunc(bool bArg)
1020 {
1021 if (bArg)
1022 {
1023 return TermT::template GetFunc<ArgsB..., std::true_type>();
1024 }
1025
1026 return TermT::template GetFunc<ArgsB..., std::false_type>();
1027 }
1028
1029 // Recursively parse args
1030 template <typename... TArgsT>
1031 static typename TermT::FuncType GetFunc(bool bArg, TArgsT... remainingArgs)
1032 {
1033 if (bArg)
1034 {
1035 return TemplateArgUnroller<TermT, ArgsB..., std::true_type>::GetFunc(remainingArgs...);
1036 }
1037
1038 return TemplateArgUnroller<TermT, ArgsB..., std::false_type>::GetFunc(remainingArgs...);
1039 }
1040
1041 //-----------------------------------------
1042 // Integer value (within specified range)
1043 //-----------------------------------------
1044
1045 // Last Arg Terminator
1046 template <uint32_t TMin, uint32_t TMax>
1047 static typename TermT::FuncType GetFunc(IntArg<TMin, TMax> iArg)
1048 {
1049 if (iArg.val == TMax)
1050 {
1051 return TermT::template GetFunc<ArgsB..., std::integral_constant<uint32_t, TMax>>();
1052 }
1053 if (TMax > TMin)
1054 {
1055 return TemplateArgUnroller<TermT, ArgsB...>::GetFunc(IntArg<TMin, TMax-1>{iArg.val});
1056 }
1057 SWR_ASSUME(false); return nullptr;
1058 }
1059 template <uint32_t TVal>
1060 static typename TermT::FuncType GetFunc(IntArg<TVal, TVal> iArg)
1061 {
1062 SWR_ASSERT(iArg.val == TVal);
1063 return TermT::template GetFunc<ArgsB..., std::integral_constant<uint32_t, TVal>>();
1064 }
1065
1066 // Recursively parse args
1067 template <uint32_t TMin, uint32_t TMax, typename... TArgsT>
1068 static typename TermT::FuncType GetFunc(IntArg<TMin, TMax> iArg, TArgsT... remainingArgs)
1069 {
1070 if (iArg.val == TMax)
1071 {
1072 return TemplateArgUnroller<TermT, ArgsB..., std::integral_constant<uint32_t, TMax>>::GetFunc(remainingArgs...);
1073 }
1074 if (TMax > TMin)
1075 {
1076 return TemplateArgUnroller<TermT, ArgsB...>::GetFunc(IntArg<TMin, TMax - 1>{iArg.val}, remainingArgs...);
1077 }
1078 SWR_ASSUME(false); return nullptr;
1079 }
1080 template <uint32_t TVal, typename... TArgsT>
1081 static typename TermT::FuncType GetFunc(IntArg<TVal, TVal> iArg, TArgsT... remainingArgs)
1082 {
1083 SWR_ASSERT(iArg.val == TVal);
1084 return TemplateArgUnroller<TermT, ArgsB..., std::integral_constant<uint32_t, TVal>>::GetFunc(remainingArgs...);
1085 }
1086 };
1087
1088